1 files changed, 2430 insertions, 6457 deletions
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 0597245499..9771c87721 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -46,6 +46,7 @@
 #include <sys/bitmap.h>
 #include <sys/cpuvar.h>
 #include <sys/time.h>
+#include <sys/ctype.h>
 #include <sys/kmem.h>
 #include <sys/systm.h>
 #include <sys/param.h>
@@ -61,10 +62,10 @@
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/igmp_var.h>
-#include <sys/strsun.h>
 #include <sys/policy.h>
 #include <sys/ethernet.h>
 #include <sys/callb.h>
+#include <sys/md5.h>
 
 #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
 #include <inet/mi.h>
@@ -85,7 +86,6 @@
 #include <inet/tun.h>
 #include <inet/sctp_ip.h>
 #include <inet/ip_netinfo.h>
-#include <inet/mib2.h>
 
 #include <net/pfkeyv2.h>
 #include <inet/ipsec_info.h>
@@ -93,7 +93,6 @@
 #include <inet/ipsec_impl.h>
 #include <sys/iphada.h>
 
-
 #include <netinet/igmp.h>
 #include <inet/ip_listutils.h>
 #include <inet/ipclassifier.h>
@@ -158,7 +157,7 @@ static	int	ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
 static void	ipsq_delete(ipsq_t *);
 
 static ipif_t	*ipif_allocate(ill_t *ill, int id, uint_t ire_type,
-		    boolean_t initialize);
+    boolean_t initialize, boolean_t insert);
 static void	ipif_check_bcast_ires(ipif_t *test_ipif);
 static ire_t	**ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
@@ -169,7 +168,6 @@ static int	ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
 static void	ipif_free(ipif_t *ipif);
 static void	ipif_free_tail(ipif_t *ipif);
 static void	ipif_mtu_change(ire_t *ire, char *ipif_arg);
-static void	ipif_multicast_down(ipif_t *ipif);
 static void	ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif);
 static void	ipif_set_default(ipif_t *ipif);
 static int	ipif_set_values(queue_t *q, mblk_t *mp,
@@ -179,8 +177,7 @@ static int	ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
 static ipif_t	*ipif_lookup_on_name(char *name, size_t namelen,
     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
     queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *);
-static int	ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp);
-static void	ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp);
+static void	ipif_update_other_ipifs(ipif_t *old_ipif);
 
 static int	ill_alloc_ppa(ill_if_t *, ill_t *);
 static int	ill_arp_off(ill_t *ill);
@@ -192,33 +189,18 @@ static void	ill_down(ill_t *ill);
 static void	ill_downi(ire_t *ire, char *ill_arg);
 static void	ill_free_mib(ill_t *ill);
 static void	ill_glist_delete(ill_t *);
-static boolean_t ill_has_usable_ipif(ill_t *);
-static int	ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int);
-static void	ill_nominate_bcast_rcv(ill_group_t *illgrp);
-static void	ill_phyint_free(ill_t *ill);
 static void	ill_phyint_reinit(ill_t *ill);
 static void	ill_set_nce_router_flags(ill_t *, boolean_t);
 static void	ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
-static void	ill_signal_ipsq_ills(ipsq_t *, boolean_t);
-static boolean_t ill_split_ipsq(ipsq_t *cur_sq);
-static void	ill_stq_cache_delete(ire_t *, char *);
-
-static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
-    in6_addr_t *);
-static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
-    ipaddr_t *);
-static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
-    in6_addr_t *);
-static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
-    ipaddr_t *);
-
+static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
+static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
+static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo;
+static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo;
 static void	ipif_save_ire(ipif_t *, ire_t *);
 static void	ipif_remove_ire(ipif_t *, ire_t *);
 static void 	ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *);
 static void 	ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
+static void	phyint_free(phyint_t *);
 
 /*
  * Per-ill IPsec capabilities management.
@@ -250,18 +232,14 @@ static void	ill_capability_ack_thr(void *);
 static void	ill_capability_lso_enable(ill_t *);
 static void	ill_capability_send(ill_t *, mblk_t *);
 
-static void	illgrp_cache_delete(ire_t *, char *);
-static void	illgrp_delete(ill_t *ill);
-static void	illgrp_reset_schednext(ill_t *ill);
-
 static ill_t	*ill_prev_usesrc(ill_t *);
 static int	ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
 static void	ill_disband_usesrc_group(ill_t *);
 static void	conn_cleanup_stale_ire(conn_t *, caddr_t);
 
 #ifdef DEBUG
-static	void	ill_trace_cleanup(const ill_t *);
-static	void	ipif_trace_cleanup(const ipif_t *);
+static  void    ill_trace_cleanup(const ill_t *);
+static  void    ipif_trace_cleanup(const ipif_t *);
 #endif
 
 /*
@@ -491,6 +469,7 @@ static nv_t	ipif_nv_tbl[] = {
 	{ PHYI_STANDBY,		"STANDBY" },
 	{ PHYI_INACTIVE,	"INACTIVE" },
 	{ PHYI_OFFLINE,		"OFFLINE" },
+	{ PHYI_IPMP,		"IPMP" }
 };
 
 static uchar_t	ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
@@ -508,7 +487,8 @@ static ip_m_t   ip_m_tbl[] = {
 	    ip_ether_v6intfid },
 	{ DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo,
 	    ip_ib_v6intfid },
-	{ SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL},
+	{ SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL },
+	{ SUNW_DL_IPMP, IFT_OTHER, NULL, NULL, ip_ipmp_v6intfid },
 	{ DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
 	    ip_nodef_v6intfid }
 };
@@ -529,14 +509,6 @@ static ipif_t	ipif_zero;
  */
 uint_t	ill_no_arena = 12;	/* Setable in /etc/system */
 
-static uint_t
-ipif_rand(ip_stack_t *ipst)
-{
-	ipst->ips_ipif_src_random = ipst->ips_ipif_src_random * 1103515245 +
-	    12345;
-	return ((ipst->ips_ipif_src_random >> 16) & 0x7fff);
-}
-
 /*
  * Allocate per-interface mibs.
  * Returns true if ok. False otherwise.
@@ -623,7 +595,7 @@ ill_allocate_mibs(ill_t *ill)
  * (Always called as writer.)
  */
 mblk_t *
-ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
+ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr)
 {
 	arc_t	*arc = (arc_t *)template;
 	char	*cp;
@@ -669,17 +641,69 @@ ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
 }
 
 mblk_t *
-ipif_area_alloc(ipif_t *ipif)
+ipif_area_alloc(ipif_t *ipif, uint_t optflags)
 {
-	return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template,
-	    (char *)&ipif->ipif_lcl_addr));
+	caddr_t	addr;
+	mblk_t 	*mp;
+	area_t	*area;
+	uchar_t	*areap;
+	ill_t	*ill = ipif->ipif_ill;
+
+	if (ill->ill_isv6) {
+		ASSERT(ill->ill_flags & ILLF_XRESOLV);
+		addr = (caddr_t)&ipif->ipif_v6lcl_addr;
+		areap = (uchar_t *)&ip6_area_template;
+	} else {
+		addr = (caddr_t)&ipif->ipif_lcl_addr;
+		areap = (uchar_t *)&ip_area_template;
+	}
+
+	if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL)
+		return (NULL);
+
+	/*
+	 * IPMP requires that the hardware address be included in all
+	 * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on.
+	 * If there are no active underlying ills in the group (and thus no
+	 * hardware address, DAD will be deferred until an underlying ill
+	 * becomes active.
+	 */
+	if (IS_IPMP(ill)) {
+		if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
+			freemsg(mp);
+			return (NULL);
+		}
+	} else {
+		ill_refhold(ill);
+	}
+
+	area = (area_t *)mp->b_rptr;
+	area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR;
+	area->area_flags |= optflags;
+	area->area_hw_addr_length = ill->ill_phys_addr_length;
+	bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset,
+	    area->area_hw_addr_length);
+
+	ill_refrele(ill);
+	return (mp);
 }
 
 mblk_t *
 ipif_ared_alloc(ipif_t *ipif)
 {
-	return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template,
-	    (char *)&ipif->ipif_lcl_addr));
+	caddr_t	addr;
+	uchar_t	*aredp;
+
+	if (ipif->ipif_ill->ill_isv6) {
+		ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV);
+		addr = (caddr_t)&ipif->ipif_v6lcl_addr;
+		aredp = (uchar_t *)&ip6_ared_template;
+	} else {
+		addr = (caddr_t)&ipif->ipif_lcl_addr;
+		aredp = (uchar_t *)&ip_ared_template;
+	}
+
+	return (ill_arp_alloc(ipif->ipif_ill, aredp, addr));
 }
 
 mblk_t *
@@ -689,6 +713,19 @@ ill_ared_alloc(ill_t *ill, ipaddr_t addr)
 	    (char *)&addr));
 }
 
+mblk_t *
+ill_arie_alloc(ill_t *ill, const char *grifname, const void *template)
+{
+	mblk_t	*mp = ill_arp_alloc(ill, template, 0);
+	arie_t	*arie;
+
+	if (mp != NULL) {
+		arie = (arie_t *)mp->b_rptr;
+		(void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ);
+	}
+	return (mp);
+}
+
 /*
  * Completely vaporize a lower level tap and all associated interfaces.
  * ill_delete is called only out of ip_close when the device control
@@ -751,6 +788,12 @@ ill_delete(ill_t *ill)
 	ip_purge_allmulti(ill);
 
 	/*
+	 * If the ill being deleted is under IPMP, boot it out of the illgrp.
+	 */
+	if (IS_UNDER_IPMP(ill))
+		ipmp_ill_leave_illgrp(ill);
+
+	/*
 	 * ill_down will arrange to blow off any IRE's dependent on this
 	 * ILL, and shut down fragmentation reassembly.
 	 */
@@ -890,8 +933,19 @@ ill_delete_tail(ill_t *ill)
 	 *    ill references.
 	 */
 	ASSERT(ilm_walk_ill(ill) == 0);
+
 	/*
-	 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free
+	 * If this ill is an IPMP meta-interface, blow away the illgrp.  This
+	 * is safe to do because the illgrp has already been unlinked from the
+	 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
+	 */
+	if (IS_IPMP(ill)) {
+		ipmp_illgrp_destroy(ill->ill_grp);
+		ill->ill_grp = NULL;
+	}
+
+	/*
+	 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
 	 * could free the phyint. No more reference to the phyint after this
 	 * point.
 	 */
@@ -1139,7 +1193,7 @@ ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id)
  * Add the pending mp to the list. There can be only 1 pending mp
  * in the list. Any exclusive ioctl that needs to wait for a response
  * from another module or driver needs to use this function to set
- * the ipsq_pending_mp to the ioctl mblk and wait for the response from
+ * the ipx_pending_mp to the ioctl mblk and wait for the response from
  * the other module/driver. This is also used while waiting for the
  * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
  */
@@ -1147,19 +1201,19 @@ boolean_t
 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
     int waitfor)
 {
-	ipsq_t	*ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
+	ipxop_t	*ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
 
 	ASSERT(IAM_WRITER_IPIF(ipif));
 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
 	ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
-	ASSERT(ipsq->ipsq_pending_mp == NULL);
+	ASSERT(ipx->ipx_pending_mp == NULL);
 	/*
 	 * The caller may be using a different ipif than the one passed into
 	 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
 	 * ill needs to wait for the V6 ill to quiesce).  So we can't ASSERT
-	 * that `ipsq_current_ipif == ipif'.
+	 * that `ipx_current_ipif == ipif'.
 	 */
-	ASSERT(ipsq->ipsq_current_ipif != NULL);
+	ASSERT(ipx->ipx_current_ipif != NULL);
 
 	/*
 	 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls,
@@ -1180,8 +1234,8 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
 		if (connp->conn_state_flags & CONN_CLOSING)
 			return (B_FALSE);
 	}
-	mutex_enter(&ipsq->ipsq_lock);
-	ipsq->ipsq_pending_ipif = ipif;
+	mutex_enter(&ipx->ipx_lock);
+	ipx->ipx_pending_ipif = ipif;
 	/*
 	 * Note down the queue in b_queue. This will be returned by
 	 * ipsq_pending_mp_get. Caller will then use these values to restart
@@ -1189,38 +1243,40 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
 	 */
 	add_mp->b_next = NULL;
 	add_mp->b_queue = q;
-	ipsq->ipsq_pending_mp = add_mp;
-	ipsq->ipsq_waitfor = waitfor;
+	ipx->ipx_pending_mp = add_mp;
+	ipx->ipx_waitfor = waitfor;
+	mutex_exit(&ipx->ipx_lock);
 
 	if (connp != NULL)
 		connp->conn_oper_pending_ill = ipif->ipif_ill;
-	mutex_exit(&ipsq->ipsq_lock);
+
 	return (B_TRUE);
 }
 
 /*
- * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp
+ * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
  * queued in the list.
  */
 mblk_t *
 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
 {
 	mblk_t	*curr = NULL;
+	ipxop_t	*ipx = ipsq->ipsq_xop;
 
-	mutex_enter(&ipsq->ipsq_lock);
 	*connpp = NULL;
-	if (ipsq->ipsq_pending_mp == NULL) {
-		mutex_exit(&ipsq->ipsq_lock);
+	mutex_enter(&ipx->ipx_lock);
+	if (ipx->ipx_pending_mp == NULL) {
+		mutex_exit(&ipx->ipx_lock);
 		return (NULL);
 	}
 
 	/* There can be only 1 such excl message */
-	curr = ipsq->ipsq_pending_mp;
-	ASSERT(curr != NULL && curr->b_next == NULL);
-	ipsq->ipsq_pending_ipif = NULL;
-	ipsq->ipsq_pending_mp = NULL;
-	ipsq->ipsq_waitfor = 0;
-	mutex_exit(&ipsq->ipsq_lock);
+	curr = ipx->ipx_pending_mp;
+	ASSERT(curr->b_next == NULL);
+	ipx->ipx_pending_ipif = NULL;
+	ipx->ipx_pending_mp = NULL;
+	ipx->ipx_waitfor = 0;
+	mutex_exit(&ipx->ipx_lock);
 
 	if (CONN_Q(curr->b_queue)) {
 		/*
@@ -1237,7 +1293,7 @@ ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
 }
 
 /*
- * Cleanup the ioctl mp queued in ipsq_pending_mp
+ * Cleanup the ioctl mp queued in ipx_pending_mp
  * - Called in the ill_delete path
  * - Called in the M_ERROR or M_HANGUP path on the ill.
  * - Called in the conn close path.
@@ -1246,48 +1302,41 @@ boolean_t
 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
 {
 	mblk_t	*mp;
-	ipsq_t	*ipsq;
+	ipxop_t	*ipx;
 	queue_t	*q;
 	ipif_t	*ipif;
 
 	ASSERT(IAM_WRITER_ILL(ill));
-	ipsq = ill->ill_phyint->phyint_ipsq;
-	mutex_enter(&ipsq->ipsq_lock);
+	ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
+
 	/*
-	 * If connp is null, unconditionally clean up the ipsq_pending_mp.
+	 * If connp is null, unconditionally clean up the ipx_pending_mp.
 	 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl
 	 * even if it is meant for another ill, since we have to enqueue
-	 * a new mp now in ipsq_pending_mp to complete the ipif_down.
+	 * a new mp now in ipx_pending_mp to complete the ipif_down.
 	 * If connp is non-null we are called from the conn close path.
 	 */
-	mp = ipsq->ipsq_pending_mp;
+	mutex_enter(&ipx->ipx_lock);
+	mp = ipx->ipx_pending_mp;
 	if (mp == NULL || (connp != NULL &&
 	    mp->b_queue != CONNP_TO_WQ(connp))) {
-		mutex_exit(&ipsq->ipsq_lock);
+		mutex_exit(&ipx->ipx_lock);
 		return (B_FALSE);
 	}
-	/* Now remove from the ipsq_pending_mp */
-	ipsq->ipsq_pending_mp = NULL;
+	/* Now remove from the ipx_pending_mp */
+	ipx->ipx_pending_mp = NULL;
 	q = mp->b_queue;
 	mp->b_next = NULL;
 	mp->b_prev = NULL;
 	mp->b_queue = NULL;
 
-	/* If MOVE was in progress, clear the move_in_progress fields also. */
-	ill = ipsq->ipsq_pending_ipif->ipif_ill;
-	if (ill->ill_move_in_progress) {
-		ILL_CLEAR_MOVE(ill);
-	} else if (ill->ill_up_ipifs) {
-		ill_group_cleanup(ill);
-	}
-
-	ipif = ipsq->ipsq_pending_ipif;
-	ipsq->ipsq_pending_ipif = NULL;
-	ipsq->ipsq_waitfor = 0;
-	ipsq->ipsq_current_ipif = NULL;
-	ipsq->ipsq_current_ioctl = 0;
-	ipsq->ipsq_current_done = B_TRUE;
-	mutex_exit(&ipsq->ipsq_lock);
+	ipif = ipx->ipx_pending_ipif;
+	ipx->ipx_pending_ipif = NULL;
+	ipx->ipx_waitfor = 0;
+	ipx->ipx_current_ipif = NULL;
+	ipx->ipx_current_ioctl = 0;
+	ipx->ipx_current_done = B_TRUE;
+	mutex_exit(&ipx->ipx_lock);
 
 	if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
 		if (connp == NULL) {
@@ -1437,7 +1486,7 @@ conn_ioctl_cleanup(conn_t *connp)
 	 * Is any exclusive ioctl pending ? If so clean it up. If the
 	 * ioctl has not yet started, the mp is pending in the list headed by
 	 * ipsq_xopq_head. If the ioctl has started the mp could be present in
-	 * ipsq_pending_mp. If the ioctl timed out in the streamhead but
+	 * ipx_pending_mp. If the ioctl timed out in the streamhead but
 	 * is currently executing now the mp is not queued anywhere but
 	 * conn_oper_pending_ill is null. The conn close will wait
 	 * till the conn_ref drops to zero.
@@ -1468,9 +1517,9 @@ conn_ioctl_cleanup(conn_t *connp)
 			ill_waiter_dcr(ill);
 			/*
 			 * Check whether this ioctl has started and is
-			 * pending now in ipsq_pending_mp. If it is not
-			 * found there then check whether this ioctl has
-			 * not even started and is in the ipsq_xopq list.
+			 * pending. If it is not found there then check
+			 * whether this ioctl has not even started and is in
+			 * the ipsq_xopq list.
 			 */
 			if (!ipsq_pending_mp_cleanup(ill, connp))
 				ipsq_xopq_mp_cleanup(ill, connp);
@@ -1506,16 +1555,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg)
 	if (connp->conn_multicast_ill == ill) {
 		/* Revert to late binding */
 		connp->conn_multicast_ill = NULL;
-		connp->conn_orig_multicast_ifindex = 0;
 	}
 	if (connp->conn_incoming_ill == ill)
 		connp->conn_incoming_ill = NULL;
 	if (connp->conn_outgoing_ill == ill)
 		connp->conn_outgoing_ill = NULL;
-	if (connp->conn_outgoing_pill == ill)
-		connp->conn_outgoing_pill = NULL;
-	if (connp->conn_nofailover_ill == ill)
-		connp->conn_nofailover_ill = NULL;
 	if (connp->conn_dhcpinit_ill == ill) {
 		connp->conn_dhcpinit_ill = NULL;
 		ASSERT(ill->ill_dhcpinit != 0);
@@ -1524,11 +1568,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg)
 	if (connp->conn_ire_cache != NULL) {
 		ire = connp->conn_ire_cache;
 		/*
-		 * ip_newroute creates IRE_CACHE with ire_stq coming from
-		 * interface X and ipif coming from interface Y, if interface
-		 * X and Y are part of the same IPMPgroup. Thus whenever
-		 * interface X goes down, remove all references to it by
-		 * checking both on ire_ipif and ire_stq.
+		 * Source address selection makes it possible for IRE_CACHE
+		 * entries to be created with ire_stq coming from interface X
+		 * and ipif coming from interface Y.  Thus whenever interface
+		 * X goes down, remove all references to it by checking both
+		 * on ire_ipif and ire_stq.
 		 */
 		if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
 		    (ire->ire_type == IRE_CACHE &&
@@ -1601,14 +1645,10 @@ ill_down(ill_t *ill)
 	ip_stack_t	*ipst = ill->ill_ipst;
 
 	/* Blow off any IREs dependent on this ILL. */
-	ire_walk(ill_downi, (char *)ill, ipst);
+	ire_walk(ill_downi, ill, ipst);
 
 	/* Remove any conn_*_ill depending on this ill */
 	ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
-
-	if (ill->ill_group != NULL) {
-		illgrp_delete(ill);
-	}
 }
 
 /*
@@ -1621,9 +1661,9 @@ ill_downi(ire_t *ire, char *ill_arg)
 	ill_t	*ill = (ill_t *)ill_arg;
 
 	/*
-	 * ip_newroute creates IRE_CACHE with ire_stq coming from
-	 * interface X and ipif coming from interface Y, if interface
-	 * X and Y are part of the same IPMP group. Thus whenever interface
+	 * Source address selection makes it possible for IRE_CACHE
+	 * entries to be created with ire_stq coming from interface X
+	 * and ipif coming from interface Y.  Thus whenever interface
 	 * X goes down, remove all references to it by checking both
 	 * on ire_ipif and ire_stq.
 	 */
@@ -3696,16 +3736,39 @@ nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp,
 }
 
 /*
- * Set an ill's ILLF_ROUTER flag appropriately.  If the ill is part of an
- * IPMP group, make sure all ill's in the group adopt the new policy.  Send
- * up RTS_IFINFO routing socket messages for each interface whose flags we
- * change.
+ * Helper function for ill_forward_set().
+ */
+static void
+ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
+{
+	ip_stack_t	*ipst = ill->ill_ipst;
+
+	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
+
+	ip1dbg(("ill_forward_set: %s %s forwarding on %s",
+	    (enable ? "Enabling" : "Disabling"),
+	    (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
+	mutex_enter(&ill->ill_lock);
+	if (enable)
+		ill->ill_flags |= ILLF_ROUTER;
+	else
+		ill->ill_flags &= ~ILLF_ROUTER;
+	mutex_exit(&ill->ill_lock);
+	if (ill->ill_isv6)
+		ill_set_nce_router_flags(ill, enable);
+	/* Notify routing socket listeners of this change. */
+	ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
+}
+
+/*
+ * Set an ill's ILLF_ROUTER flag appropriately.  Send up RTS_IFINFO routing
+ * socket messages for each interface whose flags we change.
  */
 int
 ill_forward_set(ill_t *ill, boolean_t enable)
 {
-	ill_group_t *illgrp;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	ipmp_illgrp_t *illg;
+	ip_stack_t *ipst = ill->ill_ipst;
 
 	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
 
@@ -3716,47 +3779,23 @@ ill_forward_set(ill_t *ill, boolean_t enable)
 	if (IS_LOOPBACK(ill))
 		return (EINVAL);
 
-	/*
-	 * If the ill is in an IPMP group, set the forwarding policy on all
-	 * members of the group to the same value.
-	 */
-	illgrp = ill->ill_group;
-	if (illgrp != NULL) {
-		ill_t *tmp_ill;
+	if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
+		/*
+		 * Update all of the interfaces in the group.
+		 */
+		illg = ill->ill_grp;
+		ill = list_head(&illg->ig_if);
+		for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
+			ill_forward_set_on_ill(ill, enable);
 
-		for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL;
-		    tmp_ill = tmp_ill->ill_group_next) {
-			ip1dbg(("ill_forward_set: %s %s forwarding on %s",
-			    (enable ? "Enabling" : "Disabling"),
-			    (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"),
-			    tmp_ill->ill_name));
-			mutex_enter(&tmp_ill->ill_lock);
-			if (enable)
-				tmp_ill->ill_flags |= ILLF_ROUTER;
-			else
-				tmp_ill->ill_flags &= ~ILLF_ROUTER;
-			mutex_exit(&tmp_ill->ill_lock);
-			if (tmp_ill->ill_isv6)
-				ill_set_nce_router_flags(tmp_ill, enable);
-			/* Notify routing socket listeners of this change. */
-			ip_rts_ifmsg(tmp_ill->ill_ipif);
-		}
-	} else {
-		ip1dbg(("ill_forward_set: %s %s forwarding on %s",
-		    (enable ? "Enabling" : "Disabling"),
-		    (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
-		mutex_enter(&ill->ill_lock);
-		if (enable)
-			ill->ill_flags |= ILLF_ROUTER;
-		else
-			ill->ill_flags &= ~ILLF_ROUTER;
-		mutex_exit(&ill->ill_lock);
-		if (ill->ill_isv6)
-			ill_set_nce_router_flags(ill, enable);
-		/* Notify routing socket listeners of this change. */
-		ip_rts_ifmsg(ill->ill_ipif);
+		/*
+		 * Update the IPMP meta-interface.
+		 */
+		ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
+		return (0);
 	}
 
+	ill_forward_set_on_ill(ill, enable);
 	return (0);
 }
 
@@ -3772,7 +3811,12 @@ ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
 	nce_t *nce;
 
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE);
+		/*
+		 * NOTE: we're called separately for each ill in an illgrp,
+		 * so don't match across the illgrp.
+		 */
+		nce = ndp_lookup_v6(ill, B_FALSE, &ipif->ipif_v6lcl_addr,
+		    B_FALSE);
 		if (nce != NULL) {
 			mutex_enter(&nce->nce_lock);
 			if (enable)
@@ -3928,36 +3972,45 @@ ill_next(ill_walk_context_t *ctx, ill_t *lastill)
 }
 
 /*
- * Check interface name for correct format which is name+ppa.
- * name can contain characters and digits, the right most digits
- * make up the ppa number. use of octal is not allowed, name must contain
- * a ppa, return pointer to the start of ppa.
- * In case of error return NULL.
+ * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
+ * The final number (PPA) must not have any leading zeros.  Upon success, a
+ * pointer to the start of the PPA is returned; otherwise NULL is returned.
  */
 static char *
 ill_get_ppa_ptr(char *name)
 {
-	int namelen = mi_strlen(name);
+	int namelen = strlen(name);
+	int end_ndx = namelen - 1;
+	int ppa_ndx, i;
 
-	int len = namelen;
+	/*
+	 * Check that the first character is [a-zA-Z], and that the last
+	 * character is [0-9].
+	 */
+	if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
+		return (NULL);
 
-	name += len;
-	while (len > 0) {
-		name--;
-		if (*name < '0' || *name > '9')
+	/*
+	 * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
+	 */
+	for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
+		if (!isdigit(name[ppa_ndx - 1]))
 			break;
-		len--;
-	}
 
-	/* empty string, all digits, or no trailing digits */
-	if (len == 0 || len == (int)namelen)
+	if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
 		return (NULL);
 
-	name++;
-	/* check for attempted use of octal */
-	if (*name == '0' && len != (int)namelen - 1)
-		return (NULL);
-	return (name);
+	/*
+	 * Check that the intermediate characters are [a-z0-9.]
+	 */
+	for (i = 1; i < ppa_ndx; i++) {
+		if (!isalpha(name[i]) && !isdigit(name[i]) &&
+		    name[i] != '.' && name[i] != '_') {
+			return (NULL);
+		}
+	}
+
+	return (name + ppa_ndx);
 }
 
 /*
@@ -4037,8 +4090,10 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
 		} else if (ILL_CAN_WAIT(ill, q)) {
 			ipsq = ill->ill_phyint->phyint_ipsq;
 			mutex_enter(&ipsq->ipsq_lock);
+			mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 			mutex_exit(&ill->ill_lock);
 			ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+			mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 			mutex_exit(&ipsq->ipsq_lock);
 			RELEASE_CONN_LOCK(q);
 			if (error != NULL)
@@ -4102,6 +4157,7 @@ static void
 ill_glist_delete(ill_t *ill)
 {
 	ip_stack_t	*ipst;
+	phyint_t	*phyi;
 
 	if (ill == NULL)
 		return;
@@ -4139,8 +4195,41 @@ ill_glist_delete(ill_t *ill)
 	ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
 	    ill->ill_name_length);
 
-	ill_phyint_free(ill);
+	ASSERT(ill->ill_phyint != NULL);
+	phyi = ill->ill_phyint;
+	ill->ill_phyint = NULL;
+
+	/*
+	 * ill_init allocates a phyint always to store the copy
+	 * of flags relevant to phyint. At that point in time, we could
+	 * not assign the name and hence phyint_illv4/v6 could not be
+	 * initialized. Later in ipif_set_values, we assign the name to
+	 * the ill, at which point in time we assign phyint_illv4/v6.
+	 * Thus we don't rely on phyint_illv6 to be initialized always.
+	 */
+	if (ill->ill_flags & ILLF_IPV6)
+		phyi->phyint_illv6 = NULL;
+	else
+		phyi->phyint_illv4 = NULL;
+
+	if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
+		rw_exit(&ipst->ips_ill_g_lock);
+		return;
+	}
+
+	/*
+	 * There are no ills left on this phyint; pull it out of the phyint
+	 * avl trees, and free it.
+	 */
+	if (phyi->phyint_ifindex > 0) {
+		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+		    phyi);
+		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
+		    phyi);
+	}
 	rw_exit(&ipst->ips_ill_g_lock);
+
+	phyint_free(phyi);
 }
 
 /*
@@ -4367,30 +4456,32 @@ ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
 	return (0);
 }
 
-/* Initialize the per phyint (per IPMP group) ipsq used for serialization */
+/* Initialize the per phyint ipsq used for serialization */
 static boolean_t
-ipsq_init(ill_t *ill)
+ipsq_init(ill_t *ill, boolean_t enter)
 {
 	ipsq_t  *ipsq;
+	ipxop_t	*ipx;
 
-	/* Init the ipsq and impicitly enter as writer */
-	ill->ill_phyint->phyint_ipsq =
-	    kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
-	if (ill->ill_phyint->phyint_ipsq == NULL)
+	if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
 		return (B_FALSE);
-	ipsq = ill->ill_phyint->phyint_ipsq;
-	ipsq->ipsq_phyint_list = ill->ill_phyint;
-	ill->ill_phyint->phyint_ipsq_next = NULL;
+
+	ill->ill_phyint->phyint_ipsq = ipsq;
+	ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
+	ipx->ipx_ipsq = ipsq;
+	ipsq->ipsq_next = ipsq;
+	ipsq->ipsq_phyint = ill->ill_phyint;
 	mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
-	ipsq->ipsq_refs = 1;
-	ipsq->ipsq_writer = curthread;
-	ipsq->ipsq_reentry_cnt = 1;
+	mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
 	ipsq->ipsq_ipst = ill->ill_ipst;	/* No netstack_hold */
+	if (enter) {
+		ipx->ipx_writer = curthread;
+		ipx->ipx_forced = B_FALSE;
+		ipx->ipx_reentry_cnt = 1;
 #ifdef DEBUG
-	ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack,
-	    IPSQ_STACK_DEPTH);
+		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
 #endif
-	(void) strcpy(ipsq->ipsq_name, ill->ill_name);
+	}
 	return (B_TRUE);
 }
 
@@ -4468,7 +4559,7 @@ ill_init(queue_t *q, ill_t *ill)
 	ill->ill_ppa = UINT_MAX;
 	ill->ill_fastpath_list = &ill->ill_fastpath_list;
 
-	if (!ipsq_init(ill)) {
+	if (!ipsq_init(ill, B_TRUE)) {
 		freemsg(info_mp);
 		mi_free(frag_ptr);
 		mi_free(ill->ill_phyint);
@@ -4589,29 +4680,16 @@ loopback_kstat_update(kstat_t *ksp, int rw)
 }
 
 /*
- * Has ifindex been plumbed already.
- * Compares both phyint_ifindex and phyint_group_ifindex.
+ * Has ifindex been plumbed already?
  */
 static boolean_t
 phyint_exists(uint_t index, ip_stack_t *ipst)
 {
-	phyint_t *phyi;
-
 	ASSERT(index != 0);
 	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-	/*
-	 * Indexes are stored in the phyint - a common structure
-	 * to both IPv4 and IPv6.
-	 */
-	phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
-	for (; phyi != NULL;
-	    phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
-	    phyi, AVL_AFTER)) {
-		if (phyi->phyint_ifindex == index ||
-		    phyi->phyint_group_ifindex == index)
-			return (B_TRUE);
-	}
-	return (B_FALSE);
+
+	return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+	    &index, NULL) != NULL);
 }
 
 /* Pick a unique ifindex */
@@ -4675,9 +4753,9 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 {
 	ill_t	*ill;
 	ipif_t	*ipif;
+	ipsq_t	*ipsq;
 	kstat_named_t	*kn;
 	boolean_t isloopback;
-	ipsq_t *old_ipsq;
 	in6_addr_t ov6addr;
 
 	isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
@@ -4761,16 +4839,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 	ill->ill_net_type = IRE_LOOPBACK;
 
 	/* Initialize the ipsq */
-	if (!ipsq_init(ill))
+	if (!ipsq_init(ill, B_FALSE))
 		goto done;
 
-	ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL;
-	ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--;
-	ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
-	ill->ill_phyint->phyint_ipsq->ipsq_depth = 0;
-#endif
-	ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE);
+	ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE);
 	if (ipif == NULL)
 		goto done;
 
@@ -4807,7 +4879,7 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 	ill->ill_frag_free_num_pkts = 0;
 	ill->ill_last_frag_clean_time = 0;
 
-	old_ipsq = ill->ill_phyint->phyint_ipsq;
+	ipsq = ill->ill_phyint->phyint_ipsq;
 
 	if (ill_glist_insert(ill, "lo", isv6) != 0)
 		cmn_err(CE_PANIC, "cannot insert loopback interface");
@@ -4824,13 +4896,11 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 	sctp_update_ipif_addr(ipif, ov6addr);
 
 	/*
-	 * If the ipsq was changed in ill_phyint_reinit free the old ipsq.
+	 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
+	 * If so, free our original one.
 	 */
-	if (old_ipsq != ill->ill_phyint->phyint_ipsq) {
-		/* Loopback ills aren't in any IPMP group */
-		ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP));
-		ipsq_delete(old_ipsq);
-	}
+	if (ipsq != ill->ill_phyint->phyint_ipsq)
+		ipsq_delete(ipsq);
 
 	/*
 	 * Delay this till the ipif is allocated as ipif_allocate
@@ -4871,12 +4941,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 done:
 	if (ill != NULL) {
 		if (ill->ill_phyint != NULL) {
-			ipsq_t	*ipsq;
-
 			ipsq = ill->ill_phyint->phyint_ipsq;
 			if (ipsq != NULL) {
-				ipsq->ipsq_ipst = NULL;
-				kmem_free(ipsq, sizeof (ipsq_t));
+				ipsq->ipsq_phyint = NULL;
+				ipsq_delete(ipsq);
 			}
 			mi_free(ill->ill_phyint);
 		}
@@ -4954,9 +5022,11 @@ ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp,
 			} else if (ILL_CAN_WAIT(ill, q)) {
 				ipsq = ill->ill_phyint->phyint_ipsq;
 				mutex_enter(&ipsq->ipsq_lock);
+				mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 				rw_exit(&ipst->ips_ill_g_lock);
 				mutex_exit(&ill->ill_lock);
 				ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+				mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 				mutex_exit(&ipsq->ipsq_lock);
 				RELEASE_CONN_LOCK(q);
 				if (err != NULL)
@@ -5294,6 +5364,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 	dl_info_ack_t	*dlia;
 	ip_m_t		*ipm;
 	dl_qos_cl_sel1_t *sel1;
+	int		min_mtu;
 
 	ASSERT(IAM_WRITER_ILL(ill));
 
@@ -5336,7 +5407,14 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 	ill->ill_bcast_addr_length = brdcst_addr_length;
 	ill->ill_phys_addr_length = phys_addr_length;
 	ill->ill_sap_length = sap_length;
-	ill->ill_max_frag = dlia->dl_max_sdu;
+
+	/*
+	 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
+	 * but we must ensure a minimum IP MTU is used since other bits of
+	 * IP will fly apart otherwise.
+	 */
+	min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
+	ill->ill_max_frag  = MAX(min_mtu, dlia->dl_max_sdu);
 	ill->ill_max_mtu = ill->ill_max_frag;
 
 	ill->ill_type = ipm->ip_m_type;
@@ -5358,7 +5436,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 		 * the wakeup.
 		 */
 		(void) ipif_allocate(ill, 0, IRE_LOCAL,
-		    dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE);
+		    dlia->dl_provider_style != DL_STYLE2, B_TRUE);
 		mutex_enter(&ill->ill_lock);
 		ASSERT(ill->ill_dlpi_style_set == 0);
 		ill->ill_dlpi_style_set = 1;
@@ -5397,8 +5475,13 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 	/*
 	 * Free ill_resolver_mp and ill_bcast_mp as things could have
 	 * changed now.
+	 *
+	 * NOTE: The IPMP meta-interface is special-cased because it starts
+	 * with no underlying interfaces (and thus an unknown broadcast
+	 * address length), but we enforce that an interface is broadcast-
+	 * capable as part of allowing it to join a group.
 	 */
-	if (ill->ill_bcast_addr_length == 0) {
+	if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
 		if (ill->ill_resolver_mp != NULL)
 			freemsg(ill->ill_resolver_mp);
 		if (ill->ill_bcast_mp != NULL)
@@ -5451,6 +5534,11 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 		if (!ill->ill_isv6)
 			ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
 	}
+
+	/* For IPMP, PHYI_IPMP should already be set by ipif_allocate() */
+	if (ill->ill_mactype == SUNW_DL_IPMP)
+		ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
+
 	/* By default an interface does not support any CoS marking */
 	ill->ill_flags &= ~ILLF_COS_ENABLED;
 
@@ -5552,16 +5640,18 @@ ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
 }
 
 /*
- * Find any non-virtual, not condemned, and up multicast capable interface
- * given an IP instance and zoneid.  Order of preference is:
+ * Find a mulitcast-capable ipif given an IP instance and zoneid.
+ * The ipif must be up, and its ill must multicast-capable, not
+ * condemned, not an underlying interface in an IPMP group, and
+ * not a VNI interface.  Order of preference:
  *
- * 1. normal
- * 1.1 normal, but deprecated
- * 2. point to point
- * 2.1 point to point, but deprecated
- * 3. link local
- * 3.1 link local, but deprecated
- * 4. loopback.
+ * 	1a. normal
+ * 	1b. normal, but deprecated
+ * 	2a. point to point
+ * 	2b. point to point, but deprecated
+ * 	3a. link local
+ * 	3b. link local, but deprecated
+ * 	4. loopback.
  */
 ipif_t *
 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
@@ -5580,7 +5670,7 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
 
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
 		mutex_enter(&ill->ill_lock);
-		if (IS_VNI(ill) || !ILL_CAN_LOOKUP(ill) ||
+		if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) ||
 		    !(ill->ill_flags & ILLF_MULTICAST)) {
 			mutex_exit(&ill->ill_lock);
 			continue;
@@ -5736,10 +5826,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
 				} else if (IPIF_CAN_WAIT(ipif, q)) {
 					ipsq = ill->ill_phyint->phyint_ipsq;
 					mutex_enter(&ipsq->ipsq_lock);
+					mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ill->ill_lock);
 					rw_exit(&ipst->ips_ill_g_lock);
 					ipsq_enq(ipsq, q, mp, func, NEW_OP,
 					    ill);
+					mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ipsq->ipsq_lock);
 					RELEASE_CONN_LOCK(q);
 					if (error != NULL)
@@ -5761,15 +5853,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
 }
 
 /*
- * Look for an ipif with the specified address. For point-point links
- * we look for matches on either the destination address and the local
- * address, but we ignore the check on the local address if IPIF_UNNUMBERED
- * is set.
- * Matches on a specific ill if match_ill is set.
+ * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
  */
-ipif_t *
-ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
-    mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+static ipif_t *
+ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp,
+    zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
+    ip_stack_t *ipst)
 {
 	ipif_t  *ipif;
 	ill_t   *ill;
@@ -5788,7 +5877,8 @@ ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
 repeat:
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if (match_ill != NULL && ill != match_ill) {
+		if (match_ill != NULL && ill != match_ill &&
+		    (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
 			continue;
 		}
 		GRAB_CONN_LOCK(q);
@@ -5817,10 +5907,12 @@ repeat:
 				} else if (IPIF_CAN_WAIT(ipif, q)) {
 					ipsq = ill->ill_phyint->phyint_ipsq;
 					mutex_enter(&ipsq->ipsq_lock);
+					mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ill->ill_lock);
 					rw_exit(&ipst->ips_ill_g_lock);
 					ipsq_enq(ipsq, q, mp, func, NEW_OP,
 					    ill);
+					mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ipsq->ipsq_lock);
 					RELEASE_CONN_LOCK(q);
 					if (error != NULL)
@@ -5894,11 +5986,40 @@ ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
 }
 
 /*
+ * Lookup an ipif with the specified address.  For point-to-point links we
+ * look for matches on either the destination address or the local address,
+ * but we skip the local address check if IPIF_UNNUMBERED is set.  If the
+ * `match_ill' argument is non-NULL, the lookup is restricted to that ill
+ * (or illgrp if `match_ill' is in an IPMP group).
+ */
+ipif_t *
+ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
+    mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+{
+	return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp,
+	    func, error, ipst));
+}
+
+/*
+ * Special abbreviated version of ipif_lookup_addr() that doesn't match
+ * `match_ill' across the IPMP group.  This function is only needed in some
+ * corner-cases; almost everything should use ipif_lookup_addr().
+ */
+static ipif_t *
+ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
+{
+	ASSERT(match_ill != NULL);
+	return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES,
+	    NULL, NULL, NULL, NULL, ipst));
+}
+
+/*
  * Look for an ipif with the specified address. For point-point links
  * we look for matches on either the destination address and the local
  * address, but we ignore the check on the local address if IPIF_UNNUMBERED
  * is set.
- * Matches on a specific ill if match_ill is set.
+ * If the `match_ill' argument is non-NULL, the lookup is restricted to that
+ * ill (or illgrp if `match_ill' is in an IPMP group).
  * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
  */
 zoneid_t
@@ -5918,7 +6039,8 @@ ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
 repeat:
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if (match_ill != NULL && ill != match_ill) {
+		if (match_ill != NULL && ill != match_ill &&
+		    !IS_IN_SAME_ILLGRP(ill, match_ill)) {
 			continue;
 		}
 		mutex_enter(&ill->ill_lock);
@@ -6008,7 +6130,7 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
 		/*
 		 * The callers of this function wants to know the
 		 * interface on which they have to send the replies
-		 * back. For IRE_CACHES that have ire_stq and ire_ipif
+		 * back. For IREs that have ire_stq and ire_ipif
 		 * derived from different ills, we really don't care
 		 * what we return here.
 		 */
@@ -6109,30 +6231,6 @@ ipif_is_freeable(ipif_t *ipif)
 }
 
 /*
- * This func does not prevent refcnt from increasing. But if
- * the caller has taken steps to that effect, then this func
- * can be used to determine whether the ipifs marked with IPIF_MOVING
- * have become quiescent and can be moved in a failover/failback.
- */
-static ipif_t *
-ill_quiescent_to_move(ill_t *ill)
-{
-	ipif_t  *ipif;
-
-	ASSERT(MUTEX_HELD(&ill->ill_lock));
-
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (ipif->ipif_state_flags & IPIF_MOVING) {
-			if (ipif->ipif_refcnt != 0 ||
-			    !IPIF_DOWN_OK(ipif)) {
-				return (ipif);
-			}
-		}
-	}
-	return (NULL);
-}
-
-/*
  * The ipif/ill/ire has been refreled. Do the tail processing.
  * Determine if the ipif or ill in question has become quiescent and if so
  * wakeup close and/or restart any queued pending ioctl that is waiting
@@ -6144,87 +6242,61 @@ ipif_ill_refrele_tail(ill_t *ill)
 	mblk_t	*mp;
 	conn_t	*connp;
 	ipsq_t	*ipsq;
+	ipxop_t	*ipx;
 	ipif_t	*ipif;
 	dl_notify_ind_t *dlindp;
 
 	ASSERT(MUTEX_HELD(&ill->ill_lock));
 
-	if ((ill->ill_state_flags & ILL_CONDEMNED) &&
-	    ill_is_freeable(ill)) {
-		/* ill_close may be waiting */
+	if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
+		/* ip_modclose() may be waiting */
 		cv_broadcast(&ill->ill_cv);
 	}
 
-	/* ipsq can't change because ill_lock  is held */
 	ipsq = ill->ill_phyint->phyint_ipsq;
-	if (ipsq->ipsq_waitfor == 0) {
-		/* Not waiting for anything, just return. */
-		mutex_exit(&ill->ill_lock);
-		return;
-	}
-	ASSERT(ipsq->ipsq_pending_mp != NULL &&
-	    ipsq->ipsq_pending_ipif != NULL);
-	/*
-	 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF.
-	 * Last ipif going down needs to down the ill, so ill_ire_cnt must
-	 * be zero for restarting an ioctl that ends up downing the ill.
-	 */
-	ipif = ipsq->ipsq_pending_ipif;
-	if (ipif->ipif_ill != ill) {
-		/* The ioctl is pending on some other ill. */
-		mutex_exit(&ill->ill_lock);
-		return;
-	}
+	mutex_enter(&ipsq->ipsq_lock);
+	ipx = ipsq->ipsq_xop;
+	mutex_enter(&ipx->ipx_lock);
+	if (ipx->ipx_waitfor == 0)	/* no one's waiting; bail */
+		goto unlock;
+
+	ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
+
+	ipif = ipx->ipx_pending_ipif;
+	if (ipif->ipif_ill != ill) 	/* wait is for another ill; bail */
+		goto unlock;
 
-	switch (ipsq->ipsq_waitfor) {
+	switch (ipx->ipx_waitfor) {
 	case IPIF_DOWN:
-		if (!ipif_is_quiescent(ipif)) {
-			mutex_exit(&ill->ill_lock);
-			return;
-		}
+		if (!ipif_is_quiescent(ipif))
+			goto unlock;
 		break;
 	case IPIF_FREE:
-		if (!ipif_is_freeable(ipif)) {
-			mutex_exit(&ill->ill_lock);
-			return;
-		}
+		if (!ipif_is_freeable(ipif))
+			goto unlock;
 		break;
-
 	case ILL_DOWN:
-		if (!ill_is_quiescent(ill)) {
-			mutex_exit(&ill->ill_lock);
-			return;
-		}
+		if (!ill_is_quiescent(ill))
+			goto unlock;
 		break;
 	case ILL_FREE:
 		/*
-		 * case ILL_FREE arises only for loopback. otherwise ill_delete
-		 * waits synchronously in ip_close, and no message is queued in
-		 * ipsq_pending_mp at all in this case
+		 * ILL_FREE is only for loopback; normal ill teardown waits
+		 * synchronously in ip_modclose() without using ipx_waitfor,
+		 * handled by the cv_broadcast() at the top of this function.
 		 */
-		if (!ill_is_freeable(ill)) {
-			mutex_exit(&ill->ill_lock);
-			return;
-		}
-		break;
-
-	case ILL_MOVE_OK:
-		if (ill_quiescent_to_move(ill) != NULL) {
-			mutex_exit(&ill->ill_lock);
-			return;
-		}
+		if (!ill_is_freeable(ill))
+			goto unlock;
 		break;
 	default:
-		cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n",
-		    (void *)ipsq, ipsq->ipsq_waitfor);
+		cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
+		    (void *)ipsq, ipx->ipx_waitfor);
 	}
 
-	/*
-	 * Incr refcnt for the qwriter_ip call below which
-	 * does a refrele
-	 */
-	ill_refhold_locked(ill);
+	ill_refhold_locked(ill);	/* for qwriter_ip() call below */
+	mutex_exit(&ipx->ipx_lock);
 	mp = ipsq_pending_mp_get(ipsq, &connp);
+	mutex_exit(&ipsq->ipsq_lock);
 	mutex_exit(&ill->ill_lock);
 
 	ASSERT(mp != NULL);
@@ -6249,6 +6321,7 @@ ipif_ill_refrele_tail(ill_t *ill)
 			return;
 		default:
 			ASSERT(0);
+			ill_refrele(ill);
 		}
 		break;
 
@@ -6268,6 +6341,11 @@ ipif_ill_refrele_tail(ill_t *ill)
 		cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
 		    "db_type %d\n", (void *)mp, mp->b_datap->db_type);
 	}
+	return;
+unlock:
+	mutex_exit(&ipsq->ipsq_lock);
+	mutex_exit(&ipx->ipx_lock);
+	mutex_exit(&ill->ill_lock);
 }
 
 #ifdef DEBUG
@@ -6902,10 +6980,23 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 	ipif = ipif_arg;
 	if (ipif_arg != NULL)
 		match_flags |= MATCH_IRE_ILL;
+again:
 	gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL,
 	    ALL_ZONES, 0, NULL, match_flags, ipst);
-	if (gw_ire == NULL)
+	if (gw_ire == NULL) {
+		/*
+		 * With IPMP, we allow host routes to influence in.mpathd's
+		 * target selection.  However, if the test addresses are on
+		 * their own network, the above lookup will fail since the
+		 * underlying IRE_INTERFACEs are marked hidden.  So allow
+		 * hidden test IREs to be found and try again.
+		 */
+		if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))  {
+			match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+			goto again;
+		}
 		return (ENETUNREACH);
+	}
 
 	/*
 	 * We create one of three types of IREs as a result of this request
@@ -7355,9 +7446,11 @@ void
 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
     ill_t *pending_ill)
 {
-	conn_t	*connp = NULL;
+	conn_t	*connp;
+	ipxop_t *ipx = ipsq->ipsq_xop;
 
 	ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
+	ASSERT(MUTEX_HELD(&ipx->ipx_lock));
 	ASSERT(func != NULL);
 
 	mp->b_queue = q;
@@ -7366,14 +7459,14 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
 
 	switch (type) {
 	case CUR_OP:
-		if (ipsq->ipsq_mptail != NULL) {
-			ASSERT(ipsq->ipsq_mphead != NULL);
-			ipsq->ipsq_mptail->b_next = mp;
+		if (ipx->ipx_mptail != NULL) {
+			ASSERT(ipx->ipx_mphead != NULL);
+			ipx->ipx_mptail->b_next = mp;
 		} else {
-			ASSERT(ipsq->ipsq_mphead == NULL);
-			ipsq->ipsq_mphead = mp;
+			ASSERT(ipx->ipx_mphead == NULL);
+			ipx->ipx_mphead = mp;
 		}
-		ipsq->ipsq_mptail = mp;
+		ipx->ipx_mptail = mp;
 		break;
 
 	case NEW_OP:
@@ -7385,6 +7478,15 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
 			ipsq->ipsq_xopq_mphead = mp;
 		}
 		ipsq->ipsq_xopq_mptail = mp;
+		ipx->ipx_ipsq_queued = B_TRUE;
+		break;
+
+	case SWITCH_OP:
+		ASSERT(ipsq->ipsq_swxop != NULL);
+		/* only one switch operation is currently allowed */
+		ASSERT(ipsq->ipsq_switch_mp == NULL);
+		ipsq->ipsq_switch_mp = mp;
+		ipx->ipx_ipsq_queued = B_TRUE;
 		break;
 	default:
 		cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
@@ -7392,55 +7494,273 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
 
 	if (CONN_Q(q) && pending_ill != NULL) {
 		connp = Q_TO_CONN(q);
-
 		ASSERT(MUTEX_HELD(&connp->conn_lock));
 		connp->conn_oper_pending_ill = pending_ill;
 	}
 }
 
 /*
- * Return the mp at the head of the ipsq. After emptying the ipsq
- * look at the next ioctl, if this ioctl is complete. Otherwise
- * return, we will resume when we complete the current ioctl.
- * The current ioctl will wait till it gets a response from the
- * driver below.
+ * Dequeue the next message that requested exclusive access to this IPSQ's
+ * xop.  Specifically:
+ *
+ *  1. If we're still processing the current operation on `ipsq', then
+ *     dequeue the next message for the operation (from ipx_mphead), or
+ *     return NULL if there are no queued messages for the operation.
+ *     These messages are queued via CUR_OP to qwriter_ip() and friends.
+ *
+ *  2. If the current operation on `ipsq' has completed (ipx_current_ipif is
+ *     not set) see if the ipsq has requested an xop switch.  If so, switch
+ *     `ipsq' to a different xop.  Xop switches only happen when joining or
+ *     leaving IPMP groups and require a careful dance -- see the comments
+ *     in-line below for details.  If we're leaving a group xop or if we're
+ *     joining a group xop and become writer on it, then we proceed to (3).
+ *     Otherwise, we return NULL and exit the xop.
+ *
+ *  3. For each IPSQ in the xop, return any switch operation stored on
+ *     ipsq_switch_mp (set via SWITCH_OP); these must be processed before
+ *     any other messages queued on the IPSQ.  Otherwise, dequeue the next
+ *     exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
+ *     Note that if the phyint tied to `ipsq' is not using IPMP there will
+ *     only be one IPSQ in the xop.  Otherwise, there will be one IPSQ for
+ *     each phyint in the group, including the IPMP meta-interface phyint.
  */
 static mblk_t *
 ipsq_dq(ipsq_t *ipsq)
 {
+	ill_t	*illv4, *illv6;
 	mblk_t	*mp;
+	ipsq_t	*xopipsq;
+	ipsq_t	*leftipsq = NULL;
+	ipxop_t *ipx;
+	phyint_t *phyi = ipsq->ipsq_phyint;
+	ip_stack_t *ipst = ipsq->ipsq_ipst;
+	boolean_t emptied = B_FALSE;
 
-	ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
+	/*
+	 * Grab all the locks we need in the defined order (ill_g_lock ->
+	 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
+	 */
+	rw_enter(&ipst->ips_ill_g_lock,
+	    ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
+	mutex_enter(&ipsq->ipsq_lock);
+	ipx = ipsq->ipsq_xop;
+	mutex_enter(&ipx->ipx_lock);
 
-	mp = ipsq->ipsq_mphead;
-	if (mp != NULL) {
-		ipsq->ipsq_mphead = mp->b_next;
-		if (ipsq->ipsq_mphead == NULL)
-			ipsq->ipsq_mptail = NULL;
-		mp->b_next = NULL;
-		return (mp);
+	/*
+	 * Dequeue the next message associated with the current exclusive
+	 * operation, if any.
+	 */
+	if ((mp = ipx->ipx_mphead) != NULL) {
+		ipx->ipx_mphead = mp->b_next;
+		if (ipx->ipx_mphead == NULL)
+			ipx->ipx_mptail = NULL;
+		mp->b_next = (void *)ipsq;
+		goto out;
 	}
-	if (ipsq->ipsq_current_ipif != NULL)
-		return (NULL);
-	mp = ipsq->ipsq_xopq_mphead;
-	if (mp != NULL) {
-		ipsq->ipsq_xopq_mphead = mp->b_next;
-		if (ipsq->ipsq_xopq_mphead == NULL)
-			ipsq->ipsq_xopq_mptail = NULL;
-		mp->b_next = NULL;
-		return (mp);
+
+	if (ipx->ipx_current_ipif != NULL)
+		goto empty;
+
+	if (ipsq->ipsq_swxop != NULL) {
+		/*
+		 * The exclusive operation that is now being completed has
+		 * requested a switch to a different xop.  This happens
+		 * when an interface joins or leaves an IPMP group.  Joins
+		 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
+		 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
+		 * (phyint_free()), or interface plumb for an ill type
+		 * not in the IPMP group (ip_rput_dlpi_writer()).
+		 *
+		 * Xop switches are not allowed on the IPMP meta-interface.
+		 */
+		ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
+		ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
+		DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
+
+		if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
+			/*
+			 * We're switching back to our own xop, so we have two
+			 * xop's to drain/exit: our own, and the group xop
+			 * that we are leaving.
+			 *
+			 * First, pull ourselves out of the group ipsq list.
+			 * This is safe since we're writer on ill_g_lock.
+			 */
+			ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
+
+			xopipsq = ipx->ipx_ipsq;
+			while (xopipsq->ipsq_next != ipsq)
+				xopipsq = xopipsq->ipsq_next;
+
+			xopipsq->ipsq_next = ipsq->ipsq_next;
+			ipsq->ipsq_next = ipsq;
+			ipsq->ipsq_xop = ipsq->ipsq_swxop;
+			ipsq->ipsq_swxop = NULL;
+
+			/*
+			 * Second, prepare to exit the group xop.  The actual
+			 * ipsq_exit() is done at the end of this function
+			 * since we cannot hold any locks across ipsq_exit().
+			 * Note that although we drop the group's ipx_lock, no
+			 * threads can proceed since we're still ipx_writer.
+			 */
+			leftipsq = xopipsq;
+			mutex_exit(&ipx->ipx_lock);
+
+			/*
+			 * Third, set ipx to point to our own xop (which was
+			 * inactive and therefore can be entered).
+			 */
+			ipx = ipsq->ipsq_xop;
+			mutex_enter(&ipx->ipx_lock);
+			ASSERT(ipx->ipx_writer == NULL);
+			ASSERT(ipx->ipx_current_ipif == NULL);
+		} else {
+			/*
+			 * We're switching from our own xop to a group xop.
+			 * The requestor of the switch must ensure that the
+			 * group xop cannot go away (e.g. by ensuring the
+			 * phyint associated with the xop cannot go away).
+			 *
+			 * If we can become writer on our new xop, then we'll
+			 * do the drain.  Otherwise, the current writer of our
+			 * new xop will do the drain when it exits.
+			 *
+			 * First, splice ourselves into the group IPSQ list.
+			 * This is safe since we're writer on ill_g_lock.
+			 */
+			ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
+
+			xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
+			while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
+				xopipsq = xopipsq->ipsq_next;
+
+			xopipsq->ipsq_next = ipsq;
+			ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
+			ipsq->ipsq_xop = ipsq->ipsq_swxop;
+			ipsq->ipsq_swxop = NULL;
+
+			/*
+			 * Second, exit our own xop, since it's now unused.
+			 * This is safe since we've got the only reference.
+			 */
+			ASSERT(ipx->ipx_writer == curthread);
+			ipx->ipx_writer = NULL;
+			VERIFY(--ipx->ipx_reentry_cnt == 0);
+			ipx->ipx_ipsq_queued = B_FALSE;
+			mutex_exit(&ipx->ipx_lock);
+
+			/*
+			 * Third, set ipx to point to our new xop, and check
+			 * if we can become writer on it.  If we cannot, then
+			 * the current writer will drain the IPSQ group when
+			 * it exits.  Our ipsq_xop is guaranteed to be stable
+			 * because we're still holding ipsq_lock.
+			 */
+			ipx = ipsq->ipsq_xop;
+			mutex_enter(&ipx->ipx_lock);
+			if (ipx->ipx_writer != NULL ||
+			    ipx->ipx_current_ipif != NULL) {
+				goto out;
+			}
+		}
+
+		/*
+		 * Fourth, become writer on our new ipx before we continue
+		 * with the drain.  Note that we never dropped ipsq_lock
+		 * above, so no other thread could've raced with us to
+		 * become writer first.  Also, we're holding ipx_lock, so
+		 * no other thread can examine the ipx right now.
+		 */
+		ASSERT(ipx->ipx_current_ipif == NULL);
+		ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
+		VERIFY(ipx->ipx_reentry_cnt++ == 0);
+		ipx->ipx_writer = curthread;
+		ipx->ipx_forced = B_FALSE;
+#ifdef DEBUG
+		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
+#endif
 	}
-	return (NULL);
+
+	xopipsq = ipsq;
+	do {
+		/*
+		 * So that other operations operate on a consistent and
+		 * complete phyint, a switch message on an IPSQ must be
+		 * handled prior to any other operations on that IPSQ.
+		 */
+		if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
+			xopipsq->ipsq_switch_mp = NULL;
+			ASSERT(mp->b_next == NULL);
+			mp->b_next = (void *)xopipsq;
+			goto out;
+		}
+
+		if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
+			xopipsq->ipsq_xopq_mphead = mp->b_next;
+			if (xopipsq->ipsq_xopq_mphead == NULL)
+				xopipsq->ipsq_xopq_mptail = NULL;
+			mp->b_next = (void *)xopipsq;
+			goto out;
+		}
+	} while ((xopipsq = xopipsq->ipsq_next) != ipsq);
+empty:
+	/*
+	 * There are no messages.  Further, we are holding ipx_lock, hence no
+	 * new messages can end up on any IPSQ in the xop.
+	 */
+	ipx->ipx_writer = NULL;
+	ipx->ipx_forced = B_FALSE;
+	VERIFY(--ipx->ipx_reentry_cnt == 0);
+	ipx->ipx_ipsq_queued = B_FALSE;
+	emptied = B_TRUE;
+#ifdef	DEBUG
+	ipx->ipx_depth = 0;
+#endif
+out:
+	mutex_exit(&ipx->ipx_lock);
+	mutex_exit(&ipsq->ipsq_lock);
+
+	/*
+	 * If we completely emptied the xop, then wake up any threads waiting
+	 * to enter any of the IPSQ's associated with it.
+	 */
+	if (emptied) {
+		xopipsq = ipsq;
+		do {
+			if ((phyi = xopipsq->ipsq_phyint) == NULL)
+				continue;
+
+			illv4 = phyi->phyint_illv4;
+			illv6 = phyi->phyint_illv6;
+
+			GRAB_ILL_LOCKS(illv4, illv6);
+			if (illv4 != NULL)
+				cv_broadcast(&illv4->ill_cv);
+			if (illv6 != NULL)
+				cv_broadcast(&illv6->ill_cv);
+			RELEASE_ILL_LOCKS(illv4, illv6);
+		} while ((xopipsq = xopipsq->ipsq_next) != ipsq);
+	}
+	rw_exit(&ipst->ips_ill_g_lock);
+
+	/*
+	 * Now that all locks are dropped, exit the IPSQ we left.
+	 */
+	if (leftipsq != NULL)
+		ipsq_exit(leftipsq);
+
+	return (mp);
 }
 
 /*
  * Enter the ipsq corresponding to ill, by waiting synchronously till
  * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
  * will have to drain completely before ipsq_enter returns success.
- * ipsq_current_ipif will be set if some exclusive ioctl is in progress,
- * and the ipsq_exit logic will start the next enqueued ioctl after
- * completion of the current ioctl. If 'force' is used, we don't wait
- * for the enqueued ioctls. This is needed when a conn_close wants to
+ * ipx_current_ipif will be set if some exclusive op is in progress,
+ * and the ipsq_exit logic will start the next enqueued op after
+ * completion of the current op. If 'force' is used, we don't wait
+ * for the enqueued ops. This is needed when a conn_close wants to
  * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
  * of an ill can also use this option. But we dont' use it currently.
  */
@@ -7449,13 +7769,16 @@ boolean_t
 ipsq_enter(ill_t *ill, boolean_t force, int type)
 {
 	ipsq_t	*ipsq;
+	ipxop_t *ipx;
 	boolean_t waited_enough = B_FALSE;
 
 	/*
-	 * Holding the ill_lock prevents <ill-ipsq> assocs from changing.
-	 * Since the <ill-ipsq> assocs could change while we wait for the
-	 * writer, it is easier to wait on a fixed global rather than try to
-	 * cv_wait on a changing ipsq.
+	 * Note that the relationship between ill and ipsq is fixed as long as
+	 * the ill is not ILL_CONDEMNED.  Holding ipsq_lock ensures the
+	 * relationship between the IPSQ and xop cannot change.  However,
+	 * since we cannot hold ipsq_lock across the cv_wait(), it may change
+	 * while we're waiting.  We wait on ill_cv and rely on ipsq_exit()
+	 * waking up all ills in the xop when it becomes available.
 	 */
 	mutex_enter(&ill->ill_lock);
 	for (;;) {
@@ -7466,34 +7789,35 @@ ipsq_enter(ill_t *ill, boolean_t force, int type)
 
 		ipsq = ill->ill_phyint->phyint_ipsq;
 		mutex_enter(&ipsq->ipsq_lock);
-		if (ipsq->ipsq_writer == NULL &&
-		    (type == CUR_OP || ipsq->ipsq_current_ipif == NULL ||
-		    waited_enough)) {
+		ipx = ipsq->ipsq_xop;
+		mutex_enter(&ipx->ipx_lock);
+
+		if (ipx->ipx_writer == NULL && (type == CUR_OP ||
+		    ipx->ipx_current_ipif == NULL || waited_enough))
 			break;
-		} else if (ipsq->ipsq_writer != NULL) {
+
+		if (!force || ipx->ipx_writer != NULL) {
+			mutex_exit(&ipx->ipx_lock);
 			mutex_exit(&ipsq->ipsq_lock);
 			cv_wait(&ill->ill_cv, &ill->ill_lock);
 		} else {
+			mutex_exit(&ipx->ipx_lock);
 			mutex_exit(&ipsq->ipsq_lock);
-			if (force) {
-				(void) cv_timedwait(&ill->ill_cv,
-				    &ill->ill_lock,
-				    lbolt + ENTER_SQ_WAIT_TICKS);
-				waited_enough = B_TRUE;
-				continue;
-			} else {
-				cv_wait(&ill->ill_cv, &ill->ill_lock);
-			}
+			(void) cv_timedwait(&ill->ill_cv,
+			    &ill->ill_lock, lbolt + ENTER_SQ_WAIT_TICKS);
+			waited_enough = B_TRUE;
 		}
 	}
 
-	ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL);
-	ASSERT(ipsq->ipsq_reentry_cnt == 0);
-	ipsq->ipsq_writer = curthread;
-	ipsq->ipsq_reentry_cnt++;
+	ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
+	ASSERT(ipx->ipx_reentry_cnt == 0);
+	ipx->ipx_writer = curthread;
+	ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
+	ipx->ipx_reentry_cnt++;
 #ifdef DEBUG
-	ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IPSQ_STACK_DEPTH);
+	ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
 #endif
+	mutex_exit(&ipx->ipx_lock);
 	mutex_exit(&ipsq->ipsq_lock);
 	mutex_exit(&ill->ill_lock);
 	return (B_TRUE);
@@ -7513,14 +7837,13 @@ ill_perim_exit(ill_t *ill)
 
 /*
  * The ipsq_t (ipsq) is the synchronization data structure used to serialize
- * certain critical operations like plumbing (i.e. most set ioctls),
- * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP
- * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per
- * IPMP group. The ipsq serializes exclusive ioctls issued by applications
- * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple
- * threads executing in the ipsq. Responses from the driver pertain to the
- * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated
- * as part of bringing up the interface) and are enqueued in ipsq_mphead.
+ * certain critical operations like plumbing (i.e. most set ioctls), multicast
+ * joins, igmp/mld timers, etc.  There is one ipsq per phyint. The ipsq
+ * serializes exclusive ioctls issued by applications on a per ipsq basis in
+ * ipsq_xopq_mphead. It also protects against multiple threads executing in
+ * the ipsq. Responses from the driver pertain to the current ioctl (say a
+ * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
+ * up the interface) and are enqueued in ipx_mphead.
  *
  * If a thread does not want to reenter the ipsq when it is already writer,
  * it must make sure that the specified reentry point to be called later
@@ -7528,29 +7851,33 @@ ill_perim_exit(ill_t *ill)
  * point must never ever try to enter the ipsq again. Otherwise it can lead
  * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
  * When the thread that is currently exclusive finishes, it (ipsq_exit)
- * dequeues the requests waiting to become exclusive in ipsq_mphead and calls
- * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit
+ * dequeues the requests waiting to become exclusive in ipx_mphead and calls
+ * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
  * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
  * ioctl if the current ioctl has completed. If the current ioctl is still
  * in progress it simply returns. The current ioctl could be waiting for
- * a response from another module (arp_ or the driver or could be waiting for
- * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp
- * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the
+ * a response from another module (arp or the driver or could be waiting for
+ * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
+ * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
  * execution of the ioctl and ipsq_exit does not start the next ioctl unless
- * ipsq_current_ipif is clear which happens only on ioctl completion.
+ * ipx_current_ipif is NULL which happens only once the ioctl is complete and
+ * all associated DLPI operations have completed.
  */
 
 /*
- * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of
- * ipif or ill can be specified). The caller ensures ipif or ill is valid by
- * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued
- * completion.
+ * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
+ * and `ill' cannot both be specified).  Returns a pointer to the entered IPSQ
+ * on success, or NULL on failure.  The caller ensures ipif/ill is valid by
+ * refholding it as necessary.  If the IPSQ cannot be entered and `func' is
+ * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
+ * can be entered.  If `func' is NULL, then `q' and `mp' are ignored.
  */
 ipsq_t *
 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
     ipsq_func_t func, int type, boolean_t reentry_ok)
 {
 	ipsq_t	*ipsq;
+	ipxop_t	*ipx;
 
 	/* Only 1 of ipif or ill can be specified */
 	ASSERT((ipif != NULL) ^ (ill != NULL));
@@ -7558,13 +7885,15 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
 		ill = ipif->ipif_ill;
 
 	/*
-	 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock
-	 * ipsq of an ill can't change when ill_lock is held.
+	 * lock ordering: conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
+	 * ipx of an ipsq can't change when ipsq_lock is held.
 	 */
 	GRAB_CONN_LOCK(q);
 	mutex_enter(&ill->ill_lock);
 	ipsq = ill->ill_phyint->phyint_ipsq;
 	mutex_enter(&ipsq->ipsq_lock);
+	ipx = ipsq->ipsq_xop;
+	mutex_enter(&ipx->ipx_lock);
 
 	/*
 	 * 1. Enter the ipsq if we are already writer and reentry is ok.
@@ -7572,30 +7901,32 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
 	 *    'func' nor any of its callees must ever attempt to enter the ipsq
 	 *    again. Otherwise it can lead to an infinite loop
 	 * 2. Enter the ipsq if there is no current writer and this attempted
-	 *    entry is part of the current ioctl or operation
+	 *    entry is part of the current operation
 	 * 3. Enter the ipsq if there is no current writer and this is a new
-	 *    ioctl (or operation) and the ioctl (or operation) queue is
-	 *    empty and there is no ioctl (or operation) currently in progress
+	 *    operation and the operation queue is empty and there is no
+	 *    operation currently in progress
 	 */
-	if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) ||
-	    (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL &&
-	    ipsq->ipsq_current_ipif == NULL))) ||
-	    (ipsq->ipsq_writer == curthread && reentry_ok)) {
+	if ((ipx->ipx_writer == curthread && reentry_ok) ||
+	    (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
+	    !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL)))) {
 		/* Success. */
-		ipsq->ipsq_reentry_cnt++;
-		ipsq->ipsq_writer = curthread;
+		ipx->ipx_reentry_cnt++;
+		ipx->ipx_writer = curthread;
+		ipx->ipx_forced = B_FALSE;
+		mutex_exit(&ipx->ipx_lock);
 		mutex_exit(&ipsq->ipsq_lock);
 		mutex_exit(&ill->ill_lock);
 		RELEASE_CONN_LOCK(q);
 #ifdef DEBUG
-		ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack,
-		    IPSQ_STACK_DEPTH);
+		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
 #endif
 		return (ipsq);
 	}
 
-	ipsq_enq(ipsq, q, mp, func, type, ill);
+	if (func != NULL)
+		ipsq_enq(ipsq, q, mp, func, type, ill);
 
+	mutex_exit(&ipx->ipx_lock);
 	mutex_exit(&ipsq->ipsq_lock);
 	mutex_exit(&ill->ill_lock);
 	RELEASE_CONN_LOCK(q);
@@ -7630,188 +7961,58 @@ qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
 }
 
 /*
- * If there are more than ILL_GRP_CNT ills in a group,
- * we use kmem alloc'd buffers, else use the stack
- */
-#define	ILL_GRP_CNT	14
-/*
- * Drain the ipsq, if there are messages on it, and then leave the ipsq.
- * Called by a thread that is currently exclusive on this ipsq.
+ * Exit the specified IPSQ.  If this is the final exit on it then drain it
+ * prior to exiting.  Caller must be writer on the specified IPSQ.
  */
 void
 ipsq_exit(ipsq_t *ipsq)
 {
+	mblk_t *mp;
+	ipsq_t *mp_ipsq;
 	queue_t	*q;
-	mblk_t	*mp;
-	ipsq_func_t	func;
-	int	next;
-	ill_t	**ill_list = NULL;
-	size_t	ill_list_size = 0;
-	int	cnt = 0;
-	boolean_t need_ipsq_free = B_FALSE;
-	ip_stack_t	*ipst = ipsq->ipsq_ipst;
+	phyint_t *phyi;
+	ipsq_func_t func;
 
 	ASSERT(IAM_WRITER_IPSQ(ipsq));
-	mutex_enter(&ipsq->ipsq_lock);
-	ASSERT(ipsq->ipsq_reentry_cnt >= 1);
-	if (ipsq->ipsq_reentry_cnt != 1) {
-		ipsq->ipsq_reentry_cnt--;
-		mutex_exit(&ipsq->ipsq_lock);
+
+	ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
+	if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
+		ipsq->ipsq_xop->ipx_reentry_cnt--;
 		return;
 	}
 
-	mp = ipsq_dq(ipsq);
-	while (mp != NULL) {
-again:
-		mutex_exit(&ipsq->ipsq_lock);
-		func = (ipsq_func_t)mp->b_prev;
-		q = (queue_t *)mp->b_queue;
-		mp->b_prev = NULL;
-		mp->b_queue = NULL;
-
-		/*
-		 * If 'q' is an conn queue, it is valid, since we did a
-		 * a refhold on the connp, at the start of the ioctl.
-		 * If 'q' is an ill queue, it is valid, since close of an
-		 * ill will clean up the 'ipsq'.
-		 */
-		(*func)(ipsq, q, mp, NULL);
-
-		mutex_enter(&ipsq->ipsq_lock);
+	for (;;) {
+		phyi = ipsq->ipsq_phyint;
 		mp = ipsq_dq(ipsq);
-	}
-
-	mutex_exit(&ipsq->ipsq_lock);
-
-	/*
-	 * Need to grab the locks in the right order. Need to
-	 * atomically check (under ipsq_lock) that there are no
-	 * messages before relinquishing the ipsq. Also need to
-	 * atomically wakeup waiters on ill_cv while holding ill_lock.
-	 * Holding ill_g_lock ensures that ipsq list of ills is stable.
-	 * If we need to call ill_split_ipsq and change <ill-ipsq> we need
-	 * to grab ill_g_lock as writer.
-	 */
-	rw_enter(&ipst->ips_ill_g_lock,
-	    ipsq->ipsq_split ? RW_WRITER : RW_READER);
+		mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
 
-	/* ipsq_refs can't change while ill_g_lock is held as reader */
-	if (ipsq->ipsq_refs != 0) {
-		/* At most 2 ills v4/v6 per phyint */
-		cnt = ipsq->ipsq_refs << 1;
-		ill_list_size = cnt * sizeof (ill_t *);
 		/*
-		 * If memory allocation fails, we will do the split
-		 * the next time ipsq_exit is called for whatever reason.
-		 * As long as the ipsq_split flag is set the need to
-		 * split is remembered.
+		 * If we've changed to a new IPSQ, and the phyint associated
+		 * with the old one has gone away, free the old IPSQ.  Note
+		 * that this cannot happen while the IPSQ is in a group.
 		 */
-		ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
-		if (ill_list != NULL)
-			cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt);
-	}
-	mutex_enter(&ipsq->ipsq_lock);
-	mp = ipsq_dq(ipsq);
-	if (mp != NULL) {
-		/* oops, some message has landed up, we can't get out */
-		if (ill_list != NULL)
-			ill_unlock_ills(ill_list, cnt);
-		rw_exit(&ipst->ips_ill_g_lock);
-		if (ill_list != NULL)
-			kmem_free(ill_list, ill_list_size);
-		ill_list = NULL;
-		ill_list_size = 0;
-		cnt = 0;
-		goto again;
-	}
+		if (mp_ipsq != ipsq && phyi == NULL) {
+			ASSERT(ipsq->ipsq_next == ipsq);
+			ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
+			ipsq_delete(ipsq);
+		}
 
-	/*
-	 * Split only if no ioctl is pending and if memory alloc succeeded
-	 * above.
-	 */
-	if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL &&
-	    ill_list != NULL) {
-		/*
-		 * No new ill can join this ipsq since we are holding the
-		 * ill_g_lock. Hence ill_split_ipsq can safely traverse the
-		 * ipsq. ill_split_ipsq may fail due to memory shortage.
-		 * If so we will retry on the next ipsq_exit.
-		 */
-		ipsq->ipsq_split = ill_split_ipsq(ipsq);
-	}
+		if (mp == NULL)
+			break;
 
-	/*
-	 * We are holding the ipsq lock, hence no new messages can
-	 * land up on the ipsq, and there are no messages currently.
-	 * Now safe to get out. Wake up waiters and relinquish ipsq
-	 * atomically while holding ill locks.
-	 */
-	ipsq->ipsq_writer = NULL;
-	ipsq->ipsq_reentry_cnt--;
-	ASSERT(ipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
-	ipsq->ipsq_depth = 0;
-#endif
-	mutex_exit(&ipsq->ipsq_lock);
-	/*
-	 * For IPMP this should wake up all ills in this ipsq.
-	 * We need to hold the ill_lock while waking up waiters to
-	 * avoid missed wakeups. But there is no need to acquire all
-	 * the ill locks and then wakeup. If we have not acquired all
-	 * the locks (due to memory failure above) ill_signal_ipsq_ills
-	 * wakes up ills one at a time after getting the right ill_lock
-	 */
-	ill_signal_ipsq_ills(ipsq, ill_list != NULL);
-	if (ill_list != NULL)
-		ill_unlock_ills(ill_list, cnt);
-	if (ipsq->ipsq_refs == 0)
-		need_ipsq_free = B_TRUE;
-	rw_exit(&ipst->ips_ill_g_lock);
-	if (ill_list != 0)
-		kmem_free(ill_list, ill_list_size);
+		q = mp->b_queue;
+		func = (ipsq_func_t)mp->b_prev;
+		ipsq = mp_ipsq;
+		mp->b_next = mp->b_prev = NULL;
+		mp->b_queue = NULL;
 
-	if (need_ipsq_free) {
 		/*
-		 * Free the ipsq. ipsq_refs can't increase because ipsq can't be
-		 * looked up. ipsq can be looked up only thru ill or phyint
-		 * and there are no ills/phyint on this ipsq.
+		 * If 'q' is an conn queue, it is valid, since we did a
+		 * a refhold on the conn at the start of the ioctl.
+		 * If 'q' is an ill queue, it is valid, since close of an
+		 * ill will clean up its IPSQ.
 		 */
-		ipsq_delete(ipsq);
-	}
-
-	/*
-	 * Now that we're outside the IPSQ, start any IGMP/MLD timers.  We
-	 * can't start these inside the IPSQ since e.g. igmp_start_timers() ->
-	 * untimeout() (inside the IPSQ, waiting for an executing timeout to
-	 * finish) could deadlock with igmp_timeout_handler() -> ipsq_enter()
-	 * (executing the timeout, waiting to get inside the IPSQ).
-	 *
-	 * However, there is one exception to the above: if this thread *is*
-	 * the IGMP/MLD timeout handler thread, then we must not start its
-	 * timer until the current handler is done.
-	 */
-	mutex_enter(&ipst->ips_igmp_timer_lock);
-	if (curthread != ipst->ips_igmp_timer_thread) {
-		next = ipst->ips_igmp_deferred_next;
-		ipst->ips_igmp_deferred_next = INFINITY;
-		mutex_exit(&ipst->ips_igmp_timer_lock);
-
-		if (next != INFINITY)
-			igmp_start_timers(next, ipst);
-	} else {
-		mutex_exit(&ipst->ips_igmp_timer_lock);
-	}
-
-	mutex_enter(&ipst->ips_mld_timer_lock);
-	if (curthread != ipst->ips_mld_timer_thread) {
-		next = ipst->ips_mld_deferred_next;
-		ipst->ips_mld_deferred_next = INFINITY;
-		mutex_exit(&ipst->ips_mld_timer_lock);
-
-		if (next != INFINITY)
-			mld_start_timers(next, ipst);
-	} else {
-		mutex_exit(&ipst->ips_mld_timer_lock);
+		(*func)(ipsq, q, mp, NULL);
 	}
 }
 
@@ -7822,15 +8023,17 @@ again:
 void
 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
 {
+	ipxop_t *ipx = ipsq->ipsq_xop;
+
 	ASSERT(IAM_WRITER_IPSQ(ipsq));
+	ASSERT(ipx->ipx_current_ipif == NULL);
+	ASSERT(ipx->ipx_current_ioctl == 0);
 
-	mutex_enter(&ipsq->ipsq_lock);
-	ASSERT(ipsq->ipsq_current_ipif == NULL);
-	ASSERT(ipsq->ipsq_current_ioctl == 0);
-	ipsq->ipsq_current_done = B_FALSE;
-	ipsq->ipsq_current_ipif = ipif;
-	ipsq->ipsq_current_ioctl = ioccmd;
-	mutex_exit(&ipsq->ipsq_lock);
+	ipx->ipx_current_done = B_FALSE;
+	ipx->ipx_current_ioctl = ioccmd;
+	mutex_enter(&ipx->ipx_lock);
+	ipx->ipx_current_ipif = ipif;
+	mutex_exit(&ipx->ipx_lock);
 }
 
 /*
@@ -7844,17 +8047,18 @@ ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
 void
 ipsq_current_finish(ipsq_t *ipsq)
 {
-	ipif_t *ipif = ipsq->ipsq_current_ipif;
+	ipxop_t	*ipx = ipsq->ipsq_xop;
 	t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
+	ipif_t	*ipif = ipx->ipx_current_ipif;
 
 	ASSERT(IAM_WRITER_IPSQ(ipsq));
 
 	/*
-	 * For SIOCSLIFREMOVEIF, the ipif has been already been blown away
+	 * For SIOCLIFREMOVEIF, the ipif has been already been blown away
 	 * (but in that case, IPIF_CHANGING will already be clear and no
 	 * pending DLPI messages can remain).
 	 */
-	if (ipsq->ipsq_current_ioctl != SIOCLIFREMOVEIF) {
+	if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
 		ill_t *ill = ipif->ipif_ill;
 
 		mutex_enter(&ill->ill_lock);
@@ -7863,12 +8067,14 @@ ipsq_current_finish(ipsq_t *ipsq)
 		mutex_exit(&ill->ill_lock);
 	}
 
-	mutex_enter(&ipsq->ipsq_lock);
-	ipsq->ipsq_current_ioctl = 0;
-	ipsq->ipsq_current_done = B_TRUE;
-	if (dlpi_pending == DL_PRIM_INVAL)
-		ipsq->ipsq_current_ipif = NULL;
-	mutex_exit(&ipsq->ipsq_lock);
+	ASSERT(!ipx->ipx_current_done);
+	ipx->ipx_current_done = B_TRUE;
+	ipx->ipx_current_ioctl = 0;
+	if (dlpi_pending == DL_PRIM_INVAL) {
+		mutex_enter(&ipx->ipx_lock);
+		ipx->ipx_current_ipif = NULL;
+		mutex_exit(&ipx->ipx_lock);
+	}
 }
 
 /*
@@ -7884,123 +8090,38 @@ ipsq_flush(ill_t *ill)
 	mblk_t	*prev;
 	mblk_t	*mp;
 	mblk_t	*mp_next;
-	ipsq_t	*ipsq;
+	ipxop_t	*ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
 
 	ASSERT(IAM_WRITER_ILL(ill));
-	ipsq = ill->ill_phyint->phyint_ipsq;
+
 	/*
 	 * Flush any messages sent up by the driver.
 	 */
-	mutex_enter(&ipsq->ipsq_lock);
-	for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) {
+	mutex_enter(&ipx->ipx_lock);
+	for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
 		mp_next = mp->b_next;
 		q = mp->b_queue;
 		if (q == ill->ill_rq || q == ill->ill_wq) {
-			/* Remove the mp from the ipsq */
+			/* dequeue mp */
 			if (prev == NULL)
-				ipsq->ipsq_mphead = mp->b_next;
+				ipx->ipx_mphead = mp->b_next;
 			else
 				prev->b_next = mp->b_next;
-			if (ipsq->ipsq_mptail == mp) {
+			if (ipx->ipx_mptail == mp) {
 				ASSERT(mp_next == NULL);
-				ipsq->ipsq_mptail = prev;
+				ipx->ipx_mptail = prev;
 			}
 			inet_freemsg(mp);
 		} else {
 			prev = mp;
 		}
 	}
-	mutex_exit(&ipsq->ipsq_lock);
+	mutex_exit(&ipx->ipx_lock);
 	(void) ipsq_pending_mp_cleanup(ill, NULL);
 	ipsq_xopq_mp_cleanup(ill, NULL);
 	ill_pending_mp_cleanup(ill);
 }
 
-/* ARGSUSED */
-int
-ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
-    ip_ioctl_cmd_t *ipip, void *ifreq)
-{
-	ill_t	*ill;
-	struct lifreq	*lifr = (struct lifreq *)ifreq;
-	boolean_t isv6;
-	conn_t	*connp;
-	ip_stack_t	*ipst;
-
-	connp = Q_TO_CONN(q);
-	ipst = connp->conn_netstack->netstack_ip;
-	isv6 = connp->conn_af_isv6;
-	/*
-	 * Set original index.
-	 * Failover and failback move logical interfaces
-	 * from one physical interface to another.  The
-	 * original index indicates the parent of a logical
-	 * interface, in other words, the physical interface
-	 * the logical interface will be moved back to on
-	 * failback.
-	 */
-
-	/*
-	 * Don't allow the original index to be changed
-	 * for non-failover addresses, autoconfigured
-	 * addresses, or IPv6 link local addresses.
-	 */
-	if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) ||
-	    (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) {
-		return (EINVAL);
-	}
-	/*
-	 * The new original index must be in use by some
-	 * physical interface.
-	 */
-	ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL,
-	    NULL, NULL, ipst);
-	if (ill == NULL)
-		return (ENXIO);
-	ill_refrele(ill);
-
-	ipif->ipif_orig_ifindex = lifr->lifr_index;
-	/*
-	 * When this ipif gets failed back, don't
-	 * preserve the original id, as it is no
-	 * longer applicable.
-	 */
-	ipif->ipif_orig_ipifid = 0;
-	/*
-	 * For IPv4, change the original index of any
-	 * multicast addresses associated with the
-	 * ipif to the new value.
-	 */
-	if (!isv6) {
-		ilm_t *ilm;
-
-		mutex_enter(&ipif->ipif_ill->ill_lock);
-		for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL;
-		    ilm = ilm->ilm_next) {
-			if (ilm->ilm_ipif == ipif) {
-				ilm->ilm_orig_ifindex = lifr->lifr_index;
-			}
-		}
-		mutex_exit(&ipif->ipif_ill->ill_lock);
-	}
-	return (0);
-}
-
-/* ARGSUSED */
-int
-ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
-    ip_ioctl_cmd_t *ipip, void *ifreq)
-{
-	struct lifreq *lifr = (struct lifreq *)ifreq;
-
-	/*
-	 * Get the original interface index i.e the one
-	 * before FAILOVER if it ever happened.
-	 */
-	lifr->lifr_index = ipif->ipif_orig_ifindex;
-	return (0);
-}
-
 /*
  * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls,
  * refhold and return the associated ipif
@@ -8087,8 +8208,6 @@ int
 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
     cmd_info_t *ci, ipsq_func_t func)
 {
-	sin_t		*sin;
-	sin6_t		*sin6;
 	char		*name;
 	struct ifreq    *ifr;
 	struct lifreq    *lifr;
@@ -8132,9 +8251,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 		 * be trusted.
 		 */
 		ifr->ifr_name[IFNAMSIZ - 1] = '\0';
-		sin = (sin_t *)&ifr->ifr_addr;
 		name = ifr->ifr_name;
-		ci->ci_sin = sin;
+		ci->ci_sin = (sin_t *)&ifr->ifr_addr;
 		ci->ci_sin6 = NULL;
 		ci->ci_lifr = (struct lifreq *)ifr;
 	} else {
@@ -8148,14 +8266,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 		 */
 		lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
 		name = lifr->lifr_name;
-		sin = (sin_t *)&lifr->lifr_addr;
-		sin6 = (sin6_t *)&lifr->lifr_addr;
-		if (ipip->ipi_cmd == SIOCSLIFGROUPNAME) {
-			(void) strncpy(ci->ci_groupname, lifr->lifr_groupname,
-			    LIFNAMSIZ);
-		}
-		ci->ci_sin = sin;
-		ci->ci_sin6 = sin6;
+		ci->ci_sin = (sin_t *)&lifr->lifr_addr;
+		ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
 		ci->ci_lifr = lifr;
 	}
 
@@ -8181,21 +8293,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 		if (ipif == NULL) {
 			if (err == EINPROGRESS)
 				return (err);
-			if (ipip->ipi_cmd == SIOCLIFFAILOVER ||
-			    ipip->ipi_cmd == SIOCLIFFAILBACK) {
-				/*
-				 * Need to try both v4 and v6 since this
-				 * ioctl can come down either v4 or v6
-				 * socket. The lifreq.lifr_family passed
-				 * down by this ioctl is AF_UNSPEC.
-				 */
-				ipif = ipif_lookup_on_name(name,
-				    mi_strlen(name), B_FALSE, &exists, !isv6,
-				    zoneid, (connp == NULL) ? q :
-				    CONNP_TO_WQ(connp), mp, func, &err, ipst);
-				if (err == EINPROGRESS)
-					return (err);
-			}
 			err = 0;	/* Ensure we don't use it below */
 		}
 	}
@@ -8221,15 +8318,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 	if (ipif == NULL)
 		return (ENXIO);
 
-	/*
-	 * Allow only GET operations if this ipif has been created
-	 * temporarily due to a MOVE operation.
-	 */
-	if (ipif->ipif_replace_zero && !(ipip->ipi_flags & IPI_REPL)) {
-		ipif_refrele(ipif);
-		return (EINVAL);
-	}
-
 	ci->ci_ipif = ipif;
 	return (0);
 }
@@ -8247,15 +8335,15 @@ ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
 
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V4(&ctx, ipst);
-
-	while (ill != NULL) {
+	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		if (IS_UNDER_IPMP(ill))
+			continue;
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
 			if (ipif->ipif_zoneid == zoneid ||
 			    ipif->ipif_zoneid == ALL_ZONES)
 				numifs++;
 		}
-		ill = ill_next(&ctx, ill);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 	return (numifs);
@@ -8283,6 +8371,9 @@ ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
 		ill = ILL_START_WALK_ALL(&ctx, ipst);
 
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
+			continue;
+
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
 			if ((ipif->ipif_flags & IPIF_NOXMIT) &&
@@ -8491,6 +8582,8 @@ ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		if (IS_UNDER_IPMP(ill))
+			continue;
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
 			if (zoneid != ipif->ipif_zoneid &&
@@ -8760,6 +8853,9 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ill_first(list, list, &ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
+			continue;
+
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
 			if ((ipif->ipif_flags & IPIF_NOXMIT) &&
@@ -8795,6 +8891,7 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
 
 			ipif_get_name(ipif, lifr->lifr_name,
 			    sizeof (lifr->lifr_name));
+			lifr->lifr_type = ill->ill_type;
 			if (ipif->ipif_isv6) {
 				sin6 = (sin6_t *)&lifr->lifr_addr;
 				*sin6 = sin6_null;
@@ -8828,23 +8925,6 @@ lif_copydone:
 	return (0);
 }
 
-/* ARGSUSED */
-int
-ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin,
-    queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
-{
-	ip_stack_t	*ipst;
-
-	if (q->q_next == NULL)
-		ipst = CONNQ_TO_IPST(q);
-	else
-		ipst = ILLQ_TO_IPST(q);
-
-	/* Existence of b_cont->b_cont checked in ip_wput_nondata */
-	ipst->ips_ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr;
-	return (0);
-}
-
 static void
 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
 {
@@ -9038,8 +9118,7 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
 			src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid);
 		} else {
 			src_ipif = ipif_select_source_v6(dst_ill,
-			    daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT,
-			    zoneid);
+			    daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid);
 		}
 		if (src_ipif == NULL)
 			goto next_dst;
@@ -9325,10 +9404,14 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	struct arpreq *ar;
 	struct xarpreq *xar;
 	int flags, alength;
-	char *lladdr;
-	ip_stack_t	*ipst;
+	uchar_t *lladdr;
+	ire_t *ire;
+	ip_stack_t *ipst;
 	ill_t *ill = ipif->ipif_ill;
+	ill_t *proxy_ill = NULL;
+	ipmp_arpent_t *entp = NULL;
 	boolean_t if_arp_ioctl = B_FALSE;
+	boolean_t proxyarp = B_FALSE;
 
 	ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
 	connp = Q_TO_CONN(q);
@@ -9340,7 +9423,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		ar = NULL;
 
 		flags = xar->xarp_flags;
-		lladdr = LLADDR(&xar->xarp_ha);
+		lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
 		if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
 		/*
 		 * Validate against user's link layer address length
@@ -9359,7 +9442,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		xar = NULL;
 
 		flags = ar->arp_flags;
-		lladdr = ar->arp_ha.sa_data;
+		lladdr = (uchar_t *)ar->arp_ha.sa_data;
 		/*
 		 * Theoretically, the sa_family could tell us what link
 		 * layer type this operation is trying to deal with. By
@@ -9379,6 +9462,51 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		}
 	}
 
+	ipaddr = sin->sin_addr.s_addr;
+
+	/*
+	 * IPMP ARP special handling:
+	 *
+	 * 1. Since ARP mappings must appear consistent across the group,
+	 *    prohibit changing ARP mappings on the underlying interfaces.
+	 *
+	 * 2. Since ARP mappings for IPMP data addresses are maintained by
+	 *    IP itself, prohibit changing them.
+	 *
+	 * 3. For proxy ARP, use a functioning hardware address in the group,
+	 *    provided one exists.  If one doesn't, just add the entry as-is;
+	 *    ipmp_illgrp_refresh_arpent() will refresh it if things change.
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP)
+			return (EPERM);
+	}
+	if (IS_IPMP(ill)) {
+		ipmp_illgrp_t *illg = ill->ill_grp;
+
+		switch (ipip->ipi_cmd) {
+		case SIOCSARP:
+		case SIOCSXARP:
+			proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength);
+			if (proxy_ill != NULL) {
+				proxyarp = B_TRUE;
+				if (!ipmp_ill_is_active(proxy_ill))
+					proxy_ill = ipmp_illgrp_next_ill(illg);
+				if (proxy_ill != NULL)
+					lladdr = proxy_ill->ill_phys_addr;
+			}
+			/* FALLTHRU */
+		case SIOCDARP:
+		case SIOCDXARP:
+			ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL,
+			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
+			if (ire != NULL) {
+				ire_refrele(ire);
+				return (EPERM);
+			}
+		}
+	}
+
 	/*
 	 * We are going to pass up to ARP a packet chain that looks
 	 * like:
@@ -9400,8 +9528,6 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		return (ENOMEM);
 	}
 
-	ipaddr = sin->sin_addr.s_addr;
-
 	mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
 	    (caddr_t)&ipaddr);
 	if (mp2 == NULL) {
@@ -9481,6 +9607,30 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		area->area_flags |= ACE_F_AUTHORITY;
 
 	/*
+	 * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it
+	 * so that IP can update ARP as the active ills in the group change.
+	 */
+	if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD &&
+	    (area->area_flags & ACE_F_PERMANENT)) {
+		entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp);
+
+		/*
+		 * The second part of the conditional below handles a corner
+		 * case: if this is proxy ARP and the IPMP group has no active
+		 * interfaces, we can't send the request to ARP now since it
+		 * won't be able to build an ACE.  So we return success and
+		 * notify ARP about the proxy ARP entry once an interface
+		 * becomes active.
+		 */
+		if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
+			mp2->b_cont = NULL;
+			inet_freemsg(mp1);
+			inet_freemsg(pending_mp);
+			return (entp == NULL ? ENOMEM : 0);
+		}
+	}
+
+	/*
 	 * Before sending 'mp' to ARP, we have to clear the b_next
 	 * and b_prev. Otherwise if STREAMS encounters such a message
 	 * in freemsg(), (because ARP can close any time) it can cause
@@ -9497,7 +9647,12 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	mutex_enter(&connp->conn_lock);
 	mutex_enter(&ill->ill_lock);
 	/* conn has not yet started closing, hence this can't fail */
-	VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0);
+	if (ipip->ipi_flags & IPI_WR) {
+		VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp),
+		    pending_mp, 0) != 0);
+	} else {
+		VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0);
+	}
 	mutex_exit(&ill->ill_lock);
 	mutex_exit(&connp->conn_lock);
 
@@ -9506,6 +9661,13 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	 * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion.
 	 */
 	putnext(ill->ill_rq, mp1);
+
+	/*
+	 * If we created an IPMP ARP entry, mark that we've notified ARP.
+	 */
+	if (entp != NULL)
+		ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
+
 	return (EINPROGRESS);
 }
 
@@ -9564,55 +9726,114 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 		    mp, func, &err, ipst);
 		if (ipif == NULL)
 			return (err);
-		if (ipif->ipif_id != 0 ||
-		    ipif->ipif_net_type != IRE_IF_RESOLVER) {
+		if (ipif->ipif_id != 0) {
 			ipif_refrele(ipif);
 			return (ENXIO);
 		}
 	} else {
 		/*
-		 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with sdl_nlen ==
-		 * 0: use the IP address to figure out the ill.	 In the IPMP
-		 * case, a simple forwarding table lookup will return the
-		 * IRE_IF_RESOLVER for the first interface in the group, which
-		 * might not be the interface on which the requested IP
-		 * address was resolved due to the ill selection algorithm
-		 * (see ip_newroute_get_dst_ill()).  So we do a cache table
-		 * lookup first: if the IRE cache entry for the IP address is
-		 * still there, it will contain the ill pointer for the right
-		 * interface, so we use that. If the cache entry has been
-		 * flushed, we fall back to the forwarding table lookup. This
-		 * should be rare enough since IRE cache entries have a longer
-		 * life expectancy than ARP cache entries.
+		 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen
+		 * of 0: use the IP address to find the ipif.  If the IP
+		 * address is an IPMP test address, ire_ftable_lookup() will
+		 * find the wrong ill, so we first do an ipif_lookup_addr().
 		 */
-		ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL,
-		    ipst);
-		if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) ||
-		    ((ill = ire_to_ill(ire)) == NULL) ||
-		    (ill->ill_net_type != IRE_IF_RESOLVER)) {
-			if (ire != NULL)
-				ire_refrele(ire);
-			ire = ire_ftable_lookup(sin->sin_addr.s_addr,
-			    0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0,
-			    NULL, MATCH_IRE_TYPE, ipst);
+		ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
+		    CONNP_TO_WQ(connp), mp, func, &err, ipst);
+		if (ipif == NULL) {
+			ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0,
+			    IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL,
+			    MATCH_IRE_TYPE, ipst);
 			if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) {
-
 				if (ire != NULL)
 					ire_refrele(ire);
 				return (ENXIO);
 			}
+			ipif = ill->ill_ipif;
+			ipif_refhold(ipif);
+			ire_refrele(ire);
 		}
-		ASSERT(ire != NULL && ill != NULL);
-		ipif = ill->ill_ipif;
-		ipif_refhold(ipif);
-		ire_refrele(ire);
 	}
+
+	if (ipif->ipif_net_type != IRE_IF_RESOLVER) {
+		ipif_refrele(ipif);
+		return (ENXIO);
+	}
+
 	ci->ci_sin = sin;
 	ci->ci_ipif = ipif;
 	return (0);
 }
 
 /*
+ * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the
+ * value of `ioccmd'.  While an illgrp is linked to an ipmp_grp_t, it is
+ * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it
+ * up and thus an ill can join that illgrp.
+ *
+ * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than
+ * open()/close() primarily because close() is not allowed to fail or block
+ * forever.  On the other hand, I_PUNLINK *can* fail, and there's no reason
+ * why anyone should ever need to I_PUNLINK an in-use IPMP stream.  To ensure
+ * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the
+ * I_PUNLINK) we defer linking to I_PLINK.  Separately, we also fail attempts
+ * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent
+ * state if I_UNLINK didn't occur.
+ *
+ * Note that for each plumb/unplumb operation, we may end up here more than
+ * once because of the way ifconfig works.  However, it's OK to link the same
+ * illgrp more than once, or unlink an illgrp that's already unlinked.
+ */
+static int
+ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
+{
+	int err;
+	ip_stack_t *ipst = ill->ill_ipst;
+
+	ASSERT(IS_IPMP(ill));
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	switch (ioccmd) {
+	case I_LINK:
+		return (ENOTSUP);
+
+	case I_PLINK:
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp);
+		rw_exit(&ipst->ips_ipmp_lock);
+		break;
+
+	case I_PUNLINK:
+		/*
+		 * Require all UP ipifs be brought down prior to unlinking the
+		 * illgrp so any associated IREs (and other state) is torched.
+		 */
+		if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
+			return (EBUSY);
+
+		/*
+		 * NOTE: We hold ipmp_lock across the unlink to prevent a race
+		 * with an SIOCSLIFGROUPNAME request from an ill trying to
+		 * join this group.  Specifically: ills trying to join grab
+		 * ipmp_lock and bump a "pending join" counter checked by
+		 * ipmp_illgrp_unlink_grp().  During the unlink no new pending
+		 * joins can occur (since we have ipmp_lock).  Once we drop
+		 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not
+		 * find the illgrp (since we unlinked it) and will return
+		 * EAFNOSUPPORT.  This will then take them back through the
+		 * IPMP meta-interface plumbing logic in ifconfig, and thus
+		 * back through I_PLINK above.
+		 */
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		err = ipmp_illgrp_unlink_grp(ill->ill_grp);
+		rw_exit(&ipst->ips_ipmp_lock);
+		return (err);
+	default:
+		break;
+	}
+	return (0);
+}
+
+/*
  * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
  * atomically set/clear the muxids. Also complete the ioctl by acking or
  * naking it.  Note that the code is structured such that the link type,
@@ -9697,7 +9918,7 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 
 		if (ipsq == NULL) {
 			ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
-			    NEW_OP, B_TRUE);
+			    NEW_OP, B_FALSE);
 			if (ipsq == NULL) {
 				ill_refrele(ill);
 				return;
@@ -9728,6 +9949,11 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			err = EINVAL;
 			goto done;
 		}
+
+		if (IS_IPMP(ill) &&
+		    (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
+			goto done;
+
 		ill->ill_arp_muxid = islink ? li->l_index : 0;
 	} else {
 		/*
@@ -9763,6 +9989,7 @@ static int
 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
     struct linkblk *li, boolean_t doconsist)
 {
+	int		err = 0;
 	ill_t  		*ill;
 	queue_t		*ipwq, *dwq;
 	const char	*name;
@@ -9796,7 +10023,7 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 
 	if (ipsq == NULL) {
 		ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
-		    NEW_OP, B_TRUE);
+		    NEW_OP, B_FALSE);
 		if (ipsq == NULL)
 			return (EINPROGRESS);
 		entered_ipsq = B_TRUE;
@@ -9811,12 +10038,14 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 		 */
 		if ((islink && ill->ill_ip_muxid != 0) ||
 		    (!islink && ill->ill_arp_muxid != 0)) {
-			if (entered_ipsq)
-				ipsq_exit(ipsq);
-			return (EINVAL);
+			err = EINVAL;
+			goto done;
 		}
 	}
 
+	if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
+		goto done;
+
 	/*
 	 * As part of I_{P}LINKing, stash the number of downstream modules and
 	 * the read queue of the module immediately below IP in the ill.
@@ -9853,11 +10082,11 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 			ill_capability_reset(ill, B_FALSE);
 	}
 	ipsq_current_finish(ipsq);
-
+done:
 	if (entered_ipsq)
 		ipsq_exit(ipsq);
 
-	return (0);
+	return (err);
 }
 
 /*
@@ -10124,8 +10353,9 @@ nak:
 }
 
 /* ip_wput hands off ARP IOCTL responses to us */
+/* ARGSUSED3 */
 void
-ip_sioctl_iocack(queue_t *q, mblk_t *mp)
+ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 {
 	struct arpreq *ar;
 	struct xarpreq *xar;
@@ -10136,7 +10366,6 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
 	struct iocblk	*orig_iocp;
 	ill_t *ill;
 	conn_t *connp = NULL;
-	uint_t ioc_id;
 	mblk_t *pending_mp;
 	int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE;
 	int *flagsp;
@@ -10146,6 +10375,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
 	int err;
 	ip_stack_t *ipst;
 
+	ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
 	ill = q->q_ptr;
 	ASSERT(ill != NULL);
 	ipst = ill->ill_ipst;
@@ -10185,10 +10415,14 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
 	iocp = (struct iocblk *)mp->b_rptr;
 
 	/*
-	 * Pick out the originating queue based on the ioc_id.
+	 * Find the pending message; if we're exclusive, it'll be on our IPSQ.
+	 * Otherwise, we can find it from our ioc_id.
 	 */
-	ioc_id = iocp->ioc_id;
-	pending_mp = ill_pending_mp_get(ill, &connp, ioc_id);
+	if (ipsq != NULL)
+		pending_mp = ipsq_pending_mp_get(ipsq, &connp);
+	else
+		pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id);
+
 	if (pending_mp == NULL) {
 		ASSERT(connp == NULL);
 		inet_freemsg(mp);
@@ -10271,7 +10505,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
 					ire_refrele(ire);
 					freemsg(mp);
 					ip_ioctl_finish(q, orig_ioc_mp,
-					    EINVAL, NO_COPYOUT, NULL);
+					    EINVAL, NO_COPYOUT, ipsq);
 					return;
 				}
 				*flagsp |= ATF_COM;
@@ -10297,12 +10531,27 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
 			/* Ditch the internal IOCTL. */
 			freemsg(mp);
 			ire_refrele(ire);
-			ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL);
+			ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
 			return;
 		}
 	}
 
 	/*
+	 * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE
+	 * on the IPMP meta-interface, ensure any ARP entries added in
+	 * ip_sioctl_arp() are deleted.
+	 */
+	if (IS_IPMP(ill) &&
+	    ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) ||
+	    ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) {
+		ipmp_illgrp_t *illg = ill->ill_grp;
+		ipmp_arpent_t *entp;
+
+		if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL)
+			ipmp_illgrp_destroy_arpent(illg, entp);
+	}
+
+	/*
 	 * Delete the coresponding IRE_CACHE if any.
 	 * Reset the error if there was one (in case there was no entry
 	 * in arp.)
@@ -10341,7 +10590,7 @@ errack:
 	if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) {
 		err = iocp->ioc_error;
 		freemsg(mp);
-		ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL);
+		ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq);
 		return;
 	}
 
@@ -10355,7 +10604,7 @@ errack:
 		    sizeof (xar->xarp_ha.sdl_data)) {
 			freemsg(mp);
 			ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT,
-			    NULL);
+			    ipsq);
 			return;
 		}
 	}
@@ -10382,7 +10631,7 @@ errack:
 	/* Ditch the internal IOCTL. */
 	freemsg(mp);
 	/* Complete the original. */
-	ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL);
+	ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
 }
 
 /*
@@ -10397,7 +10646,7 @@ errack:
  * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
  * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
  *
- * Executed as a writer on the ill or ill group.
+ * Executed as a writer on the ill.
  * So no lock is needed to traverse the ipif chain, or examine the
  * phyint flags.
  */
@@ -10423,7 +10672,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	boolean_t found_sep = B_FALSE;
 	conn_t	*connp;
 	zoneid_t zoneid;
-	int	orig_ifindex = 0;
 	ip_stack_t *ipst = CONNQ_TO_IPST(q);
 
 	ASSERT(q->q_next == NULL);
@@ -10513,61 +10761,10 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	if (ipsq == NULL)
 		return (EINPROGRESS);
 
-	/*
-	 * If the interface is failed, inactive or offlined, look for a working
-	 * interface in the ill group and create the ipif there. If we can't
-	 * find a good interface, create the ipif anyway so that in.mpathd can
-	 * move it to the first repaired interface.
-	 */
-	if ((ill->ill_phyint->phyint_flags &
-	    (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
-	    ill->ill_phyint->phyint_groupname_len != 0) {
-		phyint_t *phyi;
-		char *groupname = ill->ill_phyint->phyint_groupname;
-
-		/*
-		 * We're looking for a working interface, but it doesn't matter
-		 * if it's up or down; so instead of following the group lists,
-		 * we look at each physical interface and compare the groupname.
-		 * We're only interested in interfaces with IPv4 (resp. IPv6)
-		 * plumbed when we're adding an IPv4 (resp. IPv6) ipif.
-		 * Otherwise we create the ipif on the failed interface.
-		 */
-		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-		phyi = avl_first(&ipst->ips_phyint_g_list->
-		    phyint_list_avl_by_index);
-		for (; phyi != NULL;
-		    phyi = avl_walk(&ipst->ips_phyint_g_list->
-		    phyint_list_avl_by_index,
-		    phyi, AVL_AFTER)) {
-			if (phyi->phyint_groupname_len == 0)
-				continue;
-			ASSERT(phyi->phyint_groupname != NULL);
-			if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 &&
-			    !(phyi->phyint_flags &
-			    (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
-			    (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) :
-			    (phyi->phyint_illv4 != NULL))) {
-				break;
-			}
-		}
-		rw_exit(&ipst->ips_ill_g_lock);
-
-		if (phyi != NULL) {
-			orig_ifindex = ill->ill_phyint->phyint_ifindex;
-			ill = (ill->ill_isv6 ? phyi->phyint_illv6 :
-			    phyi->phyint_illv4);
-		}
-	}
-
-	/*
-	 * We are now exclusive on the ipsq, so an ill move will be serialized
-	 * before or after us.
-	 */
+	/* We are now exclusive on the IPSQ */
 	ASSERT(IAM_WRITER_ILL(ill));
-	ASSERT(ill->ill_move_in_progress == B_FALSE);
 
-	if (found_sep && orig_ifindex == 0) {
+	if (found_sep) {
 		/* Now see if there is an IPIF with this unit number. */
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
@@ -10580,14 +10777,11 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 
 	/*
 	 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use
-	 * of lo0. We never come here when we plumb lo0:0. It
-	 * happens in ipif_lookup_on_name.
-	 * The specified unit number is ignored when we create the ipif on a
-	 * different interface. However, we save it in ipif_orig_ipifid below so
-	 * that the ipif fails back to the right position.
-	 */
-	if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ?
-	    id : -1, IRE_LOCAL, B_TRUE)) == NULL) {
+	 * of lo0.  Plumbing for lo0:0 happens in ipif_lookup_on_name()
+	 * instead.
+	 */
+	if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL,
+	    B_TRUE, B_TRUE)) == NULL) {
 		err = ENOBUFS;
 		goto done;
 	}
@@ -10604,14 +10798,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 		    &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
 	}
 
-	/* Set ifindex and unit number for failback */
-	if (err == 0 && orig_ifindex != 0) {
-		ipif->ipif_orig_ifindex = orig_ifindex;
-		if (found_sep) {
-			ipif->ipif_orig_ipifid = id;
-		}
-	}
-
 done:
 	ipsq_exit(ipsq);
 	return (err);
@@ -10672,7 +10858,6 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 			ill_delete(ill);
 			mutex_enter(&connp->conn_lock);
 			mutex_enter(&ill->ill_lock);
-			ASSERT(ill->ill_group == NULL);
 
 			/* Are any references to this ill active */
 			if (ill_is_freeable(ill)) {
@@ -10693,14 +10878,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		}
 	}
 
-	/*
-	 * We are exclusive on the ipsq, so an ill move will be serialized
-	 * before or after us.
-	 */
-	ASSERT(ill->ill_move_in_progress == B_FALSE);
-
 	if (ipif->ipif_id == 0) {
-
 		ipsq_t *ipsq;
 
 		/* Find based on address */
@@ -10712,35 +10890,15 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 			sin6 = (sin6_t *)sin;
 			/* We are a writer, so we should be able to lookup */
-			ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
-			    ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
-			if (ipif == NULL) {
-				/*
-				 * Maybe the address in on another interface in
-				 * the same IPMP group? We check this below.
-				 */
-				ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
-				    NULL, ALL_ZONES, NULL, NULL, NULL, NULL,
-				    ipst);
-			}
+			ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill,
+			    ipst);
 		} else {
-			ipaddr_t addr;
-
 			if (sin->sin_family != AF_INET)
 				return (EAFNOSUPPORT);
 
-			addr = sin->sin_addr.s_addr;
 			/* We are a writer, so we should be able to lookup */
-			ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL,
-			    NULL, NULL, NULL, ipst);
-			if (ipif == NULL) {
-				/*
-				 * Maybe the address in on another interface in
-				 * the same IPMP group? We check this below.
-				 */
-				ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES,
-				    NULL, NULL, NULL, NULL, ipst);
-			}
+			ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill,
+			    ipst);
 		}
 		if (ipif == NULL) {
 			return (EADDRNOTAVAIL);
@@ -10750,32 +10908,11 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		 * It is possible for a user to send an SIOCLIFREMOVEIF with
 		 * lifr_name of the physical interface but with an ip address
 		 * lifr_addr of a logical interface plumbed over it.
-		 * So update ipsq_current_ipif once ipif points to the
-		 * correct interface after doing ipif_lookup_addr().
+		 * So update ipx_current_ipif now that ipif points to the
+		 * correct one.
 		 */
 		ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
-		ASSERT(ipsq != NULL);
-
-		mutex_enter(&ipsq->ipsq_lock);
-		ipsq->ipsq_current_ipif = ipif;
-		mutex_exit(&ipsq->ipsq_lock);
-
-		/*
-		 * When the address to be removed is hosted on a different
-		 * interface, we check if the interface is in the same IPMP
-		 * group as the specified one; if so we proceed with the
-		 * removal.
-		 * ill->ill_group is NULL when the ill is down, so we have to
-		 * compare the group names instead.
-		 */
-		if (ipif->ipif_ill != ill &&
-		    (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 ||
-		    ill->ill_phyint->phyint_groupname_len == 0 ||
-		    mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname,
-		    ill->ill_phyint->phyint_groupname) != 0)) {
-			ipif_refrele(ipif);
-			return (EADDRNOTAVAIL);
-		}
+		ipsq->ipsq_xop->ipx_current_ipif = ipif;
 
 		/* This is a writer */
 		ipif_refrele(ipif);
@@ -11072,7 +11209,7 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	if (need_dl_down)
 		ill_dl_down(ill);
 	if (need_arp_down)
-		ipif_arp_down(ipif);
+		ipif_resolver_down(ipif);
 
 	return (err);
 }
@@ -11272,9 +11409,9 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	if (need_dl_down)
 		ill_dl_down(ill);
-
 	if (need_arp_down)
-		ipif_arp_down(ipif);
+		ipif_resolver_down(ipif);
+
 	return (err);
 }
 
@@ -11323,144 +11460,8 @@ ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 }
 
 /*
- * part of ipmp, make this func return the active/inactive state and
- * caller can set once atomically instead of multiple mutex_enter/mutex_exit
- */
-/*
- * This function either sets or clears the IFF_INACTIVE flag.
- *
- * As long as there are some addresses or multicast memberships on the
- * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we
- * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface
- * will be used for outbound packets.
- *
- * Caller needs to verify the validity of setting IFF_INACTIVE.
- */
-static void
-phyint_inactive(phyint_t *phyi)
-{
-	ill_t *ill_v4;
-	ill_t *ill_v6;
-	ipif_t *ipif;
-	ilm_t *ilm;
-
-	ill_v4 = phyi->phyint_illv4;
-	ill_v6 = phyi->phyint_illv6;
-
-	/*
-	 * No need for a lock while traversing the list since iam
-	 * a writer
-	 */
-	if (ill_v4 != NULL) {
-		ASSERT(IAM_WRITER_ILL(ill_v4));
-		for (ipif = ill_v4->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
-				mutex_enter(&phyi->phyint_lock);
-				phyi->phyint_flags &= ~PHYI_INACTIVE;
-				mutex_exit(&phyi->phyint_lock);
-				return;
-			}
-		}
-		for (ilm = ill_v4->ill_ilm; ilm != NULL;
-		    ilm = ilm->ilm_next) {
-			if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
-				mutex_enter(&phyi->phyint_lock);
-				phyi->phyint_flags &= ~PHYI_INACTIVE;
-				mutex_exit(&phyi->phyint_lock);
-				return;
-			}
-		}
-	}
-	if (ill_v6 != NULL) {
-		ill_v6 = phyi->phyint_illv6;
-		for (ipif = ill_v6->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
-				mutex_enter(&phyi->phyint_lock);
-				phyi->phyint_flags &= ~PHYI_INACTIVE;
-				mutex_exit(&phyi->phyint_lock);
-				return;
-			}
-		}
-		for (ilm = ill_v6->ill_ilm; ilm != NULL;
-		    ilm = ilm->ilm_next) {
-			if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
-				mutex_enter(&phyi->phyint_lock);
-				phyi->phyint_flags &= ~PHYI_INACTIVE;
-				mutex_exit(&phyi->phyint_lock);
-				return;
-			}
-		}
-	}
-	mutex_enter(&phyi->phyint_lock);
-	phyi->phyint_flags |= PHYI_INACTIVE;
-	mutex_exit(&phyi->phyint_lock);
-}
-
-/*
- * This function is called only when the phyint flags change. Currently
- * called from ip_sioctl_flags. We re-do the broadcast nomination so
- * that we can select a good ill.
- */
-static void
-ip_redo_nomination(phyint_t *phyi)
-{
-	ill_t *ill_v4;
-
-	ill_v4 = phyi->phyint_illv4;
-
-	if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
-		ASSERT(IAM_WRITER_ILL(ill_v4));
-		if (ill_v4->ill_group->illgrp_ill_count > 1)
-			ill_nominate_bcast_rcv(ill_v4->ill_group);
-	}
-}
-
-/*
- * Heuristic to check if ill is INACTIVE.
- * Checks if ill has an ipif with an usable ip address.
- *
- * Return values:
- *	B_TRUE	- ill is INACTIVE; has no usable ipif
- *	B_FALSE - ill is not INACTIVE; ill has at least one usable ipif
- */
-static boolean_t
-ill_is_inactive(ill_t *ill)
-{
-	ipif_t *ipif;
-
-	/* Check whether it is in an IPMP group */
-	if (ill->ill_phyint->phyint_groupname == NULL)
-		return (B_FALSE);
-
-	if (ill->ill_ipif_up_count == 0)
-		return (B_TRUE);
-
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		uint64_t flags = ipif->ipif_flags;
-
-		/*
-		 * This ipif is usable if it is IPIF_UP and not a
-		 * dedicated test address.  A dedicated test address
-		 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED
-		 * (note in particular that V6 test addresses are
-		 * link-local data addresses and thus are marked
-		 * IPIF_NOFAILOVER but not IPIF_DEPRECATED).
-		 */
-		if ((flags & IPIF_UP) &&
-		    ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) !=
-		    (IPIF_DEPRECATED|IPIF_NOFAILOVER)))
-			return (B_FALSE);
-	}
-	return (B_TRUE);
-}
-
-/*
- * Set interface flags.
- * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT,
- * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST,
- * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE.
+ * Set interface flags.  Many flags require special handling (e.g.,
+ * bringing the interface down); see below for details.
  *
  * NOTE : We really don't enforce that ipif_id zero should be used
  *	  for setting any flags other than IFF_LOGINT_FLAGS. This
@@ -11478,17 +11479,16 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 {
 	uint64_t turn_on;
 	uint64_t turn_off;
-	int	err;
+	int	err = 0;
 	phyint_t *phyi;
 	ill_t *ill;
-	uint64_t intf_flags;
+	uint64_t intf_flags, cantchange_flags;
 	boolean_t phyint_flags_modified = B_FALSE;
 	uint64_t flags;
 	struct ifreq *ifr;
 	struct lifreq *lifr;
 	boolean_t set_linklocal = B_FALSE;
 	boolean_t zero_source = B_FALSE;
-	ip_stack_t *ipst;
 
 	ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -11497,11 +11497,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	ill = ipif->ipif_ill;
 	phyi = ill->ill_phyint;
-	ipst = ill->ill_ipst;
 
 	if (ipip->ipi_cmd_type == IF_CMD) {
 		ifr = (struct ifreq *)if_req;
-		flags =  (uint64_t)(ifr->ifr_flags & 0x0000ffff);
+		flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
 	} else {
 		lifr = (struct lifreq *)if_req;
 		flags = lifr->lifr_flags;
@@ -11524,25 +11523,60 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		flags |= intf_flags & ~0xFFFF;
 
 	/*
-	 * First check which bits will change and then which will
-	 * go on and off
+	 * Explicitly fail attempts to change flags that are always invalid on
+	 * an IPMP meta-interface.
 	 */
-	turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE;
-	if (!turn_on)
+	if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID))
+		return (EINVAL);
+
+	/*
+	 * Check which flags will change; silently ignore flags which userland
+	 * is not allowed to control.  (Because these flags may change between
+	 * SIOCGLIFFLAGS and SIOCSLIFFLAGS, and that's outside of userland's
+	 * control, we need to silently ignore them rather than fail.)
+	 */
+	cantchange_flags = IFF_CANTCHANGE;
+	if (IS_IPMP(ill))
+		cantchange_flags |= IFF_IPMP_CANTCHANGE;
+
+	turn_on = (flags ^ intf_flags) & ~cantchange_flags;
+	if (turn_on == 0)
 		return (0);	/* No change */
 
 	turn_off = intf_flags & turn_on;
 	turn_on ^= turn_off;
-	err = 0;
 
 	/*
-	 * Don't allow any bits belonging to the logical interface
-	 * to be set or cleared on the replacement ipif that was
-	 * created temporarily during a MOVE.
+	 * All test addresses must be IFF_DEPRECATED (to ensure source address
+	 * selection avoids them) -- so force IFF_DEPRECATED on, and do not
+	 * allow it to be turned off.
 	 */
-	if (ipif->ipif_replace_zero &&
-	    ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) {
+	if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED &&
+	    (turn_on|intf_flags) & IFF_NOFAILOVER)
 		return (EINVAL);
+
+	if (turn_on & IFF_NOFAILOVER) {
+		turn_on |= IFF_DEPRECATED;
+		flags |= IFF_DEPRECATED;
+	}
+
+	/*
+	 * On underlying interfaces, only allow applications to manage test
+	 * addresses -- otherwise, they may get confused when the address
+	 * moves as part of being brought up.  Likewise, prevent an
+	 * application-managed test address from being converted to a data
+	 * address.  To prevent migration of administratively up addresses in
+	 * the kernel, we don't allow them to be converted either.
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF;
+
+		if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER))
+			return (EINVAL);
+
+		if ((turn_off & IFF_NOFAILOVER) &&
+		    (flags & (appflags | IFF_UP | IFF_DUPLICATE)))
+			return (EINVAL);
 	}
 
 	/*
@@ -11583,16 +11617,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	}
 
 	/*
-	 * ILL cannot be part of a usesrc group and and IPMP group at the
-	 * same time. No need to grab ill_g_usesrc_lock here, see
-	 * synchronization notes in ip.c
-	 */
-	if (turn_on & PHYI_STANDBY &&
-	    ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
-		return (EINVAL);
-	}
-
-	/*
 	 * If we modify physical interface flags, we'll potentially need to
 	 * send up two routing socket messages for the changes (one for the
 	 * IPv4 ill, and another for the IPv6 ill).  Note that here.
@@ -11601,98 +11625,44 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		phyint_flags_modified = B_TRUE;
 
 	/*
-	 * If we are setting or clearing FAILED or STANDBY or OFFLINE,
-	 * we need to flush the IRE_CACHES belonging to this ill.
-	 * We handle this case here without doing the DOWN/UP dance
-	 * like it is done for other flags. If some other flags are
-	 * being turned on/off with FAILED/STANDBY/OFFLINE, the code
-	 * below will handle it by bringing it down and then
-	 * bringing it UP.
+	 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE
+	 * (otherwise, we'd immediately use them, defeating standby).  Also,
+	 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not
+	 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already
+	 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared.  We
+	 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics
+	 * will not be honored.
 	 */
-	if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) {
-		ill_t *ill_v4, *ill_v6;
-
-		ill_v4 = phyi->phyint_illv4;
-		ill_v6 = phyi->phyint_illv6;
-
+	if (turn_on & PHYI_STANDBY) {
 		/*
-		 * First set the INACTIVE flag if needed. Then delete the ires.
-		 * ire_add will atomically prevent creating new IRE_CACHEs
-		 * unless hidden flag is set.
-		 * PHYI_FAILED and PHYI_INACTIVE are exclusive
+		 * No need to grab ill_g_usesrc_lock here; see the
+		 * synchronization notes in ip.c.
 		 */
-		if ((turn_on & PHYI_FAILED) &&
-		    ((intf_flags & PHYI_STANDBY) ||
-		    !ipst->ips_ipmp_enable_failback)) {
-			/* Reset PHYI_INACTIVE when PHYI_FAILED is being set */
-			phyi->phyint_flags &= ~PHYI_INACTIVE;
-		}
-		if ((turn_off & PHYI_FAILED) &&
-		    ((intf_flags & PHYI_STANDBY) ||
-		    (!ipst->ips_ipmp_enable_failback &&
-		    ill_is_inactive(ill)))) {
-			phyint_inactive(phyi);
-		}
-
-		if (turn_on & PHYI_STANDBY) {
-			/*
-			 * We implicitly set INACTIVE only when STANDBY is set.
-			 * INACTIVE is also set on non-STANDBY phyint when user
-			 * disables FAILBACK using configuration file.
-			 * Do not allow STANDBY to be set on such INACTIVE
-			 * phyint
-			 */
-			if (phyi->phyint_flags & PHYI_INACTIVE)
-				return (EINVAL);
-			if (!(phyi->phyint_flags & PHYI_FAILED))
-				phyint_inactive(phyi);
-		}
-		if (turn_off & PHYI_STANDBY) {
-			if (ipst->ips_ipmp_enable_failback) {
-				/*
-				 * Reset PHYI_INACTIVE.
-				 */
-				phyi->phyint_flags &= ~PHYI_INACTIVE;
-			} else if (ill_is_inactive(ill) &&
-			    !(phyi->phyint_flags & PHYI_FAILED)) {
-				/*
-				 * Need to set INACTIVE, when user sets
-				 * STANDBY on a non-STANDBY phyint and
-				 * later resets STANDBY
-				 */
-				phyint_inactive(phyi);
-			}
+		if (ill->ill_usesrc_grp_next != NULL ||
+		    intf_flags & PHYI_INACTIVE)
+			return (EINVAL);
+		if (!(flags & PHYI_FAILED)) {
+			flags |= PHYI_INACTIVE;
+			turn_on |= PHYI_INACTIVE;
 		}
-		/*
-		 * We should always send up a message so that the
-		 * daemons come to know of it. Note that the zeroth
-		 * interface can be down and the check below for IPIF_UP
-		 * will not make sense as we are actually setting
-		 * a phyint flag here. We assume that the ipif used
-		 * is always the zeroth ipif. (ip_rts_ifmsg does not
-		 * send up any message for non-zero ipifs).
-		 */
-		phyint_flags_modified = B_TRUE;
+	}
 
-		if (ill_v4 != NULL) {
-			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
-			    IRE_CACHE, ill_stq_cache_delete,
-			    (char *)ill_v4, ill_v4);
-			illgrp_reset_schednext(ill_v4);
-		}
-		if (ill_v6 != NULL) {
-			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
-			    IRE_CACHE, ill_stq_cache_delete,
-			    (char *)ill_v6, ill_v6);
-			illgrp_reset_schednext(ill_v6);
-		}
+	if (turn_off & PHYI_STANDBY) {
+		flags &= ~PHYI_INACTIVE;
+		turn_off |= PHYI_INACTIVE;
 	}
 
 	/*
+	 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both
+	 * would end up on.
+	 */
+	if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
+	    (PHYI_FAILED | PHYI_INACTIVE))
+		return (EINVAL);
+
+	/*
 	 * If ILLF_ROUTER changes, we need to change the ip forwarding
-	 * status of the interface and, if the interface is part of an IPMP
-	 * group, all other interfaces that are part of the same IPMP
-	 * group.
+	 * status of the interface.
 	 */
 	if ((turn_on | turn_off) & ILLF_ROUTER)
 		(void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
@@ -11718,33 +11688,31 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		mutex_exit(&ill->ill_phyint->phyint_lock);
 
 		/*
-		 * We do the broadcast and nomination here rather
-		 * than waiting for a FAILOVER/FAILBACK to happen. In
-		 * the case of FAILBACK from INACTIVE standby to the
-		 * interface that has been repaired, PHYI_FAILED has not
-		 * been cleared yet. If there are only two interfaces in
-		 * that group, all we have is a FAILED and INACTIVE
-		 * interface. If we do the nomination soon after a failback,
-		 * the broadcast nomination code would select the
-		 * INACTIVE interface for receiving broadcasts as FAILED is
-		 * not yet cleared. As we don't want STANDBY/INACTIVE to
-		 * receive broadcast packets, we need to redo nomination
-		 * when the FAILED is cleared here. Thus, in general we
-		 * always do the nomination here for FAILED, STANDBY
-		 * and OFFLINE.
+		 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the
+		 * same to the kernel: if any of them has been set by
+		 * userland, the interface cannot be used for data traffic.
 		 */
-		if (((turn_on | turn_off) &
-		    (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) {
-			ip_redo_nomination(phyi);
+		if ((turn_on|turn_off) &
+		    (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
+			ASSERT(!IS_IPMP(ill));
+			/*
+			 * It's possible the ill is part of an "anonymous"
+			 * IPMP group rather than a real group.  In that case,
+			 * there are no other interfaces in the group and thus
+			 * no need to call ipmp_phyint_refresh_active().
+			 */
+			if (IS_UNDER_IPMP(ill))
+				ipmp_phyint_refresh_active(phyi);
 		}
+
 		if (phyint_flags_modified) {
 			if (phyi->phyint_illv4 != NULL) {
 				ip_rts_ifmsg(phyi->phyint_illv4->
-				    ill_ipif);
+				    ill_ipif, RTSQ_DEFAULT);
 			}
 			if (phyi->phyint_illv6 != NULL) {
 				ip_rts_ifmsg(phyi->phyint_illv6->
-				    ill_ipif);
+				    ill_ipif, RTSQ_DEFAULT);
 			}
 		}
 		return (0);
@@ -11785,15 +11753,17 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	}
 
 	/*
-	 * The only flag changes that we currently take specific action on
-	 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL,
-	 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and
-	 * IPIF_PREFERRED.  This is done by bring the ipif down, changing
-	 * the flags and bringing it back up again.
+	 * The only flag changes that we currently take specific action on are
+	 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP,
+	 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and
+	 * IPIF_NOFAILOVER.  This is done by bring the ipif down, changing the
+	 * flags and bringing it back up again.  For IPIF_NOFAILOVER, the act
+	 * of bringing it back up will trigger the address to be moved.
 	 */
 	if ((turn_on|turn_off) &
 	    (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
-	    ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) {
+	    ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
+	    IPIF_NOFAILOVER)) {
 		/*
 		 * Taking this ipif down, make sure we have
 		 * valid net and subnet bcast ire's for other
@@ -11822,9 +11792,8 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 {
 	ill_t	*ill;
 	phyint_t *phyi;
-	uint64_t turn_on;
-	uint64_t turn_off;
-	uint64_t intf_flags;
+	uint64_t turn_on, turn_off;
+	uint64_t intf_flags, cantchange_flags;
 	boolean_t phyint_flags_modified = B_FALSE;
 	int	err = 0;
 	boolean_t set_linklocal = B_FALSE;
@@ -11839,12 +11808,15 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 	phyi = ill->ill_phyint;
 
 	intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
-	turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP);
+	cantchange_flags = IFF_CANTCHANGE | IFF_UP;
+	if (IS_IPMP(ill))
+		cantchange_flags |= IFF_IPMP_CANTCHANGE;
 
+	turn_on = (flags ^ intf_flags) & ~cantchange_flags;
 	turn_off = intf_flags & turn_on;
 	turn_on ^= turn_off;
 
-	if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))
+	if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
 		phyint_flags_modified = B_TRUE;
 
 	/*
@@ -11870,9 +11842,6 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 	mutex_exit(&ill->ill_lock);
 	mutex_exit(&phyi->phyint_lock);
 
-	if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)))
-		ip_redo_nomination(phyi);
-
 	if (set_linklocal)
 		(void) ipif_setlinklocal(ipif);
 
@@ -11881,12 +11850,29 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 	else
 		ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
 
+	/*
+	 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
+	 * the kernel: if any of them has been set by userland, the interface
+	 * cannot be used for data traffic.
+	 */
+	if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
+		ASSERT(!IS_IPMP(ill));
+		/*
+		 * It's possible the ill is part of an "anonymous" IPMP group
+		 * rather than a real group.  In that case, there are no other
+		 * interfaces in the group and thus no need for us to call
+		 * ipmp_phyint_refresh_active().
+		 */
+		if (IS_UNDER_IPMP(ill))
+			ipmp_phyint_refresh_active(phyi);
+	}
+
 	if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) {
 		/*
 		 * XXX ipif_up really does not know whether a phyint flags
 		 * was modified or not. So, it sends up information on
 		 * only one routing sockets message. As we don't bring up
-		 * the interface and also set STANDBY/FAILED simultaneously
+		 * the interface and also set PHYI_ flags simultaneously
 		 * it should be okay.
 		 */
 		err = ipif_up(ipif, q, mp);
@@ -11898,14 +11884,14 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 		if (phyint_flags_modified) {
 			if (phyi->phyint_illv4 != NULL) {
 				ip_rts_ifmsg(phyi->phyint_illv4->
-				    ill_ipif);
+				    ill_ipif, RTSQ_DEFAULT);
 			}
 			if (phyi->phyint_illv6 != NULL) {
 				ip_rts_ifmsg(phyi->phyint_illv6->
-				    ill_ipif);
+				    ill_ipif, RTSQ_DEFAULT);
 			}
 		} else {
-			ip_rts_ifmsg(ipif);
+			ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
 		}
 		/*
 		 * Update the flags in SCTP's IPIF list, ipif_up() will do
@@ -12101,10 +12087,7 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		 * broadcast address makes sense.  If it does,
 		 * there should be an IRE for it already.
 		 * Don't match on ipif, only on the ill
-		 * since we are sharing these now. Don't use
-		 * MATCH_IRE_ILL_GROUP as we are looking for
-		 * the broadcast ire on this ill and each ill
-		 * in the group has its own broadcast ire.
+		 * since we are sharing these now.
 		 */
 		ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST,
 		    ipif, ALL_ZONES, NULL,
@@ -12302,9 +12285,16 @@ int
 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *if_req)
 {
-
 	ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+
+	/*
+	 * Since no applications should ever be setting metrics on underlying
+	 * interfaces, we explicitly fail to smoke 'em out.
+	 */
+	if (IS_UNDER_IPMP(ipif->ipif_ill))
+		return (EINVAL);
+
 	/*
 	 * Set interface metric.  We don't use this for
 	 * anything but we keep track of it in case it is
@@ -12332,6 +12322,7 @@ ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	/* Get interface metric. */
 	ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+
 	if (ipip->ipi_cmd_type == IF_CMD) {
 		struct ifreq    *ifr;
 
@@ -12766,13 +12757,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		nipif->ipif_state_flags |= IPIF_CHANGING;
 	}
 
-	mutex_exit(&ill->ill_lock);
-
 	if (lir->lir_maxmtu != 0) {
 		ill->ill_max_mtu = lir->lir_maxmtu;
-		ill->ill_mtu_userspecified = 1;
+		ill->ill_user_mtu = lir->lir_maxmtu;
 		mtu_walk = B_TRUE;
 	}
+	mutex_exit(&ill->ill_lock);
 
 	if (lir->lir_reachtime != 0)
 		ill->ill_reachable_time = lir->lir_reachtime;
@@ -12821,6 +12811,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	ILL_UNMARK_CHANGING(ill);
 	mutex_exit(&ill->ill_lock);
 
+	/*
+	 * Refresh IPMP meta-interface MTU if necessary.
+	 */
+	if (IS_UNDER_IPMP(ill))
+		ipmp_illgrp_refresh_mtu(ill->ill_grp);
+
 	return (0);
 }
 
@@ -13032,13 +13028,117 @@ ipif_assign_seqid(ipif_t *ipif)
 }
 
 /*
+ * Clone the contents of `sipif' to `dipif'.  Requires that both ipifs are
+ * administratively down (i.e., no DAD), of the same type, and locked.  Note
+ * that the clone is complete -- including the seqid -- and the expectation is
+ * that the caller will either free or overwrite `sipif' before it's unlocked.
+ */
+static void
+ipif_clone(const ipif_t *sipif, ipif_t *dipif)
+{
+	ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock));
+	ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock));
+	ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
+	ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
+	ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
+	ASSERT(sipif->ipif_arp_del_mp == NULL);
+	ASSERT(dipif->ipif_arp_del_mp == NULL);
+	ASSERT(sipif->ipif_igmp_rpt == NULL);
+	ASSERT(dipif->ipif_igmp_rpt == NULL);
+	ASSERT(sipif->ipif_multicast_up == 0);
+	ASSERT(dipif->ipif_multicast_up == 0);
+	ASSERT(sipif->ipif_joined_allhosts == 0);
+	ASSERT(dipif->ipif_joined_allhosts == 0);
+
+	dipif->ipif_mtu = sipif->ipif_mtu;
+	dipif->ipif_flags = sipif->ipif_flags;
+	dipif->ipif_metric = sipif->ipif_metric;
+	dipif->ipif_zoneid = sipif->ipif_zoneid;
+	dipif->ipif_v6subnet = sipif->ipif_v6subnet;
+	dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
+	dipif->ipif_v6src_addr = sipif->ipif_v6src_addr;
+	dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
+	dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
+	dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
+
+	/*
+	 * While dipif is down right now, it might've been up before.  Since
+	 * it's changing identity, its packet counters need to be reset.
+	 */
+	dipif->ipif_ib_pkt_count = 0;
+	dipif->ipif_ob_pkt_count = 0;
+	dipif->ipif_fo_pkt_count = 0;
+
+	/*
+	 * As per the comment atop the function, we assume that these sipif
+	 * fields will be changed before sipif is unlocked.
+	 */
+	dipif->ipif_seqid = sipif->ipif_seqid;
+	dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp;
+	dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt;
+	dipif->ipif_state_flags = sipif->ipif_state_flags;
+}
+
+/*
+ * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif'
+ * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin
+ * (unreferenced) ipif.  Also, if `sipif' is used by the current xop, then
+ * transfer the xop to `dipif'.  Requires that all ipifs are administratively
+ * down (i.e., no DAD), of the same type, and unlocked.
+ */
+static void
+ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
+{
+	ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq;
+	int ipx_current_ioctl;
+
+	ASSERT(sipif != dipif);
+	ASSERT(sipif != virgipif);
+
+	/*
+	 * Grab all of the locks that protect the ipif in a defined order.
+	 */
+	GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
+	if (sipif > dipif) {
+		mutex_enter(&sipif->ipif_saved_ire_lock);
+		mutex_enter(&dipif->ipif_saved_ire_lock);
+	} else {
+		mutex_enter(&dipif->ipif_saved_ire_lock);
+		mutex_enter(&sipif->ipif_saved_ire_lock);
+	}
+
+	ipif_clone(sipif, dipif);
+	if (virgipif != NULL) {
+		ipif_clone(virgipif, sipif);
+		mi_free(virgipif);
+	}
+
+	mutex_exit(&sipif->ipif_saved_ire_lock);
+	mutex_exit(&dipif->ipif_saved_ire_lock);
+	RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
+
+	/*
+	 * Transfer ownership of the current xop, if necessary.
+	 */
+	if (ipsq->ipsq_xop->ipx_current_ipif == sipif) {
+		ASSERT(ipsq->ipsq_xop->ipx_pending_ipif == NULL);
+		ipx_current_ioctl = ipsq->ipsq_xop->ipx_current_ioctl;
+		ipsq_current_finish(ipsq);
+		ipsq_current_start(ipsq, dipif, ipx_current_ioctl);
+	}
+
+	if (virgipif == NULL)
+		mi_free(sipif);
+}
+
+/*
  * Insert the ipif, so that the list of ipifs on the ill will be sorted
  * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
  * be inserted into the first space available in the list. The value of
  * ipif_id will then be set to the appropriate value for its position.
  */
 static int
-ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
+ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock)
 {
 	ill_t *ill;
 	ipif_t *tipif;
@@ -13056,12 +13156,11 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
 	/*
 	 * In the case of lo0:0 we already hold the ill_g_lock.
 	 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
-	 * ipif_insert. Another such caller is ipif_move.
+	 * ipif_insert.
 	 */
 	if (acquire_g_lock)
 		rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-	if (acquire_ill_lock)
-		mutex_enter(&ill->ill_lock);
+	mutex_enter(&ill->ill_lock);
 	id = ipif->ipif_id;
 	tipifp = &(ill->ill_ipif);
 	if (id == -1) {	/* need to find a real id */
@@ -13075,8 +13174,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
 		}
 		/* limit number of logical interfaces */
 		if (id >= ipst->ips_ip_addrs_per_if) {
-			if (acquire_ill_lock)
-				mutex_exit(&ill->ill_lock);
+			mutex_exit(&ill->ill_lock);
 			if (acquire_g_lock)
 				rw_exit(&ipst->ips_ill_g_lock);
 			return (-1);
@@ -13091,8 +13189,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
 			tipifp = &(tipif->ipif_next);
 		}
 	} else {
-		if (acquire_ill_lock)
-			mutex_exit(&ill->ill_lock);
+		mutex_exit(&ill->ill_lock);
 		if (acquire_g_lock)
 			rw_exit(&ipst->ips_ill_g_lock);
 		return (-1);
@@ -13102,25 +13199,22 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
 
 	ipif->ipif_next = tipif;
 	*tipifp = ipif;
-	if (acquire_ill_lock)
-		mutex_exit(&ill->ill_lock);
+	mutex_exit(&ill->ill_lock);
 	if (acquire_g_lock)
 		rw_exit(&ipst->ips_ill_g_lock);
+
 	return (0);
 }
 
 static void
-ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
+ipif_remove(ipif_t *ipif)
 {
 	ipif_t	**ipifp;
 	ill_t	*ill = ipif->ipif_ill;
 
 	ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
-	if (acquire_ill_lock)
-		mutex_enter(&ill->ill_lock);
-	else
-		ASSERT(MUTEX_HELD(&ill->ill_lock));
 
+	mutex_enter(&ill->ill_lock);
 	ipifp = &ill->ill_ipif;
 	for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
 		if (*ipifp == ipif) {
@@ -13128,9 +13222,7 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
 			break;
 		}
 	}
-
-	if (acquire_ill_lock)
-		mutex_exit(&ill->ill_lock);
+	mutex_exit(&ill->ill_lock);
 }
 
 /*
@@ -13149,10 +13241,12 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
  * second DL_INFO_ACK comes in from the driver.
  */
 static ipif_t *
-ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
+ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
+    boolean_t insert)
 {
 	ipif_t	*ipif;
-	phyint_t *phyi;
+	phyint_t *phyi = ill->ill_phyint;
+	ip_stack_t *ipst = ill->ill_ipst;
 
 	ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
 	    ill->ill_name, id, (void *)ill));
@@ -13175,23 +13269,61 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
 	ipif->ipif_refcnt = 0;
 	ipif->ipif_saved_ire_cnt = 0;
 
-	if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) {
-		mi_free(ipif);
-		return (NULL);
+	if (insert) {
+		if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) {
+			mi_free(ipif);
+			return (NULL);
+		}
+		/* -1 id should have been replaced by real id */
+		id = ipif->ipif_id;
+		ASSERT(id >= 0);
 	}
-	/* -1 id should have been replaced by real id */
-	id = ipif->ipif_id;
-	ASSERT(id >= 0);
 
 	if (ill->ill_name[0] != '\0')
 		ipif_assign_seqid(ipif);
 
 	/*
-	 * Keep a copy of original id in ipif_orig_ipifid.  Failback
-	 * will attempt to restore the original id.  The SIOCSLIFOINDEX
-	 * ioctl sets ipif_orig_ipifid to zero.
+	 * If this is ipif zero, configure ill/phyint-wide information.
+	 * Defer most configuration until we're guaranteed we're attached.
 	 */
-	ipif->ipif_orig_ipifid = id;
+	if (id == 0) {
+		if (ill->ill_mactype == SUNW_DL_IPMP) {
+			/*
+			 * Set PHYI_IPMP and also set PHYI_FAILED since there
+			 * are no active interfaces.  Similarly, PHYI_RUNNING
+			 * isn't set until the group has an active interface.
+			 */
+			mutex_enter(&phyi->phyint_lock);
+			phyi->phyint_flags |= (PHYI_IPMP | PHYI_FAILED);
+			mutex_exit(&phyi->phyint_lock);
+
+			/*
+			 * Create the illgrp (which must not exist yet because
+			 * the zeroth ipif is created once per ill).  However,
+			 * do not not link it to the ipmp_grp_t until I_PLINK
+			 * is called; see ip_sioctl_plink_ipmp() for details.
+			 */
+			if (ipmp_illgrp_create(ill) == NULL) {
+				if (insert) {
+					rw_enter(&ipst->ips_ill_g_lock,
+					    RW_WRITER);
+					ipif_remove(ipif);
+					rw_exit(&ipst->ips_ill_g_lock);
+				}
+				mi_free(ipif);
+				return (NULL);
+			}
+		} else {
+			/*
+			 * By default, PHYI_RUNNING is set when the zeroth
+			 * ipif is created.  For other ipifs, we don't touch
+			 * it since DLPI notifications may have changed it.
+			 */
+			mutex_enter(&phyi->phyint_lock);
+			phyi->phyint_flags |= PHYI_RUNNING;
+			mutex_exit(&phyi->phyint_lock);
+		}
+	}
 
 	/*
 	 * We grab the ill_lock and phyint_lock to protect the flag changes.
@@ -13199,18 +13331,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
 	 * ioctl completes and the IPIF_CHANGING flag is cleared.
 	 */
 	mutex_enter(&ill->ill_lock);
-	mutex_enter(&ill->ill_phyint->phyint_lock);
-	/*
-	 * Set the running flag when logical interface zero is created.
-	 * For subsequent logical interfaces, a DLPI link down
-	 * notification message may have cleared the running flag to
-	 * indicate the link is down, so we shouldn't just blindly set it.
-	 */
-	if (id == 0)
-		ill->ill_phyint->phyint_flags |= PHYI_RUNNING;
+	mutex_enter(&phyi->phyint_lock);
+
 	ipif->ipif_ire_type = ire_type;
-	phyi = ill->ill_phyint;
-	ipif->ipif_orig_ifindex = phyi->phyint_ifindex;
 
 	if (ipif->ipif_isv6) {
 		ill->ill_flags |= ILLF_IPV6;
@@ -13238,14 +13361,18 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
 	 * Don't set the interface flags etc. now, will do it in
 	 * ip_ll_subnet_defaults.
 	 */
-	if (!initialize) {
-		mutex_exit(&ill->ill_lock);
-		mutex_exit(&ill->ill_phyint->phyint_lock);
-		return (ipif);
-	}
+	if (!initialize)
+		goto out;
+
 	ipif->ipif_mtu = ill->ill_max_mtu;
 
-	if (ill->ill_bcast_addr_length != 0) {
+	/*
+	 * NOTE: The IPMP meta-interface is special-cased because it starts
+	 * with no underlying interfaces (and thus an unknown broadcast
+	 * address length), but all interfaces that can be placed into an IPMP
+	 * group are required to be broadcast-capable.
+	 */
+	if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) {
 		/*
 		 * Later detect lack of DLPI driver multicast
 		 * capability by catching DL_ENABMULTI errors in
@@ -13269,8 +13396,7 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
 				ill->ill_flags |= ILLF_NOARP;
 		}
 		if (ill->ill_phys_addr_length == 0) {
-			if (ill->ill_media &&
-			    ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) {
+			if (ill->ill_mactype == SUNW_DL_VNI) {
 				ipif->ipif_flags |= IPIF_NOXMIT;
 				phyi->phyint_flags |= PHYI_VIRTUAL;
 			} else {
@@ -13285,8 +13411,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
 			}
 		}
 	}
+out:
+	mutex_exit(&phyi->phyint_lock);
 	mutex_exit(&ill->ill_lock);
-	mutex_exit(&ill->ill_phyint->phyint_lock);
 	return (ipif);
 }
 
@@ -13300,34 +13427,49 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
  *	  for details.
  */
 void
-ipif_arp_down(ipif_t *ipif)
+ipif_resolver_down(ipif_t *ipif)
 {
 	mblk_t	*mp;
 	ill_t	*ill = ipif->ipif_ill;
 
-	ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
+	ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
+	if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
+		return;
+
 	/* Delete the mapping for the local address */
 	mp = ipif->ipif_arp_del_mp;
 	if (mp != NULL) {
-		ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+		ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
 		    *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id));
 		putnext(ill->ill_rq, mp);
 		ipif->ipif_arp_del_mp = NULL;
 	}
 
 	/*
+	 * Make IPMP aware of the deleted data address.
+	 */
+	if (IS_IPMP(ill))
+		ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+
+	/*
 	 * If this is the last ipif that is going down and there are no
 	 * duplicate addresses we may yet attempt to re-probe, then we need to
 	 * clean up ARP completely.
 	 */
 	if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) {
+		/*
+		 * If this was the last ipif on an IPMP interface, purge any
+		 * IPMP ARP entries associated with it.
+		 */
+		if (IS_IPMP(ill))
+			ipmp_illgrp_refresh_arpent(ill->ill_grp);
 
 		/* Send up AR_INTERFACE_DOWN message */
 		mp = ill->ill_arp_down_mp;
 		if (mp != NULL) {
-			ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+			ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
 			    *(unsigned *)mp->b_rptr, ill->ill_name,
 			    ipif->ipif_id));
 			putnext(ill->ill_rq, mp);
@@ -13337,7 +13479,7 @@ ipif_arp_down(ipif_t *ipif)
 		/* Tell ARP to delete the multicast mappings */
 		mp = ill->ill_arp_del_mapping_mp;
 		if (mp != NULL) {
-			ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+			ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
 			    *(unsigned *)mp->b_rptr, ill->ill_name,
 			    ipif->ipif_id));
 			putnext(ill->ill_rq, mp);
@@ -13377,6 +13519,13 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
 		return (0);
 
 	/*
+	 * IPMP meta-interfaces don't have any inherent multicast mappings,
+	 * and instead use the ones on the underlying interfaces.
+	 */
+	if (IS_IPMP(ill))
+		return (0);
+
+	/*
 	 * Delete the existing mapping from ARP. Normally ipif_down
 	 * -> ipif_arp_down should send this up to ARP. The only
 	 * reason we would find this when we are switching from
@@ -13473,26 +13622,23 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
 }
 
 /*
- * Get the resolver set up for a new interface address.
- * (Always called as writer.)
- * Called both for IPv4 and IPv6 interfaces,
- * though it only sets up the resolver for v6
- * if it's an xresolv interface (one using an external resolver).
- * Honors ILLF_NOARP.
- * The enumerated value res_act is used to tune the behavior.
- * If set to Res_act_initial, then we set up all the resolver
- * structures for a new interface.  If set to Res_act_move, then
- * we just send an AR_ENTRY_ADD message up to ARP for IPv4
- * interfaces; this is called by ip_rput_dlpi_writer() to handle
- * asynchronous hardware address change notification.  If set to
- * Res_act_defend, then we tell ARP that it needs to send a single
- * gratuitous message in defense of the address.
+ * Get the resolver set up for a new IP address.  (Always called as writer.)
+ * Called both for IPv4 and IPv6 interfaces, though it only sets up the
+ * resolver for v6 if it's an ILLF_XRESOLV interface.  Honors ILLF_NOARP.
+ *
+ * The enumerated value res_act tunes the behavior:
+ * 	* Res_act_initial: set up all the resolver structures for a new
+ *	  IP address.
+ *	* Res_act_defend: tell ARP that it needs to send a single gratuitous
+ *	  ARP message in defense of the address.
+ *	* Res_act_rebind: tell ARP to change the hardware address for an IP
+ *	  address (and issue gratuitous ARPs).  Used by ipmp_ill_bind_ipif().
+ *
  * Returns error on failure.
  */
 int
 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 {
-	caddr_t	addr;
 	mblk_t	*arp_up_mp = NULL;
 	mblk_t	*arp_down_mp = NULL;
 	mblk_t	*arp_add_mp = NULL;
@@ -13500,9 +13646,9 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 	mblk_t	*arp_add_mapping_mp = NULL;
 	mblk_t	*arp_del_mapping_mp = NULL;
 	ill_t	*ill = ipif->ipif_ill;
-	uchar_t	*area_p = NULL;
-	uchar_t	*ared_p = NULL;
 	int	err = ENOMEM;
+	boolean_t added_ipif = B_FALSE;
+	boolean_t publish;
 	boolean_t was_dup;
 
 	ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
@@ -13540,11 +13686,7 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 		 * External resolver for IPv6
 		 */
 		ASSERT(res_act == Res_act_initial);
-		if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
-			addr = (caddr_t)&ipif->ipif_v6lcl_addr;
-			area_p = (uchar_t *)&ip6_area_template;
-			ared_p = (uchar_t *)&ip6_ared_template;
-		}
+		publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr);
 	} else {
 		/*
 		 * IPv4 arp case. If the ARP stream has already started
@@ -13562,41 +13704,39 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 				ill->ill_arp_bringup_pending = 1;
 			mutex_exit(&ill->ill_lock);
 		}
-		if (ipif->ipif_lcl_addr != INADDR_ANY) {
-			addr = (caddr_t)&ipif->ipif_lcl_addr;
-			area_p = (uchar_t *)&ip_area_template;
-			ared_p = (uchar_t *)&ip_ared_template;
+		publish = (ipif->ipif_lcl_addr != INADDR_ANY);
+	}
+
+	if (IS_IPMP(ill) && publish) {
+		/*
+		 * If we're here via ipif_up(), then the ipif won't be bound
+		 * yet -- add it to the group, which will bind it if possible.
+		 * (We would add it in ipif_up(), but deleting on failure
+		 * there is gruesome.)  If we're here via ipmp_ill_bind_ipif(),
+		 * then the ipif has already been added to the group and we
+		 * just need to use the binding.
+		 */
+		if (ipmp_ipif_bound_ill(ipif) == NULL) {
+			if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) {
+				/*
+				 * We couldn't bind the ipif to an ill yet,
+				 * so we have nothing to publish.
+				 */
+				publish = B_FALSE;
+			}
+			added_ipif = B_TRUE;
 		}
 	}
 
 	/*
 	 * Add an entry for the local address in ARP only if it
-	 * is not UNNUMBERED and the address is not INADDR_ANY.
+	 * is not UNNUMBERED and it is suitable for publishing.
 	 */
-	if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) {
-		area_t *area;
-
-		/* Now ask ARP to publish our address. */
-		arp_add_mp = ill_arp_alloc(ill, area_p, addr);
-		if (arp_add_mp == NULL)
-			goto failed;
-		area = (area_t *)arp_add_mp->b_rptr;
-		if (res_act != Res_act_initial) {
-			/*
-			 * Copy the new hardware address and length into
-			 * arp_add_mp to be sent to ARP.
-			 */
-			area->area_hw_addr_length = ill->ill_phys_addr_length;
-			bcopy(ill->ill_phys_addr,
-			    ((char *)area + area->area_hw_addr_offset),
-			    area->area_hw_addr_length);
-		}
-
-		area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH |
-		    ACE_F_MYADDR;
-
+	if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) {
 		if (res_act == Res_act_defend) {
-			area->area_flags |= ACE_F_DEFEND;
+			arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND);
+			if (arp_add_mp == NULL)
+				goto failed;
 			/*
 			 * If we're just defending our address now, then
 			 * there's no need to set up ARP multicast mappings.
@@ -13605,17 +13745,18 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 			goto done;
 		}
 
-		if (res_act != Res_act_initial)
-			goto arp_setup_multicast;
-
 		/*
-		 * Allocate an ARP deletion message so we know we can tell ARP
-		 * when the interface goes down.
+		 * Allocate an ARP add message and an ARP delete message (the
+		 * latter is saved for use when the address goes down).
 		 */
-		arp_del_mp = ill_arp_alloc(ill, ared_p, addr);
-		if (arp_del_mp == NULL)
+		if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL)
+			goto failed;
+
+		if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL)
 			goto failed;
 
+		if (res_act != Res_act_initial)
+			goto arp_setup_multicast;
 	} else {
 		if (res_act != Res_act_initial)
 			goto done;
@@ -13624,14 +13765,11 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 	 * Need to bring up ARP or setup multicast mapping only
 	 * when the first interface is coming UP.
 	 */
-	if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
-	    was_dup) {
+	if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup)
 		goto done;
-	}
 
 	/*
-	 * Allocate an ARP down message (to be saved) and an ARP up
-	 * message.
+	 * Allocate an ARP down message (to be saved) and an ARP up message.
 	 */
 	arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0);
 	if (arp_down_mp == NULL)
@@ -13648,33 +13786,21 @@ arp_setup_multicast:
 	/*
 	 * Setup the multicast mappings. This function initializes
 	 * ill_arp_del_mapping_mp also. This does not need to be done for
-	 * IPv6.
+	 * IPv6, or for the IPMP interface (since it has no link-layer).
 	 */
-	if (!ill->ill_isv6) {
+	if (!ill->ill_isv6 && !IS_IPMP(ill)) {
 		err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp);
 		if (err != 0)
 			goto failed;
 		ASSERT(ill->ill_arp_del_mapping_mp != NULL);
 		ASSERT(arp_add_mapping_mp != NULL);
 	}
-
 done:
-	if (arp_del_mp != NULL) {
-		ASSERT(ipif->ipif_arp_del_mp == NULL);
-		ipif->ipif_arp_del_mp = arp_del_mp;
-	}
-	if (arp_down_mp != NULL) {
-		ASSERT(ill->ill_arp_down_mp == NULL);
-		ill->ill_arp_down_mp = arp_down_mp;
-	}
-	if (arp_del_mapping_mp != NULL) {
-		ASSERT(ill->ill_arp_del_mapping_mp == NULL);
-		ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
-	}
 	if (arp_up_mp != NULL) {
 		ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n",
 		    ill->ill_name, ipif->ipif_id));
 		putnext(ill->ill_rq, arp_up_mp);
+		arp_up_mp = NULL;
 	}
 	if (arp_add_mp != NULL) {
 		ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n",
@@ -13686,6 +13812,7 @@ done:
 		if (!ill->ill_arp_extend)
 			ipif->ipif_addr_ready = 1;
 		putnext(ill->ill_rq, arp_add_mp);
+		arp_add_mp = NULL;
 	} else {
 		ipif->ipif_addr_ready = 1;
 	}
@@ -13693,29 +13820,40 @@ done:
 		ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n",
 		    ill->ill_name, ipif->ipif_id));
 		putnext(ill->ill_rq, arp_add_mapping_mp);
+		arp_add_mapping_mp = NULL;
 	}
-	if (res_act != Res_act_initial)
-		return (0);
 
-	if (ill->ill_flags & ILLF_NOARP)
-		err = ill_arp_off(ill);
-	else
-		err = ill_arp_on(ill);
-	if (err != 0) {
-		ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err));
-		freemsg(ipif->ipif_arp_del_mp);
-		freemsg(ill->ill_arp_down_mp);
-		freemsg(ill->ill_arp_del_mapping_mp);
-		ipif->ipif_arp_del_mp = NULL;
-		ill->ill_arp_down_mp = NULL;
-		ill->ill_arp_del_mapping_mp = NULL;
-		return (err);
+	if (res_act == Res_act_initial) {
+		if (ill->ill_flags & ILLF_NOARP)
+			err = ill_arp_off(ill);
+		else
+			err = ill_arp_on(ill);
+		if (err != 0) {
+			ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n",
+			    err));
+			goto failed;
+		}
 	}
+
+	if (arp_del_mp != NULL) {
+		ASSERT(ipif->ipif_arp_del_mp == NULL);
+		ipif->ipif_arp_del_mp = arp_del_mp;
+	}
+	if (arp_down_mp != NULL) {
+		ASSERT(ill->ill_arp_down_mp == NULL);
+		ill->ill_arp_down_mp = arp_down_mp;
+	}
+	if (arp_del_mapping_mp != NULL) {
+		ASSERT(ill->ill_arp_del_mapping_mp == NULL);
+		ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
+	}
+
 	return ((ill->ill_ipif_up_count != 0 || was_dup ||
 	    ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS);
-
 failed:
 	ip1dbg(("ipif_resolver_up: FAILED\n"));
+	if (added_ipif)
+		ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
 	freemsg(arp_add_mp);
 	freemsg(arp_del_mp);
 	freemsg(arp_add_mapping_mp);
@@ -13734,13 +13872,12 @@ ipif_arp_start_dad(ipif_t *ipif)
 {
 	ill_t *ill = ipif->ipif_ill;
 	mblk_t *arp_add_mp;
-	area_t *area;
 
+	/* ACE_F_UNVERIFIED restarts DAD */
 	if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing ||
 	    (ipif->ipif_flags & IPIF_UNNUMBERED) ||
 	    ipif->ipif_lcl_addr == INADDR_ANY ||
-	    (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
-	    (char *)&ipif->ipif_lcl_addr)) == NULL) {
+	    (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) {
 		/*
 		 * If we can't contact ARP for some reason, that's not really a
 		 * problem.  Just send out the routing socket notification that
@@ -13752,10 +13889,6 @@ ipif_arp_start_dad(ipif_t *ipif)
 		return;
 	}
 
-	/* Setting the 'unverified' flag restarts DAD */
-	area = (area_t *)arp_add_mp->b_rptr;
-	area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR |
-	    ACE_F_UNVERIFIED;
 	putnext(ill->ill_rq, arp_add_mp);
 }
 
@@ -13764,7 +13897,8 @@ ipif_ndp_start_dad(ipif_t *ipif)
 {
 	nce_t *nce;
 
-	nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE);
+	nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr,
+	    B_FALSE);
 	if (nce == NULL)
 		return;
 
@@ -13805,7 +13939,7 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
 	 */
 	if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) ||
 	    (!ill->ill_isv6 && !ill->ill_arp_extend)) {
-		ip_rts_ifmsg(ill->ill_ipif);
+		ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
 		return;
 	}
 
@@ -13838,8 +13972,10 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
 				 * we'll handle eventual routing socket
 				 * notification via DAD completion.)
 				 */
-				if (ipif == ill->ill_ipif)
-					ip_rts_ifmsg(ill->ill_ipif);
+				if (ipif == ill->ill_ipif) {
+					ip_rts_ifmsg(ill->ill_ipif,
+					    RTSQ_DEFAULT);
+				}
 			}
 		} else {
 			/*
@@ -13855,285 +13991,30 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
 	 * If we've torn down links, then notify the user right away.
 	 */
 	if (!went_up)
-		ip_rts_ifmsg(ill->ill_ipif);
-}
-
-/*
- * Wakeup all threads waiting to enter the ipsq, and sleeping
- * on any of the ills in this ipsq. The ill_lock of the ill
- * must be held so that waiters don't miss wakeups
- */
-static void
-ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock)
-{
-	phyint_t *phyint;
-
-	phyint = ipsq->ipsq_phyint_list;
-	while (phyint != NULL) {
-		if (phyint->phyint_illv4) {
-			if (!caller_holds_lock)
-				mutex_enter(&phyint->phyint_illv4->ill_lock);
-			ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
-			cv_broadcast(&phyint->phyint_illv4->ill_cv);
-			if (!caller_holds_lock)
-				mutex_exit(&phyint->phyint_illv4->ill_lock);
-		}
-		if (phyint->phyint_illv6) {
-			if (!caller_holds_lock)
-				mutex_enter(&phyint->phyint_illv6->ill_lock);
-			ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
-			cv_broadcast(&phyint->phyint_illv6->ill_cv);
-			if (!caller_holds_lock)
-				mutex_exit(&phyint->phyint_illv6->ill_lock);
-		}
-		phyint = phyint->phyint_ipsq_next;
-	}
-}
-
-static ipsq_t *
-ipsq_create(char *groupname, ip_stack_t *ipst)
-{
-	ipsq_t	*ipsq;
-
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-	ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
-	if (ipsq == NULL) {
-		return (NULL);
-	}
-
-	if (groupname != NULL)
-		(void) strcpy(ipsq->ipsq_name, groupname);
-	else
-		ipsq->ipsq_name[0] = '\0';
-
-	mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL);
-	ipsq->ipsq_flags |= IPSQ_GROUP;
-	ipsq->ipsq_next = ipst->ips_ipsq_g_head;
-	ipst->ips_ipsq_g_head = ipsq;
-	ipsq->ipsq_ipst = ipst;		/* No netstack_hold */
-	return (ipsq);
-}
-
-/*
- * Return an ipsq correspoding to the groupname. If 'create' is true
- * allocate a new ipsq if one does not exist. Usually an ipsq is associated
- * uniquely with an IPMP group. However during IPMP groupname operations,
- * multiple IPMP groups may be associated with a single ipsq. But no
- * IPMP group can be associated with more than 1 ipsq at any time.
- * For example
- *	Interfaces		IPMP grpname	ipsq	ipsq_name      ipsq_refs
- * 	hme1, hme2		mpk17-84	ipsq1	mpk17-84	2
- *	hme3, hme4		mpk17-85	ipsq2	mpk17-85	2
- *
- * Now the command ifconfig hme3 group mpk17-84 results in the temporary
- * status shown below during the execution of the above command.
- * 	hme1, hme2, hme3, hme4	mpk17-84, mpk17-85	ipsq1	mpk17-84  4
- *
- * After the completion of the above groupname command we return to the stable
- * state shown below.
- * 	hme1, hme2, hme3	mpk17-84	ipsq1	mpk17-84	3
- *	hme4			mpk17-85	ipsq2	mpk17-85	1
- *
- * Because of the above, we don't search based on the ipsq_name since that
- * would miss the correct ipsq during certain windows as shown above.
- * The ipsq_name is only used during split of an ipsq to return the ipsq to its
- * natural state.
- */
-static ipsq_t *
-ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq,
-    ip_stack_t *ipst)
-{
-	ipsq_t	*ipsq;
-	int	group_len;
-	phyint_t *phyint;
-
-	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
-	group_len = strlen(groupname);
-	ASSERT(group_len != 0);
-	group_len++;
-
-	for (ipsq = ipst->ips_ipsq_g_head;
-	    ipsq != NULL;
-	    ipsq = ipsq->ipsq_next) {
-		/*
-		 * When an ipsq is being split, and ill_split_ipsq
-		 * calls this function, we exclude it from being considered.
-		 */
-		if (ipsq == exclude_ipsq)
-			continue;
-
-		/*
-		 * Compare against the ipsq_name. The groupname change happens
-		 * in 2 phases. The 1st phase merges the from group into
-		 * the to group's ipsq, by calling ill_merge_groups and restarts
-		 * the ioctl. The 2nd phase then locates the ipsq again thru
-		 * ipsq_name. At this point the phyint_groupname has not been
-		 * updated.
-		 */
-		if ((group_len == strlen(ipsq->ipsq_name) + 1) &&
-		    (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) {
-			/*
-			 * Verify that an ipmp groupname is exactly
-			 * part of 1 ipsq and is not found in any other
-			 * ipsq.
-			 */
-			ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, ipst) ==
-			    NULL);
-			return (ipsq);
-		}
-
-		/*
-		 * Comparison against ipsq_name alone is not sufficient.
-		 * In the case when groups are currently being
-		 * merged, the ipsq could hold other IPMP groups temporarily.
-		 * so we walk the phyint list and compare against the
-		 * phyint_groupname as well.
-		 */
-		phyint = ipsq->ipsq_phyint_list;
-		while (phyint != NULL) {
-			if ((group_len == phyint->phyint_groupname_len) &&
-			    (bcmp(phyint->phyint_groupname, groupname,
-			    group_len) == 0)) {
-				/*
-				 * Verify that an ipmp groupname is exactly
-				 * part of 1 ipsq and is not found in any other
-				 * ipsq.
-				 */
-				ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq,
-				    ipst) == NULL);
-				return (ipsq);
-			}
-			phyint = phyint->phyint_ipsq_next;
-		}
-	}
-	if (create)
-		ipsq = ipsq_create(groupname, ipst);
-	return (ipsq);
+		ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
 }
 
 static void
 ipsq_delete(ipsq_t *ipsq)
 {
-	ipsq_t *nipsq;
-	ipsq_t *pipsq = NULL;
-	ip_stack_t *ipst = ipsq->ipsq_ipst;
-
-	/*
-	 * We don't hold the ipsq lock, but we are sure no new
-	 * messages can land up, since the ipsq_refs is zero.
-	 * i.e. this ipsq is unnamed and no phyint or phyint group
-	 * is associated with this ipsq. (Lookups are based on ill_name
-	 * or phyint_groupname)
-	 */
-	ASSERT(ipsq->ipsq_refs == 0);
-	ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL);
-	ASSERT(ipsq->ipsq_pending_mp == NULL);
-	if (!(ipsq->ipsq_flags & IPSQ_GROUP)) {
-		/*
-		 * This is not the ipsq of an IPMP group.
-		 */
-		ipsq->ipsq_ipst = NULL;
-		kmem_free(ipsq, sizeof (ipsq_t));
-		return;
-	}
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
-	/*
-	 * Locate the ipsq  before we can remove it from
-	 * the singly linked list of ipsq's.
-	 */
-	for (nipsq = ipst->ips_ipsq_g_head; nipsq != NULL;
-	    nipsq = nipsq->ipsq_next) {
-		if (nipsq == ipsq) {
-			break;
-		}
-		pipsq = nipsq;
-	}
-
-	ASSERT(nipsq == ipsq);
+	ipxop_t *ipx = ipsq->ipsq_xop;
 
-	/* unlink ipsq from the list */
-	if (pipsq != NULL)
-		pipsq->ipsq_next = ipsq->ipsq_next;
-	else
-		ipst->ips_ipsq_g_head = ipsq->ipsq_next;
 	ipsq->ipsq_ipst = NULL;
+	ASSERT(ipsq->ipsq_phyint == NULL);
+	ASSERT(ipsq->ipsq_xop != NULL);
+	ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL);
+	ASSERT(ipx->ipx_pending_mp == NULL);
 	kmem_free(ipsq, sizeof (ipsq_t));
-	rw_exit(&ipst->ips_ill_g_lock);
-}
-
-static void
-ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp,
-    queue_t *q)
-{
-	ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock));
-	ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL);
-	ASSERT(old_ipsq->ipsq_pending_ipif == NULL);
-	ASSERT(old_ipsq->ipsq_pending_mp == NULL);
-	ASSERT(current_mp != NULL);
-
-	ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl,
-	    NEW_OP, NULL);
-
-	ASSERT(new_ipsq->ipsq_xopq_mptail != NULL &&
-	    new_ipsq->ipsq_xopq_mphead != NULL);
-
-	/*
-	 * move from old ipsq to the new ipsq.
-	 */
-	new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead;
-	if (old_ipsq->ipsq_xopq_mphead != NULL)
-		new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail;
-
-	old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL;
 }
 
-void
-ill_group_cleanup(ill_t *ill)
-{
-	ill_t *ill_v4;
-	ill_t *ill_v6;
-	ipif_t *ipif;
-
-	ill_v4 = ill->ill_phyint->phyint_illv4;
-	ill_v6 = ill->ill_phyint->phyint_illv6;
-
-	if (ill_v4 != NULL) {
-		mutex_enter(&ill_v4->ill_lock);
-		for (ipif = ill_v4->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			IPIF_UNMARK_MOVING(ipif);
-		}
-		ill_v4->ill_up_ipifs = B_FALSE;
-		mutex_exit(&ill_v4->ill_lock);
-	}
-
-	if (ill_v6 != NULL) {
-		mutex_enter(&ill_v6->ill_lock);
-		for (ipif = ill_v6->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			IPIF_UNMARK_MOVING(ipif);
-		}
-		ill_v6->ill_up_ipifs = B_FALSE;
-		mutex_exit(&ill_v6->ill_lock);
-	}
-}
-/*
- * This function is called when an ill has had a change in its group status
- * to bring up all the ipifs that were up before the change.
- */
-int
-ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
+static int
+ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
 {
+	int err;
 	ipif_t *ipif;
-	ill_t *ill_v4;
-	ill_t *ill_v6;
-	ill_t *from_ill;
-	int err = 0;
 
-	ASSERT(IAM_WRITER_ILL(ill));
+	if (ill == NULL)
+		return (0);
 
 	/*
 	 * Except for ipif_state_flags and ill_state_flags the other
@@ -14142,389 +14023,86 @@ ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
 	 * even an ipif that was already down, in ill_down_ipifs. So we
 	 * just blindly clear the IPIF_CHANGING flag here on all ipifs.
 	 */
-	ill_v4 = ill->ill_phyint->phyint_illv4;
-	ill_v6 = ill->ill_phyint->phyint_illv6;
-	if (ill_v4 != NULL) {
-		ill_v4->ill_up_ipifs = B_TRUE;
-		for (ipif = ill_v4->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			mutex_enter(&ill_v4->ill_lock);
-			ipif->ipif_state_flags &= ~IPIF_CHANGING;
-			IPIF_UNMARK_MOVING(ipif);
-			mutex_exit(&ill_v4->ill_lock);
-			if (ipif->ipif_was_up) {
-				if (!(ipif->ipif_flags & IPIF_UP))
-					err = ipif_up(ipif, q, mp);
-				ipif->ipif_was_up = B_FALSE;
-				if (err != 0) {
-					/*
-					 * Can there be any other error ?
-					 */
-					ASSERT(err == EINPROGRESS);
-					return (err);
-				}
-			}
-		}
-		mutex_enter(&ill_v4->ill_lock);
-		ill_v4->ill_state_flags &= ~ILL_CHANGING;
-		mutex_exit(&ill_v4->ill_lock);
-		ill_v4->ill_up_ipifs = B_FALSE;
-		if (ill_v4->ill_move_in_progress) {
-			ASSERT(ill_v4->ill_move_peer != NULL);
-			ill_v4->ill_move_in_progress = B_FALSE;
-			from_ill = ill_v4->ill_move_peer;
-			from_ill->ill_move_in_progress = B_FALSE;
-			from_ill->ill_move_peer = NULL;
-			mutex_enter(&from_ill->ill_lock);
-			from_ill->ill_state_flags &= ~ILL_CHANGING;
-			mutex_exit(&from_ill->ill_lock);
-			if (ill_v6 == NULL) {
-				if (from_ill->ill_phyint->phyint_flags &
-				    PHYI_STANDBY) {
-					phyint_inactive(from_ill->ill_phyint);
-				}
-				if (ill_v4->ill_phyint->phyint_flags &
-				    PHYI_STANDBY) {
-					phyint_inactive(ill_v4->ill_phyint);
-				}
-			}
-			ill_v4->ill_move_peer = NULL;
-		}
-	}
+	ASSERT(IAM_WRITER_ILL(ill));
 
-	if (ill_v6 != NULL) {
-		ill_v6->ill_up_ipifs = B_TRUE;
-		for (ipif = ill_v6->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			mutex_enter(&ill_v6->ill_lock);
-			ipif->ipif_state_flags &= ~IPIF_CHANGING;
-			IPIF_UNMARK_MOVING(ipif);
-			mutex_exit(&ill_v6->ill_lock);
-			if (ipif->ipif_was_up) {
-				if (!(ipif->ipif_flags & IPIF_UP))
-					err = ipif_up(ipif, q, mp);
-				ipif->ipif_was_up = B_FALSE;
-				if (err != 0) {
-					/*
-					 * Can there be any other error ?
-					 */
-					ASSERT(err == EINPROGRESS);
-					return (err);
-				}
-			}
-		}
-		mutex_enter(&ill_v6->ill_lock);
-		ill_v6->ill_state_flags &= ~ILL_CHANGING;
-		mutex_exit(&ill_v6->ill_lock);
-		ill_v6->ill_up_ipifs = B_FALSE;
-		if (ill_v6->ill_move_in_progress) {
-			ASSERT(ill_v6->ill_move_peer != NULL);
-			ill_v6->ill_move_in_progress = B_FALSE;
-			from_ill = ill_v6->ill_move_peer;
-			from_ill->ill_move_in_progress = B_FALSE;
-			from_ill->ill_move_peer = NULL;
-			mutex_enter(&from_ill->ill_lock);
-			from_ill->ill_state_flags &= ~ILL_CHANGING;
-			mutex_exit(&from_ill->ill_lock);
-			if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
-				phyint_inactive(from_ill->ill_phyint);
-			}
-			if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) {
-				phyint_inactive(ill_v6->ill_phyint);
+	ill->ill_up_ipifs = B_TRUE;
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+		mutex_enter(&ill->ill_lock);
+		ipif->ipif_state_flags &= ~IPIF_CHANGING;
+		mutex_exit(&ill->ill_lock);
+		if (ipif->ipif_was_up) {
+			if (!(ipif->ipif_flags & IPIF_UP))
+				err = ipif_up(ipif, q, mp);
+			ipif->ipif_was_up = B_FALSE;
+			if (err != 0) {
+				ASSERT(err == EINPROGRESS);
+				return (err);
 			}
-			ill_v6->ill_move_peer = NULL;
 		}
 	}
+	mutex_enter(&ill->ill_lock);
+	ill->ill_state_flags &= ~ILL_CHANGING;
+	mutex_exit(&ill->ill_lock);
+	ill->ill_up_ipifs = B_FALSE;
 	return (0);
 }
 
 /*
- * bring down all the approriate ipifs.
+ * This function is called to bring up all the ipifs that were up before
+ * bringing the ill down via ill_down_ipifs().
  */
-/* ARGSUSED */
-static void
-ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover)
+int
+ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
 {
-	ipif_t *ipif;
+	int err;
 
 	ASSERT(IAM_WRITER_ILL(ill));
 
-	/*
-	 * Except for ipif_state_flags the other fields of the ipif/ill that
-	 * are modified below are protected implicitly since we are a writer
-	 */
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER))
-			continue;
-		/*
-		 * Don't bring down the LINK LOCAL addresses as they are tied
-		 * to physical interface and they don't move. Treat them as
-		 * IPIF_NOFAILOVER.
-		 */
-		if (chk_nofailover && ill->ill_isv6 &&
-		    IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))
-			continue;
-		if (index == 0 || index == ipif->ipif_orig_ifindex) {
-			/*
-			 * We go through the ipif_down logic even if the ipif
-			 * is already down, since routes can be added based
-			 * on down ipifs. Going through ipif_down once again
-			 * will delete any IREs created based on these routes.
-			 */
-			if (ipif->ipif_flags & IPIF_UP)
-				ipif->ipif_was_up = B_TRUE;
-			/*
-			 * If called with chk_nofailover true ipif is moving.
-			 */
-			mutex_enter(&ill->ill_lock);
-			if (chk_nofailover) {
-				ipif->ipif_state_flags |=
-				    IPIF_MOVING | IPIF_CHANGING;
-			} else {
-				ipif->ipif_state_flags |= IPIF_CHANGING;
-			}
-			mutex_exit(&ill->ill_lock);
-			/*
-			 * Need to re-create net/subnet bcast ires if
-			 * they are dependent on ipif.
-			 */
-			if (!ipif->ipif_isv6)
-				ipif_check_bcast_ires(ipif);
-			(void) ipif_logical_down(ipif, NULL, NULL);
-			ipif_non_duplicate(ipif);
-			ipif_down_tail(ipif);
-		}
-	}
-}
-
-#define	IPSQ_INC_REF(ipsq, ipst)	{			\
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));		\
-	(ipsq)->ipsq_refs++;				\
-}
+	err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
+	if (err != 0)
+		return (err);
 
-#define	IPSQ_DEC_REF(ipsq, ipst)	{			\
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));		\
-	(ipsq)->ipsq_refs--;				\
-	if ((ipsq)->ipsq_refs == 0)				\
-		(ipsq)->ipsq_name[0] = '\0'; 		\
+	return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp));
 }
 
 /*
- * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
- * new_ipsq.
+ * Bring down any IPIF_UP ipifs on ill.
  */
 static void
-ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq, ip_stack_t *ipst)
+ill_down_ipifs(ill_t *ill)
 {
-	phyint_t *phyint;
-	phyint_t *next_phyint;
-
-	/*
-	 * To change the ipsq of an ill, we need to hold the ill_g_lock as
-	 * writer and the ill_lock of the ill in question. Also the dest
-	 * ipsq can't vanish while we hold the ill_g_lock as writer.
-	 */
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
-	phyint = cur_ipsq->ipsq_phyint_list;
-	cur_ipsq->ipsq_phyint_list = NULL;
-	while (phyint != NULL) {
-		next_phyint = phyint->phyint_ipsq_next;
-		IPSQ_DEC_REF(cur_ipsq, ipst);
-		phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list;
-		new_ipsq->ipsq_phyint_list = phyint;
-		IPSQ_INC_REF(new_ipsq, ipst);
-		phyint->phyint_ipsq = new_ipsq;
-		phyint = next_phyint;
-	}
-}
-
-#define	SPLIT_SUCCESS		0
-#define	SPLIT_NOT_NEEDED	1
-#define	SPLIT_FAILED		2
-
-int
-ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry,
-    ip_stack_t *ipst)
-{
-	ipsq_t *newipsq = NULL;
-
-	/*
-	 * Assertions denote pre-requisites for changing the ipsq of
-	 * a phyint
-	 */
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-	/*
-	 * <ill-phyint> assocs can't change while ill_g_lock
-	 * is held as writer. See ill_phyint_reinit()
-	 */
-	ASSERT(phyint->phyint_illv4 == NULL ||
-	    MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
-	ASSERT(phyint->phyint_illv6 == NULL ||
-	    MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
-
-	if ((phyint->phyint_groupname_len !=
-	    (strlen(cur_ipsq->ipsq_name) + 1) ||
-	    bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name,
-	    phyint->phyint_groupname_len) != 0)) {
-		/*
-		 * Once we fail in creating a new ipsq due to memory shortage,
-		 * don't attempt to create new ipsq again, based on another
-		 * phyint, since we want all phyints belonging to an IPMP group
-		 * to be in the same ipsq even in the event of mem alloc fails.
-		 */
-		newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry,
-		    cur_ipsq, ipst);
-		if (newipsq == NULL) {
-			/* Memory allocation failure */
-			return (SPLIT_FAILED);
-		} else {
-			/* ipsq_refs protected by ill_g_lock (writer) */
-			IPSQ_DEC_REF(cur_ipsq, ipst);
-			phyint->phyint_ipsq = newipsq;
-			phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list;
-			newipsq->ipsq_phyint_list = phyint;
-			IPSQ_INC_REF(newipsq, ipst);
-			return (SPLIT_SUCCESS);
-		}
-	}
-	return (SPLIT_NOT_NEEDED);
-}
+	ipif_t *ipif;
 
-/*
- * The ill locks of the phyint and the ill_g_lock (writer) must be held
- * to do this split
- */
-static int
-ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, ip_stack_t *ipst)
-{
-	ipsq_t *newipsq;
+	ASSERT(IAM_WRITER_ILL(ill));
 
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
 	/*
-	 * <ill-phyint> assocs can't change while ill_g_lock
-	 * is held as writer. See ill_phyint_reinit()
+	 * Except for ipif_state_flags the other fields of the ipif/ill that
+	 * are modified below are protected implicitly since we are a writer
 	 */
-
-	ASSERT(phyint->phyint_illv4 == NULL ||
-	    MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
-	ASSERT(phyint->phyint_illv6 == NULL ||
-	    MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
-
-	if (!ipsq_init((phyint->phyint_illv4 != NULL) ?
-	    phyint->phyint_illv4: phyint->phyint_illv6)) {
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 		/*
-		 * ipsq_init failed due to no memory
-		 * caller will use the same ipsq
+		 * We go through the ipif_down logic even if the ipif
+		 * is already down, since routes can be added based
+		 * on down ipifs. Going through ipif_down once again
+		 * will delete any IREs created based on these routes.
 		 */
-		return (SPLIT_FAILED);
-	}
-
-	/* ipsq_ref is protected by ill_g_lock (writer) */
-	IPSQ_DEC_REF(cur_ipsq, ipst);
-
-	/*
-	 * This is a new ipsq that is unknown to the world.
-	 * So we don't need to hold ipsq_lock,
-	 */
-	newipsq = phyint->phyint_ipsq;
-	newipsq->ipsq_writer = NULL;
-	newipsq->ipsq_reentry_cnt--;
-	ASSERT(newipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
-	newipsq->ipsq_depth = 0;
-#endif
-
-	return (SPLIT_SUCCESS);
-}
+		if (ipif->ipif_flags & IPIF_UP)
+			ipif->ipif_was_up = B_TRUE;
 
-/*
- * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
- * ipsq's representing their individual groups or themselves. Return
- * whether split needs to be retried again later.
- */
-static boolean_t
-ill_split_ipsq(ipsq_t *cur_ipsq)
-{
-	phyint_t *phyint;
-	phyint_t *next_phyint;
-	int	error;
-	boolean_t need_retry = B_FALSE;
-	ip_stack_t	*ipst = cur_ipsq->ipsq_ipst;
+		mutex_enter(&ill->ill_lock);
+		ipif->ipif_state_flags |= IPIF_CHANGING;
+		mutex_exit(&ill->ill_lock);
 
-	phyint = cur_ipsq->ipsq_phyint_list;
-	cur_ipsq->ipsq_phyint_list = NULL;
-	while (phyint != NULL) {
-		next_phyint = phyint->phyint_ipsq_next;
 		/*
-		 * 'created' will tell us whether the callee actually
-		 * created an ipsq. Lack of memory may force the callee
-		 * to return without creating an ipsq.
+		 * Need to re-create net/subnet bcast ires if
+		 * they are dependent on ipif.
 		 */
-		if (phyint->phyint_groupname == NULL) {
-			error = ill_split_to_own_ipsq(phyint, cur_ipsq, ipst);
-		} else {
-			error = ill_split_to_grp_ipsq(phyint, cur_ipsq,
-			    need_retry, ipst);
-		}
-
-		switch (error) {
-		case SPLIT_FAILED:
-			need_retry = B_TRUE;
-			/* FALLTHRU */
-		case SPLIT_NOT_NEEDED:
-			/*
-			 * Keep it on the list.
-			 */
-			phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list;
-			cur_ipsq->ipsq_phyint_list = phyint;
-			break;
-		case SPLIT_SUCCESS:
-			break;
-		default:
-			ASSERT(0);
-		}
-
-		phyint = next_phyint;
-	}
-	return (need_retry);
-}
-
-/*
- * given an ipsq 'ipsq' lock all ills associated with this ipsq.
- * and return the ills in the list. This list will be
- * needed to unlock all the ills later on by the caller.
- * The <ill-ipsq> associations could change between the
- * lock and unlock. Hence the unlock can't traverse the
- * ipsq to get the list of ills.
- */
-static int
-ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max)
-{
-	int	cnt = 0;
-	phyint_t	*phyint;
-	ip_stack_t	*ipst = ipsq->ipsq_ipst;
-
-	/*
-	 * The caller holds ill_g_lock to ensure that the ill memberships
-	 * of the ipsq don't change
-	 */
-	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
-	phyint = ipsq->ipsq_phyint_list;
-	while (phyint != NULL) {
-		if (phyint->phyint_illv4 != NULL) {
-			ASSERT(cnt < list_max);
-			list[cnt++] = phyint->phyint_illv4;
-		}
-		if (phyint->phyint_illv6 != NULL) {
-			ASSERT(cnt < list_max);
-			list[cnt++] = phyint->phyint_illv6;
-		}
-		phyint = phyint->phyint_ipsq_next;
+		if (!ipif->ipif_isv6)
+			ipif_check_bcast_ires(ipif);
+		(void) ipif_logical_down(ipif, NULL, NULL);
+		ipif_non_duplicate(ipif);
+		ipif_down_tail(ipif);
 	}
-	ill_lock_ills(list, cnt);
-	return (cnt);
 }
 
 void
@@ -14577,3504 +14155,251 @@ ill_unlock_ills(ill_t **list, int cnt)
 }
 
 /*
- * Merge all the ills from 1 ipsq group into another ipsq group.
- * The source ipsq group is specified by the ipsq associated with
- * 'from_ill'. The destination ipsq group is specified by the ipsq
- * associated with 'to_ill' or 'groupname' respectively.
- * Note that ipsq itself does not have a reference count mechanism
- * and functions don't look up an ipsq and pass it around. Instead
- * functions pass around an ill or groupname, and the ipsq is looked
- * up from the ill or groupname and the required operation performed
- * atomically with the lookup on the ipsq.
+ * Redo source address selection.  This is called when a
+ * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up.
  */
-static int
-ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp,
-    queue_t *q)
-{
-	ipsq_t *old_ipsq;
-	ipsq_t *new_ipsq;
-	ill_t	**ill_list;
-	int	cnt;
-	size_t	ill_list_size;
-	boolean_t became_writer_on_new_sq = B_FALSE;
-	ip_stack_t	*ipst = from_ill->ill_ipst;
-
-	ASSERT(to_ill == NULL || ipst == to_ill->ill_ipst);
-	/* Exactly 1 of 'to_ill' and groupname can be specified. */
-	ASSERT((to_ill != NULL) ^ (groupname != NULL));
-
-	/*
-	 * Need to hold ill_g_lock as writer and also the ill_lock to
-	 * change the <ill-ipsq> assoc of an ill. Need to hold the
-	 * ipsq_lock to prevent new messages from landing on an ipsq.
-	 */
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
-	old_ipsq = from_ill->ill_phyint->phyint_ipsq;
-	if (groupname != NULL)
-		new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL, ipst);
-	else {
-		new_ipsq = to_ill->ill_phyint->phyint_ipsq;
-	}
-
-	ASSERT(old_ipsq != NULL && new_ipsq != NULL);
-
-	/*
-	 * both groups are on the same ipsq.
-	 */
-	if (old_ipsq == new_ipsq) {
-		rw_exit(&ipst->ips_ill_g_lock);
-		return (0);
-	}
-
-	cnt = old_ipsq->ipsq_refs << 1;
-	ill_list_size = cnt * sizeof (ill_t *);
-	ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
-	if (ill_list == NULL) {
-		rw_exit(&ipst->ips_ill_g_lock);
-		return (ENOMEM);
-	}
-	cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt);
-
-	/* Need ipsq lock to enque messages on new ipsq or to become writer */
-	mutex_enter(&new_ipsq->ipsq_lock);
-	if ((new_ipsq->ipsq_writer == NULL &&
-	    new_ipsq->ipsq_current_ipif == NULL) ||
-	    (new_ipsq->ipsq_writer == curthread)) {
-		new_ipsq->ipsq_writer = curthread;
-		new_ipsq->ipsq_reentry_cnt++;
-		became_writer_on_new_sq = B_TRUE;
-	}
-
-	/*
-	 * We are holding ill_g_lock as writer and all the ill locks of
-	 * the old ipsq. So the old_ipsq can't be looked up, and hence no new
-	 * message can land up on the old ipsq even though we don't hold the
-	 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq.
-	 */
-	ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q);
-
-	/*
-	 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'.
-	 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq>
-	 * assocs. till we release the ill_g_lock, and hence it can't vanish.
-	 */
-	ill_merge_ipsq(old_ipsq, new_ipsq, ipst);
-
-	/*
-	 * Mark the new ipsq as needing a split since it is currently
-	 * being shared by more than 1 IPMP group. The split will
-	 * occur at the end of ipsq_exit
-	 */
-	new_ipsq->ipsq_split = B_TRUE;
-
-	/* Now release all the locks */
-	mutex_exit(&new_ipsq->ipsq_lock);
-	ill_unlock_ills(ill_list, cnt);
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	kmem_free(ill_list, ill_list_size);
-
-	/*
-	 * If we succeeded in becoming writer on the new ipsq, then
-	 * drain the new ipsq and start processing  all enqueued messages
-	 * including the current ioctl we are processing which is either
-	 * a set groupname or failover/failback.
-	 */
-	if (became_writer_on_new_sq)
-		ipsq_exit(new_ipsq);
-
-	/*
-	 * syncq has been changed and all the messages have been moved.
-	 */
-	mutex_enter(&old_ipsq->ipsq_lock);
-	old_ipsq->ipsq_current_ipif = NULL;
-	old_ipsq->ipsq_current_ioctl = 0;
-	old_ipsq->ipsq_current_done = B_TRUE;
-	mutex_exit(&old_ipsq->ipsq_lock);
-	return (EINPROGRESS);
-}
-
-/*
- * Delete and add the loopback copy and non-loopback copy of
- * the BROADCAST ire corresponding to ill and addr. Used to
- * group broadcast ires together when ill becomes part of
- * a group.
- *
- * This function is also called when ill is leaving the group
- * so that the ires belonging to the group gets re-grouped.
- */
-static void
-ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr)
-{
-	ire_t *ire, *nire, *nire_next, *ire_head = NULL;
-	ire_t **ire_ptpn = &ire_head;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	/*
-	 * The loopback and non-loopback IREs are inserted in the order in which
-	 * they're found, on the basis that they are correctly ordered (loopback
-	 * first).
-	 */
-	for (;;) {
-		ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
-		    ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
-		if (ire == NULL)
-			break;
-
-		/*
-		 * we are passing in KM_SLEEP because it is not easy to
-		 * go back to a sane state in case of memory failure.
-		 */
-		nire = kmem_cache_alloc(ire_cache, KM_SLEEP);
-		ASSERT(nire != NULL);
-		bzero(nire, sizeof (ire_t));
-		/*
-		 * Don't use ire_max_frag directly since we don't
-		 * hold on to 'ire' until we add the new ire 'nire' and
-		 * we don't want the new ire to have a dangling reference
-		 * to 'ire'. The ire_max_frag of a broadcast ire must
-		 * be in sync with the ipif_mtu of the associate ipif.
-		 * For eg. this happens as a result of SIOCSLIFNAME,
-		 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by
-		 * the driver. A change in ire_max_frag triggered as
-		 * as a result of path mtu discovery, or due to an
-		 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a
-		 * route change -mtu command does not apply to broadcast ires.
-		 *
-		 * XXX We need a recovery strategy here if ire_init fails
-		 */
-		if (ire_init(nire,
-		    (uchar_t *)&ire->ire_addr,
-		    (uchar_t *)&ire->ire_mask,
-		    (uchar_t *)&ire->ire_src_addr,
-		    (uchar_t *)&ire->ire_gateway_addr,
-		    ire->ire_stq == NULL ? &ip_loopback_mtu :
-		    &ire->ire_ipif->ipif_mtu,
-		    ire->ire_nce,
-		    ire->ire_rfq,
-		    ire->ire_stq,
-		    ire->ire_type,
-		    ire->ire_ipif,
-		    ire->ire_cmask,
-		    ire->ire_phandle,
-		    ire->ire_ihandle,
-		    ire->ire_flags,
-		    &ire->ire_uinfo,
-		    NULL,
-		    NULL,
-		    ipst) == NULL) {
-			cmn_err(CE_PANIC, "ire_init() failed");
-		}
-		ire_delete(ire);
-		ire_refrele(ire);
-
-		/*
-		 * The newly created IREs are inserted at the tail of the list
-		 * starting with ire_head. As we've just allocated them no one
-		 * knows about them so it's safe.
-		 */
-		*ire_ptpn = nire;
-		ire_ptpn = &nire->ire_next;
-	}
-
-	for (nire = ire_head; nire != NULL; nire = nire_next) {
-		int error;
-		ire_t *oire;
-		/* unlink the IRE from our list before calling ire_add() */
-		nire_next = nire->ire_next;
-		nire->ire_next = NULL;
-
-		/* ire_add adds the ire at the right place in the list */
-		oire = nire;
-		error = ire_add(&nire, NULL, NULL, NULL, B_FALSE);
-		ASSERT(error == 0);
-		ASSERT(oire == nire);
-		ire_refrele(nire);	/* Held in ire_add */
-	}
-}
-
-/*
- * This function is usually called when an ill is inserted in
- * a group and all the ipifs are already UP. As all the ipifs
- * are already UP, the broadcast ires have already been created
- * and been inserted. But, ire_add_v4 would not have grouped properly.
- * We need to re-group for the benefit of ip_wput_ire which
- * expects BROADCAST ires to be grouped properly to avoid sending
- * more than one copy of the broadcast packet per group.
- *
- * NOTE : We don't check for ill_ipif_up_count to be non-zero here
- *	  because when ipif_up_done ends up calling this, ires have
- *        already been added before illgrp_insert i.e before ill_group
- *	  has been initialized.
- */
-static void
-ill_group_bcast_for_xmit(ill_t *ill)
+void
+ill_update_source_selection(ill_t *ill)
 {
-	ill_group_t *illgrp;
 	ipif_t *ipif;
-	ipaddr_t addr;
-	ipaddr_t net_mask;
-	ipaddr_t subnet_netmask;
 
-	illgrp = ill->ill_group;
+	ASSERT(IAM_WRITER_ILL(ill));
 
 	/*
-	 * This function is called even when an ill is deleted from
-	 * the group. Hence, illgrp could be null.
+	 * Underlying interfaces are only used for test traffic and thus
+	 * should always send with their (deprecated) source addresses.
 	 */
-	if (illgrp != NULL && illgrp->illgrp_ill_count == 1)
+	if (IS_UNDER_IPMP(ill))
 		return;
 
-	/*
-	 * Delete all the BROADCAST ires matching this ill and add
-	 * them back. This time, ire_add_v4 should take care of
-	 * grouping them with others because ill is part of the
-	 * group.
-	 */
-	ill_bcast_delete_and_add(ill, 0);
-	ill_bcast_delete_and_add(ill, INADDR_BROADCAST);
-
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-
-		if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
-		    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
-			net_mask = ip_net_mask(ipif->ipif_lcl_addr);
-		} else {
-			net_mask = htonl(IN_CLASSA_NET);
-		}
-		addr = net_mask & ipif->ipif_subnet;
-		ill_bcast_delete_and_add(ill, addr);
-		ill_bcast_delete_and_add(ill, ~net_mask | addr);
-
-		subnet_netmask = ipif->ipif_net_mask;
-		addr = ipif->ipif_subnet;
-		ill_bcast_delete_and_add(ill, addr);
-		ill_bcast_delete_and_add(ill, ~subnet_netmask | addr);
-	}
-}
-
-/*
- * This function is called from illgrp_delete when ill is being deleted
- * from the group.
- *
- * As ill is not there in the group anymore, any address belonging
- * to this ill should be cleared of IRE_MARK_NORECV.
- */
-static void
-ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr)
-{
-	ire_t *ire;
-	irb_t *irb;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	ASSERT(ill->ill_group == NULL);
-
-	ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
-	    ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
-
-	if (ire != NULL) {
-		/*
-		 * IPMP and plumbing operations are serialized on the ipsq, so
-		 * no one will insert or delete a broadcast ire under our feet.
-		 */
-		irb = ire->ire_bucket;
-		rw_enter(&irb->irb_lock, RW_READER);
-		ire_refrele(ire);
-
-		for (; ire != NULL; ire = ire->ire_next) {
-			if (ire->ire_addr != addr)
-				break;
-			if (ire_to_ill(ire) != ill)
-				continue;
-
-			ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED));
-			ire->ire_marks &= ~IRE_MARK_NORECV;
-		}
-		rw_exit(&irb->irb_lock);
-	}
-}
-
-ire_t *
-irep_insert(ill_group_t *illgrp, ipaddr_t addr, ire_t *ire, ire_t ***pirep)
-{
-	boolean_t first = B_TRUE;
-	ire_t *clear_ire = NULL;
-	ire_t *start_ire = NULL;
-	uint64_t match_flags;
-	uint64_t phyi_flags;
-	boolean_t fallback = B_FALSE;
-
-	/*
-	 * irb_lock must be held by the caller.
-	 * Get to the first ire matching the address and the
-	 * group. If the address does not match we are done
-	 * as we could not find the IRE. If the address matches
-	 * we should get to the first one matching the group.
-	 */
-	while (ire != NULL) {
-		if (ire->ire_addr != addr ||
-		    ire->ire_ipif->ipif_ill->ill_group == illgrp) {
-			break;
-		}
-		ire = ire->ire_next;
-	}
-	match_flags = PHYI_FAILED | PHYI_INACTIVE;
-	start_ire = ire;
-redo:
-	while (ire != NULL && ire->ire_addr == addr &&
-	    ire->ire_ipif->ipif_ill->ill_group == illgrp) {
-		/*
-		 * The first ire for any address within a group
-		 * should always be the one with IRE_MARK_NORECV cleared
-		 * so that ip_wput_ire can avoid searching for one.
-		 * Note down the insertion point which will be used
-		 * later.
-		 */
-		if (first && (*pirep == NULL))
-			*pirep = ire->ire_ptpn;
-		/*
-		 * PHYI_FAILED is set when the interface fails.
-		 * This interface might have become good, but the
-		 * daemon has not yet detected. We should still
-		 * not receive on this. PHYI_OFFLINE should never
-		 * be picked as this has been offlined and soon
-		 * be removed.
-		 */
-		phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags;
-		if (phyi_flags & PHYI_OFFLINE) {
-			ire->ire_marks |= IRE_MARK_NORECV;
-			ire = ire->ire_next;
-			continue;
-		}
-		if (phyi_flags & match_flags) {
-			ire->ire_marks |= IRE_MARK_NORECV;
-			ire = ire->ire_next;
-			if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
-			    PHYI_INACTIVE) {
-				fallback = B_TRUE;
-			}
-			continue;
-		}
-		if (first) {
-			/*
-			 * We will move this to the front of the list later
-			 * on.
-			 */
-			clear_ire = ire;
-			ire->ire_marks &= ~IRE_MARK_NORECV;
-		} else {
-			ire->ire_marks |= IRE_MARK_NORECV;
-		}
-		first = B_FALSE;
-		ire = ire->ire_next;
-	}
-	/*
-	 * If we never nominated anybody, try nominating at least
-	 * an INACTIVE, if we found one. Do it only once though.
-	 */
-	if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) &&
-	    fallback) {
-		match_flags = PHYI_FAILED;
-		ire = start_ire;
-		*pirep = NULL;
-		goto redo;
-	}
-	return (clear_ire);
-}
-
-/*
- * This function must be called only after the broadcast ires
- * have been grouped together. For a given address addr, nominate
- * only one of the ires whose interface is not FAILED or OFFLINE.
- *
- * This is also called when an ipif goes down, so that we can nominate
- * a different ire with the same address for receiving.
- */
-static void
-ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr, ip_stack_t *ipst)
-{
-	irb_t *irb;
-	ire_t *ire;
-	ire_t *ire1;
-	ire_t *save_ire;
-	ire_t **irep = NULL;
-	ire_t *clear_ire = NULL;
-	ire_t	*new_lb_ire;
-	ire_t	*new_nlb_ire;
-	boolean_t new_lb_ire_used = B_FALSE;
-	boolean_t new_nlb_ire_used = B_FALSE;
-	boolean_t refrele_lb_ire = B_FALSE;
-	boolean_t refrele_nlb_ire = B_FALSE;
-	uint_t	max_frag;
-
-	ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES,
-	    NULL, MATCH_IRE_TYPE, ipst);
-	/*
-	 * We may not be able to find some ires if a previous
-	 * ire_create failed. This happens when an ipif goes
-	 * down and we are unable to create BROADCAST ires due
-	 * to memory failure. Thus, we have to check for NULL
-	 * below. This should handle the case for LOOPBACK,
-	 * POINTOPOINT and interfaces with some POINTOPOINT
-	 * logicals for which there are no BROADCAST ires.
-	 */
-	if (ire == NULL)
-		return;
-	/*
-	 * Currently IRE_BROADCASTS are deleted when an ipif
-	 * goes down which runs exclusively. Thus, setting
-	 * IRE_MARK_RCVD should not race with ire_delete marking
-	 * IRE_MARK_CONDEMNED. We grab the lock below just to
-	 * be consistent with other parts of the code that walks
-	 * a given bucket.
-	 */
-	save_ire = ire;
-	irb = ire->ire_bucket;
-	new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
-	if (new_lb_ire == NULL) {
-		ire_refrele(ire);
-		return;
-	}
-	new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
-	if (new_nlb_ire == NULL) {
-		ire_refrele(ire);
-		kmem_cache_free(ire_cache, new_lb_ire);
-		return;
-	}
-	IRB_REFHOLD(irb);
-	rw_enter(&irb->irb_lock, RW_WRITER);
-	clear_ire = irep_insert(illgrp, addr, ire, &irep);
-
-	/*
-	 * irep non-NULL indicates that we entered the while loop
-	 * above. If clear_ire is at the insertion point, we don't
-	 * have to do anything. clear_ire will be NULL if all the
-	 * interfaces are failed.
-	 *
-	 * We cannot unlink and reinsert the ire at the right place
-	 * in the list since there can be other walkers of this bucket.
-	 * Instead we delete and recreate the ire
-	 */
-	if (clear_ire != NULL && irep != NULL && *irep != clear_ire) {
-		ire_t *clear_ire_stq = NULL;
-		ire_t *clr_ire = NULL;
-		ire_t *ire_next = NULL;
-
-		if (clear_ire->ire_stq == NULL)
-			ire_next = clear_ire->ire_next;
-
-		rw_exit(&irb->irb_lock);
-
-		bzero(new_lb_ire, sizeof (ire_t));
-		/* XXX We need a recovery strategy here. */
-		if (ire_init(new_lb_ire,
-		    (uchar_t *)&clear_ire->ire_addr,
-		    (uchar_t *)&clear_ire->ire_mask,
-		    (uchar_t *)&clear_ire->ire_src_addr,
-		    (uchar_t *)&clear_ire->ire_gateway_addr,
-		    &clear_ire->ire_max_frag,
-		    NULL, /* let ire_nce_init derive the resolver info */
-		    clear_ire->ire_rfq,
-		    clear_ire->ire_stq,
-		    clear_ire->ire_type,
-		    clear_ire->ire_ipif,
-		    clear_ire->ire_cmask,
-		    clear_ire->ire_phandle,
-		    clear_ire->ire_ihandle,
-		    clear_ire->ire_flags,
-		    &clear_ire->ire_uinfo,
-		    NULL,
-		    NULL,
-		    ipst) == NULL)
-			cmn_err(CE_PANIC, "ire_init() failed");
-
-		refrele_lb_ire = B_TRUE;
-
-		if (ire_next != NULL &&
-		    ire_next->ire_stq != NULL &&
-		    ire_next->ire_addr == clear_ire->ire_addr &&
-		    ire_next->ire_ipif->ipif_ill ==
-		    clear_ire->ire_ipif->ipif_ill) {
-			clear_ire_stq = ire_next;
-
-			bzero(new_nlb_ire, sizeof (ire_t));
-			/* XXX We need a recovery strategy here. */
-			if (ire_init(new_nlb_ire,
-			    (uchar_t *)&clear_ire_stq->ire_addr,
-			    (uchar_t *)&clear_ire_stq->ire_mask,
-			    (uchar_t *)&clear_ire_stq->ire_src_addr,
-			    (uchar_t *)&clear_ire_stq->ire_gateway_addr,
-			    &clear_ire_stq->ire_max_frag,
-			    NULL,
-			    clear_ire_stq->ire_rfq,
-			    clear_ire_stq->ire_stq,
-			    clear_ire_stq->ire_type,
-			    clear_ire_stq->ire_ipif,
-			    clear_ire_stq->ire_cmask,
-			    clear_ire_stq->ire_phandle,
-			    clear_ire_stq->ire_ihandle,
-			    clear_ire_stq->ire_flags,
-			    &clear_ire_stq->ire_uinfo,
-			    NULL,
-			    NULL,
-			    ipst) == NULL)
-				cmn_err(CE_PANIC, "ire_init() failed");
-
-				refrele_nlb_ire = B_TRUE;
-			}
-
-		rw_enter(&irb->irb_lock, RW_WRITER);
-		/*
-		 * irb_lock was dropped across call to ire_init() due to
-		 * lock ordering issue with ipst->ips_ndp{4,6}->ndp_g_lock
-		 * mutex lock. Therefore irep could have changed. call
-		 * irep_insert() to get the new insertion point (irep) and
-		 * recheck all known conditions.
-		 */
-		irep = NULL;
-		clr_ire = irep_insert(illgrp, addr, save_ire, &irep);
-		if ((irep != NULL) && (*irep != clear_ire) &&
-		    (clr_ire == clear_ire)) {
-			if ((clear_ire_stq != NULL) &&
-			    (clr_ire->ire_next != clear_ire_stq))
-				clear_ire_stq = NULL;
-			/*
-			 * Delete the ire. We can't call ire_delete() since
-			 * we are holding the bucket lock. We can't release the
-			 * bucket lock since we can't allow irep to change.
-			 * So just mark it CONDEMNED.
-			 * The IRB_REFRELE will delete the ire from the list
-			 * and do the refrele.
-			 */
-			clear_ire->ire_marks |= IRE_MARK_CONDEMNED;
-			irb->irb_marks |= IRB_MARK_CONDEMNED;
-
-			if (clear_ire_stq != NULL &&
-			    clear_ire_stq->ire_nce != NULL) {
-				nce_fastpath_list_delete(
-				    clear_ire_stq->ire_nce);
-				clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED;
-			}
-
-			/*
-			 * Also take care of otherfields like ib/ob pkt count
-			 * etc. Need to dup them.
-			 * ditto in ill_bcast_delete_and_add
-			 */
-
-			/* Set the max_frag before adding the ire */
-			max_frag = *new_lb_ire->ire_max_fragp;
-			new_lb_ire->ire_max_fragp = NULL;
-			new_lb_ire->ire_max_frag = max_frag;
-
-			/* Add the new ire's. Insert at *irep */
-			new_lb_ire->ire_bucket = clear_ire->ire_bucket;
-			ire1 = *irep;
-			if (ire1 != NULL)
-				ire1->ire_ptpn = &new_lb_ire->ire_next;
-			new_lb_ire->ire_next = ire1;
-			/* Link the new one in. */
-			new_lb_ire->ire_ptpn = irep;
-			membar_producer();
-			*irep = new_lb_ire;
-			new_lb_ire_used = B_TRUE;
-			BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
-			    ire_stats_inserted);
-			new_lb_ire->ire_bucket->irb_ire_cnt++;
-			DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *),
-			    new_lb_ire->ire_ipif,
-			    (char *), "ire", (void *), new_lb_ire);
-			new_lb_ire->ire_ipif->ipif_ire_cnt++;
-
-			if (clear_ire_stq != NULL) {
-				ill_t	*ire_ill;
-				/* Set the max_frag before adding the ire */
-				max_frag = *new_nlb_ire->ire_max_fragp;
-				new_nlb_ire->ire_max_fragp = NULL;
-				new_nlb_ire->ire_max_frag = max_frag;
-
-				new_nlb_ire->ire_bucket = clear_ire->ire_bucket;
-				irep = &new_lb_ire->ire_next;
-				/* Add the new ire. Insert at *irep */
-				ire1 = *irep;
-				if (ire1 != NULL)
-					ire1->ire_ptpn = &new_nlb_ire->ire_next;
-				new_nlb_ire->ire_next = ire1;
-				/* Link the new one in. */
-				new_nlb_ire->ire_ptpn = irep;
-				membar_producer();
-				*irep = new_nlb_ire;
-				new_nlb_ire_used = B_TRUE;
-				BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
-				    ire_stats_inserted);
-				new_nlb_ire->ire_bucket->irb_ire_cnt++;
-				DTRACE_PROBE3(ipif__incr__cnt,
-				    (ipif_t *), new_nlb_ire->ire_ipif,
-				    (char *), "ire", (void *), new_nlb_ire);
-				new_nlb_ire->ire_ipif->ipif_ire_cnt++;
-				DTRACE_PROBE3(ill__incr__cnt,
-				    (ill_t *), new_nlb_ire->ire_stq->q_ptr,
-				    (char *), "ire", (void *), new_nlb_ire);
-				ire_ill = (ill_t *)new_nlb_ire->ire_stq->q_ptr;
-				ire_ill->ill_ire_cnt++;
-			}
-		}
-	}
-	ire_refrele(save_ire);
-	rw_exit(&irb->irb_lock);
-	/*
-	 * Since we dropped the irb_lock across call to ire_init()
-	 * and rechecking known conditions, it is possible that
-	 * the checks might fail, therefore undo the work done by
-	 * ire_init() by calling ire_refrele() on the newly created ire.
-	 */
-	if (!new_lb_ire_used) {
-		if (refrele_lb_ire) {
-			ire_refrele(new_lb_ire);
-		} else {
-			kmem_cache_free(ire_cache, new_lb_ire);
-		}
-	}
-	if (!new_nlb_ire_used) {
-		if (refrele_nlb_ire) {
-			ire_refrele(new_nlb_ire);
-		} else {
-			kmem_cache_free(ire_cache, new_nlb_ire);
-		}
-	}
-	IRB_REFRELE(irb);
-}
-
-/*
- * Whenever an ipif goes down we have to renominate a different
- * broadcast ire to receive. Whenever an ipif comes up, we need
- * to make sure that we have only one nominated to receive.
- */
-static void
-ipif_renominate_bcast(ipif_t *ipif)
-{
-	ill_t *ill = ipif->ipif_ill;
-	ipaddr_t subnet_addr;
-	ipaddr_t net_addr;
-	ipaddr_t net_mask = 0;
-	ipaddr_t subnet_netmask;
-	ipaddr_t addr;
-	ill_group_t *illgrp;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	illgrp = ill->ill_group;
-	/*
-	 * If this is the last ipif going down, it might take
-	 * the ill out of the group. In that case ipif_down ->
-	 * illgrp_delete takes care of doing the nomination.
-	 * ipif_down does not call for this case.
-	 */
-	ASSERT(illgrp != NULL);
-
-	/* There could not have been any ires associated with this */
-	if (ipif->ipif_subnet == 0)
-		return;
-
-	ill_mark_bcast(illgrp, 0, ipst);
-	ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst);
-
-	if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
-	    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
-		net_mask = ip_net_mask(ipif->ipif_lcl_addr);
-	} else {
-		net_mask = htonl(IN_CLASSA_NET);
-	}
-	addr = net_mask & ipif->ipif_subnet;
-	ill_mark_bcast(illgrp, addr, ipst);
-
-	net_addr = ~net_mask | addr;
-	ill_mark_bcast(illgrp, net_addr, ipst);
-
-	subnet_netmask = ipif->ipif_net_mask;
-	addr = ipif->ipif_subnet;
-	ill_mark_bcast(illgrp, addr, ipst);
-
-	subnet_addr = ~subnet_netmask | addr;
-	ill_mark_bcast(illgrp, subnet_addr, ipst);
-}
-
-/*
- * Whenever we form or delete ill groups, we need to nominate one set of
- * BROADCAST ires for receiving in the group.
- *
- * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires
- *    have been added, but ill_ipif_up_count is 0. Thus, we don't assert
- *    for ill_ipif_up_count to be non-zero. This is the only case where
- *    ill_ipif_up_count is zero and we would still find the ires.
- *
- * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one
- *    ipif is UP and we just have to do the nomination.
- *
- * 3) When ill_handoff_responsibility calls us, some ill has been removed
- *    from the group. So, we have to do the nomination.
- *
- * Because of (3), there could be just one ill in the group. But we have
- * to nominate still as IRE_MARK_NORCV may have been marked on this.
- * Thus, this function does not optimize when there is only one ill as
- * it is not correct for (3).
- */
-static void
-ill_nominate_bcast_rcv(ill_group_t *illgrp)
-{
-	ill_t *ill;
-	ipif_t *ipif;
-	ipaddr_t subnet_addr;
-	ipaddr_t prev_subnet_addr = 0;
-	ipaddr_t net_addr;
-	ipaddr_t prev_net_addr = 0;
-	ipaddr_t net_mask = 0;
-	ipaddr_t subnet_netmask;
-	ipaddr_t addr;
-	ip_stack_t	*ipst;
-
-	/*
-	 * When the last memeber is leaving, there is nothing to
-	 * nominate.
-	 */
-	if (illgrp->illgrp_ill_count == 0) {
-		ASSERT(illgrp->illgrp_ill == NULL);
-		return;
-	}
-
-	ill = illgrp->illgrp_ill;
-	ASSERT(!ill->ill_isv6);
-	ipst = ill->ill_ipst;
-	/*
-	 * We assume that ires with same address and belonging to the
-	 * same group, has been grouped together. Nominating a *single*
-	 * ill in the group for sending and receiving broadcast is done
-	 * by making sure that the first BROADCAST ire (which will be
-	 * the one returned by ire_ctable_lookup for ip_rput and the
-	 * one that will be used in ip_wput_ire) will be the one that
-	 * will not have IRE_MARK_NORECV set.
-	 *
-	 * 1) ip_rput checks and discards packets received on ires marked
-	 *    with IRE_MARK_NORECV. Thus, we don't send up duplicate
-	 *    broadcast packets. We need to clear IRE_MARK_NORECV on the
-	 *    first ire in the group for every broadcast address in the group.
-	 *    ip_rput will accept packets only on the first ire i.e only
-	 *    one copy of the ill.
-	 *
-	 * 2) ip_wput_ire needs to send out just one copy of the broadcast
-	 *    packet for the whole group. It needs to send out on the ill
-	 *    whose ire has not been marked with IRE_MARK_NORECV. If it sends
-	 *    on the one marked with IRE_MARK_NORECV, ip_rput will accept
-	 *    the copy echoed back on other port where the ire is not marked
-	 *    with IRE_MARK_NORECV.
-	 *
-	 * Note that we just need to have the first IRE either loopback or
-	 * non-loopback (either of them may not exist if ire_create failed
-	 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will
-	 * always hit the first one and hence will always accept one copy.
-	 *
-	 * We have a broadcast ire per ill for all the unique prefixes
-	 * hosted on that ill. As we don't have a way of knowing the
-	 * unique prefixes on a given ill and hence in the whole group,
-	 * we just call ill_mark_bcast on all the prefixes that exist
-	 * in the group. For the common case of one prefix, the code
-	 * below optimizes by remebering the last address used for
-	 * markng. In the case of multiple prefixes, this will still
-	 * optimize depending the order of prefixes.
-	 *
-	 * The only unique address across the whole group is 0.0.0.0 and
-	 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables
-	 * the first ire in the bucket for receiving and disables the
-	 * others.
-	 */
-	ill_mark_bcast(illgrp, 0, ipst);
-	ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst);
-	for (; ill != NULL; ill = ill->ill_group_next) {
-
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-
-			if (!(ipif->ipif_flags & IPIF_UP) ||
-			    ipif->ipif_subnet == 0) {
-				continue;
-			}
-			if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
-			    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
-				net_mask = ip_net_mask(ipif->ipif_lcl_addr);
-			} else {
-				net_mask = htonl(IN_CLASSA_NET);
-			}
-			addr = net_mask & ipif->ipif_subnet;
-			if (prev_net_addr == 0 || prev_net_addr != addr) {
-				ill_mark_bcast(illgrp, addr, ipst);
-				net_addr = ~net_mask | addr;
-				ill_mark_bcast(illgrp, net_addr, ipst);
-			}
-			prev_net_addr = addr;
-
-			subnet_netmask = ipif->ipif_net_mask;
-			addr = ipif->ipif_subnet;
-			if (prev_subnet_addr == 0 ||
-			    prev_subnet_addr != addr) {
-				ill_mark_bcast(illgrp, addr, ipst);
-				subnet_addr = ~subnet_netmask | addr;
-				ill_mark_bcast(illgrp, subnet_addr, ipst);
-			}
-			prev_subnet_addr = addr;
-		}
-	}
-}
-
-/*
- * This function is called while forming ill groups.
- *
- * Currently, we handle only allmulti groups. We want to join
- * allmulti on only one of the ills in the groups. In future,
- * when we have link aggregation, we may have to join normal
- * multicast groups on multiple ills as switch does inbound load
- * balancing. Following are the functions that calls this
- * function :
- *
- * 1) ill_recover_multicast : Interface is coming back UP.
- *    When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6
- *    will call ill_recover_multicast to recover all the multicast
- *    groups. We need to make sure that only one member is joined
- *    in the ill group.
- *
- * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed.
- *    Somebody is joining allmulti. We need to make sure that only one
- *    member is joined in the group.
- *
- * 3) illgrp_insert : If allmulti has already joined, we need to make
- *    sure that only one member is joined in the group.
- *
- * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving
- *    allmulti who we have nominated. We need to pick someother ill.
- *
- * 5) illgrp_delete : The ill we nominated is leaving the group,
- *    we need to pick a new ill to join the group.
- *
- * For (1), (2), (5) - we just have to check whether there is
- * a good ill joined in the group. If we could not find any ills
- * joined the group, we should join.
- *
- * For (4), the one that was nominated to receive, left the group.
- * There could be nobody joined in the group when this function is
- * called.
- *
- * For (3) - we need to explicitly check whether there are multiple
- * ills joined in the group.
- *
- * For simplicity, we don't differentiate any of the above cases. We
- * just leave the group if it is joined on any of them and join on
- * the first good ill.
- */
-int
-ill_nominate_mcast_rcv(ill_group_t *illgrp)
-{
-	ilm_t *ilm;
-	ill_t *ill;
-	ill_t *fallback_inactive_ill = NULL;
-	ill_t *fallback_failed_ill = NULL;
-	int ret = 0;
-
-	/*
-	 * Leave the allmulti on all the ills and start fresh.
-	 */
-	for (ill = illgrp->illgrp_ill; ill != NULL;
-	    ill = ill->ill_group_next) {
-		if (ill->ill_join_allmulti)
-			ill_leave_allmulti(ill);
-	}
-
-	/*
-	 * Choose a good ill. Fallback to inactive or failed if
-	 * none available. We need to fallback to FAILED in the
-	 * case where we have 2 interfaces in a group - where
-	 * one of them is failed and another is a good one and
-	 * the good one (not marked inactive) is leaving the group.
-	 */
-	for (ill = illgrp->illgrp_ill; ill != NULL; ill = ill->ill_group_next) {
-		if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE)
-			continue;
-		if (ill->ill_phyint->phyint_flags & PHYI_FAILED) {
-			fallback_failed_ill = ill;
-			continue;
-		}
-		if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) {
-			fallback_inactive_ill = ill;
-			continue;
-		}
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
-				ret = ill_join_allmulti(ill);
-				/*
-				 * ill_join_allmulti() can fail because of
-				 * memory failures so make sure we join at
-				 * least on one ill.
-				 */
-				if (ill->ill_join_allmulti)
-					return (0);
-			}
-		}
-	}
-	if (ret != 0) {
-		/*
-		 * If we tried nominating above and failed to do so,
-		 * return error. We might have tried multiple times.
-		 * But, return the latest error.
-		 */
-		return (ret);
-	}
-	if ((ill = fallback_inactive_ill) != NULL) {
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr))
-				return (ill_join_allmulti(ill));
-		}
-	} else if ((ill = fallback_failed_ill) != NULL) {
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr))
-				return (ill_join_allmulti(ill));
-		}
-	}
-	return (0);
-}
-
-/*
- * This function is called from illgrp_delete after it is
- * deleted from the group to reschedule responsibilities
- * to a different ill.
- */
-static void
-ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp)
-{
-	ilm_t	*ilm;
-	ipif_t	*ipif;
-	ipaddr_t subnet_addr;
-	ipaddr_t net_addr;
-	ipaddr_t net_mask = 0;
-	ipaddr_t subnet_netmask;
-	ipaddr_t addr;
-	ip_stack_t *ipst = ill->ill_ipst;
-
-	ASSERT(ill->ill_group == NULL);
-	/*
-	 * Broadcast Responsibility:
-	 *
-	 * 1. If this ill has been nominated for receiving broadcast
-	 * packets, we need to find a new one. Before we find a new
-	 * one, we need to re-group the ires that are part of this new
-	 * group (assumed by ill_nominate_bcast_rcv). We do this by
-	 * calling ill_group_bcast_for_xmit(ill) which will do the right
-	 * thing for us.
-	 *
-	 * 2. If this ill was not nominated for receiving broadcast
-	 * packets, we need to clear the IRE_MARK_NORECV flag
-	 * so that we continue to send up broadcast packets.
-	 */
-	if (!ill->ill_isv6) {
-		/*
-		 * Case 1 above : No optimization here. Just redo the
-		 * nomination.
-		 */
-		ill_group_bcast_for_xmit(ill);
-		ill_nominate_bcast_rcv(illgrp);
-
-		/*
-		 * Case 2 above : Lookup and clear IRE_MARK_NORECV.
-		 */
-		ill_clear_bcast_mark(ill, 0);
-		ill_clear_bcast_mark(ill, INADDR_BROADCAST);
-
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-
-			if (!(ipif->ipif_flags & IPIF_UP) ||
-			    ipif->ipif_subnet == 0) {
-				continue;
-			}
-			if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
-			    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
-				net_mask = ip_net_mask(ipif->ipif_lcl_addr);
-			} else {
-				net_mask = htonl(IN_CLASSA_NET);
-			}
-			addr = net_mask & ipif->ipif_subnet;
-			ill_clear_bcast_mark(ill, addr);
-
-			net_addr = ~net_mask | addr;
-			ill_clear_bcast_mark(ill, net_addr);
-
-			subnet_netmask = ipif->ipif_net_mask;
-			addr = ipif->ipif_subnet;
-			ill_clear_bcast_mark(ill, addr);
-
-			subnet_addr = ~subnet_netmask | addr;
-			ill_clear_bcast_mark(ill, subnet_addr);
-		}
-	}
-
-	/*
-	 * Multicast Responsibility.
-	 *
-	 * If we have joined allmulti on this one, find a new member
-	 * in the group to join allmulti. As this ill is already part
-	 * of allmulti, we don't have to join on this one.
-	 *
-	 * If we have not joined allmulti on this one, there is no
-	 * responsibility to handoff. But we need to take new
-	 * responsibility i.e, join allmulti on this one if we need
-	 * to.
-	 */
-	if (ill->ill_join_allmulti) {
-		(void) ill_nominate_mcast_rcv(illgrp);
-	} else {
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
-				(void) ill_join_allmulti(ill);
-				break;
-			}
-		}
-	}
-
-	/*
-	 * We intentionally do the flushing of IRE_CACHES only matching
-	 * on the ill and not on groups. Note that we are already deleted
-	 * from the group.
-	 *
-	 * This will make sure that all IRE_CACHES whose stq is pointing
-	 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get
-	 * deleted and IRE_CACHES that are not pointing at this ill will
-	 * be left alone.
-	 */
-	ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
-	    illgrp_cache_delete, ill, ill);
-
-	/*
-	 * Some conn may have cached one of the IREs deleted above. By removing
-	 * the ire reference, we clean up the extra reference to the ill held in
-	 * ire->ire_stq.
-	 */
-	ipcl_walk(conn_cleanup_stale_ire, NULL, ipst);
-
-	/*
-	 * Re-do source address selection for all the members in the
-	 * group, if they borrowed source address from one of the ipifs
-	 * in this ill.
-	 */
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (ill->ill_isv6) {
-			ipif_update_other_ipifs_v6(ipif, illgrp);
-		} else {
-			ipif_update_other_ipifs(ipif, illgrp);
-		}
+		if (ill->ill_isv6)
+			ipif_recreate_interface_routes_v6(NULL, ipif);
+		else
+			ipif_recreate_interface_routes(NULL, ipif);
 	}
 }
 
 /*
- * Delete the ill from the group. The caller makes sure that it is
- * in a group and it okay to delete from the group. So, we always
- * delete here.
+ * Finish the group join started in ip_sioctl_groupname().
  */
+/* ARGSUSED */
 static void
-illgrp_delete(ill_t *ill)
-{
-	ill_group_t *illgrp;
-	ill_group_t *tmpg;
-	ill_t *tmp_ill;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	/*
-	 * Reset illgrp_ill_schednext if it was pointing at us.
-	 * We need to do this before we set ill_group to NULL.
-	 */
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-	mutex_enter(&ill->ill_lock);
-
-	illgrp_reset_schednext(ill);
-
-	illgrp = ill->ill_group;
-
-	/* Delete the ill from illgrp. */
-	if (illgrp->illgrp_ill == ill) {
-		illgrp->illgrp_ill = ill->ill_group_next;
-	} else {
-		tmp_ill = illgrp->illgrp_ill;
-		while (tmp_ill->ill_group_next != ill) {
-			tmp_ill = tmp_ill->ill_group_next;
-			ASSERT(tmp_ill != NULL);
-		}
-		tmp_ill->ill_group_next = ill->ill_group_next;
-	}
-	ill->ill_group = NULL;
-	ill->ill_group_next = NULL;
-
-	illgrp->illgrp_ill_count--;
-	mutex_exit(&ill->ill_lock);
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	/*
-	 * As this ill is leaving the group, we need to hand off
-	 * the responsibilities to the other ills in the group, if
-	 * this ill had some responsibilities.
-	 */
-
-	ill_handoff_responsibility(ill, illgrp);
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
-	if (illgrp->illgrp_ill_count == 0) {
-
-		ASSERT(illgrp->illgrp_ill == NULL);
-		if (ill->ill_isv6) {
-			if (illgrp == ipst->ips_illgrp_head_v6) {
-				ipst->ips_illgrp_head_v6 = illgrp->illgrp_next;
-			} else {
-				tmpg = ipst->ips_illgrp_head_v6;
-				while (tmpg->illgrp_next != illgrp) {
-					tmpg = tmpg->illgrp_next;
-					ASSERT(tmpg != NULL);
-				}
-				tmpg->illgrp_next = illgrp->illgrp_next;
-			}
-		} else {
-			if (illgrp == ipst->ips_illgrp_head_v4) {
-				ipst->ips_illgrp_head_v4 = illgrp->illgrp_next;
-			} else {
-				tmpg = ipst->ips_illgrp_head_v4;
-				while (tmpg->illgrp_next != illgrp) {
-					tmpg = tmpg->illgrp_next;
-					ASSERT(tmpg != NULL);
-				}
-				tmpg->illgrp_next = illgrp->illgrp_next;
-			}
-		}
-		mutex_destroy(&illgrp->illgrp_lock);
-		mi_free(illgrp);
-	}
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	/*
-	 * Even though the ill is out of the group its not necessary
-	 * to set ipsq_split as TRUE as the ipifs could be down temporarily
-	 * We will split the ipsq when phyint_groupname is set to NULL.
-	 */
-
-	/*
-	 * Send a routing sockets message if we are deleting from
-	 * groups with names.
-	 */
-	if (ill->ill_phyint->phyint_groupname_len != 0)
-		ip_rts_ifmsg(ill->ill_ipif);
-}
-
-/*
- * Re-do source address selection. This is normally called when
- * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST
- * ipif comes up.
- */
-void
-ill_update_source_selection(ill_t *ill)
+ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
 {
-	ipif_t *ipif;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	if (ill->ill_group != NULL)
-		ill = ill->ill_group->illgrp_ill;
-
-	for (; ill != NULL; ill = ill->ill_group_next) {
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (ill->ill_isv6)
-				ipif_recreate_interface_routes_v6(NULL, ipif);
-			else
-				ipif_recreate_interface_routes(NULL, ipif);
-		}
-	}
-}
-
-/*
- * Insert ill in a group headed by illgrp_head. The caller can either
- * pass a groupname in which case we search for a group with the
- * same name to insert in or pass a group to insert in. This function
- * would only search groups with names.
- *
- * NOTE : The caller should make sure that there is at least one ipif
- *	  UP on this ill so that illgrp_scheduler can pick this ill
- *	  for outbound packets. If ill_ipif_up_count is zero, we have
- *	  already sent a DL_UNBIND to the driver and we don't want to
- *	  send anymore packets. We don't assert for ipif_up_count
- *	  to be greater than zero, because ipif_up_done wants to call
- *	  this function before bumping up the ipif_up_count. See
- *	  ipif_up_done() for details.
- */
-int
-illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname,
-    ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up)
-{
-	ill_group_t *illgrp;
-	ill_t *prev_ill;
-	phyint_t *phyi;
+	ill_t		*ill = q->q_ptr;
+	phyint_t	*phyi = ill->ill_phyint;
+	ipmp_grp_t	*grp = phyi->phyint_grp;
 	ip_stack_t	*ipst = ill->ill_ipst;
 
-	ASSERT(ill->ill_group == NULL);
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-	mutex_enter(&ill->ill_lock);
-
-	if (groupname != NULL) {
-		/*
-		 * Look for a group with a matching groupname to insert.
-		 */
-		for (illgrp = *illgrp_head; illgrp != NULL;
-		    illgrp = illgrp->illgrp_next) {
-
-			ill_t *tmp_ill;
-
-			/*
-			 * If we have an ill_group_t in the list which has
-			 * no ill_t assigned then we must be in the process of
-			 * removing this group. We skip this as illgrp_delete()
-			 * will remove it from the list.
-			 */
-			if ((tmp_ill = illgrp->illgrp_ill) == NULL) {
-				ASSERT(illgrp->illgrp_ill_count == 0);
-				continue;
-			}
-
-			ASSERT(tmp_ill->ill_phyint != NULL);
-			phyi = tmp_ill->ill_phyint;
-			/*
-			 * Look at groups which has names only.
-			 */
-			if (phyi->phyint_groupname_len == 0)
-				continue;
-			/*
-			 * Names are stored in the phyint common to both
-			 * IPv4 and IPv6.
-			 */
-			if (mi_strcmp(phyi->phyint_groupname,
-			    groupname) == 0) {
-				break;
-			}
-		}
-	} else {
-		/*
-		 * If the caller passes in a NULL "grp_to_insert", we
-		 * allocate one below and insert this singleton.
-		 */
-		illgrp = grp_to_insert;
-	}
-
-	ill->ill_group_next = NULL;
-
-	if (illgrp == NULL) {
-		illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t));
-		if (illgrp == NULL) {
-			return (ENOMEM);
-		}
-		illgrp->illgrp_next = *illgrp_head;
-		*illgrp_head = illgrp;
-		illgrp->illgrp_ill = ill;
-		illgrp->illgrp_ill_count = 1;
-		ill->ill_group = illgrp;
-		/*
-		 * Used in illgrp_scheduler to protect multiple threads
-		 * from traversing the list.
-		 */
-		mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0);
-	} else {
-		ASSERT(ill->ill_net_type ==
-		    illgrp->illgrp_ill->ill_net_type);
-		ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type);
-
-		/* Insert ill at tail of this group */
-		prev_ill = illgrp->illgrp_ill;
-		while (prev_ill->ill_group_next != NULL)
-			prev_ill = prev_ill->ill_group_next;
-		prev_ill->ill_group_next = ill;
-		ill->ill_group = illgrp;
-		illgrp->illgrp_ill_count++;
-		/*
-		 * Inherit group properties. Currently only forwarding
-		 * is the property we try to keep the same with all the
-		 * ills. When there are more, we will abstract this into
-		 * a function.
-		 */
-		ill->ill_flags &= ~ILLF_ROUTER;
-		ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER);
-	}
-	mutex_exit(&ill->ill_lock);
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	/*
-	 * 1) When ipif_up_done() calls this function, ipif_up_count
-	 *    may be zero as it has not yet been bumped. But the ires
-	 *    have already been added. So, we do the nomination here
-	 *    itself. But, when ip_sioctl_groupname calls this, it checks
-	 *    for ill_ipif_up_count != 0. Thus we don't check for
-	 *    ill_ipif_up_count here while nominating broadcast ires for
-	 *    receive.
-	 *
-	 * 2) Similarly, we need to call ill_group_bcast_for_xmit here
-	 *    to group them properly as ire_add() has already happened
-	 *    in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert
-	 *    case, we need to do it here anyway.
-	 */
-	if (!ill->ill_isv6) {
-		ill_group_bcast_for_xmit(ill);
-		ill_nominate_bcast_rcv(illgrp);
-	}
-
-	if (!ipif_is_coming_up) {
-		/*
-		 * When ipif_up_done() calls this function, the multicast
-		 * groups have not been joined yet. So, there is no point in
-		 * nomination. ill_join_allmulti() will handle groups when
-		 * ill_recover_multicast() is called from ipif_up_done() later.
-		 */
-		(void) ill_nominate_mcast_rcv(illgrp);
-		/*
-		 * ipif_up_done calls ill_update_source_selection
-		 * anyway. Moreover, we don't want to re-create
-		 * interface routes while ipif_up_done() still has reference
-		 * to them. Refer to ipif_up_done() for more details.
-		 */
-		ill_update_source_selection(ill);
-	}
-
-	/*
-	 * Send a routing sockets message if we are inserting into
-	 * groups with names.
-	 */
-	if (groupname != NULL)
-		ip_rts_ifmsg(ill->ill_ipif);
-	return (0);
-}
-
-/*
- * Return the first phyint matching the groupname. There could
- * be more than one when there are ill groups.
- *
- * If 'usable' is set, then we exclude ones that are marked with any of
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE).
- * Needs work: called only from ip_sioctl_groupname and from the ipmp/netinfo
- * emulation of ipmp.
- */
-phyint_t *
-phyint_lookup_group(char *groupname, boolean_t usable, ip_stack_t *ipst)
-{
-	phyint_t *phyi;
-
-	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-	/*
-	 * Group names are stored in the phyint - a common structure
-	 * to both IPv4 and IPv6.
-	 */
-	phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
-	for (; phyi != NULL;
-	    phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
-	    phyi, AVL_AFTER)) {
-		if (phyi->phyint_groupname_len == 0)
-			continue;
-		/*
-		 * Skip the ones that should not be used since the callers
-		 * sometime use this for sending packets.
-		 */
-		if (usable && (phyi->phyint_flags &
-		    (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)))
-			continue;
+	/* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */
+	ASSERT(!IS_IPMP(ill) && grp != NULL);
+	ASSERT(IAM_WRITER_IPSQ(ipsq));
 
-		ASSERT(phyi->phyint_groupname != NULL);
-		if (mi_strcmp(groupname, phyi->phyint_groupname) == 0)
-			return (phyi);
+	if (phyi->phyint_illv4 != NULL) {
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		VERIFY(grp->gr_pendv4-- > 0);
+		rw_exit(&ipst->ips_ipmp_lock);
+		ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4);
 	}
-	return (NULL);
-}
-
-
-/*
- * Return the first usable phyint matching the group index. By 'usable'
- * we exclude ones that are marked ununsable with any of
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE).
- *
- * Used only for the ipmp/netinfo emulation of ipmp.
- */
-phyint_t *
-phyint_lookup_group_ifindex(uint_t group_ifindex, ip_stack_t *ipst)
-{
-	phyint_t *phyi;
-
-	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
-	if (!ipst->ips_ipmp_hook_emulation)
-		return (NULL);
-
-	/*
-	 * Group indicies are stored in the phyint - a common structure
-	 * to both IPv4 and IPv6.
-	 */
-	phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
-	for (; phyi != NULL;
-	    phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
-	    phyi, AVL_AFTER)) {
-		/* Ignore the ones that do not have a group */
-		if (phyi->phyint_groupname_len == 0)
-			continue;
-
-		ASSERT(phyi->phyint_group_ifindex != 0);
-		/*
-		 * Skip the ones that should not be used since the callers
-		 * sometime use this for sending packets.
-		 */
-		if (phyi->phyint_flags &
-		    (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE))
-			continue;
-		if (phyi->phyint_group_ifindex == group_ifindex)
-			return (phyi);
+	if (phyi->phyint_illv6 != NULL) {
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		VERIFY(grp->gr_pendv6-- > 0);
+		rw_exit(&ipst->ips_ipmp_lock);
+		ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6);
 	}
-	return (NULL);
+	freemsg(mp);
 }
 
 /*
- * MT notes on creation and deletion of IPMP groups
- *
- * Creation and deletion of IPMP groups introduce the need to merge or
- * split the associated serialization objects i.e the ipsq's. Normally all
- * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled
- * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during
- * the execution of the SIOCSLIFGROUPNAME command the picture changes. There
- * is a need to change the <ill-ipsq> association and we have to operate on both
- * the source and destination IPMP groups. For eg. attempting to set the
- * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to
- * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the
- * source or destination IPMP group are mapped to a single ipsq for executing
- * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's.
- * The <ill-ipsq> mapping is restored back to normal at a later point. This is
- * termed as a split of the ipsq. The converse of the merge i.e. a split of the
- * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname
- * occurred on the ipsq, then the ipsq_split flag is set. This indicates the
- * ipsq has to be examined for redoing the <ill-ipsq> associations.
- *
- * In the above example the ioctl handling code locates the current ipsq of hme0
- * which is ipsq(mpk17-84). It then enters the above ipsq immediately or
- * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates
- * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into
- * the destination ipsq. If the destination ipsq is not busy, it also enters
- * the destination ipsq exclusively. Now the actual groupname setting operation
- * can proceed. If the destination ipsq is busy, the operation is enqueued
- * on the destination (merged) ipsq and will be handled in the unwind from
- * ipsq_exit.
- *
- * To prevent other threads accessing the ill while the group name change is
- * in progres, we bring down the ipifs which also removes the ill from the
- * group. The group is changed in phyint and when the first ipif on the ill
- * is brought up, the ill is inserted into the right IPMP group by
- * illgrp_insert.
+ * Process an SIOCSLIFGROUPNAME request.
  */
 /* ARGSUSED */
 int
 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *ifreq)
 {
-	int i;
-	char *tmp;
-	int namelen;
-	ill_t *ill = ipif->ipif_ill;
-	ill_t *ill_v4, *ill_v6;
-	int err = 0;
-	phyint_t *phyi;
-	phyint_t *phyi_tmp;
-	struct lifreq *lifr;
-	mblk_t	*mp1;
-	char *groupname;
-	ipsq_t *ipsq;
+	struct lifreq	*lifr = ifreq;
+	ill_t		*ill = ipif->ipif_ill;
 	ip_stack_t	*ipst = ill->ill_ipst;
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
-
-	/* Existance verified in ip_wput_nondata */
-	mp1 = mp->b_cont->b_cont;
-	lifr = (struct lifreq *)mp1->b_rptr;
-	groupname = lifr->lifr_groupname;
-
-	if (ipif->ipif_id != 0)
-		return (EINVAL);
-
-	phyi = ill->ill_phyint;
-	ASSERT(phyi != NULL);
-
-	if (phyi->phyint_flags & PHYI_VIRTUAL)
-		return (EINVAL);
-
-	tmp = groupname;
-	for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++)
-		;
-
-	if (i == LIFNAMSIZ) {
-		/* no null termination */
-		return (EINVAL);
-	}
+	phyint_t	*phyi = ill->ill_phyint;
+	ipmp_grp_t	*grp = phyi->phyint_grp;
+	mblk_t		*ipsq_mp;
+	int		err = 0;
 
 	/*
-	 * Calculate the namelen exclusive of the null
-	 * termination character.
+	 * Note that phyint_grp can only change here, where we're exclusive.
 	 */
-	namelen = tmp - groupname;
-
-	ill_v4 = phyi->phyint_illv4;
-	ill_v6 = phyi->phyint_illv6;
+	ASSERT(IAM_WRITER_ILL(ill));
 
-	/*
-	 * ILL cannot be part of a usesrc group and and IPMP group at the
-	 * same time. No need to grab the ill_g_usesrc_lock here, see
-	 * synchronization notes in ip.c
-	 */
-	if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
+	if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL ||
+	    (phyi->phyint_flags & PHYI_VIRTUAL))
 		return (EINVAL);
-	}
-
-	/*
-	 * mark the ill as changing.
-	 * this should queue all new requests on the syncq.
-	 */
-	GRAB_ILL_LOCKS(ill_v4, ill_v6);
-
-	if (ill_v4 != NULL)
-		ill_v4->ill_state_flags |= ILL_CHANGING;
-	if (ill_v6 != NULL)
-		ill_v6->ill_state_flags |= ILL_CHANGING;
-	RELEASE_ILL_LOCKS(ill_v4, ill_v6);
-
-	if (namelen == 0) {
-		/*
-		 * Null string means remove this interface from the
-		 * existing group.
-		 */
-		if (phyi->phyint_groupname_len == 0) {
-			/*
-			 * Never was in a group.
-			 */
-			err = 0;
-			goto done;
-		}
-
-		/*
-		 * IPv4 or IPv6 may be temporarily out of the group when all
-		 * the ipifs are down. Thus, we need to check for ill_group to
-		 * be non-NULL.
-		 */
-		if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
-			ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
-			mutex_enter(&ill_v4->ill_lock);
-			if (!ill_is_quiescent(ill_v4)) {
-				/*
-				 * ipsq_pending_mp_add will not fail since
-				 * connp is NULL
-				 */
-				(void) ipsq_pending_mp_add(NULL,
-				    ill_v4->ill_ipif, q, mp, ILL_DOWN);
-				mutex_exit(&ill_v4->ill_lock);
-				err = EINPROGRESS;
-				goto done;
-			}
-			mutex_exit(&ill_v4->ill_lock);
-		}
-
-		if (ill_v6 != NULL && ill_v6->ill_group != NULL) {
-			ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
-			mutex_enter(&ill_v6->ill_lock);
-			if (!ill_is_quiescent(ill_v6)) {
-				(void) ipsq_pending_mp_add(NULL,
-				    ill_v6->ill_ipif, q, mp, ILL_DOWN);
-				mutex_exit(&ill_v6->ill_lock);
-				err = EINPROGRESS;
-				goto done;
-			}
-			mutex_exit(&ill_v6->ill_lock);
-		}
-
-		rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-		GRAB_ILL_LOCKS(ill_v4, ill_v6);
-		mutex_enter(&phyi->phyint_lock);
-		ASSERT(phyi->phyint_groupname != NULL);
-		mi_free(phyi->phyint_groupname);
-		phyi->phyint_groupname = NULL;
-		phyi->phyint_groupname_len = 0;
-
-		/* Restore the ifindex used to be the per interface one */
-		phyi->phyint_group_ifindex = 0;
-		phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-		mutex_exit(&phyi->phyint_lock);
-		RELEASE_ILL_LOCKS(ill_v4, ill_v6);
-		rw_exit(&ipst->ips_ill_g_lock);
-		err = ill_up_ipifs(ill, q, mp);
 
-		/*
-		 * set the split flag so that the ipsq can be split
-		 */
-		mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
-		phyi->phyint_ipsq->ipsq_split = B_TRUE;
-		mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
+	lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0';
 
-	} else {
-		if (phyi->phyint_groupname_len != 0) {
-			ASSERT(phyi->phyint_groupname != NULL);
-			/* Are we inserting in the same group ? */
-			if (mi_strcmp(groupname,
-			    phyi->phyint_groupname) == 0) {
-				err = 0;
-				goto done;
-			}
-		}
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
 
-		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-		/*
-		 * Merge ipsq for the group's.
-		 * This check is here as multiple groups/ills might be
-		 * sharing the same ipsq.
-		 * If we have to merege than the operation is restarted
-		 * on the new ipsq.
-		 */
-		ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL, ipst);
-		if (phyi->phyint_ipsq != ipsq) {
-			rw_exit(&ipst->ips_ill_g_lock);
-			err = ill_merge_groups(ill, NULL, groupname, mp, q);
-			goto done;
-		}
-		/*
-		 * Running exclusive on new ipsq.
-		 */
-
-		ASSERT(ipsq != NULL);
-		ASSERT(ipsq->ipsq_writer == curthread);
-
-		/*
-		 * Check whether the ill_type and ill_net_type matches before
-		 * we allocate any memory so that the cleanup is easier.
-		 *
-		 * We can't group dissimilar ones as we can't load spread
-		 * packets across the group because of potential link-level
-		 * header differences.
-		 */
-		phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst);
-		if (phyi_tmp != NULL) {
-			if ((ill_v4 != NULL &&
-			    phyi_tmp->phyint_illv4 != NULL) &&
-			    ((ill_v4->ill_net_type !=
-			    phyi_tmp->phyint_illv4->ill_net_type) ||
-			    (ill_v4->ill_type !=
-			    phyi_tmp->phyint_illv4->ill_type))) {
-				mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
-				phyi->phyint_ipsq->ipsq_split = B_TRUE;
-				mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
-				rw_exit(&ipst->ips_ill_g_lock);
-				return (EINVAL);
-			}
-			if ((ill_v6 != NULL &&
-			    phyi_tmp->phyint_illv6 != NULL) &&
-			    ((ill_v6->ill_net_type !=
-			    phyi_tmp->phyint_illv6->ill_net_type) ||
-			    (ill_v6->ill_type !=
-			    phyi_tmp->phyint_illv6->ill_type))) {
-				mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
-				phyi->phyint_ipsq->ipsq_split = B_TRUE;
-				mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
-				rw_exit(&ipst->ips_ill_g_lock);
-				return (EINVAL);
-			}
-		}
-
-		rw_exit(&ipst->ips_ill_g_lock);
-
-		/*
-		 * bring down all v4 ipifs.
-		 */
-		if (ill_v4 != NULL) {
-			ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
-		}
-
-		/*
-		 * bring down all v6 ipifs.
-		 */
-		if (ill_v6 != NULL) {
-			ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
-		}
-
-		/*
-		 * make sure all ipifs are down and there are no active
-		 * references. Call to ipsq_pending_mp_add will not fail
-		 * since connp is NULL.
-		 */
-		if (ill_v4 != NULL) {
-			mutex_enter(&ill_v4->ill_lock);
-			if (!ill_is_quiescent(ill_v4)) {
-				(void) ipsq_pending_mp_add(NULL,
-				    ill_v4->ill_ipif, q, mp, ILL_DOWN);
-				mutex_exit(&ill_v4->ill_lock);
-				err = EINPROGRESS;
-				goto done;
-			}
-			mutex_exit(&ill_v4->ill_lock);
-		}
-
-		if (ill_v6 != NULL) {
-			mutex_enter(&ill_v6->ill_lock);
-			if (!ill_is_quiescent(ill_v6)) {
-				(void) ipsq_pending_mp_add(NULL,
-				    ill_v6->ill_ipif, q, mp, ILL_DOWN);
-				mutex_exit(&ill_v6->ill_lock);
-				err = EINPROGRESS;
-				goto done;
-			}
-			mutex_exit(&ill_v6->ill_lock);
-		}
-
-		/*
-		 * allocate including space for null terminator
-		 * before we insert.
-		 */
-		tmp = (char *)mi_alloc(namelen + 1, BPRI_MED);
-		if (tmp == NULL)
-			return (ENOMEM);
-
-		rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-		GRAB_ILL_LOCKS(ill_v4, ill_v6);
-		mutex_enter(&phyi->phyint_lock);
-		if (phyi->phyint_groupname_len != 0) {
-			ASSERT(phyi->phyint_groupname != NULL);
-			mi_free(phyi->phyint_groupname);
-		}
-
-		/*
-		 * setup the new group name.
-		 */
-		phyi->phyint_groupname = tmp;
-		bcopy(groupname, phyi->phyint_groupname, namelen + 1);
-		phyi->phyint_groupname_len = namelen + 1;
-
-		if (ipst->ips_ipmp_hook_emulation) {
-			/*
-			 * If the group already exists we use the existing
-			 * group_ifindex, otherwise we pick a new index here.
-			 */
-			if (phyi_tmp != NULL) {
-				phyi->phyint_group_ifindex =
-				    phyi_tmp->phyint_group_ifindex;
-			} else {
-				/* XXX We need a recovery strategy here. */
-				if (!ip_assign_ifindex(
-				    &phyi->phyint_group_ifindex, ipst))
-					cmn_err(CE_PANIC,
-					    "ip_assign_ifindex() failed");
-			}
-		}
-		/*
-		 * Select whether the netinfo and hook use the per-interface
-		 * or per-group ifindex.
-		 */
-		if (ipst->ips_ipmp_hook_emulation)
-			phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex;
-		else
-			phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-
-		if (ipst->ips_ipmp_hook_emulation &&
-		    phyi_tmp != NULL) {
-			/* First phyint in group - group PLUMB event */
-			ill_nic_event_plumb(ill, B_TRUE);
-		}
-		mutex_exit(&phyi->phyint_lock);
-		RELEASE_ILL_LOCKS(ill_v4, ill_v6);
-		rw_exit(&ipst->ips_ill_g_lock);
-
-		err = ill_up_ipifs(ill, q, mp);
-	}
-
-done:
 	/*
-	 *  normally ILL_CHANGING is cleared in ill_up_ipifs.
+	 * If the name hasn't changed, there's nothing to do.
 	 */
-	if (err != EINPROGRESS) {
-		GRAB_ILL_LOCKS(ill_v4, ill_v6);
-		if (ill_v4 != NULL)
-			ill_v4->ill_state_flags &= ~ILL_CHANGING;
-		if (ill_v6 != NULL)
-			ill_v6->ill_state_flags &= ~ILL_CHANGING;
-		RELEASE_ILL_LOCKS(ill_v4, ill_v6);
-	}
-	return (err);
-}
+	if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0)
+		goto unlock;
 
-/* ARGSUSED */
-int
-ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
-    mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
-{
-	ill_t *ill;
-	phyint_t *phyi;
-	struct lifreq *lifr;
-	mblk_t	*mp1;
-
-	/* Existence verified in ip_wput_nondata */
-	mp1 = mp->b_cont->b_cont;
-	lifr = (struct lifreq *)mp1->b_rptr;
-	ill = ipif->ipif_ill;
-	phyi = ill->ill_phyint;
-
-	lifr->lifr_groupname[0] = '\0';
 	/*
-	 * ill_group may be null if all the interfaces
-	 * are down. But still, the phyint should always
-	 * hold the name.
-	 */
-	if (phyi->phyint_groupname_len != 0) {
-		bcopy(phyi->phyint_groupname, lifr->lifr_groupname,
-		    phyi->phyint_groupname_len);
-	}
-
-	return (0);
-}
-
-
-typedef struct conn_move_s {
-	ill_t	*cm_from_ill;
-	ill_t	*cm_to_ill;
-	int	cm_ifindex;
-} conn_move_t;
-
-/*
- * ipcl_walk function for moving conn_multicast_ill for a given ill.
- */
-static void
-conn_move(conn_t *connp, caddr_t arg)
-{
-	conn_move_t *connm;
-	int ifindex;
-	int i;
-	ill_t *from_ill;
-	ill_t *to_ill;
-	ilg_t *ilg;
-	ilm_t *ret_ilm;
-
-	connm = (conn_move_t *)arg;
-	ifindex = connm->cm_ifindex;
-	from_ill = connm->cm_from_ill;
-	to_ill = connm->cm_to_ill;
-
-	/* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */
-
-	/* All multicast fields protected by conn_lock */
-	mutex_enter(&connp->conn_lock);
-	ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
-	if ((connp->conn_outgoing_ill == from_ill) &&
-	    (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) {
-		connp->conn_outgoing_ill = to_ill;
-		connp->conn_incoming_ill = to_ill;
-	}
-
-	/* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */
-
-	if ((connp->conn_multicast_ill == from_ill) &&
-	    (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) {
-		connp->conn_multicast_ill = connm->cm_to_ill;
-	}
-
-	/*
-	 * Change the ilg_ill to point to the new one. This assumes
-	 * ilm_move_v6 has moved the ilms to new_ill and the driver
-	 * has been told to receive packets on this interface.
-	 * ilm_move_v6 FAILBACKS all the ilms successfully always.
-	 * But when doing a FAILOVER, it might fail with ENOMEM and so
-	 * some ilms may not have moved. We check to see whether
-	 * the ilms have moved to to_ill. We can't check on from_ill
-	 * as in the process of moving, we could have split an ilm
-	 * in to two - which has the same orig_ifindex and v6group.
+	 * Handle requests to rename an IPMP meta-interface.
 	 *
-	 * For IPv4, ilg_ipif moves implicitly. The code below really
-	 * does not do anything for IPv4 as ilg_ill is NULL for IPv4.
-	 */
-	for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
-		ilg = &connp->conn_ilg[i];
-		if ((ilg->ilg_ill == from_ill) &&
-		    (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) {
-			/* ifindex != 0 indicates failback */
-			if (ifindex != 0) {
-				connp->conn_ilg[i].ilg_ill = to_ill;
-				continue;
-			}
-
-			mutex_enter(&to_ill->ill_lock);
-			ret_ilm = ilm_lookup_ill_index_v6(to_ill,
-			    &ilg->ilg_v6group, ilg->ilg_orig_ifindex,
-			    connp->conn_zoneid);
-			mutex_exit(&to_ill->ill_lock);
-
-			if (ret_ilm != NULL)
-				connp->conn_ilg[i].ilg_ill = to_ill;
-		}
+	 * Note that creation of the IPMP meta-interface is handled in
+	 * userland through the standard plumbing sequence.  As part of the
+	 * plumbing the IPMP meta-interface, its initial groupname is set to
+	 * the name of the interface (see ipif_set_values_tail()).
+	 */
+	if (IS_IPMP(ill)) {
+		err = ipmp_grp_rename(grp, lifr->lifr_groupname);
+		goto unlock;
 	}
-	mutex_exit(&connp->conn_lock);
-}
-
-static void
-conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex)
-{
-	conn_move_t connm;
-	ip_stack_t	*ipst = from_ill->ill_ipst;
-
-	connm.cm_from_ill = from_ill;
-	connm.cm_to_ill = to_ill;
-	connm.cm_ifindex = ifindex;
-
-	ipcl_walk(conn_move, (caddr_t)&connm, ipst);
-}
-
-/*
- * ilm has been moved from from_ill to to_ill.
- * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill.
- * appropriately.
- *
- * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because
- *	  the code there de-references ipif_ill to get the ill to
- *	  send multicast requests. It does not work as ipif is on its
- *	  move and already moved when this function is called.
- *	  Thus, we need to use from_ill and to_ill send down multicast
- *	  requests.
- */
-static void
-ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill)
-{
-	ipif_t *ipif;
-	ilm_t *ilm;
 
 	/*
-	 * See whether we need to send down DL_ENABMULTI_REQ on
-	 * to_ill as ilm has just been added.
+	 * Handle requests to add or remove an IP interface from a group.
 	 */
-	ASSERT(IAM_WRITER_ILL(to_ill));
-	ASSERT(IAM_WRITER_ILL(from_ill));
-
-	ILM_WALKER_HOLD(to_ill);
-	for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-
-		if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED))
-			continue;
-		/*
-		 * no locks held, ill/ipif cannot dissappear as long
-		 * as we are writer.
-		 */
-		ipif = to_ill->ill_ipif;
+	if (lifr->lifr_groupname[0] != '\0') {			/* add */
 		/*
-		 * No need to hold any lock as we are the writer and this
-		 * can only be changed by a writer.
+		 * Moves are handled by first removing the interface from
+		 * its existing group, and then adding it to another group.
+		 * So, fail if it's already in a group.
 		 */
-		ilm->ilm_is_new = B_FALSE;
-
-		if (to_ill->ill_net_type != IRE_IF_RESOLVER ||
-		    ipif->ipif_flags & IPIF_POINTOPOINT) {
-			ip1dbg(("ilm_send_multicast_reqs: to_ill not "
-			    "resolver\n"));
-			continue;		/* Must be IRE_IF_NORESOLVER */
-		}
-
-		if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
-			ip1dbg(("ilm_send_multicast_reqs: "
-			    "to_ill MULTI_BCAST\n"));
-			goto from;
+		if (IS_UNDER_IPMP(ill)) {
+			err = EALREADY;
+			goto unlock;
 		}
 
-		if (to_ill->ill_isv6)
-			mld_joingroup(ilm);
-		else
-			igmp_joingroup(ilm);
-
-		if (to_ill->ill_ipif_up_count == 0) {
-			/*
-			 * Nobody there. All multicast addresses will be
-			 * re-joined when we get the DL_BIND_ACK bringing the
-			 * interface up.
-			 */
-			ilm->ilm_notify_driver = B_FALSE;
-			ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n"));
-			goto from;
+		grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst);
+		if (grp == NULL) {
+			err = ENOENT;
+			goto unlock;
 		}
 
 		/*
-		 * For allmulti address, we want to join on only one interface.
-		 * Checking for ilm_numentries_v6 is not correct as you may
-		 * find an ilm with zero address on to_ill, but we may not
-		 * have nominated to_ill for receiving. Thus, if we have
-		 * nominated from_ill (ill_join_allmulti is set), nominate
-		 * only if to_ill is not already nominated (to_ill normally
-		 * should not have been nominated if "from_ill" has already
-		 * been nominated. As we don't prevent failovers from happening
-		 * across groups, we don't assert).
+		 * Check if the phyint and its ills are suitable for
+		 * inclusion into the group.
 		 */
-		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
-			/*
-			 * There is no need to hold ill locks as we are
-			 * writer on both ills and when ill_join_allmulti()
-			 * is called the thread is always a writer.
-			 */
-			if (from_ill->ill_join_allmulti &&
-			    !to_ill->ill_join_allmulti) {
-				(void) ill_join_allmulti(to_ill);
-			}
-		} else if (ilm->ilm_notify_driver) {
-
-			/*
-			 * This is a newly moved ilm so we need to tell the
-			 * driver about the new group. There can be more than
-			 * one ilm's for the same group in the list each with a
-			 * different orig_ifindex. We have to inform the driver
-			 * once. In ilm_move_v[4,6] we only set the flag
-			 * ilm_notify_driver for the first ilm.
-			 */
-
-			(void) ip_ll_send_enabmulti_req(to_ill,
-			    &ilm->ilm_v6addr);
-		}
-
-		ilm->ilm_notify_driver = B_FALSE;
+		if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0)
+			goto unlock;
 
 		/*
-		 * See whether we need to send down DL_DISABMULTI_REQ on
-		 * from_ill as ilm has just been removed.
+		 * Checks pass; join the group, and enqueue the remaining
+		 * illgrp joins for when we've become part of the group xop
+		 * and are exclusive across its IPSQs.  Since qwriter_ip()
+		 * requires an mblk_t to scribble on, and since `mp' will be
+		 * freed as part of completing the ioctl, allocate another.
 		 */
-from:
-		ipif = from_ill->ill_ipif;
-		if (from_ill->ill_net_type != IRE_IF_RESOLVER ||
-		    ipif->ipif_flags & IPIF_POINTOPOINT) {
-			ip1dbg(("ilm_send_multicast_reqs: "
-			    "from_ill not resolver\n"));
-			continue;		/* Must be IRE_IF_NORESOLVER */
-		}
-
-		if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
-			ip1dbg(("ilm_send_multicast_reqs: "
-			    "from_ill MULTI_BCAST\n"));
-			continue;
-		}
-
-		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
-			if (from_ill->ill_join_allmulti)
-				ill_leave_allmulti(from_ill);
-		} else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) {
-			(void) ip_ll_send_disabmulti_req(from_ill,
-			    &ilm->ilm_v6addr);
+		if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) {
+			err = ENOMEM;
+			goto unlock;
 		}
-	}
-	ILM_WALKER_RELE(to_ill);
-}
-
-/*
- * This function is called when all multicast memberships needs
- * to be moved from "from_ill" to "to_ill" for IPv6. This function is
- * called only once unlike the IPv4 counterpart where it is called after
- * every logical interface is moved. The reason is due to multicast
- * memberships are joined using an interface address in IPv4 while in
- * IPv6, interface index is used.
- */
-static void
-ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex)
-{
-	ilm_t	*ilm;
-	ilm_t	*ilm_next;
-	ilm_t	*new_ilm;
-	ilm_t	**ilmp;
-	int	count;
-	char buf[INET6_ADDRSTRLEN];
-	in6_addr_t ipv6_snm = ipv6_solicited_node_mcast;
-	ip_stack_t	*ipst = from_ill->ill_ipst;
-
-	ASSERT(MUTEX_HELD(&to_ill->ill_lock));
-	ASSERT(MUTEX_HELD(&from_ill->ill_lock));
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
 
-	if (ifindex == 0) {
 		/*
-		 * Form the solicited node mcast address which is used later.
+		 * Before we drop ipmp_lock, bump gr_pend* to ensure that the
+		 * IPMP meta-interface ills needed by `phyi' cannot go away
+		 * before ip_join_illgrps() is called back.  See the comments
+		 * in ip_sioctl_plink_ipmp() for more.
 		 */
-		ipif_t *ipif;
-
-		ipif = from_ill->ill_ipif;
-		ASSERT(ipif->ipif_id == 0);
-
-		ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
-	}
-
-	ilmp = &from_ill->ill_ilm;
-	for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
-		ilm_next = ilm->ilm_next;
-
-		if (ilm->ilm_flags & ILM_DELETED) {
-			ilmp = &ilm->ilm_next;
-			continue;
-		}
+		if (phyi->phyint_illv4 != NULL)
+			grp->gr_pendv4++;
+		if (phyi->phyint_illv6 != NULL)
+			grp->gr_pendv6++;
 
-		new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr,
-		    ilm->ilm_orig_ifindex, ilm->ilm_zoneid);
-		ASSERT(ilm->ilm_orig_ifindex != 0);
-		if (ilm->ilm_orig_ifindex == ifindex) {
-			/*
-			 * We are failing back multicast memberships.
-			 * If the same ilm exists in to_ill, it means somebody
-			 * has joined the same group there e.g. ff02::1
-			 * is joined within the kernel when the interfaces
-			 * came UP.
-			 */
-			ASSERT(ilm->ilm_ipif == NULL);
-			if (new_ilm != NULL) {
-				new_ilm->ilm_refcnt += ilm->ilm_refcnt;
-				if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
-				    !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
-					new_ilm->ilm_is_new = B_TRUE;
-				}
-			} else {
-				/*
-				 * check if we can just move the ilm
-				 */
-				if (from_ill->ill_ilm_walker_cnt != 0) {
-					/*
-					 * We have walkers we cannot move
-					 * the ilm, so allocate a new ilm,
-					 * this (old) ilm will be marked
-					 * ILM_DELETED at the end of the loop
-					 * and will be freed when the
-					 * last walker exits.
-					 */
-					new_ilm = (ilm_t *)mi_zalloc
-					    (sizeof (ilm_t));
-					if (new_ilm == NULL) {
-						ip0dbg(("ilm_move_v6: "
-						    "FAILBACK of IPv6"
-						    " multicast address %s : "
-						    "from %s to"
-						    " %s failed : ENOMEM \n",
-						    inet_ntop(AF_INET6,
-						    &ilm->ilm_v6addr, buf,
-						    sizeof (buf)),
-						    from_ill->ill_name,
-						    to_ill->ill_name));
-
-							ilmp = &ilm->ilm_next;
-							continue;
-					}
-					*new_ilm = *ilm;
-					/*
-					 * we don't want new_ilm linked to
-					 * ilm's filter list.
-					 */
-					new_ilm->ilm_filter = NULL;
-				} else {
-					/*
-					 * No walkers we can move the ilm.
-					 * lets take it out of the list.
-					 */
-					*ilmp = ilm->ilm_next;
-					ilm->ilm_next = NULL;
-					DTRACE_PROBE3(ill__decr__cnt,
-					    (ill_t *), from_ill,
-					    (char *), "ilm", (void *), ilm);
-					ASSERT(from_ill->ill_ilm_cnt > 0);
-					from_ill->ill_ilm_cnt--;
-
-					new_ilm = ilm;
-				}
+		rw_exit(&ipst->ips_ipmp_lock);
 
-				/*
-				 * if this is the first ilm for the group
-				 * set ilm_notify_driver so that we notify the
-				 * driver in ilm_send_multicast_reqs.
-				 */
-				if (ilm_lookup_ill_v6(to_ill,
-				    &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
-					new_ilm->ilm_notify_driver = B_TRUE;
-
-				DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
-				    (char *), "ilm", (void *), new_ilm);
-				new_ilm->ilm_ill = to_ill;
-				to_ill->ill_ilm_cnt++;
-
-				/* Add to the to_ill's list */
-				new_ilm->ilm_next = to_ill->ill_ilm;
-				to_ill->ill_ilm = new_ilm;
-				/*
-				 * set the flag so that mld_joingroup is
-				 * called in ilm_send_multicast_reqs().
-				 */
-				new_ilm->ilm_is_new = B_TRUE;
-			}
-			goto bottom;
-		} else if (ifindex != 0) {
-			/*
-			 * If this is FAILBACK (ifindex != 0) and the ifindex
-			 * has not matched above, look at the next ilm.
-			 */
-			ilmp = &ilm->ilm_next;
-			continue;
-		}
-		/*
-		 * If we are here, it means ifindex is 0. Failover
-		 * everything.
-		 *
-		 * We need to handle solicited node mcast address
-		 * and all_nodes mcast address differently as they
-		 * are joined witin the kenrel (ipif_multicast_up)
-		 * and potentially from the userland. We are called
-		 * after the ipifs of from_ill has been moved.
-		 * If we still find ilms on ill with solicited node
-		 * mcast address or all_nodes mcast address, it must
-		 * belong to the UP interface that has not moved e.g.
-		 * ipif_id 0 with the link local prefix does not move.
-		 * We join this on the new ill accounting for all the
-		 * userland memberships so that applications don't
-		 * see any failure.
-		 *
-		 * We need to make sure that we account only for the
-		 * solicited node and all node multicast addresses
-		 * that was brought UP on these. In the case of
-		 * a failover from A to B, we might have ilms belonging
-		 * to A (ilm_orig_ifindex pointing at A) on B accounting
-		 * for the membership from the userland. If we are failing
-		 * over from B to C now, we will find the ones belonging
-		 * to A on B. These don't account for the ill_ipif_up_count.
-		 * They just move from B to C. The check below on
-		 * ilm_orig_ifindex ensures that.
-		 */
-		if ((ilm->ilm_orig_ifindex ==
-		    from_ill->ill_phyint->phyint_ifindex) &&
-		    (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) ||
-		    IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast,
-		    &ilm->ilm_v6addr))) {
-			ASSERT(ilm->ilm_refcnt > 0);
-			count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count;
-			/*
-			 * For indentation reasons, we are not using a
-			 * "else" here.
-			 */
-			if (count == 0) {
-				ilmp = &ilm->ilm_next;
-				continue;
-			}
-			ilm->ilm_refcnt -= count;
-			if (new_ilm != NULL) {
-				/*
-				 * Can find one with the same
-				 * ilm_orig_ifindex, if we are failing
-				 * over to a STANDBY. This happens
-				 * when somebody wants to join a group
-				 * on a STANDBY interface and we
-				 * internally join on a different one.
-				 * If we had joined on from_ill then, a
-				 * failover now will find a new ilm
-				 * with this index.
-				 */
-				ip1dbg(("ilm_move_v6: FAILOVER, found"
-				    " new ilm on %s, group address %s\n",
-				    to_ill->ill_name,
-				    inet_ntop(AF_INET6,
-				    &ilm->ilm_v6addr, buf,
-				    sizeof (buf))));
-				new_ilm->ilm_refcnt += count;
-				if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
-				    !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
-					new_ilm->ilm_is_new = B_TRUE;
-				}
-			} else {
-				new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
-				if (new_ilm == NULL) {
-					ip0dbg(("ilm_move_v6: FAILOVER of IPv6"
-					    " multicast address %s : from %s to"
-					    " %s failed : ENOMEM \n",
-					    inet_ntop(AF_INET6,
-					    &ilm->ilm_v6addr, buf,
-					    sizeof (buf)), from_ill->ill_name,
-					    to_ill->ill_name));
-					ilmp = &ilm->ilm_next;
-					continue;
-				}
-				*new_ilm = *ilm;
-				new_ilm->ilm_filter = NULL;
-				new_ilm->ilm_refcnt = count;
-				new_ilm->ilm_timer = INFINITY;
-				new_ilm->ilm_rtx.rtx_timer = INFINITY;
-				new_ilm->ilm_is_new = B_TRUE;
-				/*
-				 * If the to_ill has not joined this
-				 * group we need to tell the driver in
-				 * ill_send_multicast_reqs.
-				 */
-				if (ilm_lookup_ill_v6(to_ill,
-				    &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
-					new_ilm->ilm_notify_driver = B_TRUE;
-
-				new_ilm->ilm_ill = to_ill;
-				DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
-				    (char *), "ilm", (void *), new_ilm);
-				to_ill->ill_ilm_cnt++;
-
-				/* Add to the to_ill's list */
-				new_ilm->ilm_next = to_ill->ill_ilm;
-				to_ill->ill_ilm = new_ilm;
-				ASSERT(new_ilm->ilm_ipif == NULL);
-			}
-			if (ilm->ilm_refcnt == 0) {
-				goto bottom;
-			} else {
-				new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
-				CLEAR_SLIST(new_ilm->ilm_filter);
-				ilmp = &ilm->ilm_next;
-			}
-			continue;
-		} else {
-			/*
-			 * ifindex = 0 means, move everything pointing at
-			 * from_ill. We are doing this becuase ill has
-			 * either FAILED or became INACTIVE.
-			 *
-			 * As we would like to move things later back to
-			 * from_ill, we want to retain the identity of this
-			 * ilm. Thus, we don't blindly increment the reference
-			 * count on the ilms matching the address alone. We
-			 * need to match on the ilm_orig_index also. new_ilm
-			 * was obtained by matching ilm_orig_index also.
-			 */
-			if (new_ilm != NULL) {
-				/*
-				 * This is possible only if a previous restore
-				 * was incomplete i.e restore to
-				 * ilm_orig_ifindex left some ilms because
-				 * of some failures. Thus when we are failing
-				 * again, we might find our old friends there.
-				 */
-				ip1dbg(("ilm_move_v6: FAILOVER, found new ilm"
-				    " on %s, group address %s\n",
-				    to_ill->ill_name,
-				    inet_ntop(AF_INET6,
-				    &ilm->ilm_v6addr, buf,
-				    sizeof (buf))));
-				new_ilm->ilm_refcnt += ilm->ilm_refcnt;
-				if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
-				    !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
-					new_ilm->ilm_is_new = B_TRUE;
-				}
-			} else {
-				if (from_ill->ill_ilm_walker_cnt != 0) {
-					new_ilm = (ilm_t *)
-					    mi_zalloc(sizeof (ilm_t));
-					if (new_ilm == NULL) {
-						ip0dbg(("ilm_move_v6: "
-						    "FAILOVER of IPv6"
-						    " multicast address %s : "
-						    "from %s to"
-						    " %s failed : ENOMEM \n",
-						    inet_ntop(AF_INET6,
-						    &ilm->ilm_v6addr, buf,
-						    sizeof (buf)),
-						    from_ill->ill_name,
-						    to_ill->ill_name));
-
-							ilmp = &ilm->ilm_next;
-							continue;
-					}
-					*new_ilm = *ilm;
-					new_ilm->ilm_filter = NULL;
-				} else {
-					*ilmp = ilm->ilm_next;
-					DTRACE_PROBE3(ill__decr__cnt,
-					    (ill_t *), from_ill,
-					    (char *), "ilm", (void *), ilm);
-					ASSERT(from_ill->ill_ilm_cnt > 0);
-					from_ill->ill_ilm_cnt--;
-
-					new_ilm = ilm;
-				}
-				/*
-				 * If the to_ill has not joined this
-				 * group we need to tell the driver in
-				 * ill_send_multicast_reqs.
-				 */
-				if (ilm_lookup_ill_v6(to_ill,
-				    &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
-					new_ilm->ilm_notify_driver = B_TRUE;
-
-				/* Add to the to_ill's list */
-				new_ilm->ilm_next = to_ill->ill_ilm;
-				to_ill->ill_ilm = new_ilm;
-				ASSERT(ilm->ilm_ipif == NULL);
-				new_ilm->ilm_ill = to_ill;
-				DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
-				    (char *), "ilm", (void *), new_ilm);
-				to_ill->ill_ilm_cnt++;
-				new_ilm->ilm_is_new = B_TRUE;
-			}
-
-		}
-
-bottom:
-		/*
-		 * Revert multicast filter state to (EXCLUDE, NULL).
-		 * new_ilm->ilm_is_new should already be set if needed.
-		 */
-		new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
-		CLEAR_SLIST(new_ilm->ilm_filter);
+		ipmp_phyint_join_grp(phyi, grp);
+		ill_refhold(ill);
+		qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps,
+		    SWITCH_OP, B_FALSE);
+		return (0);
+	} else {
 		/*
-		 * We allocated/got a new ilm, free the old one.
+		 * Request to remove the interface from a group.  If the
+		 * interface is not in a group, this trivially succeeds.
 		 */
-		if (new_ilm != ilm) {
-			if (from_ill->ill_ilm_walker_cnt == 0) {
-				*ilmp = ilm->ilm_next;
-
-				ASSERT(ilm->ilm_ipif == NULL); /* ipv6 */
-				DTRACE_PROBE3(ill__decr__cnt, (ill_t *),
-				    from_ill, (char *), "ilm", (void *), ilm);
-				ASSERT(from_ill->ill_ilm_cnt > 0);
-				from_ill->ill_ilm_cnt--;
-
-				ilm_inactive(ilm); /* frees this ilm */
-
-			} else {
-				ilm->ilm_flags |= ILM_DELETED;
-				from_ill->ill_ilm_cleanup_reqd = 1;
-				ilmp = &ilm->ilm_next;
-			}
-		}
+		rw_exit(&ipst->ips_ipmp_lock);
+		if (IS_UNDER_IPMP(ill))
+			ipmp_phyint_leave_grp(phyi);
+		return (0);
 	}
+unlock:
+	rw_exit(&ipst->ips_ipmp_lock);
+	return (err);
 }
 
 /*
- * Move all the multicast memberships to to_ill. Called when
- * an ipif moves from "from_ill" to "to_ill". This function is slightly
- * different from IPv6 counterpart as multicast memberships are associated
- * with ills in IPv6. This function is called after every ipif is moved
- * unlike IPv6, where it is moved only once.
+ * Process an SIOCGLIFBINDING request.
  */
-static void
-ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif)
-{
-	ilm_t	*ilm;
-	ilm_t	*ilm_next;
-	ilm_t	*new_ilm;
-	ilm_t	**ilmp;
-	ip_stack_t	*ipst = from_ill->ill_ipst;
-
-	ASSERT(MUTEX_HELD(&to_ill->ill_lock));
-	ASSERT(MUTEX_HELD(&from_ill->ill_lock));
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
-	ilmp = &from_ill->ill_ilm;
-	for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
-		ilm_next = ilm->ilm_next;
-
-		if (ilm->ilm_flags & ILM_DELETED) {
-			ilmp = &ilm->ilm_next;
-			continue;
-		}
-
-		ASSERT(ilm->ilm_ipif != NULL);
-
-		if (ilm->ilm_ipif != ipif) {
-			ilmp = &ilm->ilm_next;
-			continue;
-		}
-
-		if (V4_PART_OF_V6(ilm->ilm_v6addr) ==
-		    htonl(INADDR_ALLHOSTS_GROUP)) {
-			new_ilm = ilm_lookup_ipif(ipif,
-			    V4_PART_OF_V6(ilm->ilm_v6addr));
-			if (new_ilm != NULL) {
-				new_ilm->ilm_refcnt += ilm->ilm_refcnt;
-				/*
-				 * We still need to deal with the from_ill.
-				 */
-				new_ilm->ilm_is_new = B_TRUE;
-				new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
-				CLEAR_SLIST(new_ilm->ilm_filter);
-				ASSERT(ilm->ilm_ipif == ipif);
-				ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
-				if (from_ill->ill_ilm_walker_cnt == 0) {
-					DTRACE_PROBE3(ill__decr__cnt,
-					    (ill_t *), from_ill,
-					    (char *), "ilm", (void *), ilm);
-					ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
-				}
-				goto delete_ilm;
-			}
-			/*
-			 * If we could not find one e.g. ipif is
-			 * still down on to_ill, we add this ilm
-			 * on ill_new to preserve the reference
-			 * count.
-			 */
-		}
-		/*
-		 * When ipifs move, ilms always move with it
-		 * to the NEW ill. Thus we should never be
-		 * able to find ilm till we really move it here.
-		 */
-		ASSERT(ilm_lookup_ipif(ipif,
-		    V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL);
-
-		if (from_ill->ill_ilm_walker_cnt != 0) {
-			new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
-			if (new_ilm == NULL) {
-				char buf[INET6_ADDRSTRLEN];
-				ip0dbg(("ilm_move_v4: FAILBACK of IPv4"
-				    " multicast address %s : "
-				    "from %s to"
-				    " %s failed : ENOMEM \n",
-				    inet_ntop(AF_INET,
-				    &ilm->ilm_v6addr, buf,
-				    sizeof (buf)),
-				    from_ill->ill_name,
-				    to_ill->ill_name));
-
-				ilmp = &ilm->ilm_next;
-				continue;
-			}
-			*new_ilm = *ilm;
-			DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ipif,
-			    (char *), "ilm", (void *), ilm);
-			new_ilm->ilm_ipif->ipif_ilm_cnt++;
-			/* We don't want new_ilm linked to ilm's filter list */
-			new_ilm->ilm_filter = NULL;
-		} else {
-			/* Remove from the list */
-			*ilmp = ilm->ilm_next;
-			new_ilm = ilm;
-		}
-
-		/*
-		 * If we have never joined this group on the to_ill
-		 * make sure we tell the driver.
-		 */
-		if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr,
-		    ALL_ZONES) == NULL)
-			new_ilm->ilm_notify_driver = B_TRUE;
-
-		/* Add to the to_ill's list */
-		new_ilm->ilm_next = to_ill->ill_ilm;
-		to_ill->ill_ilm = new_ilm;
-		new_ilm->ilm_is_new = B_TRUE;
-
-		/*
-		 * Revert multicast filter state to (EXCLUDE, NULL)
-		 */
-		new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
-		CLEAR_SLIST(new_ilm->ilm_filter);
-
-		/*
-		 * Delete only if we have allocated a new ilm.
-		 */
-		if (new_ilm != ilm) {
-delete_ilm:
-			if (from_ill->ill_ilm_walker_cnt == 0) {
-				/* Remove from the list */
-				*ilmp = ilm->ilm_next;
-				ilm->ilm_next = NULL;
-				DTRACE_PROBE3(ipif__decr__cnt,
-				    (ipif_t *), ilm->ilm_ipif,
-				    (char *), "ilm", (void *), ilm);
-				ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
-				ilm->ilm_ipif->ipif_ilm_cnt--;
-				ilm_inactive(ilm);
-			} else {
-				ilm->ilm_flags |= ILM_DELETED;
-				from_ill->ill_ilm_cleanup_reqd = 1;
-				ilmp = &ilm->ilm_next;
-			}
-		}
-	}
-}
-
-static uint_t
-ipif_get_id(ill_t *ill, uint_t id)
-{
-	uint_t	unit;
-	ipif_t	*tipif;
-	boolean_t found = B_FALSE;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	/*
-	 * During failback, we want to go back to the same id
-	 * instead of the smallest id so that the original
-	 * configuration is maintained. id is non-zero in that
-	 * case.
-	 */
-	if (id != 0) {
-		/*
-		 * While failing back, if we still have an ipif with
-		 * MAX_ADDRS_PER_IF, it means this will be replaced
-		 * as soon as we return from this function. It was
-		 * to set to MAX_ADDRS_PER_IF by the caller so that
-		 * we can choose the smallest id. Thus we return zero
-		 * in that case ignoring the hint.
-		 */
-		if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF)
-			return (0);
-		for (tipif = ill->ill_ipif; tipif != NULL;
-		    tipif = tipif->ipif_next) {
-			if (tipif->ipif_id == id) {
-				found = B_TRUE;
-				break;
-			}
-		}
-		/*
-		 * If somebody already plumbed another logical
-		 * with the same id, we won't be able to find it.
-		 */
-		if (!found)
-			return (id);
-	}
-	for (unit = 0; unit <= ipst->ips_ip_addrs_per_if; unit++) {
-		found = B_FALSE;
-		for (tipif = ill->ill_ipif; tipif != NULL;
-		    tipif = tipif->ipif_next) {
-			if (tipif->ipif_id == unit) {
-				found = B_TRUE;
-				break;
-			}
-		}
-		if (!found)
-			break;
-	}
-	return (unit);
-}
-
 /* ARGSUSED */
-static int
-ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp,
-    ipif_t **rep_ipif_ptr)
+int
+ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+    ip_ioctl_cmd_t *ipip, void *ifreq)
 {
-	ill_t	*from_ill;
-	ipif_t	*rep_ipif;
-	uint_t	unit;
-	int err = 0;
-	ipif_t	*to_ipif;
-	struct iocblk	*iocp;
-	boolean_t failback_cmd;
-	boolean_t remove_ipif;
-	int	rc;
-	ip_stack_t	*ipst;
-
-	ASSERT(IAM_WRITER_ILL(to_ill));
-	ASSERT(IAM_WRITER_IPIF(ipif));
-
-	iocp = (struct iocblk *)mp->b_rptr;
-	failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK);
-	remove_ipif = B_FALSE;
-
-	from_ill = ipif->ipif_ill;
-	ipst = from_ill->ill_ipst;
-
-	ASSERT(MUTEX_HELD(&to_ill->ill_lock));
-	ASSERT(MUTEX_HELD(&from_ill->ill_lock));
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
-	/*
-	 * Don't move LINK LOCAL addresses as they are tied to
-	 * physical interface.
-	 */
-	if (from_ill->ill_isv6 &&
-	    IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) {
-		ipif->ipif_was_up = B_FALSE;
-		IPIF_UNMARK_MOVING(ipif);
-		return (0);
-	}
-
-	/*
-	 * We set the ipif_id to maximum so that the search for
-	 * ipif_id will pick the lowest number i.e 0 in the
-	 * following 2 cases :
-	 *
-	 * 1) We have a replacement ipif at the head of to_ill.
-	 *    We can't remove it yet as we can exceed ip_addrs_per_if
-	 *    on to_ill and hence the MOVE might fail. We want to
-	 *    remove it only if we could move the ipif. Thus, by
-	 *    setting it to the MAX value, we make the search in
-	 *    ipif_get_id return the zeroth id.
-	 *
-	 * 2) When DR pulls out the NIC and re-plumbs the interface,
-	 *    we might just have a zero address plumbed on the ipif
-	 *    with zero id in the case of IPv4. We remove that while
-	 *    doing the failback. We want to remove it only if we
-	 *    could move the ipif. Thus, by setting it to the MAX
-	 *    value, we make the search in ipif_get_id return the
-	 *    zeroth id.
-	 *
-	 * Both (1) and (2) are done only when when we are moving
-	 * an ipif (either due to failover/failback) which originally
-	 * belonged to this interface i.e the ipif_orig_ifindex is
-	 * the same as to_ill's ifindex. This is needed so that
-	 * FAILOVER from A -> B ( A failed) followed by FAILOVER
-	 * from B -> A (B is being removed from the group) and
-	 * FAILBACK from A -> B restores the original configuration.
-	 * Without the check for orig_ifindex, the second FAILOVER
-	 * could make the ipif belonging to B replace the A's zeroth
-	 * ipif and the subsequent failback re-creating the replacement
-	 * ipif again.
-	 *
-	 * NOTE : We created the replacement ipif when we did a
-	 * FAILOVER (See below). We could check for FAILBACK and
-	 * then look for replacement ipif to be removed. But we don't
-	 * want to do that because we wan't to allow the possibility
-	 * of a FAILOVER from A -> B (which creates the replacement ipif),
-	 * followed by a *FAILOVER* from B -> A instead of a FAILBACK
-	 * from B -> A.
-	 */
-	to_ipif = to_ill->ill_ipif;
-	if ((to_ill->ill_phyint->phyint_ifindex ==
-	    ipif->ipif_orig_ifindex) &&
-	    to_ipif->ipif_replace_zero) {
-		ASSERT(to_ipif->ipif_id == 0);
-		remove_ipif = B_TRUE;
-		to_ipif->ipif_id = MAX_ADDRS_PER_IF;
-	}
-	/*
-	 * Find the lowest logical unit number on the to_ill.
-	 * If we are failing back, try to get the original id
-	 * rather than the lowest one so that the original
-	 * configuration is maintained.
-	 *
-	 * XXX need a better scheme for this.
-	 */
-	if (failback_cmd) {
-		unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid);
-	} else {
-		unit = ipif_get_id(to_ill, 0);
-	}
-
-	/* Reset back to zero in case we fail below */
-	if (to_ipif->ipif_id == MAX_ADDRS_PER_IF)
-		to_ipif->ipif_id = 0;
+	ill_t		*bound_ill;
+	struct lifreq	*lifr = ifreq;
 
-	if (unit == ipst->ips_ip_addrs_per_if) {
-		ipif->ipif_was_up = B_FALSE;
-		IPIF_UNMARK_MOVING(ipif);
+	if (!IS_IPMP(ipif->ipif_ill))
 		return (EINVAL);
-	}
-
-	/*
-	 * ipif is ready to move from "from_ill" to "to_ill".
-	 *
-	 * 1) If we are moving ipif with id zero, create a
-	 *    replacement ipif for this ipif on from_ill. If this fails
-	 *    fail the MOVE operation.
-	 *
-	 * 2) Remove the replacement ipif on to_ill if any.
-	 *    We could remove the replacement ipif when we are moving
-	 *    the ipif with id zero. But what if somebody already
-	 *    unplumbed it ? Thus we always remove it if it is present.
-	 *    We want to do it only if we are sure we are going to
-	 *    move the ipif to to_ill which is why there are no
-	 *    returns due to error till ipif is linked to to_ill.
-	 *    Note that the first ipif that we failback will always
-	 *    be zero if it is present.
-	 */
-	if (ipif->ipif_id == 0) {
-		ipaddr_t inaddr_any = INADDR_ANY;
 
-		rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED);
-		if (rep_ipif == NULL) {
-			ipif->ipif_was_up = B_FALSE;
-			IPIF_UNMARK_MOVING(ipif);
-			return (ENOMEM);
-		}
-		*rep_ipif = ipif_zero;
-		/*
-		 * Before we put the ipif on the list, store the addresses
-		 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR
-		 * assumes so. This logic is not any different from what
-		 * ipif_allocate does.
-		 */
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6lcl_addr);
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6src_addr);
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6subnet);
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6net_mask);
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6brd_addr);
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6pp_dst_addr);
-		/*
-		 * We mark IPIF_NOFAILOVER so that this can never
-		 * move.
-		 */
-		rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER;
-		rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE;
-		rep_ipif->ipif_replace_zero = B_TRUE;
-		mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL,
-		    MUTEX_DEFAULT, NULL);
-		rep_ipif->ipif_id = 0;
-		rep_ipif->ipif_ire_type = ipif->ipif_ire_type;
-		rep_ipif->ipif_ill = from_ill;
-		rep_ipif->ipif_orig_ifindex =
-		    from_ill->ill_phyint->phyint_ifindex;
-		/* Insert at head */
-		rep_ipif->ipif_next = from_ill->ill_ipif;
-		from_ill->ill_ipif = rep_ipif;
-		/*
-		 * We don't really care to let apps know about
-		 * this interface.
-		 */
-	}
-
-	if (remove_ipif) {
-		/*
-		 * We set to a max value above for this case to get
-		 * id zero. ASSERT that we did get one.
-		 */
-		ASSERT((to_ipif->ipif_id == 0) && (unit == 0));
-		rep_ipif = to_ipif;
-		to_ill->ill_ipif = rep_ipif->ipif_next;
-		rep_ipif->ipif_next = NULL;
-		/*
-		 * If some apps scanned and find this interface,
-		 * it is time to let them know, so that they can
-		 * delete it.
-		 */
-
-		*rep_ipif_ptr = rep_ipif;
-	}
-
-	/* Get it out of the ILL interface list. */
-	ipif_remove(ipif, B_FALSE);
-
-	/* Assign the new ill */
-	ipif->ipif_ill = to_ill;
-	ipif->ipif_id = unit;
-	/* id has already been checked */
-	rc = ipif_insert(ipif, B_FALSE, B_FALSE);
-	ASSERT(rc == 0);
-	/* Let SCTP update its list */
-	sctp_move_ipif(ipif, from_ill, to_ill);
-	/*
-	 * Handle the failover and failback of ipif_t between
-	 * ill_t that have differing maximum mtu values.
-	 */
-	if (ipif->ipif_mtu > to_ill->ill_max_mtu) {
-		if (ipif->ipif_saved_mtu == 0) {
-			/*
-			 * As this ipif_t is moving to an ill_t
-			 * that has a lower ill_max_mtu, its
-			 * ipif_mtu needs to be saved so it can
-			 * be restored during failback or during
-			 * failover to an ill_t which has a
-			 * higher ill_max_mtu.
-			 */
-			ipif->ipif_saved_mtu = ipif->ipif_mtu;
-			ipif->ipif_mtu = to_ill->ill_max_mtu;
-		} else {
-			/*
-			 * The ipif_t is, once again, moving to
-			 * an ill_t that has a lower maximum mtu
-			 * value.
-			 */
-			ipif->ipif_mtu = to_ill->ill_max_mtu;
-		}
-	} else if (ipif->ipif_mtu < to_ill->ill_max_mtu &&
-	    ipif->ipif_saved_mtu != 0) {
-		/*
-		 * The mtu of this ipif_t had to be reduced
-		 * during an earlier failover; this is an
-		 * opportunity for it to be increased (either as
-		 * part of another failover or a failback).
-		 */
-		if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) {
-			ipif->ipif_mtu = ipif->ipif_saved_mtu;
-			ipif->ipif_saved_mtu = 0;
-		} else {
-			ipif->ipif_mtu = to_ill->ill_max_mtu;
-		}
+	if ((bound_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
+		lifr->lifr_binding[0] = '\0';
+		return (0);
 	}
 
-	/*
-	 * We preserve all the other fields of the ipif including
-	 * ipif_saved_ire_mp. The routes that are saved here will
-	 * be recreated on the new interface and back on the old
-	 * interface when we move back.
-	 */
-	ASSERT(ipif->ipif_arp_del_mp == NULL);
-
-	return (err);
-}
-
-static int
-ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp,
-    int ifindex, ipif_t **rep_ipif_ptr)
-{
-	ipif_t *mipif;
-	ipif_t *ipif_next;
-	int err;
-
-	/*
-	 * We don't really try to MOVE back things if some of the
-	 * operations fail. The daemon will take care of moving again
-	 * later on.
-	 */
-	for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) {
-		ipif_next = mipif->ipif_next;
-		if (!(mipif->ipif_flags & IPIF_NOFAILOVER) &&
-		    (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) {
-
-			err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr);
-
-			/*
-			 * When the MOVE fails, it is the job of the
-			 * application to take care of this properly
-			 * i.e try again if it is ENOMEM.
-			 */
-			if (mipif->ipif_ill != from_ill) {
-				/*
-				 * ipif has moved.
-				 *
-				 * Move the multicast memberships associated
-				 * with this ipif to the new ill. For IPv6, we
-				 * do it once after all the ipifs are moved
-				 * (in ill_move) as they are not associated
-				 * with ipifs.
-				 *
-				 * We need to move the ilms as the ipif has
-				 * already been moved to a new ill even
-				 * in the case of errors. Neither
-				 * ilm_free(ipif) will find the ilm
-				 * when somebody unplumbs this ipif nor
-				 * ilm_delete(ilm) will be able to find the
-				 * ilm, if we don't move now.
-				 */
-				if (!from_ill->ill_isv6)
-					ilm_move_v4(from_ill, to_ill, mipif);
-			}
-
-			if (err != 0)
-				return (err);
-		}
-	}
+	(void) strlcpy(lifr->lifr_binding, bound_ill->ill_name, LIFNAMSIZ);
+	ill_refrele(bound_ill);
 	return (0);
 }
 
-static int
-ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp)
-{
-	int ifindex;
-	int err;
-	struct iocblk	*iocp;
-	ipif_t	*ipif;
-	ipif_t *rep_ipif_ptr = NULL;
-	ipif_t	*from_ipif = NULL;
-	boolean_t check_rep_if = B_FALSE;
-	ip_stack_t	*ipst = from_ill->ill_ipst;
-
-	iocp = (struct iocblk *)mp->b_rptr;
-	if (iocp->ioc_cmd == SIOCLIFFAILOVER) {
-		/*
-		 * Move everything pointing at from_ill to to_ill.
-		 * We acheive this by passing in 0 as ifindex.
-		 */
-		ifindex = 0;
-	} else {
-		/*
-		 * Move everything pointing at from_ill whose original
-		 * ifindex of connp, ipif, ilm points at to_ill->ill_index.
-		 * We acheive this by passing in ifindex rather than 0.
-		 * Multicast vifs, ilgs move implicitly because ipifs move.
-		 */
-		ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK);
-		ifindex = to_ill->ill_phyint->phyint_ifindex;
-	}
-
-	/*
-	 * Determine if there is at least one ipif that would move from
-	 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement
-	 * ipif (if it exists) on the to_ill would be consumed as a result of
-	 * the move, in which case we need to quiesce the replacement ipif also.
-	 */
-	for (from_ipif = from_ill->ill_ipif; from_ipif != NULL;
-	    from_ipif = from_ipif->ipif_next) {
-		if (((ifindex == 0) ||
-		    (ifindex == from_ipif->ipif_orig_ifindex)) &&
-		    !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) {
-			check_rep_if = B_TRUE;
-			break;
-		}
-	}
-
-	ill_down_ipifs(from_ill, mp, ifindex, B_TRUE);
-
-	GRAB_ILL_LOCKS(from_ill, to_ill);
-	if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) {
-		(void) ipsq_pending_mp_add(NULL, ipif, q,
-		    mp, ILL_MOVE_OK);
-		RELEASE_ILL_LOCKS(from_ill, to_ill);
-		return (EINPROGRESS);
-	}
-
-	/* Check if the replacement ipif is quiescent to delete */
-	if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif,
-	    (iocp->ioc_cmd == SIOCLIFFAILBACK))) {
-		to_ill->ill_ipif->ipif_state_flags |=
-		    IPIF_MOVING | IPIF_CHANGING;
-		if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) {
-			(void) ipsq_pending_mp_add(NULL, ipif, q,
-			    mp, ILL_MOVE_OK);
-			RELEASE_ILL_LOCKS(from_ill, to_ill);
-			return (EINPROGRESS);
-		}
-	}
-	RELEASE_ILL_LOCKS(from_ill, to_ill);
-
-	ASSERT(!MUTEX_HELD(&to_ill->ill_lock));
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-	GRAB_ILL_LOCKS(from_ill, to_ill);
-	err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr);
-
-	/* ilm_move is done inside ipif_move for IPv4 */
-	if (err == 0 && from_ill->ill_isv6)
-		ilm_move_v6(from_ill, to_ill, ifindex);
-
-	RELEASE_ILL_LOCKS(from_ill, to_ill);
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	/*
-	 * send rts messages and multicast messages.
-	 */
-	if (rep_ipif_ptr != NULL) {
-		if (rep_ipif_ptr->ipif_recovery_id != 0) {
-			(void) untimeout(rep_ipif_ptr->ipif_recovery_id);
-			rep_ipif_ptr->ipif_recovery_id = 0;
-		}
-		ip_rts_ifmsg(rep_ipif_ptr);
-		ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr);
-#ifdef DEBUG
-		ipif_trace_cleanup(rep_ipif_ptr);
-#endif
-		mi_free(rep_ipif_ptr);
-	}
-
-	conn_move_ill(from_ill, to_ill, ifindex);
-
-	return (err);
-}
-
 /*
- * Used to extract arguments for FAILOVER/FAILBACK ioctls.
- * Also checks for the validity of the arguments.
- * Note: We are already exclusive inside the from group.
- * It is upto the caller to release refcnt on the to_ill's.
+ * Process an SIOCGLIFGROUPNAME request.
  */
-static int
-ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4,
-    ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6)
+/* ARGSUSED */
+int
+ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+    ip_ioctl_cmd_t *ipip, void *ifreq)
 {
-	int dst_index;
-	ipif_t *ipif_v4, *ipif_v6;
-	struct lifreq *lifr;
-	mblk_t *mp1;
-	boolean_t exists;
-	sin_t	*sin;
-	int	err = 0;
-	ip_stack_t	*ipst;
+	ipmp_grp_t	*grp;
+	struct lifreq	*lifr = ifreq;
+	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
 
-	if (CONN_Q(q))
-		ipst = CONNQ_TO_IPST(q);
+	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+	if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL)
+		lifr->lifr_groupname[0] = '\0';
 	else
-		ipst = ILLQ_TO_IPST(q);
-
-	if ((mp1 = mp->b_cont) == NULL)
-		return (EPROTO);
-
-	if ((mp1 = mp1->b_cont) == NULL)
-		return (EPROTO);
-
-	lifr = (struct lifreq *)mp1->b_rptr;
-	sin = (sin_t *)&lifr->lifr_addr;
-
-	/*
-	 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6
-	 * specific operations.
-	 */
-	if (sin->sin_family != AF_UNSPEC)
-		return (EINVAL);
-
-	/*
-	 * Get ipif with id 0. We are writer on the from ill. So we can pass
-	 * NULLs for the last 4 args and we know the lookup won't fail
-	 * with EINPROGRESS.
-	 */
-	ipif_v4 = ipif_lookup_on_name(lifr->lifr_name,
-	    mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE,
-	    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
-	ipif_v6 = ipif_lookup_on_name(lifr->lifr_name,
-	    mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE,
-	    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
-
-	if (ipif_v4 == NULL && ipif_v6 == NULL)
-		return (ENXIO);
-
-	if (ipif_v4 != NULL) {
-		ASSERT(ipif_v4->ipif_refcnt != 0);
-		if (ipif_v4->ipif_id != 0) {
-			err = EINVAL;
-			goto done;
-		}
-
-		ASSERT(IAM_WRITER_IPIF(ipif_v4));
-		*ill_from_v4 = ipif_v4->ipif_ill;
-	}
-
-	if (ipif_v6 != NULL) {
-		ASSERT(ipif_v6->ipif_refcnt != 0);
-		if (ipif_v6->ipif_id != 0) {
-			err = EINVAL;
-			goto done;
-		}
-
-		ASSERT(IAM_WRITER_IPIF(ipif_v6));
-		*ill_from_v6 = ipif_v6->ipif_ill;
-	}
-
-	err = 0;
-	dst_index = lifr->lifr_movetoindex;
-	*ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE,
-	    q, mp, ip_process_ioctl, &err, ipst);
-	if (err != 0) {
-		/*
-		 * A move may be in progress, EINPROGRESS looking up the "to"
-		 * ill means changes already done to the "from" ipsq need to
-		 * be undone to avoid potential deadlocks.
-		 *
-		 * ENXIO will usually be because there is only v6 on the ill,
-		 * that's not treated as an error unless an ENXIO is also
-		 * seen when looking up the v6 "to" ill.
-		 *
-		 * If EINPROGRESS, the mp has been enqueued and can not be
-		 * used to look up the v6 "to" ill, but a preemptive clean
-		 * up of changes to the v6 "from" ipsq is done.
-		 */
-		if (err == EINPROGRESS) {
-			if (*ill_from_v4 != NULL) {
-				ill_t   *from_ill;
-				ipsq_t  *from_ipsq;
-
-				from_ill = ipif_v4->ipif_ill;
-				from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
-				mutex_enter(&from_ipsq->ipsq_lock);
-				from_ipsq->ipsq_current_ipif = NULL;
-				mutex_exit(&from_ipsq->ipsq_lock);
-			}
-			if (*ill_from_v6 != NULL) {
-				ill_t   *from_ill;
-				ipsq_t  *from_ipsq;
-
-				from_ill = ipif_v6->ipif_ill;
-				from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
-				mutex_enter(&from_ipsq->ipsq_lock);
-				from_ipsq->ipsq_current_ipif = NULL;
-				mutex_exit(&from_ipsq->ipsq_lock);
-			}
-			goto done;
-		}
-		ASSERT(err == ENXIO);
-		err = 0;
-	}
-
-	*ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE,
-	    q, mp, ip_process_ioctl, &err, ipst);
-	if (err != 0) {
-		/*
-		 * A move may be in progress, EINPROGRESS looking up the "to"
-		 * ill means changes already done to the "from" ipsq need to
-		 * be undone to avoid potential deadlocks.
-		 */
-		if (err == EINPROGRESS) {
-			if (*ill_from_v6 != NULL) {
-				ill_t   *from_ill;
-				ipsq_t  *from_ipsq;
-
-				from_ill = ipif_v6->ipif_ill;
-				from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
-				mutex_enter(&from_ipsq->ipsq_lock);
-				from_ipsq->ipsq_current_ipif = NULL;
-				mutex_exit(&from_ipsq->ipsq_lock);
-			}
-			goto done;
-		}
-		ASSERT(err == ENXIO);
-
-		/* Both v4 and v6 lookup failed */
-		if (*ill_to_v4 == NULL) {
-			err = ENXIO;
-			goto done;
-		}
-		err = 0;
-	}
-
-	/*
-	 * If we have something to MOVE i.e "from" not NULL,
-	 * "to" should be non-NULL.
-	 */
-	if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) ||
-	    (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) {
-		err = EINVAL;
-	}
-
-done:
-	if (ipif_v4 != NULL)
-		ipif_refrele(ipif_v4);
-	if (ipif_v6 != NULL)
-		ipif_refrele(ipif_v6);
-	return (err);
+		(void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ);
+	rw_exit(&ipst->ips_ipmp_lock);
+	return (0);
 }
 
 /*
- * FAILOVER and FAILBACK are modelled as MOVE operations.
- *
- * We don't check whether the MOVE is within the same group or
- * not, because this ioctl can be used as a generic mechanism
- * to failover from interface A to B, though things will function
- * only if they are really part of the same group. Moreover,
- * all ipifs may be down and hence temporarily out of the group.
- *
- * ipif's that need to be moved are first brought down; V4 ipifs are brought
- * down first and then V6.  For each we wait for the ipif's to become quiescent.
- * Bringing down the ipifs ensures that all ires pointing to these ipifs's
- * have been deleted and there are no active references. Once quiescent the
- * ipif's are moved and brought up on the new ill.
- *
- * Normally the source ill and destination ill belong to the same IPMP group
- * and hence the same ipsq_t. In the event they don't belong to the same
- * same group the two ipsq's are first merged into one ipsq - that of the
- * to_ill. The multicast memberships on the source and destination ill cannot
- * change during the move operation since multicast joins/leaves also have to
- * execute on the same ipsq and are hence serialized.
+ * Process an SIOCGLIFGROUPINFO request.
  */
 /* ARGSUSED */
 int
-ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
-    ip_ioctl_cmd_t *ipip, void *ifreq)
+ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+    ip_ioctl_cmd_t *ipip, void *dummy)
 {
-	ill_t *ill_to_v4 = NULL;
-	ill_t *ill_to_v6 = NULL;
-	ill_t *ill_from_v4 = NULL;
-	ill_t *ill_from_v6 = NULL;
-	int err = 0;
-
-	/*
-	 * setup from and to ill's, we can get EINPROGRESS only for
-	 * to_ill's.
-	 */
-	err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6,
-	    &ill_to_v4, &ill_to_v6);
-
-	if (err != 0) {
-		ip0dbg(("ip_sioctl_move: extract args failed\n"));
-		goto done;
-	}
-
-	/*
-	 * nothing to do.
-	 */
-	if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) {
-		goto done;
-	}
-
-	/*
-	 * nothing to do.
-	 */
-	if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) {
-		goto done;
-	}
-
-	/*
-	 * Mark the ill as changing.
-	 * ILL_CHANGING flag is cleared when the ipif's are brought up
-	 * in ill_up_ipifs in case of error they are cleared below.
-	 */
-
-	GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
-	if (ill_from_v4 != NULL)
-		ill_from_v4->ill_state_flags |= ILL_CHANGING;
-	if (ill_from_v6 != NULL)
-		ill_from_v6->ill_state_flags |= ILL_CHANGING;
-	RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
-
-	/*
-	 * Make sure that both src and dst are
-	 * in the same syncq group. If not make it happen.
-	 * We are not holding any locks because we are the writer
-	 * on the from_ipsq and we will hold locks in ill_merge_groups
-	 * to protect to_ipsq against changing.
-	 */
-	if (ill_from_v4 != NULL) {
-		if (ill_from_v4->ill_phyint->phyint_ipsq !=
-		    ill_to_v4->ill_phyint->phyint_ipsq) {
-			err = ill_merge_groups(ill_from_v4, ill_to_v4,
-			    NULL, mp, q);
-			goto err_ret;
-
-		}
-		ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock));
-	} else {
-
-		if (ill_from_v6->ill_phyint->phyint_ipsq !=
-		    ill_to_v6->ill_phyint->phyint_ipsq) {
-			err = ill_merge_groups(ill_from_v6, ill_to_v6,
-			    NULL, mp, q);
-			goto err_ret;
-
-		}
-		ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock));
-	}
-
-	/*
-	 * Now that the ipsq's have been merged and we are the writer
-	 * lets mark to_ill as changing as well.
-	 */
-
-	GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
-	if (ill_to_v4 != NULL)
-		ill_to_v4->ill_state_flags |= ILL_CHANGING;
-	if (ill_to_v6 != NULL)
-		ill_to_v6->ill_state_flags |= ILL_CHANGING;
-	RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
-
-	/*
-	 * Its ok for us to proceed with the move even if
-	 * ill_pending_mp is non null on one of the from ill's as the reply
-	 * should not be looking at the ipif, it should only care about the
-	 * ill itself.
-	 */
-
-	/*
-	 * lets move ipv4 first.
-	 */
-	if (ill_from_v4 != NULL) {
-		ASSERT(IAM_WRITER_ILL(ill_to_v4));
-		ill_from_v4->ill_move_in_progress = B_TRUE;
-		ill_to_v4->ill_move_in_progress = B_TRUE;
-		ill_to_v4->ill_move_peer = ill_from_v4;
-		ill_from_v4->ill_move_peer = ill_to_v4;
-		err = ill_move(ill_from_v4, ill_to_v4, q, mp);
-	}
-
-	/*
-	 * Now lets move ipv6.
-	 */
-	if (err == 0 && ill_from_v6 != NULL) {
-		ASSERT(IAM_WRITER_ILL(ill_to_v6));
-		ill_from_v6->ill_move_in_progress = B_TRUE;
-		ill_to_v6->ill_move_in_progress = B_TRUE;
-		ill_to_v6->ill_move_peer = ill_from_v6;
-		ill_from_v6->ill_move_peer = ill_to_v6;
-		err = ill_move(ill_from_v6, ill_to_v6, q, mp);
-	}
-
-err_ret:
-	/*
-	 * EINPROGRESS means we are waiting for the ipif's that need to be
-	 * moved to become quiescent.
-	 */
-	if (err == EINPROGRESS) {
-		goto done;
-	}
-
-	/*
-	 * if err is set ill_up_ipifs will not be called
-	 * lets clear the flags.
-	 */
-
-	GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
-	GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
-	/*
-	 * Some of the clearing may be redundant. But it is simple
-	 * not making any extra checks.
-	 */
-	if (ill_from_v6 != NULL) {
-		ill_from_v6->ill_move_in_progress = B_FALSE;
-		ill_from_v6->ill_move_peer = NULL;
-		ill_from_v6->ill_state_flags &= ~ILL_CHANGING;
-	}
-	if (ill_from_v4 != NULL) {
-		ill_from_v4->ill_move_in_progress = B_FALSE;
-		ill_from_v4->ill_move_peer = NULL;
-		ill_from_v4->ill_state_flags &= ~ILL_CHANGING;
-	}
-	if (ill_to_v6 != NULL) {
-		ill_to_v6->ill_move_in_progress = B_FALSE;
-		ill_to_v6->ill_move_peer = NULL;
-		ill_to_v6->ill_state_flags &= ~ILL_CHANGING;
-	}
-	if (ill_to_v4 != NULL) {
-		ill_to_v4->ill_move_in_progress = B_FALSE;
-		ill_to_v4->ill_move_peer = NULL;
-		ill_to_v4->ill_state_flags &= ~ILL_CHANGING;
-	}
-
-	/*
-	 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set.
-	 * Do this always to maintain proper state i.e even in case of errors.
-	 * As phyint_inactive looks at both v4 and v6 interfaces,
-	 * we need not call on both v4 and v6 interfaces.
-	 */
-	if (ill_from_v4 != NULL) {
-		if ((ill_from_v4->ill_phyint->phyint_flags &
-		    (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
-			phyint_inactive(ill_from_v4->ill_phyint);
-		}
-	} else if (ill_from_v6 != NULL) {
-		if ((ill_from_v6->ill_phyint->phyint_flags &
-		    (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
-			phyint_inactive(ill_from_v6->ill_phyint);
-		}
-	}
-
-	if (ill_to_v4 != NULL) {
-		if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) {
-			ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
-		}
-	} else if (ill_to_v6 != NULL) {
-		if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) {
-			ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
-		}
-	}
-
-	RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
-	RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
-
-no_err:
-	/*
-	 * lets bring the interfaces up on the to_ill.
-	 */
-	if (err == 0) {
-		err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4,
-		    q, mp);
-	}
-
-	if (err == 0) {
-		if (ill_from_v4 != NULL && ill_to_v4 != NULL)
-			ilm_send_multicast_reqs(ill_from_v4, ill_to_v4);
+	lifgroupinfo_t	*lifgr;
+	ipmp_grp_t	*grp;
+	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
 
-		if (ill_from_v6 != NULL && ill_to_v6 != NULL)
-			ilm_send_multicast_reqs(ill_from_v6, ill_to_v6);
-	}
-done:
+	/* ip_wput_nondata() verified mp->b_cont->b_cont */
+	lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr;
+	lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0';
 
-	if (ill_to_v4 != NULL) {
-		ill_refrele(ill_to_v4);
-	}
-	if (ill_to_v6 != NULL) {
-		ill_refrele(ill_to_v6);
+	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+	if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) {
+		rw_exit(&ipst->ips_ipmp_lock);
+		return (ENOENT);
 	}
-
-	return (err);
+	ipmp_grp_info(grp, lifgr);
+	rw_exit(&ipst->ips_ipmp_lock);
+	return (0);
 }
 
 static void
@@ -18167,10 +14492,9 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
 	 * we only wait for the ACK of the DL_UNBIND_REQ.
 	 */
 	mutex_enter(&ill->ill_lock);
-	if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
-	    (prim == DL_UNBIND_REQ)) {
+	if (!(ill->ill_state_flags & ILL_CONDEMNED) || (prim == DL_UNBIND_REQ))
 		ill->ill_dlpi_pending = prim;
-	}
+
 	mutex_exit(&ill->ill_lock);
 	putnext(ill->ill_wq, mp);
 }
@@ -18324,6 +14648,7 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
 {
 	mblk_t *mp;
 	ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
+	ipxop_t *ipx = ipsq->ipsq_xop;
 
 	ASSERT(IAM_WRITER_IPSQ(ipsq));
 	mutex_enter(&ill->ill_lock);
@@ -18336,12 +14661,11 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
 
 	if ((mp = ill->ill_dlpi_deferred) == NULL) {
 		ill->ill_dlpi_pending = DL_PRIM_INVAL;
-
-		mutex_enter(&ipsq->ipsq_lock);
-		if (ipsq->ipsq_current_done)
-			ipsq->ipsq_current_ipif = NULL;
-		mutex_exit(&ipsq->ipsq_lock);
-
+		if (ipx->ipx_current_done) {
+			mutex_enter(&ipx->ipx_lock);
+			ipx->ipx_current_ipif = NULL;
+			mutex_exit(&ipx->ipx_lock);
+		}
 		cv_signal(&ill->ill_cv);
 		mutex_exit(&ill->ill_lock);
 		return;
@@ -18379,7 +14703,7 @@ conn_delete_ire(conn_t *connp, caddr_t arg)
 }
 
 /*
- * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number
+ * Some operations (e.g., ipif_down()) conditionally delete a number
  * of IREs. Those IREs may have been previously cached in the conn structure.
  * This ipcl_walk() walker function releases all references to such IREs based
  * on the condemned flag.
@@ -18403,7 +14727,6 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
 
 /*
  * Take down a specific interface, but don't lose any information about it.
- * Also delete interface from its interface group (ifgrp).
  * (Always called as writer.)
  * This function goes through the down sequence even if the interface is
  * already down. There are 2 reasons.
@@ -18501,7 +14824,7 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
  * For eg. bind, and route operations (Eg. route add / delete) cannot return
  * failure if the ipif is currently undergoing an exclusive operation, and
  * hence pass the flag. The mblk is then enqueued in the ipsq and the operation
- * is restarted by ipsq_exit() when the currently exclusive ioctl completes.
+ * is restarted by ipsq_exit() when the current exclusive operation completes.
  * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The
  * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
  * change while the ill_lock is held. Before dropping the ill_lock we acquire
@@ -18522,7 +14845,6 @@ int
 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 {
 	ill_t		*ill = ipif->ipif_ill;
-	phyint_t	*phyi;
 	conn_t		*connp;
 	boolean_t	success;
 	boolean_t	ipif_was_up = B_FALSE;
@@ -18569,20 +14891,7 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 	}
 
 	/*
-	 * Before we delete the ill from the group (if any), we need
-	 * to make sure that we delete all the routes dependent on
-	 * this and also any ipifs dependent on this ipif for
-	 * source address. We need to do before we delete from
-	 * the group because
-	 *
-	 * 1) ipif_down_delete_ire de-references ill->ill_group.
-	 *
-	 * 2) ipif_update_other_ipifs needs to walk the whole group
-	 *    for re-doing source address selection. Note that
-	 *    ipif_select_source[_v6] called from
-	 *    ipif_update_other_ipifs[_v6] will not pick this ipif
-	 *    because we have already marked down here i.e cleared
-	 *    IPIF_UP.
+	 * Delete all IRE's pointing at this ipif or its source address.
 	 */
 	if (ipif->ipif_isv6) {
 		ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES,
@@ -18592,6 +14901,17 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 		    ipst);
 	}
 
+	if (ipif_was_up && ill->ill_ipif_up_count == 0) {
+		/*
+		 * Since the interface is now down, it may have just become
+		 * inactive.  Note that this needs to be done even for a
+		 * lll_logical_down(), or ARP entries will not get correctly
+		 * restored when the interface comes back up.
+		 */
+		if (IS_UNDER_IPMP(ill))
+			ipmp_ill_refresh_active(ill);
+	}
+
 	/*
 	 * Cleaning up the conn_ire_cache or conns must be done only after the
 	 * ires have been deleted above. Otherwise a thread could end up
@@ -18609,53 +14929,9 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 	 * entries for such ipifs.
 	 */
 	if (ipif->ipif_isv6)
-		ipif_update_other_ipifs_v6(ipif, ill->ill_group);
+		ipif_update_other_ipifs_v6(ipif);
 	else
-		ipif_update_other_ipifs(ipif, ill->ill_group);
-
-	if (ipif_was_up) {
-		/*
-		 * Check whether it is last ipif to leave this group.
-		 * If this is the last ipif to leave, we should remove
-		 * this ill from the group as ipif_select_source will not
-		 * be able to find any useful ipifs if this ill is selected
-		 * for load balancing.
-		 *
-		 * For nameless groups, we should call ifgrp_delete if this
-		 * belongs to some group. As this ipif is going down, we may
-		 * need to reconstruct groups.
-		 */
-		phyi = ill->ill_phyint;
-		/*
-		 * If the phyint_groupname_len is 0, it may or may not
-		 * be in the nameless group. If the phyint_groupname_len is
-		 * not 0, then this ill should be part of some group.
-		 * As we always insert this ill in the group if
-		 * phyint_groupname_len is not zero when the first ipif
-		 * comes up (in ipif_up_done), it should be in a group
-		 * when the namelen is not 0.
-		 *
-		 * NOTE : When we delete the ill from the group,it will
-		 * blow away all the IRE_CACHES pointing either at this ipif or
-		 * ill_wq (illgrp_cache_delete does this). Thus, no IRES
-		 * should be pointing at this ill.
-		 */
-		ASSERT(phyi->phyint_groupname_len == 0 ||
-		    (phyi->phyint_groupname != NULL && ill->ill_group != NULL));
-
-		if (phyi->phyint_groupname_len != 0) {
-			if (ill->ill_ipif_up_count == 0)
-				illgrp_delete(ill);
-		}
-
-		/*
-		 * If we have deleted some of the broadcast ires associated
-		 * with this ipif, we need to re-nominate somebody else if
-		 * the ires that we deleted were the nominated ones.
-		 */
-		if (ill->ill_group != NULL && !ill->ill_isv6)
-			ipif_renominate_bcast(ipif);
-	}
+		ipif_update_other_ipifs(ipif);
 
 	/*
 	 * neighbor-discovery or arp entries for this interface.
@@ -18734,17 +15010,12 @@ ipif_down_tail(ipif_t *ipif)
 	ill->ill_logical_down = 0;
 
 	/*
-	 * Have to be after removing the routes in ipif_down_delete_ire.
+	 * Has to be after removing the routes in ipif_down_delete_ire.
 	 */
-	if (ipif->ipif_isv6) {
-		if (ill->ill_flags & ILLF_XRESOLV)
-			ipif_arp_down(ipif);
-	} else {
-		ipif_arp_down(ipif);
-	}
+	ipif_resolver_down(ipif);
 
-	ip_rts_ifmsg(ipif);
-	ip_rts_newaddrmsg(RTM_DELETE, 0, ipif);
+	ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+	ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
 }
 
 /*
@@ -18804,39 +15075,11 @@ static void
 ipif_down_delete_ire(ire_t *ire, char *ipif_arg)
 {
 	ipif_t	*ipif = (ipif_t *)ipif_arg;
-	ill_t *ire_ill;
-	ill_t *ipif_ill;
 
 	ASSERT(IAM_WRITER_IPIF(ipif));
 	if (ire->ire_ipif == NULL)
 		return;
 
-	/*
-	 * For IPv4, we derive source addresses for an IRE from ipif's
-	 * belonging to the same IPMP group as the IRE's outgoing
-	 * interface.  If an IRE's outgoing interface isn't in the
-	 * same IPMP group as a particular ipif, then that ipif
-	 * couldn't have been used as a source address for this IRE.
-	 *
-	 * For IPv6, source addresses are only restricted to the IPMP group
-	 * if the IRE is for a link-local address or a multicast address.
-	 * Otherwise, source addresses for an IRE can be chosen from
-	 * interfaces other than the the outgoing interface for that IRE.
-	 *
-	 * For source address selection details, see ipif_select_source()
-	 * and ipif_select_source_v6().
-	 */
-	if (ire->ire_ipversion == IPV4_VERSION ||
-	    IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) ||
-	    IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
-		ire_ill = ire->ire_ipif->ipif_ill;
-		ipif_ill = ipif->ipif_ill;
-
-		if (ire_ill->ill_group != ipif_ill->ill_group) {
-			return;
-		}
-	}
-
 	if (ire->ire_ipif != ipif) {
 		/*
 		 * Look for a matching source address.
@@ -18875,83 +15118,53 @@ void
 ill_ipif_cache_delete(ire_t *ire, char *ill_arg)
 {
 	ill_t	*ill = (ill_t *)ill_arg;
-	ill_t	*ipif_ill;
 
 	ASSERT(IAM_WRITER_ILL(ill));
-	/*
-	 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
-	 * Hence this should be IRE_CACHE.
-	 */
 	ASSERT(ire->ire_type == IRE_CACHE);
 
 	/*
-	 * We are called for IRE_CACHES whose ire_ipif matches ill.
-	 * We are only interested in IRE_CACHES that has borrowed
-	 * the source address from ill_arg e.g. ipif_up_done[_v6]
-	 * for which we need to look at ire_ipif->ipif_ill match
-	 * with ill.
+	 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
+	 * ill, but we only want to delete the IRE if ire_ipif matches.
 	 */
 	ASSERT(ire->ire_ipif != NULL);
-	ipif_ill = ire->ire_ipif->ipif_ill;
-	if (ipif_ill == ill || (ill->ill_group != NULL &&
-	    ipif_ill->ill_group == ill->ill_group)) {
+	if (ill == ire->ire_ipif->ipif_ill)
 		ire_delete(ire);
-	}
 }
 
 /*
- * Delete all the ire whose stq references ill_arg.
+ * Delete all the IREs whose ire_stq's reference `ill_arg'.  IPMP uses this
+ * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references
+ * the IPMP ill.
  */
-static void
+void
 ill_stq_cache_delete(ire_t *ire, char *ill_arg)
 {
 	ill_t	*ill = (ill_t *)ill_arg;
-	ill_t	*ire_ill;
 
 	ASSERT(IAM_WRITER_ILL(ill));
-	/*
-	 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
-	 * Hence this should be IRE_CACHE.
-	 */
 	ASSERT(ire->ire_type == IRE_CACHE);
 
 	/*
-	 * We are called for IRE_CACHES whose ire_stq and ire_ipif
-	 * matches ill. We are only interested in IRE_CACHES that
-	 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the
-	 * filtering here.
+	 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
+	 * ill, but we only want to delete the IRE if ire_stq matches.
 	 */
-	ire_ill = (ill_t *)ire->ire_stq->q_ptr;
-
-	if (ire_ill == ill)
+	if (ire->ire_stq->q_ptr == ill_arg)
 		ire_delete(ire);
 }
 
 /*
- * This is called when an ill leaves the group. We want to delete
- * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is
- * pointing at ill.
+ * Delete all broadcast IREs with a source address on `ill_arg'.
  */
 static void
-illgrp_cache_delete(ire_t *ire, char *ill_arg)
+ill_broadcast_delete(ire_t *ire, char *ill_arg)
 {
-	ill_t	*ill = (ill_t *)ill_arg;
+	ill_t *ill = (ill_t *)ill_arg;
 
 	ASSERT(IAM_WRITER_ILL(ill));
-	ASSERT(ill->ill_group == NULL);
-	/*
-	 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
-	 * Hence this should be IRE_CACHE.
-	 */
-	ASSERT(ire->ire_type == IRE_CACHE);
-	/*
-	 * We are called for IRE_CACHES whose ire_stq and ire_ipif
-	 * matches ill. We are interested in both.
-	 */
-	ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) ||
-	    (ire->ire_ipif->ipif_ill == ill));
+	ASSERT(ire->ire_type == IRE_BROADCAST);
 
-	ire_delete(ire);
+	if (ire->ire_ipif->ipif_ill == ill)
+		ire_delete(ire);
 }
 
 /*
@@ -18997,13 +15210,12 @@ ipif_free(ipif_t *ipif)
 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
 	/* Remove pointers to this ill in the multicast routing tables */
 	reset_mrt_vif_ipif(ipif);
+	/* If necessary, clear the cached source ipif rotor. */
+	if (ipif->ipif_ill->ill_src_ipif == ipif)
+		ipif->ipif_ill->ill_src_ipif = NULL;
 	rw_exit(&ipst->ips_ill_g_lock);
 }
 
-/*
- * Warning: this is not the only function that calls mi_free on an ipif_t.  See
- * also ill_move().
- */
 static void
 ipif_free_tail(ipif_t *ipif)
 {
@@ -19036,7 +15248,7 @@ ipif_free_tail(ipif_t *ipif)
 	sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
 
 	/* Get it out of the ILL interface list. */
-	ipif_remove(ipif, B_TRUE);
+	ipif_remove(ipif);
 	rw_exit(&ipst->ips_ill_g_lock);
 
 	mutex_destroy(&ipif->ipif_saved_ire_lock);
@@ -19208,8 +15420,10 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 			} else if (IPIF_CAN_WAIT(ipif, q)) {
 				ipsq = ill->ill_phyint->phyint_ipsq;
 				mutex_enter(&ipsq->ipsq_lock);
+				mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 				mutex_exit(&ill->ill_lock);
 				ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+				mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 				mutex_exit(&ipsq->ipsq_lock);
 				RELEASE_CONN_LOCK(q);
 				ill_refrele(ill);
@@ -19244,7 +15458,7 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 		ire_type = IRE_LOOPBACK;
 	else
 		ire_type = IRE_LOCAL;
-	ipif = ipif_allocate(ill, id, ire_type, B_TRUE);
+	ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE);
 	if (ipif != NULL)
 		ipif_refhold_locked(ipif);
 	else if (error != NULL)
@@ -19342,65 +15556,62 @@ ill_mtu_change(ire_t *ire, char *ill_arg)
 void
 ipif_multicast_up(ipif_t *ipif)
 {
-	int err, index;
+	int err;
 	ill_t *ill;
 
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
 	ill = ipif->ipif_ill;
-	index = ill->ill_phyint->phyint_ifindex;
 
 	ip1dbg(("ipif_multicast_up\n"));
 	if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up)
 		return;
 
 	if (ipif->ipif_isv6) {
+		in6_addr_t v6allmc = ipv6_all_hosts_mcast;
+		in6_addr_t v6solmc = ipv6_solicited_node_mcast;
+
+		v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
+
 		if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
 			return;
 
-		/* Join the all hosts multicast address */
 		ip1dbg(("ipif_multicast_up - addmulti\n"));
+
 		/*
-		 * Passing B_TRUE means we have to join the multicast
-		 * membership on this interface even though this is
-		 * FAILED. If we join on a different one in the group,
-		 * we will not be able to delete the membership later
-		 * as we currently don't track where we join when we
-		 * join within the kernel unlike applications where
-		 * we have ilg/ilg_orig_index. See ip_addmulti_v6
-		 * for more on this.
+		 * Join the all hosts multicast address.  We skip this for
+		 * underlying IPMP interfaces since they should be invisible.
 		 */
-		err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index,
-		    ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
-		if (err != 0) {
-			ip0dbg(("ipif_multicast_up: "
-			    "all_hosts_mcast failed %d\n",
-			    err));
-			return;
+		if (!IS_UNDER_IPMP(ill)) {
+			err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid,
+			    ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
+			if (err != 0) {
+				ip0dbg(("ipif_multicast_up: "
+				    "all_hosts_mcast failed %d\n", err));
+				return;
+			}
+			ipif->ipif_joined_allhosts = 1;
 		}
+
 		/*
 		 * Enable multicast for the solicited node multicast address
 		 */
 		if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
-			in6_addr_t ipv6_multi = ipv6_solicited_node_mcast;
-
-			ipv6_multi.s6_addr32[3] |=
-			    ipif->ipif_v6lcl_addr.s6_addr32[3];
-
-			err = ip_addmulti_v6(&ipv6_multi, ill, index,
-			    ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE,
-			    NULL);
+			err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid,
+			    ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
 			if (err != 0) {
 				ip0dbg(("ipif_multicast_up: solicited MC"
 				    " failed %d\n", err));
-				(void) ip_delmulti_v6(&ipv6_all_hosts_mcast,
-				    ill, ill->ill_phyint->phyint_ifindex,
-				    ipif->ipif_zoneid, B_TRUE, B_TRUE);
+				if (ipif->ipif_joined_allhosts) {
+					(void) ip_delmulti_v6(&v6allmc, ill,
+					    ipif->ipif_zoneid, B_TRUE, B_TRUE);
+					ipif->ipif_joined_allhosts = 0;
+				}
 				return;
 			}
 		}
 	} else {
-		if (ipif->ipif_lcl_addr == INADDR_ANY)
+		if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
 			return;
 
 		/* Join the all hosts multicast address */
@@ -19420,7 +15631,7 @@ ipif_multicast_up(ipif_t *ipif)
  * (Explicit memberships are blown away in ill_leave_multicast() when the
  * ill is brought down.)
  */
-static void
+void
 ipif_multicast_down(ipif_t *ipif)
 {
 	int err;
@@ -19444,19 +15655,18 @@ ipif_multicast_down(ipif_t *ipif)
 	}
 
 	/*
-	 * Leave the all hosts multicast address. Similar to ip_addmulti_v6,
-	 * we should look for ilms on this ill rather than the ones that have
-	 * been failed over here.  They are here temporarily. As
-	 * ipif_multicast_up has joined on this ill, we should delete only
-	 * from this ill.
+	 * Leave the all-hosts multicast address.
 	 */
-	err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
-	    ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid,
-	    B_TRUE, B_TRUE);
-	if (err != 0) {
-		ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n",
-		    err));
+	if (ipif->ipif_joined_allhosts) {
+		err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
+		    ipif->ipif_zoneid, B_TRUE, B_TRUE);
+		if (err != 0) {
+			ip0dbg(("ipif_multicast_down: all_hosts_mcast "
+			    "failed %d\n", err));
+		}
+		ipif->ipif_joined_allhosts = 0;
 	}
+
 	/*
 	 * Disable multicast for the solicited node multicast address
 	 */
@@ -19467,9 +15677,7 @@ ipif_multicast_down(ipif_t *ipif)
 		    ipif->ipif_v6lcl_addr.s6_addr32[3];
 
 		err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill,
-		    ipif->ipif_ill->ill_phyint->phyint_ifindex,
 		    ipif->ipif_zoneid, B_TRUE, B_TRUE);
-
 		if (err != 0) {
 			ip0dbg(("ipif_multicast_down: sol MC failed %d\n",
 			    err));
@@ -19683,9 +15891,8 @@ ipif_set_default(ipif_t *ipif)
  * Return 0 if this address can be used as local address without causing
  * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
  * is already up on a different ill, and EADDRINUSE if it's up on the same ill.
- * Special checks are needed to allow the same IPv6 link-local address
- * on different ills.
- * TODO: allowing the same site-local address on different ill's.
+ * Note that the same IPv6 link-local address is allowed as long as the ills
+ * are not on the same link.
  */
 int
 ip_addr_availability_check(ipif_t *new_ipif)
@@ -19717,30 +15924,26 @@ ip_addr_availability_check(ipif_t *new_ipif)
 		    ipif = ipif->ipif_next) {
 			if ((ipif == new_ipif) ||
 			    !(ipif->ipif_flags & IPIF_UP) ||
-			    (ipif->ipif_flags & IPIF_UNNUMBERED))
+			    (ipif->ipif_flags & IPIF_UNNUMBERED) ||
+			    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
+			    &our_v6addr))
 				continue;
-			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
-			    &our_v6addr)) {
-				if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
-					new_ipif->ipif_flags |= IPIF_UNNUMBERED;
-				else if (ipif->ipif_flags & IPIF_POINTOPOINT)
-					ipif->ipif_flags |= IPIF_UNNUMBERED;
-				else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) &&
-				    new_ipif->ipif_ill != ill)
-					continue;
-				else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) &&
-				    new_ipif->ipif_ill != ill)
-					continue;
-				else if (new_ipif->ipif_zoneid !=
-				    ipif->ipif_zoneid &&
-				    ipif->ipif_zoneid != ALL_ZONES &&
-				    IS_LOOPBACK(ill))
-					continue;
-				else if (new_ipif->ipif_ill == ill)
-					return (EADDRINUSE);
-				else
-					return (EADDRNOTAVAIL);
-			}
+
+			if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
+				new_ipif->ipif_flags |= IPIF_UNNUMBERED;
+			else if (ipif->ipif_flags & IPIF_POINTOPOINT)
+				ipif->ipif_flags |= IPIF_UNNUMBERED;
+			else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) ||
+			    IN6_IS_ADDR_SITELOCAL(&our_v6addr)) &&
+			    !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill))
+				continue;
+			else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid &&
+			    ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill))
+				continue;
+			else if (new_ipif->ipif_ill == ill)
+				return (EADDRINUSE);
+			else
+				return (EADDRNOTAVAIL);
 		}
 	}
 
@@ -19753,13 +15956,15 @@ ip_addr_availability_check(ipif_t *new_ipif)
  * When the routine returns EINPROGRESS then mp has been consumed and
  * the ioctl will be acked from ip_rput_dlpi.
  */
-static int
+int
 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 {
-	ill_t	*ill = ipif->ipif_ill;
-	boolean_t isv6 = ipif->ipif_isv6;
-	int	err = 0;
-	boolean_t success;
+	ill_t		*ill = ipif->ipif_ill;
+	boolean_t 	isv6 = ipif->ipif_isv6;
+	int		err = 0;
+	boolean_t	success;
+	uint_t		ipif_orig_id;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
@@ -19769,6 +15974,123 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 	if (ipif->ipif_flags & IPIF_UP)
 		return (EALREADY);
 
+	/*
+	 * If this is a request to bring up a data address on an interface
+	 * under IPMP, then move the address to its IPMP meta-interface and
+	 * try to bring it up.  One complication is that the zeroth ipif for
+	 * an ill is special, in that every ill always has one, and that code
+	 * throughout IP deferences ill->ill_ipif without holding any locks.
+	 */
+	if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) &&
+	    (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) {
+		ipif_t	*stubipif = NULL, *moveipif = NULL;
+		ill_t	*ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
+
+		/*
+		 * The ipif being brought up should be quiesced.  If it's not,
+		 * something has gone amiss and we need to bail out.  (If it's
+		 * quiesced, we know it will remain so via IPIF_CHANGING.)
+		 */
+		mutex_enter(&ill->ill_lock);
+		if (!ipif_is_quiescent(ipif)) {
+			mutex_exit(&ill->ill_lock);
+			return (EINVAL);
+		}
+		mutex_exit(&ill->ill_lock);
+
+		/*
+		 * If we're going to need to allocate ipifs, do it prior
+		 * to starting the move (and grabbing locks).
+		 */
+		if (ipif->ipif_id == 0) {
+			moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
+			    B_FALSE);
+			stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
+			    B_FALSE);
+			if (moveipif == NULL || stubipif == NULL) {
+				mi_free(moveipif);
+				mi_free(stubipif);
+				return (ENOMEM);
+			}
+		}
+
+		/*
+		 * Grab or transfer the ipif to move.  During the move, keep
+		 * ill_g_lock held to prevent any ill walker threads from
+		 * seeing things in an inconsistent state.
+		 */
+		rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+		if (ipif->ipif_id != 0) {
+			ipif_remove(ipif);
+		} else {
+			ipif_transfer(ipif, moveipif, stubipif);
+			ipif = moveipif;
+		}
+
+		/*
+		 * Place the ipif on the IPMP ill.  If the zeroth ipif on
+		 * the IPMP ill is a stub (0.0.0.0 down address) then we
+		 * replace that one.  Otherwise, pick the next available slot.
+		 */
+		ipif->ipif_ill = ipmp_ill;
+		ipif_orig_id = ipif->ipif_id;
+
+		if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) {
+			ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL);
+			ipif = ipmp_ill->ill_ipif;
+		} else {
+			ipif->ipif_id = -1;
+			if (ipif_insert(ipif, B_FALSE) != 0) {
+				/*
+				 * No more available ipif_id's -- put it back
+				 * on the original ill and fail the operation.
+				 * Since we're writer on the ill, we can be
+				 * sure our old slot is still available.
+				 */
+				ipif->ipif_id = ipif_orig_id;
+				ipif->ipif_ill = ill;
+				if (ipif_orig_id == 0) {
+					ipif_transfer(ipif, ill->ill_ipif,
+					    NULL);
+				} else {
+					VERIFY(ipif_insert(ipif, B_FALSE) == 0);
+				}
+				rw_exit(&ipst->ips_ill_g_lock);
+				return (ENOMEM);
+			}
+		}
+		rw_exit(&ipst->ips_ill_g_lock);
+
+		/*
+		 * Tell SCTP that the ipif has moved.  Note that even if we
+		 * had to allocate a new ipif, the original sequence id was
+		 * preserved and therefore SCTP won't know.
+		 */
+		sctp_move_ipif(ipif, ill, ipmp_ill);
+
+		/*
+		 * If the ipif being brought up was on slot zero, then we
+		 * first need to bring up the placeholder we stuck there.  In
+		 * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call
+		 * to ipif_up() itself, if we successfully bring up the
+		 * placeholder, we'll check ill_move_ipif and bring it up too.
+		 */
+		if (ipif_orig_id == 0) {
+			ASSERT(ill->ill_move_ipif == NULL);
+			ill->ill_move_ipif = ipif;
+			if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0)
+				ASSERT(ill->ill_move_ipif == NULL);
+			if (err != EINPROGRESS)
+				ill->ill_move_ipif = NULL;
+			return (err);
+		}
+
+		/*
+		 * Bring it up on the IPMP ill.
+		 */
+		return (ipif_up(ipif, q, mp));
+	}
+
 	/* Skip arp/ndp for any loopback interface. */
 	if (ill->ill_wq != NULL) {
 		conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
@@ -19798,7 +16120,6 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 		 */
 
 		ASSERT(connp != NULL || !CONN_Q(q));
-		ASSERT(ipsq->ipsq_pending_mp == NULL);
 		if (connp != NULL)
 			mutex_enter(&connp->conn_lock);
 		mutex_enter(&ill->ill_lock);
@@ -19810,27 +16131,25 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 			return (EINTR);
 
 		/*
-		 * Crank up IPv6 neighbor discovery
-		 * Unlike ARP, this should complete when
-		 * ipif_ndp_up returns. However, for
-		 * ILLF_XRESOLV interfaces we also send a
-		 * AR_INTERFACE_UP to the external resolver.
-		 * That ioctl will complete in ip_rput.
+		 * Crank up the resolver.  For IPv6, this cranks up the
+		 * external resolver if one is configured, but even if an
+		 * external resolver isn't configured, it must be called to
+		 * reset DAD state.  For IPv6, if an external resolver is not
+		 * being used, ipif_resolver_up() will never return
+		 * EINPROGRESS, so we can always call ipif_ndp_up() here.
+		 * Note that if an external resolver is being used, there's no
+		 * need to call ipif_ndp_up() since it will do nothing.
 		 */
-		if (isv6) {
-			err = ipif_ndp_up(ipif);
-			if (err != 0) {
-				if (err != EINPROGRESS)
-					mp = ipsq_pending_mp_get(ipsq, &connp);
-				return (err);
-			}
-		}
-		/* Now, ARP */
 		err = ipif_resolver_up(ipif, Res_act_initial);
 		if (err == EINPROGRESS) {
-			/* We will complete it in ip_arp_done */
+			/* We will complete it in ip_arp_done() */
 			return (err);
 		}
+
+		if (isv6 && err == 0)
+			err = ipif_ndp_up(ipif, B_TRUE);
+
+		ASSERT(err != EINPROGRESS);
 		mp = ipsq_pending_mp_get(ipsq, &connp);
 		ASSERT(mp != NULL);
 		if (err != 0)
@@ -19843,7 +16162,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
 		ipif->ipif_addr_ready = 1;
 	}
-	return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
+
+	err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif);
+	if (err == 0 && ill->ill_move_ipif != NULL) {
+		ipif = ill->ill_move_ipif;
+		ill->ill_move_ipif = NULL;
+		return (ipif_up(ipif, q, mp));
+	}
+	return (err);
 }
 
 /*
@@ -19939,13 +16265,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
 	return (EINPROGRESS);
 bad:
 	ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
-	/*
-	 * We don't have to check for possible removal from illgrp
-	 * as we have not yet inserted in illgrp. For groups
-	 * without names, this ipif is still not UP and hence
-	 * this could not have possibly had any influence in forming
-	 * groups.
-	 */
 
 	freemsg(bind_mp);
 	freemsg(unbind_mp);
@@ -19974,12 +16293,10 @@ ipif_up_done(ipif_t *ipif)
 	ipif_t   *tmp_ipif;
 	boolean_t	flush_ire_cache = B_TRUE;
 	int	err = 0;
-	phyint_t *phyi;
 	ire_t	**ipif_saved_irep = NULL;
 	int ipif_saved_ire_cnt;
 	int	cnt;
 	boolean_t	src_ipif_held = B_FALSE;
-	boolean_t	ire_added = B_FALSE;
 	boolean_t	loopback = B_FALSE;
 	ip_stack_t	*ipst = ill->ill_ipst;
 
@@ -20010,7 +16327,7 @@ ipif_up_done(ipif_t *ipif)
 		break;
 	}
 	if (flush_ire_cache)
-		ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE,
+		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
 		    IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
 
 	/*
@@ -20044,7 +16361,9 @@ ipif_up_done(ipif_t *ipif)
 			ipif->ipif_ire_type = IRE_LOCAL;
 	}
 
-	if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) {
+	if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) ||
+	    ((ipif->ipif_flags & IPIF_DEPRECATED) &&
+	    !(ipif->ipif_flags & IPIF_NOFAILOVER))) {
 		/*
 		 * Can't use our source address. Select a different
 		 * source address for the IRE_INTERFACE and IRE_LOCAL
@@ -20189,11 +16508,9 @@ ipif_up_done(ipif_t *ipif)
 	}
 
 	/*
-	 * Need to atomically check for ip_addr_availablity_check
-	 * under ip_addr_avail_lock, and if it fails got bad, and remove
-	 * from group also.The ill_g_lock is grabbed as reader
-	 * just to make sure no new ills or new ipifs are being added
-	 * to the system while we are checking the uniqueness of addresses.
+	 * Need to atomically check for IP address availability under
+	 * ip_addr_avail_lock.  ill_g_lock is held as reader to ensure no new
+	 * ills or new ipifs can be added while we are checking availability.
 	 */
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	mutex_enter(&ipst->ips_ip_addr_avail_lock);
@@ -20227,13 +16544,6 @@ ipif_up_done(ipif_t *ipif)
 	/*
 	 * Add in all newly created IREs.  ire_create_bcast() has
 	 * already checked for duplicates of the IRE_BROADCAST type.
-	 * We want to add before we call ifgrp_insert which wants
-	 * to know whether IRE_IF_RESOLVER exists or not.
-	 *
-	 * NOTE : We refrele the ire though we may branch to "bad"
-	 *	  later on where we do ire_delete. This is okay
-	 *	  because nobody can delete it as we are running
-	 *	  exclusively.
 	 */
 	for (irep1 = irep; irep1 > ire_array; ) {
 		irep1--;
@@ -20243,44 +16553,6 @@ ipif_up_done(ipif_t *ipif)
 		 */
 		(void) ire_add(irep1, NULL, NULL, NULL, B_FALSE);
 	}
-	ire_added = B_TRUE;
-	/*
-	 * Form groups if possible.
-	 *
-	 * If we are supposed to be in a ill_group with a name, insert it
-	 * now as we know that at least one ipif is UP. Otherwise form
-	 * nameless groups.
-	 *
-	 * If ip_enable_group_ifs is set and ipif address is not 0, insert
-	 * this ipif into the appropriate interface group, or create a
-	 * new one. If this is already in a nameless group, we try to form
-	 * a bigger group looking at other ills potentially sharing this
-	 * ipif's prefix.
-	 */
-	phyi = ill->ill_phyint;
-	if (phyi->phyint_groupname_len != 0) {
-		ASSERT(phyi->phyint_groupname != NULL);
-		if (ill->ill_ipif_up_count == 1) {
-			ASSERT(ill->ill_group == NULL);
-			err = illgrp_insert(&ipst->ips_illgrp_head_v4, ill,
-			    phyi->phyint_groupname, NULL, B_TRUE);
-			if (err != 0) {
-				ip1dbg(("ipif_up_done: illgrp allocation "
-				    "failed, error %d\n", err));
-				goto bad;
-			}
-		}
-		ASSERT(ill->ill_group != NULL);
-	}
-
-	/*
-	 * When this is part of group, we need to make sure that
-	 * any broadcast ires created because of this ipif coming
-	 * UP gets marked/cleared with IRE_MARK_NORECV appropriately
-	 * so that we don't receive duplicate broadcast packets.
-	 */
-	if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0)
-		ipif_renominate_bcast(ipif);
 
 	/* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
 	ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
@@ -20331,19 +16603,30 @@ ipif_up_done(ipif_t *ipif)
 		 */
 		ill_recover_multicast(ill);
 	}
-	/* Join the allhosts multicast address */
-	ipif_multicast_up(ipif);
 
-	if (!loopback) {
+	if (ill->ill_ipif_up_count == 1) {
+		/*
+		 * Since the interface is now up, it may now be active.
+		 */
+		if (IS_UNDER_IPMP(ill))
+			ipmp_ill_refresh_active(ill);
+
 		/*
-		 * See whether anybody else would benefit from the
-		 * new ipif that we added. We call this always rather
-		 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST
-		 * ipif is for the benefit of illgrp_insert (done above)
-		 * which does not do source address selection as it does
-		 * not want to re-create interface routes that we are
-		 * having reference to it here.
+		 * If this is an IPMP interface, we may now be able to
+		 * establish ARP entries.
 		 */
+		if (IS_IPMP(ill))
+			ipmp_illgrp_refresh_arpent(ill->ill_grp);
+	}
+
+	/* Join the allhosts multicast address */
+	ipif_multicast_up(ipif);
+
+	/*
+	 * See if anybody else would benefit from our new ipif.
+	 */
+	if (!loopback &&
+	    !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
 		ill_update_source_selection(ill);
 	}
 
@@ -20386,27 +16669,11 @@ ipif_up_done(ipif_t *ipif)
 
 bad:
 	ip1dbg(("ipif_up_done: FAILED \n"));
-	/*
-	 * We don't have to bother removing from ill groups because
-	 *
-	 * 1) For groups with names, we insert only when the first ipif
-	 *    comes up. In that case if it fails, it will not be in any
-	 *    group. So, we need not try to remove for that case.
-	 *
-	 * 2) For groups without names, either we tried to insert ipif_ill
-	 *    in a group as singleton or found some other group to become
-	 *    a bigger group. For the former, if it fails we don't have
-	 *    anything to do as ipif_ill is not in the group and for the
-	 *    latter, there are no failures in illgrp_insert/illgrp_delete
-	 *    (ENOMEM can't occur for this. Check ifgrp_insert).
-	 */
+
 	while (irep > ire_array) {
 		irep--;
-		if (*irep != NULL) {
+		if (*irep != NULL)
 			ire_delete(*irep);
-			if (ire_added)
-				ire_refrele(*irep);
-		}
 	}
 	(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
 
@@ -20417,7 +16684,7 @@ bad:
 	if (src_ipif_held)
 		ipif_refrele(src_ipif);
 
-	ipif_arp_down(ipif);
+	ipif_resolver_down(ipif);
 	return (err);
 }
 
@@ -20493,119 +16760,6 @@ ill_arp_on(ill_t *ill)
 }
 
 /*
- * Called after either deleting ill from the group or when setting
- * FAILED or STANDBY on the interface.
- */
-static void
-illgrp_reset_schednext(ill_t *ill)
-{
-	ill_group_t *illgrp;
-	ill_t *save_ill;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-	/*
-	 * When called from illgrp_delete, ill_group will be non-NULL.
-	 * But when called from ip_sioctl_flags, it could be NULL if
-	 * somebody is setting FAILED/INACTIVE on some interface which
-	 * is not part of a group.
-	 */
-	illgrp = ill->ill_group;
-	if (illgrp == NULL)
-		return;
-	if (illgrp->illgrp_ill_schednext != ill)
-		return;
-
-	illgrp->illgrp_ill_schednext = NULL;
-	save_ill = ill;
-	/*
-	 * Choose a good ill to be the next one for
-	 * outbound traffic. As the flags FAILED/STANDBY is
-	 * not yet marked when called from ip_sioctl_flags,
-	 * we check for ill separately.
-	 */
-	for (ill = illgrp->illgrp_ill; ill != NULL;
-	    ill = ill->ill_group_next) {
-		if ((ill != save_ill) &&
-		    !(ill->ill_phyint->phyint_flags &
-		    (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) {
-			illgrp->illgrp_ill_schednext = ill;
-			return;
-		}
-	}
-}
-
-/*
- * Given an ill, find the next ill in the group to be scheduled.
- * (This should be called by ip_newroute() before ire_create().)
- * The passed in ill may be pulled out of the group, after we have picked
- * up a different outgoing ill from the same group. However ire add will
- * atomically check this.
- */
-ill_t *
-illgrp_scheduler(ill_t *ill)
-{
-	ill_t *retill;
-	ill_group_t *illgrp;
-	int illcnt;
-	int i;
-	uint64_t flags;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	/*
-	 * We don't use a lock to check for the ill_group. If this ill
-	 * is currently being inserted we may end up just returning this
-	 * ill itself. That is ok.
-	 */
-	if (ill->ill_group == NULL) {
-		ill_refhold(ill);
-		return (ill);
-	}
-
-	/*
-	 * Grab the ill_g_lock as reader to make sure we are dealing with
-	 * a set of stable ills. No ill can be added or deleted or change
-	 * group while we hold the reader lock.
-	 */
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	if ((illgrp = ill->ill_group) == NULL) {
-		rw_exit(&ipst->ips_ill_g_lock);
-		ill_refhold(ill);
-		return (ill);
-	}
-
-	illcnt = illgrp->illgrp_ill_count;
-	mutex_enter(&illgrp->illgrp_lock);
-	retill = illgrp->illgrp_ill_schednext;
-
-	if (retill == NULL)
-		retill = illgrp->illgrp_ill;
-
-	/*
-	 * We do a circular search beginning at illgrp_ill_schednext
-	 * or illgrp_ill. We don't check the flags against the ill lock
-	 * since it can change anytime. The ire creation will be atomic
-	 * and will fail if the ill is FAILED or OFFLINE.
-	 */
-	for (i = 0; i < illcnt; i++) {
-		flags = retill->ill_phyint->phyint_flags;
-
-		if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
-		    ILL_CAN_LOOKUP(retill)) {
-			illgrp->illgrp_ill_schednext = retill->ill_group_next;
-			ill_refhold(retill);
-			break;
-		}
-		retill = retill->ill_group_next;
-		if (retill == NULL)
-			retill = illgrp->illgrp_ill;
-	}
-	mutex_exit(&illgrp->illgrp_lock);
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	return (i == illcnt ? NULL : retill);
-}
-
-/*
  * Checks for availbility of a usable source address (if there is one) when the
  * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
  * this selection is done regardless of the destination.
@@ -20654,11 +16808,26 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
 }
 
 /*
- * Determine the best source address given a destination address and an ill.
- * Prefers non-deprecated over deprecated but will return a deprecated
- * address if there is no other choice. If there is a usable source address
- * on the interface pointed to by ill_usesrc_ifindex then that is given
- * first preference.
+ * IP source address type, sorted from worst to best.  For a given type,
+ * always prefer IP addresses on the same subnet.  All-zones addresses are
+ * suboptimal because they pose problems with unlabeled destinations.
+ */
+typedef enum {
+	IPIF_NONE,
+	IPIF_DIFFNET_DEPRECATED, 	/* deprecated and different subnet */
+	IPIF_SAMENET_DEPRECATED, 	/* deprecated and same subnet */
+	IPIF_DIFFNET_ALLZONES,		/* allzones and different subnet */
+	IPIF_SAMENET_ALLZONES,		/* allzones and same subnet */
+	IPIF_DIFFNET,			/* normal and different subnet */
+	IPIF_SAMENET			/* normal and same subnet */
+} ipif_type_t;
+
+/*
+ * Pick the optimal ipif on `ill' for sending to destination `dst' from zone
+ * `zoneid'.  We rate usable ipifs from low -> high as per the ipif_type_t
+ * enumeration, and return the highest-rated ipif.  If there's a tie, we pick
+ * the first one, unless IPMP is used in which case we round-robin among them;
+ * see below for more.
  *
  * Returns NULL if there is no suitable source address for the ill.
  * This only occurs when there is no valid source address for the ill.
@@ -20666,17 +16835,13 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
 ipif_t *
 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
 {
-	ipif_t *ipif;
-	ipif_t *ipif_dep = NULL;	/* Fallback to deprecated */
-	ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE];
-	int index = 0;
-	boolean_t wrapped = B_FALSE;
-	boolean_t same_subnet_only = B_FALSE;
-	boolean_t ipif_same_found, ipif_other_found;
-	boolean_t specific_found;
-	ill_t	*till, *usill = NULL;
+	ill_t	*usill = NULL;
+	ill_t	*ipmp_ill = NULL;
+	ipif_t	*start_ipif, *next_ipif, *ipif, *best_ipif;
+	ipif_type_t type, best_type;
 	tsol_tpc_t *src_rhtp, *dst_rhtp;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	ip_stack_t *ipst = ill->ill_ipst;
+	boolean_t samenet;
 
 	if (ill->ill_usesrc_ifindex != 0) {
 		usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
@@ -20688,6 +16853,17 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
 	}
 
 	/*
+	 * Test addresses should never be used for source address selection,
+	 * so if we were passed one, switch to the IPMP meta-interface.
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL)
+			ill = ipmp_ill;	/* Select source from IPMP ill */
+		else
+			return (NULL);
+	}
+
+	/*
 	 * If we're dealing with an unlabeled destination on a labeled system,
 	 * make sure that we ignore source addresses that are incompatible with
 	 * the destination's default label.  That destination's default label
@@ -20705,7 +16881,7 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
 	}
 
 	/*
-	 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill
+	 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill
 	 * can be deleted. But an ipif/ill can get CONDEMNED any time.
 	 * After selecting the right ipif, under ill_lock make sure ipif is
 	 * not condemned, and increment refcnt. If ipif is CONDEMNED,
@@ -20713,190 +16889,117 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
 	 * but not under a lock.
 	 */
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-
 retry:
-	till = ill;
-	ipif_arr[0] = NULL;
+	/*
+	 * For source address selection, we treat the ipif list as circular
+	 * and continue until we get back to where we started.  This allows
+	 * IPMP to vary source address selection (which improves inbound load
+	 * spreading) by caching its last ending point and starting from
+	 * there.  NOTE: we don't have to worry about ill_src_ipif changing
+	 * ills since that can't happen on the IPMP ill.
+	 */
+	start_ipif = ill->ill_ipif;
+	if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
+		start_ipif = ill->ill_src_ipif;
 
-	if (till->ill_group != NULL)
-		till = till->ill_group->illgrp_ill;
+	ipif = start_ipif;
+	best_ipif = NULL;
+	best_type = IPIF_NONE;
+	do {
+		if ((next_ipif = ipif->ipif_next) == NULL)
+			next_ipif = ill->ill_ipif;
 
-	/*
-	 * Choose one good source address from each ill across the group.
-	 * If possible choose a source address in the same subnet as
-	 * the destination address.
-	 *
-	 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE
-	 * This is okay because of the following.
-	 *
-	 *    If PHYI_FAILED is set and we still have non-deprecated
-	 *    addresses, it means the addresses have not yet been
-	 *    failed over to a different interface. We potentially
-	 *    select them to create IRE_CACHES, which will be later
-	 *    flushed when the addresses move over.
-	 *
-	 *    If PHYI_INACTIVE is set and we still have non-deprecated
-	 *    addresses, it means either the user has configured them
-	 *    or PHYI_INACTIVE has not been cleared after the addresses
-	 *    been moved over. For the former, in.mpathd does a failover
-	 *    when the interface becomes INACTIVE and hence we should
-	 *    not find them. Once INACTIVE is set, we don't allow them
-	 *    to create logical interfaces anymore. For the latter, a
-	 *    flush will happen when INACTIVE is cleared which will
-	 *    flush the IRE_CACHES.
-	 *
-	 *    If PHYI_OFFLINE is set, all the addresses will be failed
-	 *    over soon. We potentially select them to create IRE_CACHEs,
-	 *    which will be later flushed when the addresses move over.
-	 *
-	 * NOTE : As ipif_select_source is called to borrow source address
-	 * for an ipif that is part of a group, source address selection
-	 * will be re-done whenever the group changes i.e either an
-	 * insertion/deletion in the group.
-	 *
-	 * Fill ipif_arr[] with source addresses, using these rules:
-	 *
-	 *	1. At most one source address from a given ill ends up
-	 *	   in ipif_arr[] -- that is, at most one of the ipif's
-	 *	   associated with a given ill ends up in ipif_arr[].
-	 *
-	 *	2. If there is at least one non-deprecated ipif in the
-	 *	   IPMP group with a source address on the same subnet as
-	 *	   our destination, then fill ipif_arr[] only with
-	 *	   source addresses on the same subnet as our destination.
-	 *	   Note that because of (1), only the first
-	 *	   non-deprecated ipif found with a source address
-	 *	   matching the destination ends up in ipif_arr[].
-	 *
-	 *	3. Otherwise, fill ipif_arr[] with non-deprecated source
-	 *	   addresses not in the same subnet as our destination.
-	 *	   Again, because of (1), only the first off-subnet source
-	 *	   address will be chosen.
-	 *
-	 *	4. If there are no non-deprecated ipifs, then just use
-	 *	   the source address associated with the last deprecated
-	 *	   one we find that happens to be on the same subnet,
-	 *	   otherwise the first one not in the same subnet.
-	 */
-	specific_found = B_FALSE;
-	for (; till != NULL; till = till->ill_group_next) {
-		ipif_same_found = B_FALSE;
-		ipif_other_found = B_FALSE;
-		for (ipif = till->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (!IPIF_CAN_LOOKUP(ipif))
-				continue;
-			/* Always skip NOLOCAL and ANYCAST interfaces */
-			if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
-				continue;
-			if (!(ipif->ipif_flags & IPIF_UP) ||
-			    !ipif->ipif_addr_ready)
-				continue;
-			if (ipif->ipif_zoneid != zoneid &&
-			    ipif->ipif_zoneid != ALL_ZONES)
-				continue;
-			/*
-			 * Interfaces with 0.0.0.0 address are allowed to be UP,
-			 * but are not valid as source addresses.
-			 */
-			if (ipif->ipif_lcl_addr == INADDR_ANY)
-				continue;
+		if (!IPIF_CAN_LOOKUP(ipif))
+			continue;
+		/* Always skip NOLOCAL and ANYCAST interfaces */
+		if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
+			continue;
+		if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready)
+			continue;
+		if (ipif->ipif_zoneid != zoneid &&
+		    ipif->ipif_zoneid != ALL_ZONES)
+			continue;
 
-			/*
-			 * Check compatibility of local address for
-			 * destination's default label if we're on a labeled
-			 * system.  Incompatible addresses can't be used at
-			 * all.
-			 */
-			if (dst_rhtp != NULL) {
-				boolean_t incompat;
+		/*
+		 * Interfaces with 0.0.0.0 address are allowed to be UP, but
+		 * are not valid as source addresses.
+		 */
+		if (ipif->ipif_lcl_addr == INADDR_ANY)
+			continue;
 
-				src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
-				    IPV4_VERSION, B_FALSE);
-				if (src_rhtp == NULL)
-					continue;
-				incompat =
-				    src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
-				    src_rhtp->tpc_tp.tp_doi !=
-				    dst_rhtp->tpc_tp.tp_doi ||
-				    (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
-				    &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
-				    !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
-				    src_rhtp->tpc_tp.tp_sl_set_cipso));
-				TPC_RELE(src_rhtp);
-				if (incompat)
-					continue;
-			}
+		/*
+		 * Check compatibility of local address for destination's
+		 * default label if we're on a labeled system.	Incompatible
+		 * addresses can't be used at all.
+		 */
+		if (dst_rhtp != NULL) {
+			boolean_t incompat;
 
-			/*
-			 * We prefer not to use all all-zones addresses, if we
-			 * can avoid it, as they pose problems with unlabeled
-			 * destinations.
-			 */
-			if (ipif->ipif_zoneid != ALL_ZONES) {
-				if (!specific_found &&
-				    (!same_subnet_only ||
-				    (ipif->ipif_net_mask & dst) ==
-				    ipif->ipif_subnet)) {
-					index = 0;
-					specific_found = B_TRUE;
-					ipif_other_found = B_FALSE;
-				}
-			} else {
-				if (specific_found)
-					continue;
-			}
-			if (ipif->ipif_flags & IPIF_DEPRECATED) {
-				if (ipif_dep == NULL ||
-				    (ipif->ipif_net_mask & dst) ==
-				    ipif->ipif_subnet)
-					ipif_dep = ipif;
+			src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
+			    IPV4_VERSION, B_FALSE);
+			if (src_rhtp == NULL)
+				continue;
+			incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
+			    src_rhtp->tpc_tp.tp_doi !=
+			    dst_rhtp->tpc_tp.tp_doi ||
+			    (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
+			    &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
+			    !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
+			    src_rhtp->tpc_tp.tp_sl_set_cipso));
+			TPC_RELE(src_rhtp);
+			if (incompat)
 				continue;
-			}
-			if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) {
-				/* found a source address in the same subnet */
-				if (!same_subnet_only) {
-					same_subnet_only = B_TRUE;
-					index = 0;
-				}
-				ipif_same_found = B_TRUE;
-			} else {
-				if (same_subnet_only || ipif_other_found)
-					continue;
-				ipif_other_found = B_TRUE;
-			}
-			ipif_arr[index++] = ipif;
-			if (index == MAX_IPIF_SELECT_SOURCE) {
-				wrapped = B_TRUE;
-				index = 0;
-			}
-			if (ipif_same_found)
-				break;
 		}
-	}
 
-	if (ipif_arr[0] == NULL) {
-		ipif = ipif_dep;
-	} else {
-		if (wrapped)
-			index = MAX_IPIF_SELECT_SOURCE;
-		ipif = ipif_arr[ipif_rand(ipst) % index];
-		ASSERT(ipif != NULL);
-	}
+		samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
 
-	if (ipif != NULL) {
+		if (ipif->ipif_flags & IPIF_DEPRECATED) {
+			type = samenet ? IPIF_SAMENET_DEPRECATED :
+			    IPIF_DIFFNET_DEPRECATED;
+		} else if (ipif->ipif_zoneid == ALL_ZONES) {
+			type = samenet ? IPIF_SAMENET_ALLZONES :
+			    IPIF_DIFFNET_ALLZONES;
+		} else {
+			type = samenet ? IPIF_SAMENET : IPIF_DIFFNET;
+		}
+
+		if (type > best_type) {
+			best_type = type;
+			best_ipif = ipif;
+			if (best_type == IPIF_SAMENET)
+				break; /* can't get better */
+		}
+	} while ((ipif = next_ipif) != start_ipif);
+
+	if ((ipif = best_ipif) != NULL) {
 		mutex_enter(&ipif->ipif_ill->ill_lock);
 		if (!IPIF_CAN_LOOKUP(ipif)) {
 			mutex_exit(&ipif->ipif_ill->ill_lock);
 			goto retry;
 		}
 		ipif_refhold_locked(ipif);
+
+		/*
+		 * For IPMP, update the source ipif rotor to the next ipif,
+		 * provided we can look it up.  (We must not use it if it's
+		 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
+		 * ipif_free() checked ill_src_ipif.)
+		 */
+		if (IS_IPMP(ill) && ipif != NULL) {
+			next_ipif = ipif->ipif_next;
+			if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif))
+				ill->ill_src_ipif = next_ipif;
+			else
+				ill->ill_src_ipif = NULL;
+		}
 		mutex_exit(&ipif->ipif_ill->ill_lock);
 	}
 
 	rw_exit(&ipst->ips_ill_g_lock);
 	if (usill != NULL)
 		ill_refrele(usill);
+	if (ipmp_ill != NULL)
+		ill_refrele(ipmp_ill);
 	if (dst_rhtp != NULL)
 		TPC_RELE(dst_rhtp);
 
@@ -20929,8 +17032,7 @@ retry:
  * ipif_update_other_ipifs calls us.
  *
  * If old_ipif is NULL, just redo the source address selection
- * if needed. This happens when illgrp_insert or ipif_up_done
- * calls us.
+ * if needed. This happens when ipif_up_done calls us.
  */
 static void
 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
@@ -21064,49 +17166,31 @@ ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
 /*
  * This old_ipif is going away.
  *
- * Determine if any other ipif's is using our address as
+ * Determine if any other ipif's are using our address as
  * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or
  * IPIF_DEPRECATED).
  * Find the IRE_INTERFACE for such ipifs and recreate them
  * to use an different source address following the rules in
  * ipif_up_done.
- *
- * This function takes an illgrp as an argument so that illgrp_delete
- * can call this to update source address even after deleting the
- * old_ipif->ipif_ill from the ill group.
  */
 static void
-ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp)
+ipif_update_other_ipifs(ipif_t *old_ipif)
 {
-	ipif_t *ipif;
-	ill_t *ill;
+	ipif_t	*ipif;
+	ill_t	*ill;
 	char	buf[INET6_ADDRSTRLEN];
 
 	ASSERT(IAM_WRITER_IPIF(old_ipif));
-	ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif));
 
 	ill = old_ipif->ipif_ill;
 
-	ip1dbg(("ipif_update_other_ipifs(%s, %s)\n",
-	    ill->ill_name,
-	    inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr,
-	    buf, sizeof (buf))));
-	/*
-	 * If this part of a group, look at all ills as ipif_select_source
-	 * borrows source address across all the ills in the group.
-	 */
-	if (illgrp != NULL)
-		ill = illgrp->illgrp_ill;
-
-	for (; ill != NULL; ill = ill->ill_group_next) {
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-
-			if (ipif == old_ipif)
-				continue;
+	ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name,
+	    inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf))));
 
-			ipif_recreate_interface_routes(old_ipif, ipif);
-		}
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+		if (ipif == old_ipif)
+			continue;
+		ipif_recreate_interface_routes(old_ipif, ipif);
 	}
 }
 
@@ -21117,8 +17201,7 @@ if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 {
 	/*
 	 * ill_phyint_reinit merged the v4 and v6 into a single
-	 * ipsq. Could also have become part of a ipmp group in the
-	 * process, and we might not have been able to complete the
+	 * ipsq.  We might not have been able to complete the
 	 * operation in ipif_set_values, if we could not become
 	 * exclusive.  If so restart it here.
 	 */
@@ -21171,6 +17254,48 @@ ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 }
 
 /*
+ * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the
+ * minimum (but complete) set exist.  This is necessary when adding or
+ * removing an interface to/from an IPMP group, since interfaces in an
+ * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever
+ * its test address subnets overlap with IPMP data addresses).	It's also
+ * used to refresh the IRE_BROADCAST entries associated with the IPMP
+ * interface when the nominated broadcast interface changes.
+ */
+void
+ill_refresh_bcast(ill_t *ill)
+{
+	ire_t *ire_array[12];	/* max ipif_create_bcast_ires() can create */
+	ire_t **irep;
+	ipif_t *ipif;
+
+	ASSERT(!ill->ill_isv6);
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	/*
+	 * Remove any old broadcast IREs.
+	 */
+	ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST,
+	    ill_broadcast_delete, ill, ill);
+
+	/*
+	 * Create new ones for any ipifs that are up and broadcast-capable.
+	 */
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+		if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) !=
+		    (IPIF_UP|IPIF_BROADCAST))
+			continue;
+
+		irep = ipif_create_bcast_ires(ipif, ire_array);
+		while (irep-- > ire_array) {
+			(void) ire_add(irep, NULL, NULL, NULL, B_FALSE);
+			if (*irep != NULL)
+				ire_refrele(*irep);
+		}
+	}
+}
+
+/*
  * Create any IRE_BROADCAST entries for `ipif', and store those entries in
  * `irep'.  Returns a pointer to the next free `irep' entry (just like
  * ire_check_and_create_bcast()).
@@ -21433,10 +17558,33 @@ ipif_check_bcast_ires(ipif_t *test_ipif)
 
 	/*
 	 * Walk through all the ipifs that will be affected by the dying IREs,
-	 * and recreate the IREs as necessary.
+	 * and recreate the IREs as necessary. Note that all interfaces in an
+	 * IPMP illgrp share the same broadcast IREs, and thus the entire
+	 * illgrp must be walked, starting with the IPMP meta-interface (so
+	 * that broadcast IREs end up on it whenever possible).
 	 */
+	if (IS_UNDER_IPMP(ill))
+		ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
+
 	irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
 
+	if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
+		ipmp_illgrp_t *illg = ill->ill_grp;
+
+		ill = list_head(&illg->ig_if);
+		for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+			for (i = 0; i < BCAST_COUNT; i++) {
+				if (bireinfo[i].bi_willdie &&
+				    !bireinfo[i].bi_haverep)
+					break;
+			}
+			if (i == BCAST_COUNT)
+				break;
+
+			irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
+		}
+	}
+
 	/*
 	 * Scan through the set of broadcast IREs and see if there are any
 	 * that we need to replace that have not yet been replaced.  If so,
@@ -21528,7 +17676,7 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	/*
 	 * If there's another ill already with the requested name, ensure
-	 * that it's of the same type.	Otherwise, ill_phyint_reinit() will
+	 * that it's of the same type.  Otherwise, ill_phyint_reinit() will
 	 * fuse together two unrelated ills, which will cause chaos.
 	 */
 	ipst = ill->ill_ipst;
@@ -21620,8 +17768,7 @@ ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 {
 	/*
 	 * ill_phyint_reinit merged the v4 and v6 into a single
-	 * ipsq. Could also have become part of a ipmp group in the
-	 * process, and we might not have been able to complete the
+	 * ipsq.  We might not have been able to complete the
 	 * slifname in ipif_set_values, if we could not become
 	 * exclusive.  If so restart it here
 	 */
@@ -21665,85 +17812,6 @@ ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
 	return (ipif);
 }
 
-typedef struct conn_change_s {
-	uint_t cc_old_ifindex;
-	uint_t cc_new_ifindex;
-} conn_change_t;
-
-/*
- * ipcl_walk function for changing interface index.
- */
-static void
-conn_change_ifindex(conn_t *connp, caddr_t arg)
-{
-	conn_change_t *connc;
-	uint_t old_ifindex;
-	uint_t new_ifindex;
-	int i;
-	ilg_t *ilg;
-
-	connc = (conn_change_t *)arg;
-	old_ifindex = connc->cc_old_ifindex;
-	new_ifindex = connc->cc_new_ifindex;
-
-	if (connp->conn_orig_bound_ifindex == old_ifindex)
-		connp->conn_orig_bound_ifindex = new_ifindex;
-
-	if (connp->conn_orig_multicast_ifindex == old_ifindex)
-		connp->conn_orig_multicast_ifindex = new_ifindex;
-
-	for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
-		ilg = &connp->conn_ilg[i];
-		if (ilg->ilg_orig_ifindex == old_ifindex)
-			ilg->ilg_orig_ifindex = new_ifindex;
-	}
-}
-
-/*
- * Walk all the ipifs and ilms on this ill and change the orig_ifindex
- * to new_index if it matches the old_index.
- *
- * Failovers typically happen within a group of ills. But somebody
- * can remove an ill from the group after a failover happened. If
- * we are setting the ifindex after this, we potentially need to
- * look at all the ills rather than just the ones in the group.
- * We cut down the work by looking at matching ill_net_types
- * and ill_types as we could not possibly grouped them together.
- */
-static void
-ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc)
-{
-	ill_t *ill;
-	ipif_t *ipif;
-	uint_t old_ifindex;
-	uint_t new_ifindex;
-	ilm_t *ilm;
-	ill_walk_context_t ctx;
-	ip_stack_t	*ipst = ill_orig->ill_ipst;
-
-	old_ifindex = connc->cc_old_ifindex;
-	new_ifindex = connc->cc_new_ifindex;
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	ill = ILL_START_WALK_ALL(&ctx, ipst);
-	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if ((ill_orig->ill_net_type != ill->ill_net_type) ||
-		    (ill_orig->ill_type != ill->ill_type)) {
-			continue;
-		}
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (ipif->ipif_orig_ifindex == old_ifindex)
-				ipif->ipif_orig_ifindex = new_ifindex;
-		}
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (ilm->ilm_orig_ifindex == old_ifindex)
-				ilm->ilm_orig_ifindex = new_ifindex;
-		}
-	}
-	rw_exit(&ipst->ips_ill_g_lock);
-}
-
 /*
  * We first need to ensure that the new index is unique, and
  * then carry the change across both v4 and v6 ill representation
@@ -21755,13 +17823,10 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *ifreq)
 {
 	ill_t		*ill;
-	ill_t		*ill_other;
 	phyint_t	*phyi;
-	int		old_index;
-	conn_change_t	connc;
 	struct ifreq	*ifr = (struct ifreq *)ifreq;
 	struct lifreq	*lifr = (struct lifreq *)ifreq;
-	uint_t	index;
+	uint_t	old_index, index;
 	ill_t	*ill_v4;
 	ill_t	*ill_v6;
 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
@@ -21773,31 +17838,15 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	/*
 	 * Only allow on physical interface. Also, index zero is illegal.
-	 *
-	 * Need to check for PHYI_FAILED and PHYI_INACTIVE
-	 *
-	 * 1) If PHYI_FAILED is set, a failover could have happened which
-	 *    implies a possible failback might have to happen. As failback
-	 *    depends on the old index, we should fail setting the index.
-	 *
-	 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that
-	 *    any addresses or multicast memberships are failed over to
-	 *    a non-STANDBY interface. As failback depends on the old
-	 *    index, we should fail setting the index for this case also.
-	 *
-	 * 3) If PHYI_OFFLINE is set, a possible failover has happened.
-	 *    Be consistent with PHYI_FAILED and fail the ioctl.
 	 */
 	ill = ipif->ipif_ill;
 	phyi = ill->ill_phyint;
-	if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) ||
-	    ipif->ipif_id != 0 || index == 0) {
+	if (ipif->ipif_id != 0 || index == 0) {
 		return (EINVAL);
 	}
-	old_index = phyi->phyint_ifindex;
 
 	/* If the index is not changing, no work to do */
-	if (old_index == index)
+	if (phyi->phyint_ifindex == index)
 		return (0);
 
 	/*
@@ -21816,31 +17865,17 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		return (EBUSY);
 	}
 
-	/*
-	 * The new index is unused. Set it in the phyint.
-	 * Locate the other ill so that we can send a routing
-	 * sockets message.
-	 */
-	if (ill->ill_isv6) {
-		ill_other = phyi->phyint_illv4;
-	} else {
-		ill_other = phyi->phyint_illv6;
-	}
-
+	/* The new index is unused. Set it in the phyint. */
+	old_index = phyi->phyint_ifindex;
 	phyi->phyint_ifindex = index;
 
 	/* Update SCTP's ILL list */
 	sctp_ill_reindex(ill, old_index);
 
-	connc.cc_old_ifindex = old_index;
-	connc.cc_new_ifindex = index;
-	ip_change_ifindex(ill, &connc);
-	ipcl_walk(conn_change_ifindex, (caddr_t)&connc, ipst);
-
 	/* Send the routing sockets message */
-	ip_rts_ifmsg(ipif);
-	if (ill_other != NULL)
-		ip_rts_ifmsg(ill_other->ill_ipif);
+	ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+	if (ILL_OTHER(ill))
+		ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
 
 	return (0);
 }
@@ -22038,6 +18073,45 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	    B_TRUE));
 }
 
+/*
+ * Return the number of addresses on `ill' with one or more of the values
+ * in `set' set and all of the values in `clear' clear.
+ */
+static uint_t
+ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear)
+{
+	ipif_t	*ipif;
+	uint_t	cnt = 0;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+		if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear))
+			cnt++;
+
+	return (cnt);
+}
+
+/*
+ * Return the number of migratable addresses on `ill' that are under
+ * application control.
+ */
+uint_t
+ill_appaddr_cnt(const ill_t *ill)
+{
+	return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF,
+	    IPIF_NOFAILOVER));
+}
+
+/*
+ * Return the number of point-to-point addresses on `ill'.
+ */
+uint_t
+ill_ptpaddr_cnt(const ill_t *ill)
+{
+	return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0));
+}
+
 /* ARGSUSED */
 int
 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
@@ -22158,7 +18232,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
 	int err = 0, ret;
 	uint_t ifindex;
-	phyint_t *us_phyint, *us_cli_phyint;
 	ipsq_t *ipsq = NULL;
 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
 
@@ -22167,19 +18240,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	ASSERT(CONN_Q(q));
 
 	isv6 = (Q_TO_CONN(q))->conn_af_isv6;
-	us_cli_phyint = usesrc_cli_ill->ill_phyint;
-
-	ASSERT(us_cli_phyint != NULL);
-
-	/*
-	 * If the client ILL is being used for IPMP, abort.
-	 * Note, this can be done before ipsq_try_enter since we are already
-	 * exclusive on this ILL
-	 */
-	if ((us_cli_phyint->phyint_groupname != NULL) ||
-	    (us_cli_phyint->phyint_flags & PHYI_STANDBY)) {
-		return (EINVAL);
-	}
 
 	ifindex = lifr->lifr_index;
 	if (ifindex == 0) {
@@ -22198,15 +18258,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		return (err);
 	}
 
-	/*
-	 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP
-	 * group nor can either of the interfaces be used for standy. So
-	 * to guarantee mutual exclusion with ip_sioctl_flags (which sets
-	 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname)
-	 * we need to be exclusive on the ipsq belonging to the usesrc_ill.
-	 * We are already exlusive on this ipsq i.e ipsq corresponding to
-	 * the usesrc_cli_ill
-	 */
 	ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
 	    NEW_OP, B_TRUE);
 	if (ipsq == NULL) {
@@ -22215,11 +18266,19 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		goto done;
 	}
 
-	/* Check if the usesrc_ill is used for IPMP */
-	us_phyint = usesrc_ill->ill_phyint;
-	if ((us_phyint->phyint_groupname != NULL) ||
-	    (us_phyint->phyint_flags & PHYI_STANDBY)) {
-		err = EINVAL;
+	/* USESRC isn't currently supported with IPMP */
+	if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) {
+		err = ENOTSUP;
+		goto done;
+	}
+
+	/*
+	 * USESRC isn't compatible with the STANDBY flag.  (STANDBY is only
+	 * used by IPMP underlying interfaces, but someone might think it's
+	 * more general and try to use it independently with VNI.)
+	 */
+	if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
+		err = ENOTSUP;
 		goto done;
 	}
 
@@ -22372,79 +18431,45 @@ ill_phyint_compare_name(const void *name_ptr, const void *phyip)
 		return (-1);
 	return (0);
 }
+
 /*
- * This function is called from ill_delete when the ill is being
- * unplumbed. We remove the reference from the phyint and we also
- * free the phyint when there are no more references to it.
+ * This function is called on the unplumb path via ill_glist_delete() when
+ * there are no ills left on the phyint and thus the phyint can be freed.
  */
 static void
-ill_phyint_free(ill_t *ill)
+phyint_free(phyint_t *phyi)
 {
-	phyint_t *phyi;
-	phyint_t *next_phyint;
-	ipsq_t *cur_ipsq;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
 
-	ASSERT(ill->ill_phyint != NULL);
+	ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL);
 
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-	phyi = ill->ill_phyint;
-	ill->ill_phyint = NULL;
 	/*
-	 * ill_init allocates a phyint always to store the copy
-	 * of flags relevant to phyint. At that point in time, we could
-	 * not assign the name and hence phyint_illv4/v6 could not be
-	 * initialized. Later in ipif_set_values, we assign the name to
-	 * the ill, at which point in time we assign phyint_illv4/v6.
-	 * Thus we don't rely on phyint_illv6 to be initialized always.
+	 * If this phyint was an IPMP meta-interface, blow away the group.
+	 * This is safe to do because all of the illgrps have already been
+	 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us.
+	 * If we're cleaning up as a result of failed initialization,
+	 * phyint_grp may be NULL.
 	 */
-	if (ill->ill_flags & ILLF_IPV6) {
-		phyi->phyint_illv6 = NULL;
-	} else {
-		phyi->phyint_illv4 = NULL;
+	if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) {
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		ipmp_grp_destroy(phyi->phyint_grp);
+		phyi->phyint_grp = NULL;
+		rw_exit(&ipst->ips_ipmp_lock);
 	}
-	/*
-	 * ipif_down removes it from the group when the last ipif goes
-	 * down.
-	 */
-	ASSERT(ill->ill_group == NULL);
-
-	if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL)
-		return;
 
 	/*
-	 * Make sure this phyint was put in the list.
+	 * If this interface was under IPMP, take it out of the group.
 	 */
-	if (phyi->phyint_ifindex > 0) {
-		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
-		    phyi);
-		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
-		    phyi);
-	}
+	if (phyi->phyint_grp != NULL)
+		ipmp_phyint_leave_grp(phyi);
+
 	/*
-	 * remove phyint from the ipsq list.
+	 * Delete the phyint and disassociate its ipsq.  The ipsq itself
+	 * will be freed in ipsq_exit().
 	 */
-	cur_ipsq = phyi->phyint_ipsq;
-	if (phyi == cur_ipsq->ipsq_phyint_list) {
-		cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next;
-	} else {
-		next_phyint = cur_ipsq->ipsq_phyint_list;
-		while (next_phyint != NULL) {
-			if (next_phyint->phyint_ipsq_next == phyi) {
-				next_phyint->phyint_ipsq_next =
-				    phyi->phyint_ipsq_next;
-				break;
-			}
-			next_phyint = next_phyint->phyint_ipsq_next;
-		}
-		ASSERT(next_phyint != NULL);
-	}
-	IPSQ_DEC_REF(cur_ipsq, ipst);
+	phyi->phyint_ipsq->ipsq_phyint = NULL;
+	phyi->phyint_name[0] = '\0';
 
-	if (phyi->phyint_groupname_len != 0) {
-		ASSERT(phyi->phyint_groupname != NULL);
-		mi_free(phyi->phyint_groupname);
-	}
 	mi_free(phyi);
 }
 
@@ -22464,7 +18489,6 @@ ill_phyint_reinit(ill_t *ill)
 	phyint_t *phyi;
 	avl_index_t where = 0;
 	ill_t	*ill_other = NULL;
-	ipsq_t	*ipsq;
 	ip_stack_t	*ipst = ill->ill_ipst;
 
 	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
@@ -22476,6 +18500,11 @@ ill_phyint_reinit(ill_t *ill)
 	    phyi_old->phyint_illv4 == NULL));
 	ASSERT(phyi_old->phyint_ifindex == 0);
 
+	/*
+	 * Now that our ill has a name, set it in the phyint.
+	 */
+	(void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ);
+
 	phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
 	    ill->ill_name, &where);
 
@@ -22497,8 +18526,7 @@ ill_phyint_reinit(ill_t *ill)
 	 * we are initializing IPv4.
 	 */
 	if (phyi != NULL) {
-		ill_other = (isv6) ? phyi->phyint_illv4 :
-		    phyi->phyint_illv6;
+		ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6;
 		ASSERT(ill_other->ill_phyint != NULL);
 		ASSERT((isv6 && !ill_other->ill_isv6) ||
 		    (!isv6 && ill_other->ill_isv6));
@@ -22517,26 +18545,15 @@ ill_phyint_reinit(ill_t *ill)
 			ASSERT(phyi->phyint_illv4 == NULL);
 			phyi->phyint_illv4 = ill;
 		}
-		/*
-		 * This is a new ill, currently undergoing SLIFNAME
-		 * So we could not have joined an IPMP group until now.
-		 */
-		ASSERT(phyi_old->phyint_ipsq_next == NULL &&
-		    phyi_old->phyint_groupname == NULL);
 
 		/*
-		 * This phyi_old is going away. Decref ipsq_refs and
-		 * assert it is zero. The ipsq itself will be freed in
-		 * ipsq_exit
+		 * Delete the old phyint and make its ipsq eligible
+		 * to be freed in ipsq_exit().
 		 */
-		ipsq = phyi_old->phyint_ipsq;
-		IPSQ_DEC_REF(ipsq, ipst);
-		ASSERT(ipsq->ipsq_refs == 0);
-		/* Get the singleton phyint out of the ipsq list */
-		ASSERT(phyi_old->phyint_ipsq_next == NULL);
-		ipsq->ipsq_phyint_list = NULL;
 		phyi_old->phyint_illv4 = NULL;
 		phyi_old->phyint_illv6 = NULL;
+		phyi_old->phyint_ipsq->ipsq_phyint = NULL;
+		phyi_old->phyint_name[0] = '\0';
 		mi_free(phyi_old);
 	} else {
 		mutex_enter(&ill->ill_lock);
@@ -22551,9 +18568,6 @@ ill_phyint_reinit(ill_t *ill)
 		if (!phyint_assign_ifindex(phyi, ipst))
 			cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
 
-		/* No IPMP group yet, thus the hook uses the ifindex */
-		phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-
 		avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
 		    (void *)phyi, where);
 
@@ -22571,13 +18585,6 @@ ill_phyint_reinit(ill_t *ill)
 	ill->ill_phyint = phyi;
 
 	/*
-	 * Keep the index on ipif_orig_index to be used by FAILOVER.
-	 * We do this here as when the first ipif was allocated,
-	 * ipif_allocate does not know the right interface index.
-	 */
-
-	ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex;
-	/*
 	 * Now that the phyint's ifindex has been assigned, complete the
 	 * remaining
 	 */
@@ -22606,45 +18613,14 @@ ill_phyint_reinit(ill_t *ill)
 	 */
 	if (ill->ill_name_length <= 2 ||
 	    ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') {
-		/*
-		 * Generate nic plumb event for ill_name even if
-		 * ipmp_hook_emulation is set. That avoids generating events
-		 * for the ill_names should ipmp_hook_emulation be turned on
-		 * later.
-		 */
-		ill_nic_event_plumb(ill, B_FALSE);
+		ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name,
+		    ill->ill_name_length);
 	}
 	RELEASE_ILL_LOCKS(ill, ill_other);
 	mutex_exit(&phyi->phyint_lock);
 }
 
 /*
- * Allocate a NE_PLUMB nic info event and store in the ill.
- * If 'group' is set we do it for the group name, otherwise the ill name.
- * It will be sent when we leave the ipsq.
- */
-void
-ill_nic_event_plumb(ill_t *ill, boolean_t group)
-{
-	phyint_t	*phyi = ill->ill_phyint;
-	char		*name;
-	int		namelen;
-
-	ASSERT(MUTEX_HELD(&ill->ill_lock));
-
-	if (group) {
-		ASSERT(phyi->phyint_groupname_len != 0);
-		namelen = phyi->phyint_groupname_len;
-		name = phyi->phyint_groupname;
-	} else {
-		namelen = ill->ill_name_length;
-		name = ill->ill_name;
-	}
-
-	ill_nic_event_dispatch(ill, 0, NE_PLUMB, name, namelen);
-}
-
-/*
  * Notify any downstream modules of the name of this interface.
  * An M_IOCTL is used even though we don't expect a successful reply.
  * Any reply message from the driver (presumably an M_IOCNAK) will
@@ -22686,8 +18662,9 @@ ip_ifname_notify(ill_t *ill, queue_t *q)
 static int
 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
 {
-	int err;
+	int		err;
 	ip_stack_t	*ipst = ill->ill_ipst;
+	phyint_t	*phyi = ill->ill_phyint;
 
 	/* Set the obsolete NDD per-interface forwarding name. */
 	err = ill_set_ndd_name(ill);
@@ -22696,6 +18673,34 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
 		    err);
 	}
 
+	/*
+	 * Now that ill_name is set, the configuration for the IPMP
+	 * meta-interface can be performed.
+	 */
+	if (IS_IPMP(ill)) {
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		/*
+		 * If phyi->phyint_grp is NULL, then this is the first IPMP
+		 * meta-interface and we need to create the IPMP group.
+		 */
+		if (phyi->phyint_grp == NULL) {
+			/*
+			 * If someone has renamed another IPMP group to have
+			 * the same name as our interface, bail.
+			 */
+			if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) {
+				rw_exit(&ipst->ips_ipmp_lock);
+				return (EEXIST);
+			}
+			phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi);
+			if (phyi->phyint_grp == NULL) {
+				rw_exit(&ipst->ips_ipmp_lock);
+				return (ENOMEM);
+			}
+		}
+		rw_exit(&ipst->ips_ipmp_lock);
+	}
+
 	/* Tell downstream modules where they are. */
 	ip_ifname_notify(ill, q);
 
@@ -22966,10 +18971,10 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
 	/*
 	 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
 	 */
-	if (ipsq->ipsq_current_ipif == NULL)
+	if (ipsq->ipsq_xop->ipx_current_ipif == NULL)
 		ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
 	else
-		ASSERT(ipsq->ipsq_current_ipif == ipif);
+		ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif);
 
 	error = ipif_set_values_tail(ill, ipif, mp, q);
 	ipsq_exit(ipsq);
@@ -22986,18 +18991,8 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
 void
 ipif_init(ip_stack_t *ipst)
 {
-	hrtime_t hrt;
 	int i;
 
-	/*
-	 * Can't call drv_getparm here as it is too early in the boot.
-	 * As we use ipif_src_random just for picking a different
-	 * source address everytime, this need not be really random.
-	 */
-	hrt = gethrtime();
-	ipst->ips_ipif_src_random =
-	    ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff);
-
 	for (i = 0; i < MAX_G_HEADS; i++) {
 		ipst->ips_ill_g_heads[i].ill_g_list_head =
 		    (ill_if_t *)&ipst->ips_ill_g_heads[i];
@@ -23023,7 +19018,11 @@ ipif_init(ip_stack_t *ipst)
  * match is found to take care of such rare network configurations like -
  * le0: 129.146.1.1/16
  * le1: 129.146.2.2/24
- * It is used only by SO_DONTROUTE at the moment.
+ *
+ * This is used by SO_DONTROUTE and IP_NEXTHOP.  Since neither of those are
+ * supported on underlying interfaces in an IPMP group, underlying interfaces
+ * are ignored when looking up a match.  (If we didn't ignore them, we'd
+ * risk using a test address as a source for outgoing traffic.)
  */
 ipif_t *
 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
@@ -23038,6 +19037,8 @@ ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		if (IS_UNDER_IPMP(ill))
+			continue;
 		mutex_enter(&ill->ill_lock);
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
@@ -23660,30 +19661,76 @@ ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa,
  * Knows about IEEE 802 and IEEE EUI-64 mappings.
  */
 static boolean_t
-ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr)
 {
 	char		*addr;
 
-	if (phys_length != ETHERADDRL)
+	if (ill->ill_phys_addr_length != ETHERADDRL)
 		return (B_FALSE);
 
 	/* Form EUI-64 like address */
 	addr = (char *)&v6addr->s6_addr32[2];
-	bcopy((char *)phys_addr, addr, 3);
+	bcopy(ill->ill_phys_addr, addr, 3);
 	addr[0] ^= 0x2;		/* Toggle Universal/Local bit */
 	addr[3] = (char)0xff;
 	addr[4] = (char)0xfe;
-	bcopy((char *)phys_addr + 3, addr + 5, 3);
+	bcopy(ill->ill_phys_addr + 3, addr + 5, 3);
 	return (B_TRUE);
 }
 
 /* ARGSUSED */
 static boolean_t
-ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr)
 {
 	return (B_FALSE);
 }
 
+typedef struct ipmp_ifcookie {
+	uint32_t	ic_hostid;
+	char		ic_ifname[LIFNAMSIZ];
+	char		ic_zonename[ZONENAME_MAX];
+} ipmp_ifcookie_t;
+
+/*
+ * Construct a pseudo-random interface ID for the IPMP interface that's both
+ * predictable and (almost) guaranteed to be unique.
+ */
+static boolean_t
+ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
+{
+	zone_t		*zp;
+	uint8_t		*addr;
+	uchar_t		hash[16];
+	ulong_t 	hostid;
+	MD5_CTX		ctx;
+	ipmp_ifcookie_t	ic = { 0 };
+
+	ASSERT(IS_IPMP(ill));
+
+	(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
+	ic.ic_hostid = htonl((uint32_t)hostid);
+
+	(void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ);
+
+	if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) {
+		(void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX);
+		zone_rele(zp);
+	}
+
+	MD5Init(&ctx);
+	MD5Update(&ctx, &ic, sizeof (ic));
+	MD5Final(hash, &ctx);
+
+	/*
+	 * Map the hash to an interface ID per the basic approach in RFC3041.
+	 */
+	addr = &v6addr->s6_addr8[8];
+	bcopy(hash + 8, addr, sizeof (uint64_t));
+	addr[0] &= ~0x2;				/* set local bit */
+
+	return (B_TRUE);
+}
+
 /* ARGSUSED */
 static boolean_t
 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
@@ -23739,14 +19786,14 @@ ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
  * Derive IPoIB interface id from the link layer address.
  */
 static boolean_t
-ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
 {
 	char		*addr;
 
-	if (phys_length != 20)
+	if (ill->ill_phys_addr_length != 20)
 		return (B_FALSE);
 	addr = (char *)&v6addr->s6_addr32[2];
-	bcopy(phys_addr + 12, addr, 8);
+	bcopy(ill->ill_phys_addr + 12, addr, 8);
 	/*
 	 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
 	 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE
@@ -23863,6 +19910,7 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
 			*ipifp = NULL;
 		return (B_FALSE);
 	}
+
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 		if (!IPIF_CAN_LOOKUP(ipif))
 			continue;
@@ -23897,71 +19945,9 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
 }
 
 /*
- * Same as ipif_lookup_zoneid() but looks at all the ills in the same group.
- */
-boolean_t
-ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
-{
-	ill_t *illg;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	/*
-	 * We look at the passed-in ill first without grabbing ill_g_lock.
-	 */
-	if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) {
-		return (B_TRUE);
-	}
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	if (ill->ill_group == NULL) {
-		/* ill not in a group */
-		rw_exit(&ipst->ips_ill_g_lock);
-		return (B_FALSE);
-	}
-
-	/*
-	 * There's no ipif in the zone on ill, however ill is part of an IPMP
-	 * group. We need to look for an ipif in the zone on all the ills in the
-	 * group.
-	 */
-	illg = ill->ill_group->illgrp_ill;
-	do {
-		/*
-		 * We don't call ipif_lookup_zoneid() on ill as we already know
-		 * that it's not there.
-		 */
-		if (illg != ill &&
-		    ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) {
-			break;
-		}
-	} while ((illg = illg->ill_group_next) != NULL);
-	rw_exit(&ipst->ips_ill_g_lock);
-	return (illg != NULL);
-}
-
-/*
- * Check if this ill is only being used to send ICMP probes for IPMP
- */
-boolean_t
-ill_is_probeonly(ill_t *ill)
-{
-	/*
-	 * Check if the interface is FAILED, or INACTIVE
-	 */
-	if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE))
-		return (B_TRUE);
-
-	return (B_FALSE);
-}
-
-/*
  * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
  * If a pointer to an ipif_t is returned then the caller will need to do
  * an ill_refrele().
- *
- * If there is no real interface which matches the ifindex, then it looks
- * for a group that has a matching index. In the case of a group match the
- * lifidx must be zero. We don't need emulate the logical interfaces
- * since IP Filter's use of netinfo doesn't use that.
  */
 ipif_t *
 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
@@ -23972,18 +19958,8 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
 
 	ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
 	    ipst);
-
-	if (ill == NULL) {
-		/* Fallback to group names only if hook_emulation set */
-		if (!ipst->ips_ipmp_hook_emulation)
-			return (NULL);
-
-		if (lifidx != 0)
-			return (NULL);
-		ill = ill_group_lookup_on_ifindex(ifindex, isv6, ipst);
-		if (ill == NULL)
-			return (NULL);
-	}
+	if (ill == NULL)
+		return (NULL);
 
 	mutex_enter(&ill->ill_lock);
 	if (ill->ill_state_flags & ILL_CONDEMNED) {
@@ -24059,7 +20035,7 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp)
 	 * If we can quiesce the ill, then set the address.  If not, then
 	 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
 	 */
-	ill_down_ipifs(ill, NULL, 0, B_FALSE);
+	ill_down_ipifs(ill);
 	mutex_enter(&ill->ill_lock);
 	if (!ill_is_quiescent(ill)) {
 		/* call cannot fail since `conn_t *' argument is NULL */
@@ -24283,10 +20259,7 @@ ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event,
 	if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL)
 		goto fail;
 
-	if (event == NE_UNPLUMB)
-		info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
-	else
-		info->hnei_event.hne_nic = ill->ill_phyint->phyint_hook_ifindex;
+	info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
 	info->hnei_event.hne_lif = lif;
 	info->hnei_event.hne_event = event;
 	info->hnei_event.hne_protocol = ill->ill_isv6 ?
@@ -24323,8 +20296,8 @@ fail:
 void
 ipif_up_notify(ipif_t *ipif)
 {
-	ip_rts_ifmsg(ipif);
-	ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
+	ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+	ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT);
 	sctp_update_ipif(ipif, SCTP_IPIF_UP);
 	ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id),
 	    NE_LIF_UP, NULL, 0);