summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/ip/ipmp.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet/ip/ipmp.c')
-rw-r--r--usr/src/uts/common/inet/ip/ipmp.c2201
1 files changed, 2201 insertions, 0 deletions
diff --git a/usr/src/uts/common/inet/ip/ipmp.c b/usr/src/uts/common/inet/ip/ipmp.c
new file mode 100644
index 0000000000..b8f3768834
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ipmp.c
@@ -0,0 +1,2201 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <inet/arp.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_rts.h>
+#include <inet/mi.h>
+#include <net/if_types.h>
+#include <sys/dlpi.h>
+#include <sys/kmem.h>
+#include <sys/modhash.h>
+#include <sys/sdt.h>
+#include <sys/strsun.h>
+#include <sys/sunddi.h>
+#include <sys/types.h>
+
+/*
+ * Convenience macros for getting the ip_stack_t associated with an
+ * ipmp_illgrp_t or ipmp_grp_t.
+ */
+#define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint)
+#define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst)
+
+/*
+ * Assorted constants that aren't important enough to be tunable.
+ */
+#define IPMP_GRP_HASH_SIZE 64
+#define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */
+
+/*
+ * Templates for IPMP ARP messages.
+ */
+static const arie_t ipmp_aract_template = {
+ AR_IPMP_ACTIVATE,
+ sizeof (arie_t), /* Name offset */
+ sizeof (arie_t) /* Name length (set by ill_arp_alloc) */
+};
+
+static const arie_t ipmp_ardeact_template = {
+ AR_IPMP_DEACTIVATE,
+ sizeof (arie_t), /* Name offset */
+ sizeof (arie_t) /* Name length (set by ill_arp_alloc) */
+};
+
+/*
+ * IPMP meta-interface kstats (based on those in PSARC/1997/198).
+ */
+static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = {
+ { "obytes", KSTAT_DATA_UINT32 },
+ { "obytes64", KSTAT_DATA_UINT64 },
+ { "rbytes", KSTAT_DATA_UINT32 },
+ { "rbytes64", KSTAT_DATA_UINT64 },
+ { "opackets", KSTAT_DATA_UINT32 },
+ { "opackets64", KSTAT_DATA_UINT64 },
+ { "oerrors", KSTAT_DATA_UINT32 },
+ { "ipackets", KSTAT_DATA_UINT32 },
+ { "ipackets64", KSTAT_DATA_UINT64 },
+ { "ierrors", KSTAT_DATA_UINT32 },
+ { "multircv", KSTAT_DATA_UINT32 },
+ { "multixmt", KSTAT_DATA_UINT32 },
+ { "brdcstrcv", KSTAT_DATA_UINT32 },
+ { "brdcstxmt", KSTAT_DATA_UINT32 },
+ { "link_up", KSTAT_DATA_UINT32 }
+};
+
+static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t);
+static int ipmp_grp_create_kstats(ipmp_grp_t *);
+static int ipmp_grp_update_kstats(kstat_t *, int);
+static void ipmp_grp_destroy_kstats(ipmp_grp_t *);
+static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *);
+static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *);
+static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
+static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t);
+static boolean_t ipmp_ill_activate(ill_t *);
+static void ipmp_ill_deactivate(ill_t *);
+static void ipmp_ill_ire_mark_testhidden(ire_t *, char *);
+static void ipmp_ill_ire_clear_testhidden(ire_t *, char *);
+static void ipmp_ill_refresh_active_timer_start(ill_t *);
+static void ipmp_ill_rtsaddrmsg(ill_t *, int);
+static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action);
+static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t);
+static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *);
+static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *);
+
+/*
+ * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init().
+ */
+void
+ipmp_init(ip_stack_t *ipst)
+{
+ ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash",
+ IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
+ mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
+ rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0);
+}
+
+/*
+ * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini().
+ */
+void
+ipmp_destroy(ip_stack_t *ipst)
+{
+ mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash);
+ rw_destroy(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Create an IPMP group named `grname', associate it with IPMP phyint `phyi',
+ * and add it to the hash. On success, return a pointer to the created group.
+ * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP
+ * meta-interface associated with the group also has the same name (but they
+ * may differ later via ipmp_grp_rename()).
+ */
+ipmp_grp_t *
+ipmp_grp_create(const char *grname, phyint_t *phyi)
+{
+ ipmp_grp_t *grp;
+ ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+ mod_hash_hndl_t mh;
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL)
+ return (NULL);
+
+ (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
+ (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname));
+
+ /*
+ * Cache the group's phyint. This is safe since a phyint_t will
+ * outlive its ipmp_grp_t.
+ */
+ grp->gr_phyint = phyi;
+
+ /*
+ * Create IPMP group kstats.
+ */
+ if (ipmp_grp_create_kstats(grp) != 0) {
+ kmem_free(grp, sizeof (ipmp_grp_t));
+ return (NULL);
+ }
+
+ /*
+ * Insert the group into the hash.
+ */
+ if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) {
+ ipmp_grp_destroy_kstats(grp);
+ kmem_free(grp, sizeof (ipmp_grp_t));
+ return (NULL);
+ }
+ ipmp_grp_insert(grp, mh);
+
+ return (grp);
+}
+
+/*
+ * Create IPMP kstat structures for `grp'. Return an errno upon failure.
+ */
+static int
+ipmp_grp_create_kstats(ipmp_grp_t *grp)
+{
+ kstat_t *ksp;
+ netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
+
+ ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net",
+ KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id);
+ if (ksp == NULL)
+ return (ENOMEM);
+
+ ksp->ks_update = ipmp_grp_update_kstats;
+ ksp->ks_private = grp;
+ bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats));
+
+ kstat_install(ksp);
+ grp->gr_ksp = ksp;
+ return (0);
+}
+
+/*
+ * Update the IPMP kstats tracked by `ksp'; called by the kstats framework.
+ */
+static int
+ipmp_grp_update_kstats(kstat_t *ksp, int rw)
+{
+ uint_t i;
+ kstat_named_t *kn = KSTAT_NAMED_PTR(ksp);
+ ipmp_grp_t *grp = ksp->ks_private;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+ ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq;
+ phyint_t *phyi;
+ uint64_t phyi_kstats[IPMP_KSTAT_MAX];
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Start with the group's baseline values.
+ */
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ if (kn[i].data_type == KSTAT_DATA_UINT32) {
+ kn[i].value.ui32 = grp->gr_kstats0[i];
+ } else {
+ ASSERT(kn[i].data_type == KSTAT_DATA_UINT64);
+ kn[i].value.ui64 = grp->gr_kstats0[i];
+ }
+ }
+
+ /*
+ * Add in the stats of each phyint currently in the group. Since we
+ * don't directly track the phyints in a group, we cheat by walking
+ * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while
+ * ill_g_lock is held.)
+ */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ ipsq = grp_ipsq->ipsq_next;
+ for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) {
+ phyi = ipsq->ipsq_phyint;
+
+ /*
+ * If a phyint in a group is being unplumbed, it's possible
+ * that ill_glist_delete() -> phyint_free() already freed the
+ * phyint (and set ipsq_phyint to NULL), but the unplumb
+ * operation has yet to complete (and thus ipsq_dq() has yet
+ * to remove the phyint's IPSQ from the group IPSQ's phyint
+ * list). We skip those phyints here (note that their kstats
+ * have already been added to gr_kstats0[]).
+ */
+ if (phyi == NULL)
+ continue;
+
+ ipmp_phyint_get_kstats(phyi, phyi_kstats);
+
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ phyi_kstats[i] -= phyi->phyint_kstats0[i];
+ if (kn[i].data_type == KSTAT_DATA_UINT32)
+ kn[i].value.ui32 += phyi_kstats[i];
+ else
+ kn[i].value.ui64 += phyi_kstats[i];
+ }
+ }
+
+ kn[IPMP_KSTAT_LINK_UP].value.ui32 =
+ (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0;
+
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (0);
+}
+
+/*
+ * Destroy IPMP kstat structures for `grp'.
+ */
+static void
+ipmp_grp_destroy_kstats(ipmp_grp_t *grp)
+{
+ netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
+
+ kstat_delete_netstack(grp->gr_ksp, id);
+ bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0));
+ grp->gr_ksp = NULL;
+}
+
+/*
+ * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it
+ * does not exist.
+ */
+ipmp_grp_t *
+ipmp_grp_lookup(const char *grname, ip_stack_t *ipst)
+{
+ ipmp_grp_t *grp;
+
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
+ (mod_hash_val_t *)&grp) == 0)
+ return (grp);
+
+ return (NULL);
+}
+
+/*
+ * Place information about group `grp' into `lifgr'.
+ */
+void
+ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr)
+{
+ ill_t *ill;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ lifgr->gi_v4 = (grp->gr_v4 != NULL);
+ lifgr->gi_v6 = (grp->gr_v6 != NULL);
+ lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4;
+ lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6;
+ lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP;
+ (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ);
+ lifgr->gi_m4ifname[0] = '\0';
+ lifgr->gi_m6ifname[0] = '\0';
+ lifgr->gi_bcifname[0] = '\0';
+
+ if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) {
+ (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ);
+ (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ);
+ }
+
+ if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL)
+ (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ);
+}
+
+/*
+ * Insert `grp' into the hash using the reserved hash entry `mh'.
+ * Caller must ensure `grp' is not yet in the hash.
+ */
+static void
+ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh)
+{
+ int err;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * Since grp->gr_name will exist at least as long as `grp' is in the
+ * hash, we use it directly as the key.
+ */
+ err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash,
+ (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh);
+ if (err != 0) {
+ /*
+ * This should never happen since `mh' was preallocated.
+ */
+ panic("cannot insert IPMP group \"%s\" (err %d)",
+ grp->gr_name, err);
+ }
+}
+
+/*
+ * Remove `grp' from the hash. Caller must ensure `grp' is in it.
+ */
+static void
+ipmp_grp_remove(ipmp_grp_t *grp)
+{
+ int err;
+ mod_hash_val_t val;
+ mod_hash_key_t key = (mod_hash_key_t)grp->gr_name;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val);
+ if (err != 0 || val != grp) {
+ panic("cannot remove IPMP group \"%s\" (err %d)",
+ grp->gr_name, err);
+ }
+}
+
+/*
+ * Attempt to rename `grp' to new name `grname'. Return an errno if the new
+ * group name already exists or is invalid, or if there isn't enough memory.
+ */
+int
+ipmp_grp_rename(ipmp_grp_t *grp, const char *grname)
+{
+ mod_hash_hndl_t mh;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if (grname[0] == '\0')
+ return (EINVAL);
+
+ if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
+ (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND)
+ return (EEXIST);
+
+ /*
+ * Before we remove the group from the hash, ensure we'll be able to
+ * re-insert it by reserving space.
+ */
+ if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0)
+ return (ENOMEM);
+
+ ipmp_grp_remove(grp);
+ (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
+ ipmp_grp_insert(grp, mh);
+
+ return (0);
+}
+
+/*
+ * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in
+ * the hash, and that there are no interfaces on it.
+ */
+void
+ipmp_grp_destroy(ipmp_grp_t *grp)
+{
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * If there are still interfaces using this group, panic before things
+ * go really off the rails.
+ */
+ if (grp->gr_nif != 0)
+ panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name);
+
+ ipmp_grp_remove(grp);
+ ipmp_grp_destroy_kstats(grp);
+
+ ASSERT(grp->gr_v4 == NULL);
+ ASSERT(grp->gr_v6 == NULL);
+ ASSERT(grp->gr_nv4 == 0);
+ ASSERT(grp->gr_nv6 == 0);
+ ASSERT(grp->gr_nactif == 0);
+ ASSERT(grp->gr_linkdownmp == NULL);
+ grp->gr_phyint = NULL;
+
+ kmem_free(grp, sizeof (ipmp_grp_t));
+}
+
+/*
+ * Check whether `ill' is suitable for inclusion into `grp', and return an
+ * errno describing the problem (if any). NOTE: many of these errno values
+ * are interpreted by ifconfig, which will take corrective action and retry
+ * the SIOCSLIFGROUPNAME, so please exercise care when changing them.
+ */
+static int
+ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
+{
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * To sidestep complicated address migration logic in the kernel and
+ * to force the kernel's all-hosts multicast memberships to be blown
+ * away, all addresses that had been brought up must be brought back
+ * down prior to adding an interface to a group. (This includes
+ * addresses currently down due to DAD.) Once the interface has been
+ * added to the group, its addresses can then be brought back up, at
+ * which point they will be moved to the IPMP meta-interface.
+ * NOTE: we do this before ill_appaddr_cnt() since bringing down the
+ * link-local causes in.ndpd to remove its ADDRCONF'd addresses.
+ */
+ if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
+ return (EADDRINUSE);
+
+ /*
+ * To avoid confusing applications by changing addresses that are
+ * under their control, all such control must be removed prior to
+ * adding an interface into a group.
+ */
+ if (ill_appaddr_cnt(ill) != 0)
+ return (EADDRNOTAVAIL);
+
+ /*
+ * Since PTP addresses do not share the same broadcast domain, they
+ * are not allowed to be in an IPMP group.
+ */
+ if (ill_ptpaddr_cnt(ill) != 0)
+ return (EINVAL);
+
+ /*
+ * An ill must support multicast to be allowed into a group.
+ */
+ if (!(ill->ill_flags & ILLF_MULTICAST))
+ return (ENOTSUP);
+
+ /*
+ * An ill must strictly be using ARP and/or ND for address
+ * resolution for it to be allowed into a group.
+ */
+ if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV))
+ return (ENOTSUP);
+
+ /*
+ * An ill cannot also be using usesrc groups. (Although usesrc uses
+ * ill_g_usesrc_lock, we don't need to grab it since usesrc also does
+ * all its modifications as writer.)
+ */
+ if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill))
+ return (ENOTSUP);
+
+ /*
+ * All ills in a group must be the same mactype.
+ */
+ if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Check whether `phyi' is suitable for inclusion into `grp', and return an
+ * errno describing the problem (if any). See comment above ipmp_grp_vet_ill()
+ * regarding errno values.
+ */
+int
+ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi)
+{
+ int err = 0;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq));
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * An interface cannot have address families plumbed that are not
+ * configured in the group.
+ */
+ if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL ||
+ phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL)
+ return (EAFNOSUPPORT);
+
+ if (phyi->phyint_illv4 != NULL)
+ err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4);
+ if (err == 0 && phyi->phyint_illv6 != NULL)
+ err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6);
+
+ return (err);
+}
+
+/*
+ * Create a new illgrp on IPMP meta-interface `ill'.
+ */
+ipmp_illgrp_t *
+ipmp_illgrp_create(ill_t *ill)
+{
+ uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
+ ipmp_illgrp_t *illg;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_IPMP(ill));
+ ASSERT(ill->ill_grp == NULL);
+
+ if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL)
+ return (NULL);
+
+ list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode));
+ list_create(&illg->ig_actif, sizeof (ill_t),
+ offsetof(ill_t, ill_actnode));
+ list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t),
+ offsetof(ipmp_arpent_t, ia_node));
+
+ illg->ig_ipmp_ill = ill;
+ ill->ill_grp = illg;
+ ipmp_illgrp_set_mtu(illg, mtu);
+
+ return (illg);
+}
+
+/*
+ * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface.
+ */
+void
+ipmp_illgrp_destroy(ipmp_illgrp_t *illg)
+{
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+ ASSERT(IS_IPMP(illg->ig_ipmp_ill));
+
+ /*
+ * Verify `illg' is empty.
+ */
+ ASSERT(illg->ig_next_ill == NULL);
+ ASSERT(illg->ig_cast_ill == NULL);
+ ASSERT(list_is_empty(&illg->ig_arpent));
+ ASSERT(list_is_empty(&illg->ig_if));
+ ASSERT(list_is_empty(&illg->ig_actif));
+ ASSERT(illg->ig_nactif == 0);
+
+ /*
+ * Destroy `illg'.
+ */
+ illg->ig_ipmp_ill->ill_grp = NULL;
+ illg->ig_ipmp_ill = NULL;
+ list_destroy(&illg->ig_if);
+ list_destroy(&illg->ig_actif);
+ list_destroy(&illg->ig_arpent);
+ kmem_free(illg, sizeof (ipmp_illgrp_t));
+}
+
+/*
+ * Add `ipif' to the pool of usable data addresses on `illg' and attempt to
+ * bind it to an underlying ill, while keeping an even address distribution.
+ * If the bind is successful, return a pointer to the bound ill.
+ */
+ill_t *
+ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
+{
+ ill_t *minill;
+ ipmp_arpent_t *entp;
+
+ ASSERT(IAM_WRITER_IPIF(ipif));
+ ASSERT(ipmp_ipif_is_dataaddr(ipif));
+
+ /*
+ * IPMP data address mappings are internally managed by IP itself, so
+ * delete any existing ARP entries associated with the address.
+ */
+ if (!ipif->ipif_isv6) {
+ entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr);
+ if (entp != NULL)
+ ipmp_illgrp_destroy_arpent(illg, entp);
+ }
+
+ if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
+ ipmp_ill_bind_ipif(minill, ipif, Res_act_none);
+
+ return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL);
+}
+
+/*
+ * Delete `ipif' from the pool of usable data addresses on `illg'. If it's
+ * bound, unbind it from the underlying ill while keeping an even address
+ * distribution.
+ */
+void
+ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
+{
+ ill_t *maxill, *boundill = ipif->ipif_bound_ill;
+
+ ASSERT(IAM_WRITER_IPIF(ipif));
+
+ if (boundill != NULL) {
+ (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE);
+
+ maxill = ipmp_illgrp_max_ill(illg);
+ if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) {
+ ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
+ ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind);
+ }
+ }
+}
+
+/*
+ * Return the active ill with the greatest number of data addresses in `illg'.
+ */
+static ill_t *
+ipmp_illgrp_max_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill, *bestill = NULL;
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ ill = list_head(&illg->ig_actif);
+ for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
+ if (bestill == NULL ||
+ ill->ill_bound_cnt > bestill->ill_bound_cnt) {
+ bestill = ill;
+ }
+ }
+ return (bestill);
+}
+
+/*
+ * Return the active ill with the fewest number of data addresses in `illg'.
+ */
+static ill_t *
+ipmp_illgrp_min_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill, *bestill = NULL;
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ ill = list_head(&illg->ig_actif);
+ for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
+ if (bestill == NULL ||
+ ill->ill_bound_cnt < bestill->ill_bound_cnt) {
+ if (ill->ill_bound_cnt == 0)
+ return (ill); /* can't get better */
+ bestill = ill;
+ }
+ }
+ return (bestill);
+}
+
+/*
+ * Return a pointer to IPMP meta-interface for `illg' (which must exist).
+ * Since ig_ipmp_ill never changes for a given illg, no locks are needed.
+ */
+ill_t *
+ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg)
+{
+ return (illg->ig_ipmp_ill);
+}
+
+/*
+ * Return a pointer to the next available underlying ill in `illg', or NULL if
+ * one doesn't exist. Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_next_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ if ((ill = illg->ig_next_ill) != NULL) {
+ illg->ig_next_ill = list_next(&illg->ig_actif, ill);
+ if (illg->ig_next_ill == NULL)
+ illg->ig_next_ill = list_head(&illg->ig_actif);
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ return (ill);
+}
+
+/*
+ * Return a held pointer to the next available underlying ill in `illg', or
+ * NULL if one doesn't exist. Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill;
+ uint_t i;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ for (i = 0; i < illg->ig_nactif; i++) {
+ ill = illg->ig_next_ill;
+ illg->ig_next_ill = list_next(&illg->ig_actif, ill);
+ if (illg->ig_next_ill == NULL)
+ illg->ig_next_ill = list_head(&illg->ig_actif);
+
+ if (ILL_CAN_LOOKUP(ill)) {
+ ill_refhold(ill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (ill);
+ }
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ return (NULL);
+}
+
+/*
+ * Return a pointer to the nominated multicast ill in `illg', or NULL if one
+ * doesn't exist. Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg)
+{
+ /*
+ * Since an IPMP ill's ill_grp gets cleared during I_PUNLINK but
+ * this function can get called after that point, handle NULL.
+ */
+ if (illg == NULL)
+ return (NULL);
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+ return (illg->ig_cast_ill);
+}
+
+/*
+ * Return a held pointer to the nominated multicast ill in `illg', or NULL if
+ * one doesn't exist. Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *castill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ castill = illg->ig_cast_ill;
+ if (castill != NULL && ILL_CAN_LOOKUP(castill)) {
+ ill_refhold(castill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (castill);
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (NULL);
+}
+
+/*
+ * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL,
+ * any existing nomination is removed. Caller must be inside the IPSQ.
+ */
+static void
+ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
+{
+ ill_t *ocastill = illg->ig_cast_ill;
+ ill_t *ipmp_ill = illg->ig_ipmp_ill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill));
+
+ /*
+ * Disable old nominated ill (if any).
+ */
+ if (ocastill != NULL) {
+ DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *,
+ illg, ill_t *, ocastill);
+ ASSERT(ocastill->ill_nom_cast);
+ ocastill->ill_nom_cast = B_FALSE;
+ /*
+ * If the IPMP meta-interface is down, we never did the join,
+ * so we must not try to leave.
+ */
+ if (ipmp_ill->ill_dl_up)
+ ill_leave_multicast(ipmp_ill);
+ }
+
+ /*
+ * Set new nomination.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ illg->ig_cast_ill = castill;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ if (ocastill != NULL) {
+ /*
+ * Delete any IREs tied to the old nomination. We must do
+ * this after the new castill is set and has reached global
+ * visibility since the datapath has not been quiesced.
+ */
+ ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
+ ill_stq_cache_delete, ocastill, ocastill);
+ }
+
+ /*
+ * Enable new nominated ill (if any).
+ */
+ if (castill != NULL) {
+ DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *,
+ illg, ill_t *, castill);
+ ASSERT(!castill->ill_nom_cast);
+ castill->ill_nom_cast = B_TRUE;
+ /*
+ * If the IPMP meta-interface is down, the attempt to recover
+ * will silently fail but ill_need_recover_multicast will be
+ * erroneously cleared -- so check first.
+ */
+ if (ipmp_ill->ill_dl_up)
+ ill_recover_multicast(ipmp_ill);
+ }
+
+ /*
+ * For IPv4, refresh our broadcast IREs. This needs to be done even
+ * if there's no new nomination since ill_refresh_bcast() still must
+ * update the IPMP meta-interface's broadcast IREs to point back at
+ * the IPMP meta-interface itself.
+ */
+ if (!ipmp_ill->ill_isv6)
+ ill_refresh_bcast(ipmp_ill);
+}
+
+/*
+ * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an
+ * entry for the same IP address already exists, destroy it first. Return the
+ * created IPMP ARP entry, or NULL on failure.
+ */
+ipmp_arpent_t *
+ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp)
+{
+ uchar_t *addrp;
+ area_t *area = (area_t *)mp->b_rptr;
+ ipmp_arpent_t *entp, *oentp;
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+ ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t));
+
+ if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL)
+ return (NULL);
+
+ if ((mp = copyb(mp)) == NULL) {
+ kmem_free(entp, sizeof (ipmp_arpent_t));
+ return (NULL);
+ }
+
+ DB_TYPE(mp) = M_PROTO;
+ entp->ia_area_mp = mp;
+ entp->ia_proxyarp = proxyarp;
+ addrp = mi_offset_paramc(mp, area->area_proto_addr_offset,
+ sizeof (ipaddr_t));
+ bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t));
+
+ if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
+ ipmp_illgrp_destroy_arpent(illg, oentp);
+
+ list_insert_head(&illg->ig_arpent, entp);
+ return (entp);
+}
+
+/*
+ * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it.
+ */
+void
+ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
+{
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ list_remove(&illg->ig_arpent, entp);
+ freeb(entp->ia_area_mp);
+ kmem_free(entp, sizeof (ipmp_arpent_t));
+}
+
+/*
+ * Mark that ARP has been notified about the IP address on `entp'; `illg' is
+ * taken as a debugging aid for DTrace FBT probes.
+ */
+/* ARGSUSED */
+void
+ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
+{
+ entp->ia_notified = B_TRUE;
+}
+
+/*
+ * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is
+ * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist.
+ */
+ipmp_arpent_t *
+ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp)
+{
+ ipmp_arpent_t *entp = list_head(&illg->ig_arpent);
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ if (addrp == NULL)
+ return (entp);
+
+ for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp))
+ if (entp->ia_ipaddr == *addrp)
+ break;
+ return (entp);
+}
+
+/*
+ * Refresh ARP entries on `illg' to be distributed across its active
+ * interfaces. Entries that cannot be refreshed (e.g., because there are no
+ * active interfaces) are marked so that subsequent calls can try again.
+ */
+void
+ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
+{
+ ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
+ uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
+ area_t *area;
+ mblk_t *area_mp;
+ uchar_t *physaddr;
+ ipmp_arpent_t *entp;
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill));
+ ASSERT(!ipmp_ill->ill_isv6);
+
+ ill = list_head(&illg->ig_actif);
+ entp = list_head(&illg->ig_arpent);
+ for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) {
+ if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) {
+ entp->ia_notified = B_FALSE;
+ continue;
+ }
+
+ area = (area_t *)entp->ia_area_mp->b_rptr;
+ ASSERT(paddrlen == ill->ill_phys_addr_length);
+ ASSERT(paddrlen == area->area_hw_addr_length);
+ physaddr = mi_offset_paramc(entp->ia_area_mp,
+ area->area_hw_addr_offset, paddrlen);
+
+ /*
+ * If this is a proxy ARP entry, we can skip notifying ARP if
+ * the entry is already up-to-date. If it has changed, we
+ * update the entry's hardware address before notifying ARP.
+ */
+ if (entp->ia_proxyarp) {
+ if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 &&
+ entp->ia_notified)
+ continue;
+ bcopy(ill->ill_phys_addr, physaddr, paddrlen);
+ }
+
+ if ((area_mp = copyb(entp->ia_area_mp)) == NULL) {
+ entp->ia_notified = B_FALSE;
+ continue;
+ }
+
+ putnext(ipmp_ill->ill_rq, area_mp);
+ ipmp_illgrp_mark_arpent(illg, entp);
+
+ if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
+ ill = list_head(&illg->ig_actif);
+ }
+}
+
+/*
+ * Return an interface in `illg' with the specified `physaddr', or NULL if one
+ * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen)
+{
+ ill_t *ill;
+ ill_t *ipmp_ill = illg->ig_ipmp_ill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock));
+
+ ill = list_head(&illg->ig_if);
+ for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+ if (ill->ill_phys_addr_length == paddrlen &&
+ bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0)
+ return (ill);
+ }
+ return (NULL);
+}
+
+/*
+ * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND.
+ * Caller must be inside the IPSQ unless this is initialization.
+ */
+static void
+ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu)
+{
+ ill_t *ill = illg->ig_ipmp_ill;
+ mblk_t *mp;
+
+ ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill));
+
+ /*
+ * If allocation fails, we have bigger problems than MTU.
+ */
+ if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) {
+ illg->ig_mtu = mtu;
+ put(ill->ill_rq, mp);
+ }
+}
+
+/*
+ * Recalculate the IPMP group MTU for `illg', and update its associated IPMP
+ * ill MTU if necessary.
+ */
+void
+ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
+{
+ ill_t *ill;
+ ill_t *ipmp_ill = illg->ig_ipmp_ill;
+ uint_t mtu = 0;
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill));
+
+ /*
+ * Since ill_max_mtu can only change under ill_lock, we hold ill_lock
+ * for each ill as we iterate through the list. Any changes to the
+ * ill_max_mtu will also trigger an update, so even if we missed it
+ * this time around, the update will catch it.
+ */
+ ill = list_head(&illg->ig_if);
+ for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+ mutex_enter(&ill->ill_lock);
+ if (mtu == 0 || ill->ill_max_mtu < mtu)
+ mtu = ill->ill_max_mtu;
+ mutex_exit(&ill->ill_lock);
+ }
+
+ /*
+ * MTU must be at least the minimum MTU.
+ */
+ mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
+
+ if (illg->ig_mtu != mtu)
+ ipmp_illgrp_set_mtu(illg, mtu);
+}
+
+/*
+ * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently
+ * allow the same link to be established more than once.
+ */
+void
+ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp)
+{
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if (illg->ig_ipmp_ill->ill_isv6) {
+ ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg);
+ grp->gr_v6 = illg;
+ } else {
+ ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg);
+ grp->gr_v4 = illg;
+ }
+}
+
+/*
+ * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp
+ * cannot be unlinked (e.g., because there are still interfaces using it).
+ */
+int
+ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg)
+{
+ ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if (illg->ig_ipmp_ill->ill_isv6) {
+ if (grp->gr_nv6 + grp->gr_pendv6 != 0)
+ return (EBUSY);
+ grp->gr_v6 = NULL;
+ } else {
+ if (grp->gr_nv4 + grp->gr_pendv4 != 0)
+ return (EBUSY);
+ grp->gr_v4 = NULL;
+ }
+ return (0);
+}
+
+/*
+ * Place `ill' into `illg', and rebalance the data addresses on `illg'
+ * to be spread evenly across the ills now in it. Also, adjust the IPMP
+ * ill as necessary to account for `ill' (e.g., MTU).
+ */
+void
+ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
+{
+ ill_t *ipmp_ill;
+ ipif_t *ipif;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */
+ ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL);
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(ill->ill_grp == NULL);
+
+ ipmp_ill = illg->ig_ipmp_ill;
+
+ /*
+ * Account for `ill' joining the illgrp.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ if (ill->ill_isv6)
+ ill->ill_phyint->phyint_grp->gr_nv6++;
+ else
+ ill->ill_phyint->phyint_grp->gr_nv4++;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * Ensure the ILLF_ROUTER flag remains consistent across the group.
+ */
+ mutex_enter(&ill->ill_lock);
+ if (ipmp_ill->ill_flags & ILLF_ROUTER)
+ ill->ill_flags |= ILLF_ROUTER;
+ else
+ ill->ill_flags &= ~ILLF_ROUTER;
+ mutex_exit(&ill->ill_lock);
+
+ /*
+ * Blow away all multicast memberships that currently exist on `ill'.
+ * This may seem odd, but it's consistent with the application view
+ * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
+ */
+ if (ill->ill_isv6) {
+ reset_conn_ill(ill);
+ reset_mrt_ill(ill);
+ } else {
+ ipif = ill->ill_ipif;
+ for (; ipif != NULL; ipif = ipif->ipif_next) {
+ reset_conn_ipif(ipif);
+ reset_mrt_vif_ipif(ipif);
+ }
+ }
+ ip_purge_allmulti(ill);
+
+ /*
+ * Borrow the first ill's ill_phys_addr_length value for the illgrp's
+ * physical address length. All other ills must have the same value,
+ * since they are required to all be the same mactype. Also update
+ * the IPMP ill's MTU and CoS marking, if necessary.
+ */
+ if (list_is_empty(&illg->ig_if)) {
+ ASSERT(ipmp_ill->ill_phys_addr_length == 0);
+ /*
+ * NOTE: we leave ill_phys_addr NULL since the IPMP group
+ * doesn't have a physical address. This means that code must
+ * not assume that ill_phys_addr is non-NULL just because
+ * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla.
+ */
+ ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length;
+ ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length;
+ ipmp_ill->ill_type = ill->ill_type;
+
+ if (ill->ill_flags & ILLF_COS_ENABLED) {
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ }
+ ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+ } else {
+ ASSERT(ipmp_ill->ill_phys_addr_length ==
+ ill->ill_phys_addr_length);
+ ASSERT(ipmp_ill->ill_type == ill->ill_type);
+
+ if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ }
+ if (illg->ig_mtu > ill->ill_max_mtu)
+ ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+ }
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ list_insert_tail(&illg->ig_if, ill);
+ ill->ill_grp = illg;
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ /*
+ * Hide the IREs on `ill' so that we don't accidentally find them when
+ * sending data traffic.
+ */
+ ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
+
+ /*
+ * Merge any broadcast IREs, if need be.
+ */
+ if (!ill->ill_isv6)
+ ill_refresh_bcast(ill);
+
+ ipmp_ill_refresh_active(ill);
+}
+
+/*
+ * Remove `ill' from its illgrp, and rebalance the data addresses in that
+ * illgrp to be spread evenly across the remaining ills. Also, adjust the
+ * IPMP ill as necessary now that `ill' is removed (e.g., MTU).
+ */
+void
+ipmp_ill_leave_illgrp(ill_t *ill)
+{
+ ill_t *ipmp_ill;
+ ipif_t *ipif;
+ ipmp_arpent_t *entp;
+ ipmp_illgrp_t *illg = ill->ill_grp;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IS_UNDER_IPMP(ill));
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(illg != NULL);
+
+ ipmp_ill = illg->ig_ipmp_ill;
+
+ /*
+ * Cancel IPMP-specific ill timeouts.
+ */
+ (void) untimeout(ill->ill_refresh_tid);
+
+ /*
+ * Expose any previously-hidden IREs on `ill'.
+ */
+ ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill);
+
+ /*
+ * Ensure the multicast state for each ipif on `ill' is down so that
+ * our ipif_multicast_up() (once `ill' leaves the group) will rejoin
+ * all eligible groups.
+ */
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipif->ipif_flags & IPIF_UP)
+ ipif_multicast_down(ipif);
+
+ /*
+ * Account for `ill' leaving the illgrp.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ if (ill->ill_isv6)
+ ill->ill_phyint->phyint_grp->gr_nv6--;
+ else
+ ill->ill_phyint->phyint_grp->gr_nv4--;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * Pull `ill' out of the interface lists.
+ */
+ if (list_link_active(&ill->ill_actnode))
+ ipmp_ill_deactivate(ill);
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ list_remove(&illg->ig_if, ill);
+ ill->ill_grp = NULL;
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ /*
+ * Recreate any broadcast IREs that had been shared, if need be.
+ */
+ if (!ill->ill_isv6)
+ ill_refresh_bcast(ill);
+
+ /*
+ * Re-establish multicast memberships that were previously being
+ * handled by the IPMP meta-interface.
+ */
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipif->ipif_flags & IPIF_UP)
+ ipif_multicast_up(ipif);
+
+ /*
+ * Refresh the group MTU based on the new interface list.
+ */
+ ipmp_illgrp_refresh_mtu(illg);
+
+ if (list_is_empty(&illg->ig_if)) {
+ /*
+ * No ills left in the illgrp; we no longer have a physical
+ * address length, nor can we support ARP, CoS, or anything
+ * else that depends on knowing the link layer type.
+ */
+ while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL)
+ ipmp_illgrp_destroy_arpent(illg, entp);
+
+ ipmp_ill->ill_phys_addr_length = 0;
+ ipmp_ill->ill_nd_lla_len = 0;
+ ipmp_ill->ill_type = IFT_OTHER;
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ } else {
+ /*
+ * If `ill' didn't support CoS, see if it can now be enabled.
+ */
+ if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
+ ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED));
+
+ ill = list_head(&illg->ig_if);
+ do {
+ if (!(ill->ill_flags & ILLF_COS_ENABLED))
+ break;
+ } while ((ill = list_next(&illg->ig_if, ill)) != NULL);
+
+ if (ill == NULL) {
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ }
+ }
+ }
+}
+
+/*
+ * Check if `ill' should be active, and activate or deactivate if need be.
+ * Return B_FALSE if a refresh was necessary but could not be performed.
+ */
+static boolean_t
+ipmp_ill_try_refresh_active(ill_t *ill)
+{
+ boolean_t refreshed = B_TRUE;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ if (ipmp_ill_is_active(ill)) {
+ if (!list_link_active(&ill->ill_actnode))
+ refreshed = ipmp_ill_activate(ill);
+ } else {
+ if (list_link_active(&ill->ill_actnode))
+ ipmp_ill_deactivate(ill);
+ }
+
+ return (refreshed);
+}
+
+/*
+ * Check if `ill' should be active, and activate or deactivate if need be.
+ * If the refresh fails, schedule a timer to try again later.
+ */
+void
+ipmp_ill_refresh_active(ill_t *ill)
+{
+ if (!ipmp_ill_try_refresh_active(ill))
+ ipmp_ill_refresh_active_timer_start(ill);
+}
+
+/*
+ * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'.
+ */
+static void
+ipmp_ill_refresh_active_timer(void *ill_arg)
+{
+ ill_t *ill = ill_arg;
+ boolean_t refreshed = B_FALSE;
+
+ /*
+ * Clear ill_refresh_tid to indicate that no timeout is pending
+ * (another thread could schedule a new timeout while we're still
+ * running, but that's harmless). If the ill is going away, bail.
+ */
+ mutex_enter(&ill->ill_lock);
+ ill->ill_refresh_tid = 0;
+ if (ill->ill_state_flags & ILL_CONDEMNED) {
+ mutex_exit(&ill->ill_lock);
+ return;
+ }
+ mutex_exit(&ill->ill_lock);
+
+ if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) {
+ refreshed = ipmp_ill_try_refresh_active(ill);
+ ipsq_exit(ill->ill_phyint->phyint_ipsq);
+ }
+
+ /*
+ * If the refresh failed, schedule another attempt.
+ */
+ if (!refreshed)
+ ipmp_ill_refresh_active_timer_start(ill);
+}
+
+/*
+ * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'.
+ */
+static void
+ipmp_ill_refresh_active_timer_start(ill_t *ill)
+{
+ mutex_enter(&ill->ill_lock);
+
+ /*
+ * If the ill is going away or a refresh is already scheduled, bail.
+ */
+ if (ill->ill_refresh_tid != 0 ||
+ (ill->ill_state_flags & ILL_CONDEMNED)) {
+ mutex_exit(&ill->ill_lock);
+ return;
+ }
+
+ ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill,
+ SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT));
+
+ mutex_exit(&ill->ill_lock);
+}
+
+/*
+ * Activate `ill' so it will be used to send and receive data traffic. Return
+ * B_FALSE if `ill' cannot be activated. Note that we allocate any messages
+ * needed to deactivate `ill' here as well so that deactivation cannot fail.
+ */
+static boolean_t
+ipmp_ill_activate(ill_t *ill)
+{
+ ipif_t *ipif;
+ mblk_t *actmp = NULL, *deactmp = NULL;
+ mblk_t *linkupmp = NULL, *linkdownmp = NULL;
+ ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
+ const char *grifname = grp->gr_ifname;
+ ipmp_illgrp_t *illg = ill->ill_grp;
+ ill_t *maxill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ /*
+ * If this will be the first active interface in the group, allocate
+ * the link-up and link-down messages.
+ */
+ if (grp->gr_nactif == 0) {
+ linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0);
+ linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0);
+ if (linkupmp == NULL || linkdownmp == NULL)
+ goto fail;
+ }
+
+ /*
+ * For IPv4, allocate the activate/deactivate messages, and tell ARP.
+ */
+ if (!ill->ill_isv6) {
+ actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template);
+ deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template);
+ if (actmp == NULL || deactmp == NULL)
+ goto fail;
+
+ ASSERT(ill->ill_ardeact_mp == NULL);
+ ill->ill_ardeact_mp = deactmp;
+ putnext(illg->ig_ipmp_ill->ill_rq, actmp);
+ }
+
+ if (list_is_empty(&illg->ig_actif)) {
+ /*
+ * Now that we have an active ill, nominate it for multicast
+ * and broadcast duties. Do this before ipmp_ill_bind_ipif()
+ * since that may need to send multicast packets (e.g., IPv6
+ * neighbor discovery probes).
+ */
+ ipmp_illgrp_set_cast(illg, ill);
+
+ /*
+ * This is the first active ill in the illgrp -- add 'em all.
+ * We can access/walk ig_ipmp_ill's ipif list since we're
+ * writer on its IPSQ as well.
+ */
+ ipif = illg->ig_ipmp_ill->ill_ipif;
+ for (; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipmp_ipif_is_up_dataaddr(ipif))
+ ipmp_ill_bind_ipif(ill, ipif, Res_act_initial);
+ } else {
+ /*
+ * Redistribute the addresses by moving them from the ill with
+ * the most addresses until the ill being activated is at the
+ * same level as the rest of the ills.
+ */
+ for (;;) {
+ maxill = ipmp_illgrp_max_ill(illg);
+ ASSERT(maxill != NULL);
+ if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt)
+ break;
+ ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
+ ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
+ }
+
+ /*
+ * TODO: explore whether it's advantageous to flush IRE_CACHE
+ * bindings to force existing connections to be redistributed
+ * to the new ill.
+ */
+ }
+
+ /*
+ * Put the interface in the active list.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ list_insert_tail(&illg->ig_actif, ill);
+ illg->ig_nactif++;
+ illg->ig_next_ill = ill;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * Refresh ARP entries to use `ill', if need be.
+ */
+ if (!ill->ill_isv6)
+ ipmp_illgrp_refresh_arpent(illg);
+
+ /*
+ * Finally, mark the group link up, if necessary.
+ */
+ if (grp->gr_nactif++ == 0) {
+ ASSERT(grp->gr_linkdownmp == NULL);
+ grp->gr_linkdownmp = linkdownmp;
+ put(illg->ig_ipmp_ill->ill_rq, linkupmp);
+ }
+ return (B_TRUE);
+fail:
+ freemsg(actmp);
+ freemsg(deactmp);
+ freemsg(linkupmp);
+ freemsg(linkdownmp);
+ return (B_FALSE);
+}
+
+/*
+ * Deactivate `ill' so it will not be used to send or receive data traffic.
+ */
+static void
+ipmp_ill_deactivate(ill_t *ill)
+{
+ ill_t *minill;
+ ipif_t *ipif, *ubnextipif, *ubheadipif = NULL;
+ mblk_t *mp;
+ ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
+ ipmp_illgrp_t *illg = ill->ill_grp;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ /*
+ * Delete IRE_CACHE entries tied to this ill before they become stale.
+ */
+ ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
+ ill_stq_cache_delete, ill, ill);
+
+ /*
+ * Pull the interface out of the active list.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ list_remove(&illg->ig_actif, ill);
+ illg->ig_nactif--;
+ illg->ig_next_ill = list_head(&illg->ig_actif);
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * If the ill that's being deactivated had been nominated for
+ * multicast/broadcast, nominate a new one.
+ */
+ if (ill == illg->ig_cast_ill)
+ ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
+
+ /*
+ * Unbind all of the ipifs bound to this ill, and save 'em in a list;
+ * we'll rebind them after we tell the resolver the ill is no longer
+ * active. We must do things in this order or the resolver could
+ * accidentally rebind to the ill we're trying to remove if multiple
+ * ills in the group have the same hardware address (which is
+ * unsupported, but shouldn't lead to a wedged machine).
+ */
+ while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) {
+ ipif->ipif_bound_next = ubheadipif;
+ ubheadipif = ipif;
+ }
+
+ if (!ill->ill_isv6) {
+ /*
+ * Tell ARP `ill' is no longer active in the group.
+ */
+ mp = ill->ill_ardeact_mp;
+ ill->ill_ardeact_mp = NULL;
+ ASSERT(mp != NULL);
+ putnext(illg->ig_ipmp_ill->ill_rq, mp);
+
+ /*
+ * Refresh any ARP entries that had been using `ill'.
+ */
+ ipmp_illgrp_refresh_arpent(illg);
+ }
+
+ /*
+ * Rebind each ipif from the deactivated ill to the active ill with
+ * the fewest ipifs. If there are no active ills, the ipifs will
+ * remain unbound.
+ */
+ for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) {
+ ubnextipif = ipif->ipif_bound_next;
+ ipif->ipif_bound_next = NULL;
+
+ if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
+ ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
+ }
+
+ /*
+ * Finally, mark the group link down, if necessary.
+ */
+ if (--grp->gr_nactif == 0) {
+ mp = grp->gr_linkdownmp;
+ grp->gr_linkdownmp = NULL;
+ ASSERT(mp != NULL);
+ put(illg->ig_ipmp_ill->ill_rq, mp);
+ }
+}
+
+/*
+ * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD)
+ * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners.
+ */
+static void
+ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd)
+{
+ ipif_t *ipif;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE);
+
+ /*
+ * If `ill' is truly down, there are no messages to generate since:
+ *
+ * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface
+ * and its addresses by bringing them down. But that's already
+ * true, so there's nothing to hide.
+ *
+ * 2. If cmd == RTM_ADD, then we're supposed to generate messages
+ * indicating that any previously-hidden up addresses are again
+ * back up (along with the interface). But they aren't, so
+ * there's nothing to expose.
+ */
+ if (ill->ill_ipif_up_count == 0)
+ return;
+
+ if (cmd == RTM_ADD)
+ ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL);
+
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipif->ipif_flags & IPIF_UP)
+ ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL);
+
+ if (cmd == RTM_DELETE)
+ ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL);
+}
+
+/*
+ * Bind the address named by `ipif' to the underlying ill named by `ill'.
+ * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act'
+ * will indicate to the resolver whether this is an initial bringup of
+ * `ipif', or just a rebind to another ill.
+ */
+static void
+ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
+{
+ int err = 0;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif));
+ ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill));
+ ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif));
+ ASSERT(ipif->ipif_bound_ill == NULL);
+ ASSERT(ipif->ipif_bound_next == NULL);
+
+ ipif->ipif_bound_next = ill->ill_bound_ipif;
+ ill->ill_bound_ipif = ipif;
+ ill->ill_bound_cnt++;
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ ipif->ipif_bound_ill = ill;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * If necessary, tell ARP/NDP about the new mapping. Note that
+ * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills.
+ */
+ if (act != Res_act_none) {
+ if (ill->ill_isv6) {
+ VERIFY(ipif_resolver_up(ipif, act) == 0);
+ err = ipif_ndp_up(ipif, act == Res_act_initial);
+ } else {
+ err = ipif_resolver_up(ipif, act);
+ }
+
+ /*
+ * Since ipif_ndp_up() never returns EINPROGRESS and
+ * ipif_resolver_up() only returns EINPROGRESS when the
+ * associated ill is not up, we should never be here with
+ * EINPROGRESS. We rely on this to simplify the design.
+ */
+ ASSERT(err != EINPROGRESS);
+ }
+ /* TODO: retry binding on failure? when? */
+ ipif->ipif_bound = (err == 0);
+}
+
+/*
+ * Unbind the address named by `ipif' from the underlying ill named by `ill'.
+ * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned.
+ * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is
+ * B_TRUE, notify the resolver about the change.
+ */
+static ipif_t *
+ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
+{
+ ill_t *ipmp_ill;
+ ipif_t *previpif;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ ipmp_ill = ill->ill_grp->ig_ipmp_ill;
+
+ /*
+ * If necessary, find an ipif to unbind.
+ */
+ if (ipif == NULL) {
+ if ((ipif = ill->ill_bound_ipif) == NULL) {
+ ASSERT(ill->ill_bound_cnt == 0);
+ return (NULL);
+ }
+ }
+
+ ASSERT(IAM_WRITER_IPIF(ipif));
+ ASSERT(IS_IPMP(ipif->ipif_ill));
+ ASSERT(ipif->ipif_bound_ill == ill);
+ ASSERT(ill->ill_bound_cnt > 0);
+
+ /*
+ * Unbind it.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ ipif->ipif_bound_ill = NULL;
+ rw_exit(&ipst->ips_ipmp_lock);
+ ill->ill_bound_cnt--;
+
+ if (ill->ill_bound_ipif == ipif) {
+ ill->ill_bound_ipif = ipif->ipif_bound_next;
+ } else {
+ previpif = ill->ill_bound_ipif;
+ while (previpif->ipif_bound_next != ipif)
+ previpif = previpif->ipif_bound_next;
+
+ previpif->ipif_bound_next = ipif->ipif_bound_next;
+ }
+ ipif->ipif_bound_next = NULL;
+
+ /*
+ * If requested, notify the resolvers (provided we're bound).
+ */
+ if (notifyres && ipif->ipif_bound) {
+ if (ill->ill_isv6) {
+ ipif_ndp_down(ipif);
+ } else {
+ ASSERT(ipif->ipif_arp_del_mp != NULL);
+ putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp);
+ ipif->ipif_arp_del_mp = NULL;
+ }
+ }
+ ipif->ipif_bound = B_FALSE;
+
+ return (ipif);
+}
+
+/*
+ * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if
+ * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this
+ * to determine whether an ill should be considered active, other consumers
+ * may race and learn about an ill that should be deactivated/activated before
+ * IPMP has performed the activation/deactivation. This should be safe though
+ * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that
+ * would've been cleaned up by ipmp_ill_deactivate().
+ */
+boolean_t
+ipmp_ill_is_active(ill_t *ill)
+{
+ phyint_t *phyi = ill->ill_phyint;
+
+ ASSERT(IS_UNDER_IPMP(ill));
+ ASSERT(IAM_WRITER_ILL(ill) ||
+ (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock)));
+
+ /*
+ * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to
+ * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the
+ * link flapping logic to be just in in.mpathd and allows us to ignore
+ * changes to PHYI_RUNNING.
+ */
+ return (!(ill->ill_ipif_up_count == 0 ||
+ (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED))));
+}
+
+/*
+ * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet
+ * IREs with a source address on `ill_arg'.
+ */
+static void
+ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
+{
+ ill_t *ill = (ill_t *)ill_arg;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(!IS_IPMP(ill));
+
+ if (ire->ire_ipif->ipif_ill != ill)
+ return;
+
+ switch (ire->ire_type) {
+ case IRE_HOST:
+ case IRE_PREFIX:
+ case IRE_DEFAULT:
+ case IRE_CACHE:
+ case IRE_IF_RESOLVER:
+ case IRE_IF_NORESOLVER:
+ DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
+ ire->ire_marks |= IRE_MARK_TESTHIDDEN;
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source
+ * address on `ill_arg'.
+ */
+static void
+ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
+{
+ ill_t *ill = (ill_t *)ill_arg;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(!IS_IPMP(ill));
+
+ if (ire->ire_ipif->ipif_ill == ill) {
+ DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
+ ire->ire_marks &= ~IRE_MARK_TESTHIDDEN;
+ }
+}
+
+/*
+ * Return a held pointer to the IPMP ill for underlying interface `ill', or
+ * NULL if one doesn't exist. (Unfortunately, this function needs to take an
+ * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
+ * ill_grp pointer may become stale when not under an IPSQ and not holding
+ * ipmp_lock.) Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_ill_hold_ipmp_ill(ill_t *ill)
+{
+ ip_stack_t *ipst = ill->ill_ipst;
+ ipmp_illgrp_t *illg;
+
+ ASSERT(!IS_IPMP(ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ illg = ill->ill_grp;
+ if (illg != NULL && ILL_CAN_LOOKUP(illg->ig_ipmp_ill)) {
+ ill_refhold(illg->ig_ipmp_ill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (illg->ig_ipmp_ill);
+ }
+ /*
+ * Assume `ill' was removed from the illgrp in the meantime.
+ */
+ rw_exit(&ill->ill_ipst->ips_ipmp_lock);
+ return (NULL);
+}
+
+/*
+ * Return the interface index for the IPMP ill tied to underlying interface
+ * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ.
+ */
+uint_t
+ipmp_ill_get_ipmp_ifindex(const ill_t *ill)
+{
+ uint_t ifindex = 0;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ipmp_grp_t *grp;
+
+ ASSERT(!IS_IPMP(ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ if ((grp = ill->ill_phyint->phyint_grp) != NULL)
+ ifindex = grp->gr_phyint->phyint_ifindex;
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (ifindex);
+}
+
+/*
+ * Place phyint `phyi' into IPMP group `grp'.
+ */
+void
+ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp)
+{
+ ill_t *ill;
+ ipsq_t *ipsq = phyi->phyint_ipsq;
+ ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq;
+ ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+
+ ASSERT(IAM_WRITER_IPSQ(ipsq));
+ ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL);
+
+ /*
+ * Send routing socket messages indicating that the phyint's ills
+ * and ipifs vanished.
+ */
+ if (phyi->phyint_illv4 != NULL) {
+ ill = phyi->phyint_illv4;
+ ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
+ }
+
+ if (phyi->phyint_illv6 != NULL) {
+ ill = phyi->phyint_illv6;
+ ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
+ }
+
+ /*
+ * Snapshot the phyint's initial kstats as a baseline.
+ */
+ ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0);
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+
+ phyi->phyint_grp = grp;
+ if (++grp->gr_nif == 1)
+ grp->gr_mactype = ill->ill_mactype;
+ else
+ ASSERT(grp->gr_mactype == ill->ill_mactype);
+
+ /*
+ * Now that we're in the group, request a switch to the group's xop
+ * when we ipsq_exit(). All future operations will be exclusive on
+ * the group xop until ipmp_phyint_leave_grp() is called.
+ */
+ ASSERT(ipsq->ipsq_swxop == NULL);
+ ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop);
+ ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop;
+
+ rw_exit(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Remove phyint `phyi' from its current IPMP group.
+ */
+void
+ipmp_phyint_leave_grp(phyint_t *phyi)
+{
+ uint_t i;
+ ipsq_t *ipsq = phyi->phyint_ipsq;
+ ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+ uint64_t phyi_kstats[IPMP_KSTAT_MAX];
+
+ ASSERT(IAM_WRITER_IPSQ(ipsq));
+
+ /*
+ * If any of the phyint's ills are still in an illgrp, kick 'em out.
+ */
+ if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4))
+ ipmp_ill_leave_illgrp(phyi->phyint_illv4);
+ if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6))
+ ipmp_ill_leave_illgrp(phyi->phyint_illv6);
+
+ /*
+ * Send routing socket messages indicating that the phyint's ills
+ * and ipifs have reappeared.
+ */
+ if (phyi->phyint_illv4 != NULL)
+ ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD);
+ if (phyi->phyint_illv6 != NULL)
+ ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD);
+
+ /*
+ * Calculate the phyint's cumulative kstats while it was in the group,
+ * and add that to the group's baseline.
+ */
+ ipmp_phyint_get_kstats(phyi, phyi_kstats);
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ phyi_kstats[i] -= phyi->phyint_kstats0[i];
+ atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]);
+ }
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+
+ phyi->phyint_grp->gr_nif--;
+ phyi->phyint_grp = NULL;
+
+ /*
+ * As our final act in leaving the group, request a switch back to our
+ * IPSQ's own xop when we ipsq_exit().
+ */
+ ASSERT(ipsq->ipsq_swxop == NULL);
+ ipsq->ipsq_swxop = &ipsq->ipsq_ownxop;
+
+ rw_exit(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Store the IPMP-related kstats for `phyi' into the array named by `kstats'.
+ * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements.
+ */
+static void
+ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[])
+{
+ uint_t i, j;
+ const char *name;
+ kstat_t *ksp;
+ kstat_named_t *kn;
+
+ bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX);
+
+ /*
+ * NOTE: ALL_ZONES here assumes that there's at most one link
+ * with a given name on a given system (safe for now).
+ */
+ ksp = kstat_hold_byname("link", 0, phyi->phyint_name, ALL_ZONES);
+ if (ksp == NULL)
+ return;
+
+ KSTAT_ENTER(ksp);
+
+ if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
+ /*
+ * Bring kstats up-to-date before recording.
+ */
+ (void) KSTAT_UPDATE(ksp, KSTAT_READ);
+
+ kn = KSTAT_NAMED_PTR(ksp);
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ name = ipmp_kstats[i].name;
+ kstats[i] = 0;
+ for (j = 0; j < ksp->ks_ndata; j++) {
+ if (strcmp(kn[j].name, name) != 0)
+ continue;
+
+ switch (kn[j].data_type) {
+ case KSTAT_DATA_INT32:
+ case KSTAT_DATA_UINT32:
+ kstats[i] = kn[j].value.ui32;
+ break;
+#ifdef _LP64
+ case KSTAT_DATA_LONG:
+ case KSTAT_DATA_ULONG:
+ kstats[i] = kn[j].value.ul;
+ break;
+#endif
+ case KSTAT_DATA_INT64:
+ case KSTAT_DATA_UINT64:
+ kstats[i] = kn[j].value.ui64;
+ break;
+ }
+ break;
+ }
+ }
+ }
+
+ KSTAT_EXIT(ksp);
+ kstat_rele(ksp);
+}
+
+/*
+ * Refresh the active state of all ills on `phyi'.
+ */
+void
+ipmp_phyint_refresh_active(phyint_t *phyi)
+{
+ if (phyi->phyint_illv4 != NULL)
+ ipmp_ill_refresh_active(phyi->phyint_illv4);
+ if (phyi->phyint_illv6 != NULL)
+ ipmp_ill_refresh_active(phyi->phyint_illv6);
+}
+
+/*
+ * Return a held pointer to the underlying ill bound to `ipif', or NULL if one
+ * doesn't exist. Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
+{
+ ill_t *boundill;
+ ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+
+ ASSERT(IS_IPMP(ipif->ipif_ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ boundill = ipif->ipif_bound_ill;
+ if (boundill != NULL && ILL_CAN_LOOKUP(boundill)) {
+ ill_refhold(boundill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (boundill);
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (NULL);
+}
+
+/*
+ * Return a pointer to the underlying ill bound to `ipif', or NULL if one
+ * doesn't exist. Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_ipif_bound_ill(const ipif_t *ipif)
+{
+ ASSERT(IAM_WRITER_ILL(ipif->ipif_ill));
+ ASSERT(IS_IPMP(ipif->ipif_ill));
+
+ return (ipif->ipif_bound_ill);
+}
+
+/*
+ * Check if `ipif' is a "stub" (placeholder address not being used).
+ */
+boolean_t
+ipmp_ipif_is_stubaddr(const ipif_t *ipif)
+{
+ if (ipif->ipif_flags & IPIF_UP)
+ return (B_FALSE);
+ if (ipif->ipif_ill->ill_isv6)
+ return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
+ else
+ return (ipif->ipif_lcl_addr == INADDR_ANY);
+}
+
+/*
+ * Check if `ipif' is an IPMP data address.
+ */
+boolean_t
+ipmp_ipif_is_dataaddr(const ipif_t *ipif)
+{
+ if (ipif->ipif_flags & IPIF_NOFAILOVER)
+ return (B_FALSE);
+ if (ipif->ipif_ill->ill_isv6)
+ return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
+ else
+ return (ipif->ipif_lcl_addr != INADDR_ANY);
+}
+
+/*
+ * Check if `ipif' is an IPIF_UP IPMP data address.
+ */
+static boolean_t
+ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
+{
+ return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
+}