diff options
Diffstat (limited to 'usr/src/uts/common/inet/ip/ipmp.c')
-rw-r--r-- | usr/src/uts/common/inet/ip/ipmp.c | 2201 |
1 files changed, 2201 insertions, 0 deletions
diff --git a/usr/src/uts/common/inet/ip/ipmp.c b/usr/src/uts/common/inet/ip/ipmp.c new file mode 100644 index 0000000000..b8f3768834 --- /dev/null +++ b/usr/src/uts/common/inet/ip/ipmp.c @@ -0,0 +1,2201 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <inet/arp.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> +#include <inet/ip_multi.h> +#include <inet/ip_rts.h> +#include <inet/mi.h> +#include <net/if_types.h> +#include <sys/dlpi.h> +#include <sys/kmem.h> +#include <sys/modhash.h> +#include <sys/sdt.h> +#include <sys/strsun.h> +#include <sys/sunddi.h> +#include <sys/types.h> + +/* + * Convenience macros for getting the ip_stack_t associated with an + * ipmp_illgrp_t or ipmp_grp_t. + */ +#define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint) +#define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst) + +/* + * Assorted constants that aren't important enough to be tunable. + */ +#define IPMP_GRP_HASH_SIZE 64 +#define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */ + +/* + * Templates for IPMP ARP messages. + */ +static const arie_t ipmp_aract_template = { + AR_IPMP_ACTIVATE, + sizeof (arie_t), /* Name offset */ + sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ +}; + +static const arie_t ipmp_ardeact_template = { + AR_IPMP_DEACTIVATE, + sizeof (arie_t), /* Name offset */ + sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ +}; + +/* + * IPMP meta-interface kstats (based on those in PSARC/1997/198). + */ +static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = { + { "obytes", KSTAT_DATA_UINT32 }, + { "obytes64", KSTAT_DATA_UINT64 }, + { "rbytes", KSTAT_DATA_UINT32 }, + { "rbytes64", KSTAT_DATA_UINT64 }, + { "opackets", KSTAT_DATA_UINT32 }, + { "opackets64", KSTAT_DATA_UINT64 }, + { "oerrors", KSTAT_DATA_UINT32 }, + { "ipackets", KSTAT_DATA_UINT32 }, + { "ipackets64", KSTAT_DATA_UINT64 }, + { "ierrors", KSTAT_DATA_UINT32 }, + { "multircv", KSTAT_DATA_UINT32 }, + { "multixmt", KSTAT_DATA_UINT32 }, + { "brdcstrcv", KSTAT_DATA_UINT32 }, + { "brdcstxmt", KSTAT_DATA_UINT32 }, + { "link_up", KSTAT_DATA_UINT32 } +}; + +static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t); +static int ipmp_grp_create_kstats(ipmp_grp_t *); +static int ipmp_grp_update_kstats(kstat_t *, int); +static void ipmp_grp_destroy_kstats(ipmp_grp_t *); +static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *); +static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *); +static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *); +static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t); +static boolean_t ipmp_ill_activate(ill_t *); +static void ipmp_ill_deactivate(ill_t *); +static void ipmp_ill_ire_mark_testhidden(ire_t *, char *); +static void ipmp_ill_ire_clear_testhidden(ire_t *, char *); +static void ipmp_ill_refresh_active_timer_start(ill_t *); +static void ipmp_ill_rtsaddrmsg(ill_t *, int); +static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action); +static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t); +static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *); +static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *); + +/* + * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init(). + */ +void +ipmp_init(ip_stack_t *ipst) +{ + ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash", + IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, + mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); + rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0); +} + +/* + * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini(). + */ +void +ipmp_destroy(ip_stack_t *ipst) +{ + mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash); + rw_destroy(&ipst->ips_ipmp_lock); +} + +/* + * Create an IPMP group named `grname', associate it with IPMP phyint `phyi', + * and add it to the hash. On success, return a pointer to the created group. + * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP + * meta-interface associated with the group also has the same name (but they + * may differ later via ipmp_grp_rename()). + */ +ipmp_grp_t * +ipmp_grp_create(const char *grname, phyint_t *phyi) +{ + ipmp_grp_t *grp; + ip_stack_t *ipst = PHYINT_TO_IPST(phyi); + mod_hash_hndl_t mh; + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL) + return (NULL); + + (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); + (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname)); + + /* + * Cache the group's phyint. This is safe since a phyint_t will + * outlive its ipmp_grp_t. + */ + grp->gr_phyint = phyi; + + /* + * Create IPMP group kstats. + */ + if (ipmp_grp_create_kstats(grp) != 0) { + kmem_free(grp, sizeof (ipmp_grp_t)); + return (NULL); + } + + /* + * Insert the group into the hash. + */ + if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) { + ipmp_grp_destroy_kstats(grp); + kmem_free(grp, sizeof (ipmp_grp_t)); + return (NULL); + } + ipmp_grp_insert(grp, mh); + + return (grp); +} + +/* + * Create IPMP kstat structures for `grp'. Return an errno upon failure. + */ +static int +ipmp_grp_create_kstats(ipmp_grp_t *grp) +{ + kstat_t *ksp; + netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; + + ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net", + KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id); + if (ksp == NULL) + return (ENOMEM); + + ksp->ks_update = ipmp_grp_update_kstats; + ksp->ks_private = grp; + bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats)); + + kstat_install(ksp); + grp->gr_ksp = ksp; + return (0); +} + +/* + * Update the IPMP kstats tracked by `ksp'; called by the kstats framework. + */ +static int +ipmp_grp_update_kstats(kstat_t *ksp, int rw) +{ + uint_t i; + kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); + ipmp_grp_t *grp = ksp->ks_private; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq; + phyint_t *phyi; + uint64_t phyi_kstats[IPMP_KSTAT_MAX]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Start with the group's baseline values. + */ + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + if (kn[i].data_type == KSTAT_DATA_UINT32) { + kn[i].value.ui32 = grp->gr_kstats0[i]; + } else { + ASSERT(kn[i].data_type == KSTAT_DATA_UINT64); + kn[i].value.ui64 = grp->gr_kstats0[i]; + } + } + + /* + * Add in the stats of each phyint currently in the group. Since we + * don't directly track the phyints in a group, we cheat by walking + * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while + * ill_g_lock is held.) + */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + ipsq = grp_ipsq->ipsq_next; + for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) { + phyi = ipsq->ipsq_phyint; + + /* + * If a phyint in a group is being unplumbed, it's possible + * that ill_glist_delete() -> phyint_free() already freed the + * phyint (and set ipsq_phyint to NULL), but the unplumb + * operation has yet to complete (and thus ipsq_dq() has yet + * to remove the phyint's IPSQ from the group IPSQ's phyint + * list). We skip those phyints here (note that their kstats + * have already been added to gr_kstats0[]). + */ + if (phyi == NULL) + continue; + + ipmp_phyint_get_kstats(phyi, phyi_kstats); + + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + phyi_kstats[i] -= phyi->phyint_kstats0[i]; + if (kn[i].data_type == KSTAT_DATA_UINT32) + kn[i].value.ui32 += phyi_kstats[i]; + else + kn[i].value.ui64 += phyi_kstats[i]; + } + } + + kn[IPMP_KSTAT_LINK_UP].value.ui32 = + (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0; + + rw_exit(&ipst->ips_ill_g_lock); + return (0); +} + +/* + * Destroy IPMP kstat structures for `grp'. + */ +static void +ipmp_grp_destroy_kstats(ipmp_grp_t *grp) +{ + netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; + + kstat_delete_netstack(grp->gr_ksp, id); + bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0)); + grp->gr_ksp = NULL; +} + +/* + * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it + * does not exist. + */ +ipmp_grp_t * +ipmp_grp_lookup(const char *grname, ip_stack_t *ipst) +{ + ipmp_grp_t *grp; + + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, + (mod_hash_val_t *)&grp) == 0) + return (grp); + + return (NULL); +} + +/* + * Place information about group `grp' into `lifgr'. + */ +void +ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr) +{ + ill_t *ill; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + lifgr->gi_v4 = (grp->gr_v4 != NULL); + lifgr->gi_v6 = (grp->gr_v6 != NULL); + lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4; + lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6; + lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP; + (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ); + lifgr->gi_m4ifname[0] = '\0'; + lifgr->gi_m6ifname[0] = '\0'; + lifgr->gi_bcifname[0] = '\0'; + + if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) { + (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ); + (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ); + } + + if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL) + (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ); +} + +/* + * Insert `grp' into the hash using the reserved hash entry `mh'. + * Caller must ensure `grp' is not yet in the hash. + */ +static void +ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh) +{ + int err; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + /* + * Since grp->gr_name will exist at least as long as `grp' is in the + * hash, we use it directly as the key. + */ + err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash, + (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh); + if (err != 0) { + /* + * This should never happen since `mh' was preallocated. + */ + panic("cannot insert IPMP group \"%s\" (err %d)", + grp->gr_name, err); + } +} + +/* + * Remove `grp' from the hash. Caller must ensure `grp' is in it. + */ +static void +ipmp_grp_remove(ipmp_grp_t *grp) +{ + int err; + mod_hash_val_t val; + mod_hash_key_t key = (mod_hash_key_t)grp->gr_name; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val); + if (err != 0 || val != grp) { + panic("cannot remove IPMP group \"%s\" (err %d)", + grp->gr_name, err); + } +} + +/* + * Attempt to rename `grp' to new name `grname'. Return an errno if the new + * group name already exists or is invalid, or if there isn't enough memory. + */ +int +ipmp_grp_rename(ipmp_grp_t *grp, const char *grname) +{ + mod_hash_hndl_t mh; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if (grname[0] == '\0') + return (EINVAL); + + if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, + (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND) + return (EEXIST); + + /* + * Before we remove the group from the hash, ensure we'll be able to + * re-insert it by reserving space. + */ + if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) + return (ENOMEM); + + ipmp_grp_remove(grp); + (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); + ipmp_grp_insert(grp, mh); + + return (0); +} + +/* + * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in + * the hash, and that there are no interfaces on it. + */ +void +ipmp_grp_destroy(ipmp_grp_t *grp) +{ + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + /* + * If there are still interfaces using this group, panic before things + * go really off the rails. + */ + if (grp->gr_nif != 0) + panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name); + + ipmp_grp_remove(grp); + ipmp_grp_destroy_kstats(grp); + + ASSERT(grp->gr_v4 == NULL); + ASSERT(grp->gr_v6 == NULL); + ASSERT(grp->gr_nv4 == 0); + ASSERT(grp->gr_nv6 == 0); + ASSERT(grp->gr_nactif == 0); + ASSERT(grp->gr_linkdownmp == NULL); + grp->gr_phyint = NULL; + + kmem_free(grp, sizeof (ipmp_grp_t)); +} + +/* + * Check whether `ill' is suitable for inclusion into `grp', and return an + * errno describing the problem (if any). NOTE: many of these errno values + * are interpreted by ifconfig, which will take corrective action and retry + * the SIOCSLIFGROUPNAME, so please exercise care when changing them. + */ +static int +ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill) +{ + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + /* + * To sidestep complicated address migration logic in the kernel and + * to force the kernel's all-hosts multicast memberships to be blown + * away, all addresses that had been brought up must be brought back + * down prior to adding an interface to a group. (This includes + * addresses currently down due to DAD.) Once the interface has been + * added to the group, its addresses can then be brought back up, at + * which point they will be moved to the IPMP meta-interface. + * NOTE: we do this before ill_appaddr_cnt() since bringing down the + * link-local causes in.ndpd to remove its ADDRCONF'd addresses. + */ + if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) + return (EADDRINUSE); + + /* + * To avoid confusing applications by changing addresses that are + * under their control, all such control must be removed prior to + * adding an interface into a group. + */ + if (ill_appaddr_cnt(ill) != 0) + return (EADDRNOTAVAIL); + + /* + * Since PTP addresses do not share the same broadcast domain, they + * are not allowed to be in an IPMP group. + */ + if (ill_ptpaddr_cnt(ill) != 0) + return (EINVAL); + + /* + * An ill must support multicast to be allowed into a group. + */ + if (!(ill->ill_flags & ILLF_MULTICAST)) + return (ENOTSUP); + + /* + * An ill must strictly be using ARP and/or ND for address + * resolution for it to be allowed into a group. + */ + if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV)) + return (ENOTSUP); + + /* + * An ill cannot also be using usesrc groups. (Although usesrc uses + * ill_g_usesrc_lock, we don't need to grab it since usesrc also does + * all its modifications as writer.) + */ + if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill)) + return (ENOTSUP); + + /* + * All ills in a group must be the same mactype. + */ + if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype) + return (EINVAL); + + return (0); +} + +/* + * Check whether `phyi' is suitable for inclusion into `grp', and return an + * errno describing the problem (if any). See comment above ipmp_grp_vet_ill() + * regarding errno values. + */ +int +ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi) +{ + int err = 0; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq)); + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + /* + * An interface cannot have address families plumbed that are not + * configured in the group. + */ + if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL || + phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL) + return (EAFNOSUPPORT); + + if (phyi->phyint_illv4 != NULL) + err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4); + if (err == 0 && phyi->phyint_illv6 != NULL) + err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6); + + return (err); +} + +/* + * Create a new illgrp on IPMP meta-interface `ill'. + */ +ipmp_illgrp_t * +ipmp_illgrp_create(ill_t *ill) +{ + uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; + ipmp_illgrp_t *illg; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_IPMP(ill)); + ASSERT(ill->ill_grp == NULL); + + if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL) + return (NULL); + + list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode)); + list_create(&illg->ig_actif, sizeof (ill_t), + offsetof(ill_t, ill_actnode)); + list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t), + offsetof(ipmp_arpent_t, ia_node)); + + illg->ig_ipmp_ill = ill; + ill->ill_grp = illg; + ipmp_illgrp_set_mtu(illg, mtu); + + return (illg); +} + +/* + * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface. + */ +void +ipmp_illgrp_destroy(ipmp_illgrp_t *illg) +{ + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + ASSERT(IS_IPMP(illg->ig_ipmp_ill)); + + /* + * Verify `illg' is empty. + */ + ASSERT(illg->ig_next_ill == NULL); + ASSERT(illg->ig_cast_ill == NULL); + ASSERT(list_is_empty(&illg->ig_arpent)); + ASSERT(list_is_empty(&illg->ig_if)); + ASSERT(list_is_empty(&illg->ig_actif)); + ASSERT(illg->ig_nactif == 0); + + /* + * Destroy `illg'. + */ + illg->ig_ipmp_ill->ill_grp = NULL; + illg->ig_ipmp_ill = NULL; + list_destroy(&illg->ig_if); + list_destroy(&illg->ig_actif); + list_destroy(&illg->ig_arpent); + kmem_free(illg, sizeof (ipmp_illgrp_t)); +} + +/* + * Add `ipif' to the pool of usable data addresses on `illg' and attempt to + * bind it to an underlying ill, while keeping an even address distribution. + * If the bind is successful, return a pointer to the bound ill. + */ +ill_t * +ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) +{ + ill_t *minill; + ipmp_arpent_t *entp; + + ASSERT(IAM_WRITER_IPIF(ipif)); + ASSERT(ipmp_ipif_is_dataaddr(ipif)); + + /* + * IPMP data address mappings are internally managed by IP itself, so + * delete any existing ARP entries associated with the address. + */ + if (!ipif->ipif_isv6) { + entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr); + if (entp != NULL) + ipmp_illgrp_destroy_arpent(illg, entp); + } + + if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) + ipmp_ill_bind_ipif(minill, ipif, Res_act_none); + + return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL); +} + +/* + * Delete `ipif' from the pool of usable data addresses on `illg'. If it's + * bound, unbind it from the underlying ill while keeping an even address + * distribution. + */ +void +ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) +{ + ill_t *maxill, *boundill = ipif->ipif_bound_ill; + + ASSERT(IAM_WRITER_IPIF(ipif)); + + if (boundill != NULL) { + (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE); + + maxill = ipmp_illgrp_max_ill(illg); + if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) { + ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); + ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind); + } + } +} + +/* + * Return the active ill with the greatest number of data addresses in `illg'. + */ +static ill_t * +ipmp_illgrp_max_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill, *bestill = NULL; + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + ill = list_head(&illg->ig_actif); + for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { + if (bestill == NULL || + ill->ill_bound_cnt > bestill->ill_bound_cnt) { + bestill = ill; + } + } + return (bestill); +} + +/* + * Return the active ill with the fewest number of data addresses in `illg'. + */ +static ill_t * +ipmp_illgrp_min_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill, *bestill = NULL; + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + ill = list_head(&illg->ig_actif); + for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { + if (bestill == NULL || + ill->ill_bound_cnt < bestill->ill_bound_cnt) { + if (ill->ill_bound_cnt == 0) + return (ill); /* can't get better */ + bestill = ill; + } + } + return (bestill); +} + +/* + * Return a pointer to IPMP meta-interface for `illg' (which must exist). + * Since ig_ipmp_ill never changes for a given illg, no locks are needed. + */ +ill_t * +ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg) +{ + return (illg->ig_ipmp_ill); +} + +/* + * Return a pointer to the next available underlying ill in `illg', or NULL if + * one doesn't exist. Caller must be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_next_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + if ((ill = illg->ig_next_ill) != NULL) { + illg->ig_next_ill = list_next(&illg->ig_actif, ill); + if (illg->ig_next_ill == NULL) + illg->ig_next_ill = list_head(&illg->ig_actif); + } + rw_exit(&ipst->ips_ipmp_lock); + + return (ill); +} + +/* + * Return a held pointer to the next available underlying ill in `illg', or + * NULL if one doesn't exist. Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill; + uint_t i; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + for (i = 0; i < illg->ig_nactif; i++) { + ill = illg->ig_next_ill; + illg->ig_next_ill = list_next(&illg->ig_actif, ill); + if (illg->ig_next_ill == NULL) + illg->ig_next_ill = list_head(&illg->ig_actif); + + if (ILL_CAN_LOOKUP(ill)) { + ill_refhold(ill); + rw_exit(&ipst->ips_ipmp_lock); + return (ill); + } + } + rw_exit(&ipst->ips_ipmp_lock); + + return (NULL); +} + +/* + * Return a pointer to the nominated multicast ill in `illg', or NULL if one + * doesn't exist. Caller must be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg) +{ + /* + * Since an IPMP ill's ill_grp gets cleared during I_PUNLINK but + * this function can get called after that point, handle NULL. + */ + if (illg == NULL) + return (NULL); + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + return (illg->ig_cast_ill); +} + +/* + * Return a held pointer to the nominated multicast ill in `illg', or NULL if + * one doesn't exist. Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) +{ + ill_t *castill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + castill = illg->ig_cast_ill; + if (castill != NULL && ILL_CAN_LOOKUP(castill)) { + ill_refhold(castill); + rw_exit(&ipst->ips_ipmp_lock); + return (castill); + } + rw_exit(&ipst->ips_ipmp_lock); + return (NULL); +} + +/* + * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL, + * any existing nomination is removed. Caller must be inside the IPSQ. + */ +static void +ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) +{ + ill_t *ocastill = illg->ig_cast_ill; + ill_t *ipmp_ill = illg->ig_ipmp_ill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ipmp_ill)); + + /* + * Disable old nominated ill (if any). + */ + if (ocastill != NULL) { + DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *, + illg, ill_t *, ocastill); + ASSERT(ocastill->ill_nom_cast); + ocastill->ill_nom_cast = B_FALSE; + /* + * If the IPMP meta-interface is down, we never did the join, + * so we must not try to leave. + */ + if (ipmp_ill->ill_dl_up) + ill_leave_multicast(ipmp_ill); + } + + /* + * Set new nomination. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + illg->ig_cast_ill = castill; + rw_exit(&ipst->ips_ipmp_lock); + + if (ocastill != NULL) { + /* + * Delete any IREs tied to the old nomination. We must do + * this after the new castill is set and has reached global + * visibility since the datapath has not been quiesced. + */ + ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, + ill_stq_cache_delete, ocastill, ocastill); + } + + /* + * Enable new nominated ill (if any). + */ + if (castill != NULL) { + DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *, + illg, ill_t *, castill); + ASSERT(!castill->ill_nom_cast); + castill->ill_nom_cast = B_TRUE; + /* + * If the IPMP meta-interface is down, the attempt to recover + * will silently fail but ill_need_recover_multicast will be + * erroneously cleared -- so check first. + */ + if (ipmp_ill->ill_dl_up) + ill_recover_multicast(ipmp_ill); + } + + /* + * For IPv4, refresh our broadcast IREs. This needs to be done even + * if there's no new nomination since ill_refresh_bcast() still must + * update the IPMP meta-interface's broadcast IREs to point back at + * the IPMP meta-interface itself. + */ + if (!ipmp_ill->ill_isv6) + ill_refresh_bcast(ipmp_ill); +} + +/* + * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an + * entry for the same IP address already exists, destroy it first. Return the + * created IPMP ARP entry, or NULL on failure. + */ +ipmp_arpent_t * +ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp) +{ + uchar_t *addrp; + area_t *area = (area_t *)mp->b_rptr; + ipmp_arpent_t *entp, *oentp; + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t)); + + if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL) + return (NULL); + + if ((mp = copyb(mp)) == NULL) { + kmem_free(entp, sizeof (ipmp_arpent_t)); + return (NULL); + } + + DB_TYPE(mp) = M_PROTO; + entp->ia_area_mp = mp; + entp->ia_proxyarp = proxyarp; + addrp = mi_offset_paramc(mp, area->area_proto_addr_offset, + sizeof (ipaddr_t)); + bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t)); + + if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL) + ipmp_illgrp_destroy_arpent(illg, oentp); + + list_insert_head(&illg->ig_arpent, entp); + return (entp); +} + +/* + * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it. + */ +void +ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) +{ + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + list_remove(&illg->ig_arpent, entp); + freeb(entp->ia_area_mp); + kmem_free(entp, sizeof (ipmp_arpent_t)); +} + +/* + * Mark that ARP has been notified about the IP address on `entp'; `illg' is + * taken as a debugging aid for DTrace FBT probes. + */ +/* ARGSUSED */ +void +ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) +{ + entp->ia_notified = B_TRUE; +} + +/* + * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is + * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist. + */ +ipmp_arpent_t * +ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp) +{ + ipmp_arpent_t *entp = list_head(&illg->ig_arpent); + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + if (addrp == NULL) + return (entp); + + for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) + if (entp->ia_ipaddr == *addrp) + break; + return (entp); +} + +/* + * Refresh ARP entries on `illg' to be distributed across its active + * interfaces. Entries that cannot be refreshed (e.g., because there are no + * active interfaces) are marked so that subsequent calls can try again. + */ +void +ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) +{ + ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill; + uint_t paddrlen = ipmp_ill->ill_phys_addr_length; + area_t *area; + mblk_t *area_mp; + uchar_t *physaddr; + ipmp_arpent_t *entp; + + ASSERT(IAM_WRITER_ILL(ipmp_ill)); + ASSERT(!ipmp_ill->ill_isv6); + + ill = list_head(&illg->ig_actif); + entp = list_head(&illg->ig_arpent); + for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) { + if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) { + entp->ia_notified = B_FALSE; + continue; + } + + area = (area_t *)entp->ia_area_mp->b_rptr; + ASSERT(paddrlen == ill->ill_phys_addr_length); + ASSERT(paddrlen == area->area_hw_addr_length); + physaddr = mi_offset_paramc(entp->ia_area_mp, + area->area_hw_addr_offset, paddrlen); + + /* + * If this is a proxy ARP entry, we can skip notifying ARP if + * the entry is already up-to-date. If it has changed, we + * update the entry's hardware address before notifying ARP. + */ + if (entp->ia_proxyarp) { + if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 && + entp->ia_notified) + continue; + bcopy(ill->ill_phys_addr, physaddr, paddrlen); + } + + if ((area_mp = copyb(entp->ia_area_mp)) == NULL) { + entp->ia_notified = B_FALSE; + continue; + } + + putnext(ipmp_ill->ill_rq, area_mp); + ipmp_illgrp_mark_arpent(illg, entp); + + if ((ill = list_next(&illg->ig_actif, ill)) == NULL) + ill = list_head(&illg->ig_actif); + } +} + +/* + * Return an interface in `illg' with the specified `physaddr', or NULL if one + * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ. + */ +ill_t * +ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen) +{ + ill_t *ill; + ill_t *ipmp_ill = illg->ig_ipmp_ill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock)); + + ill = list_head(&illg->ig_if); + for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { + if (ill->ill_phys_addr_length == paddrlen && + bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0) + return (ill); + } + return (NULL); +} + +/* + * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND. + * Caller must be inside the IPSQ unless this is initialization. + */ +static void +ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu) +{ + ill_t *ill = illg->ig_ipmp_ill; + mblk_t *mp; + + ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill)); + + /* + * If allocation fails, we have bigger problems than MTU. + */ + if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) { + illg->ig_mtu = mtu; + put(ill->ill_rq, mp); + } +} + +/* + * Recalculate the IPMP group MTU for `illg', and update its associated IPMP + * ill MTU if necessary. + */ +void +ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg) +{ + ill_t *ill; + ill_t *ipmp_ill = illg->ig_ipmp_ill; + uint_t mtu = 0; + + ASSERT(IAM_WRITER_ILL(ipmp_ill)); + + /* + * Since ill_max_mtu can only change under ill_lock, we hold ill_lock + * for each ill as we iterate through the list. Any changes to the + * ill_max_mtu will also trigger an update, so even if we missed it + * this time around, the update will catch it. + */ + ill = list_head(&illg->ig_if); + for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { + mutex_enter(&ill->ill_lock); + if (mtu == 0 || ill->ill_max_mtu < mtu) + mtu = ill->ill_max_mtu; + mutex_exit(&ill->ill_lock); + } + + /* + * MTU must be at least the minimum MTU. + */ + mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); + + if (illg->ig_mtu != mtu) + ipmp_illgrp_set_mtu(illg, mtu); +} + +/* + * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently + * allow the same link to be established more than once. + */ +void +ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp) +{ + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if (illg->ig_ipmp_ill->ill_isv6) { + ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg); + grp->gr_v6 = illg; + } else { + ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg); + grp->gr_v4 = illg; + } +} + +/* + * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp + * cannot be unlinked (e.g., because there are still interfaces using it). + */ +int +ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg) +{ + ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if (illg->ig_ipmp_ill->ill_isv6) { + if (grp->gr_nv6 + grp->gr_pendv6 != 0) + return (EBUSY); + grp->gr_v6 = NULL; + } else { + if (grp->gr_nv4 + grp->gr_pendv4 != 0) + return (EBUSY); + grp->gr_v4 = NULL; + } + return (0); +} + +/* + * Place `ill' into `illg', and rebalance the data addresses on `illg' + * to be spread evenly across the ills now in it. Also, adjust the IPMP + * ill as necessary to account for `ill' (e.g., MTU). + */ +void +ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) +{ + ill_t *ipmp_ill; + ipif_t *ipif; + ip_stack_t *ipst = ill->ill_ipst; + + /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */ + ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL); + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(ill->ill_grp == NULL); + + ipmp_ill = illg->ig_ipmp_ill; + + /* + * Account for `ill' joining the illgrp. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + if (ill->ill_isv6) + ill->ill_phyint->phyint_grp->gr_nv6++; + else + ill->ill_phyint->phyint_grp->gr_nv4++; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * Ensure the ILLF_ROUTER flag remains consistent across the group. + */ + mutex_enter(&ill->ill_lock); + if (ipmp_ill->ill_flags & ILLF_ROUTER) + ill->ill_flags |= ILLF_ROUTER; + else + ill->ill_flags &= ~ILLF_ROUTER; + mutex_exit(&ill->ill_lock); + + /* + * Blow away all multicast memberships that currently exist on `ill'. + * This may seem odd, but it's consistent with the application view + * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()). + */ + if (ill->ill_isv6) { + reset_conn_ill(ill); + reset_mrt_ill(ill); + } else { + ipif = ill->ill_ipif; + for (; ipif != NULL; ipif = ipif->ipif_next) { + reset_conn_ipif(ipif); + reset_mrt_vif_ipif(ipif); + } + } + ip_purge_allmulti(ill); + + /* + * Borrow the first ill's ill_phys_addr_length value for the illgrp's + * physical address length. All other ills must have the same value, + * since they are required to all be the same mactype. Also update + * the IPMP ill's MTU and CoS marking, if necessary. + */ + if (list_is_empty(&illg->ig_if)) { + ASSERT(ipmp_ill->ill_phys_addr_length == 0); + /* + * NOTE: we leave ill_phys_addr NULL since the IPMP group + * doesn't have a physical address. This means that code must + * not assume that ill_phys_addr is non-NULL just because + * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla. + */ + ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length; + ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length; + ipmp_ill->ill_type = ill->ill_type; + + if (ill->ill_flags & ILLF_COS_ENABLED) { + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags |= ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } + ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); + } else { + ASSERT(ipmp_ill->ill_phys_addr_length == + ill->ill_phys_addr_length); + ASSERT(ipmp_ill->ill_type == ill->ill_type); + + if (!(ill->ill_flags & ILLF_COS_ENABLED)) { + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } + if (illg->ig_mtu > ill->ill_max_mtu) + ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); + } + + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + list_insert_tail(&illg->ig_if, ill); + ill->ill_grp = illg; + rw_exit(&ipst->ips_ill_g_lock); + + /* + * Hide the IREs on `ill' so that we don't accidentally find them when + * sending data traffic. + */ + ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill); + + /* + * Merge any broadcast IREs, if need be. + */ + if (!ill->ill_isv6) + ill_refresh_bcast(ill); + + ipmp_ill_refresh_active(ill); +} + +/* + * Remove `ill' from its illgrp, and rebalance the data addresses in that + * illgrp to be spread evenly across the remaining ills. Also, adjust the + * IPMP ill as necessary now that `ill' is removed (e.g., MTU). + */ +void +ipmp_ill_leave_illgrp(ill_t *ill) +{ + ill_t *ipmp_ill; + ipif_t *ipif; + ipmp_arpent_t *entp; + ipmp_illgrp_t *illg = ill->ill_grp; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IS_UNDER_IPMP(ill)); + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(illg != NULL); + + ipmp_ill = illg->ig_ipmp_ill; + + /* + * Cancel IPMP-specific ill timeouts. + */ + (void) untimeout(ill->ill_refresh_tid); + + /* + * Expose any previously-hidden IREs on `ill'. + */ + ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill); + + /* + * Ensure the multicast state for each ipif on `ill' is down so that + * our ipif_multicast_up() (once `ill' leaves the group) will rejoin + * all eligible groups. + */ + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + if (ipif->ipif_flags & IPIF_UP) + ipif_multicast_down(ipif); + + /* + * Account for `ill' leaving the illgrp. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + if (ill->ill_isv6) + ill->ill_phyint->phyint_grp->gr_nv6--; + else + ill->ill_phyint->phyint_grp->gr_nv4--; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * Pull `ill' out of the interface lists. + */ + if (list_link_active(&ill->ill_actnode)) + ipmp_ill_deactivate(ill); + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + list_remove(&illg->ig_if, ill); + ill->ill_grp = NULL; + rw_exit(&ipst->ips_ill_g_lock); + + /* + * Recreate any broadcast IREs that had been shared, if need be. + */ + if (!ill->ill_isv6) + ill_refresh_bcast(ill); + + /* + * Re-establish multicast memberships that were previously being + * handled by the IPMP meta-interface. + */ + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + if (ipif->ipif_flags & IPIF_UP) + ipif_multicast_up(ipif); + + /* + * Refresh the group MTU based on the new interface list. + */ + ipmp_illgrp_refresh_mtu(illg); + + if (list_is_empty(&illg->ig_if)) { + /* + * No ills left in the illgrp; we no longer have a physical + * address length, nor can we support ARP, CoS, or anything + * else that depends on knowing the link layer type. + */ + while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL) + ipmp_illgrp_destroy_arpent(illg, entp); + + ipmp_ill->ill_phys_addr_length = 0; + ipmp_ill->ill_nd_lla_len = 0; + ipmp_ill->ill_type = IFT_OTHER; + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } else { + /* + * If `ill' didn't support CoS, see if it can now be enabled. + */ + if (!(ill->ill_flags & ILLF_COS_ENABLED)) { + ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED)); + + ill = list_head(&illg->ig_if); + do { + if (!(ill->ill_flags & ILLF_COS_ENABLED)) + break; + } while ((ill = list_next(&illg->ig_if, ill)) != NULL); + + if (ill == NULL) { + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags |= ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } + } + } +} + +/* + * Check if `ill' should be active, and activate or deactivate if need be. + * Return B_FALSE if a refresh was necessary but could not be performed. + */ +static boolean_t +ipmp_ill_try_refresh_active(ill_t *ill) +{ + boolean_t refreshed = B_TRUE; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + if (ipmp_ill_is_active(ill)) { + if (!list_link_active(&ill->ill_actnode)) + refreshed = ipmp_ill_activate(ill); + } else { + if (list_link_active(&ill->ill_actnode)) + ipmp_ill_deactivate(ill); + } + + return (refreshed); +} + +/* + * Check if `ill' should be active, and activate or deactivate if need be. + * If the refresh fails, schedule a timer to try again later. + */ +void +ipmp_ill_refresh_active(ill_t *ill) +{ + if (!ipmp_ill_try_refresh_active(ill)) + ipmp_ill_refresh_active_timer_start(ill); +} + +/* + * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'. + */ +static void +ipmp_ill_refresh_active_timer(void *ill_arg) +{ + ill_t *ill = ill_arg; + boolean_t refreshed = B_FALSE; + + /* + * Clear ill_refresh_tid to indicate that no timeout is pending + * (another thread could schedule a new timeout while we're still + * running, but that's harmless). If the ill is going away, bail. + */ + mutex_enter(&ill->ill_lock); + ill->ill_refresh_tid = 0; + if (ill->ill_state_flags & ILL_CONDEMNED) { + mutex_exit(&ill->ill_lock); + return; + } + mutex_exit(&ill->ill_lock); + + if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) { + refreshed = ipmp_ill_try_refresh_active(ill); + ipsq_exit(ill->ill_phyint->phyint_ipsq); + } + + /* + * If the refresh failed, schedule another attempt. + */ + if (!refreshed) + ipmp_ill_refresh_active_timer_start(ill); +} + +/* + * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'. + */ +static void +ipmp_ill_refresh_active_timer_start(ill_t *ill) +{ + mutex_enter(&ill->ill_lock); + + /* + * If the ill is going away or a refresh is already scheduled, bail. + */ + if (ill->ill_refresh_tid != 0 || + (ill->ill_state_flags & ILL_CONDEMNED)) { + mutex_exit(&ill->ill_lock); + return; + } + + ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill, + SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT)); + + mutex_exit(&ill->ill_lock); +} + +/* + * Activate `ill' so it will be used to send and receive data traffic. Return + * B_FALSE if `ill' cannot be activated. Note that we allocate any messages + * needed to deactivate `ill' here as well so that deactivation cannot fail. + */ +static boolean_t +ipmp_ill_activate(ill_t *ill) +{ + ipif_t *ipif; + mblk_t *actmp = NULL, *deactmp = NULL; + mblk_t *linkupmp = NULL, *linkdownmp = NULL; + ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; + const char *grifname = grp->gr_ifname; + ipmp_illgrp_t *illg = ill->ill_grp; + ill_t *maxill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + /* + * If this will be the first active interface in the group, allocate + * the link-up and link-down messages. + */ + if (grp->gr_nactif == 0) { + linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0); + linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0); + if (linkupmp == NULL || linkdownmp == NULL) + goto fail; + } + + /* + * For IPv4, allocate the activate/deactivate messages, and tell ARP. + */ + if (!ill->ill_isv6) { + actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template); + deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template); + if (actmp == NULL || deactmp == NULL) + goto fail; + + ASSERT(ill->ill_ardeact_mp == NULL); + ill->ill_ardeact_mp = deactmp; + putnext(illg->ig_ipmp_ill->ill_rq, actmp); + } + + if (list_is_empty(&illg->ig_actif)) { + /* + * Now that we have an active ill, nominate it for multicast + * and broadcast duties. Do this before ipmp_ill_bind_ipif() + * since that may need to send multicast packets (e.g., IPv6 + * neighbor discovery probes). + */ + ipmp_illgrp_set_cast(illg, ill); + + /* + * This is the first active ill in the illgrp -- add 'em all. + * We can access/walk ig_ipmp_ill's ipif list since we're + * writer on its IPSQ as well. + */ + ipif = illg->ig_ipmp_ill->ill_ipif; + for (; ipif != NULL; ipif = ipif->ipif_next) + if (ipmp_ipif_is_up_dataaddr(ipif)) + ipmp_ill_bind_ipif(ill, ipif, Res_act_initial); + } else { + /* + * Redistribute the addresses by moving them from the ill with + * the most addresses until the ill being activated is at the + * same level as the rest of the ills. + */ + for (;;) { + maxill = ipmp_illgrp_max_ill(illg); + ASSERT(maxill != NULL); + if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt) + break; + ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); + ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind); + } + + /* + * TODO: explore whether it's advantageous to flush IRE_CACHE + * bindings to force existing connections to be redistributed + * to the new ill. + */ + } + + /* + * Put the interface in the active list. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + list_insert_tail(&illg->ig_actif, ill); + illg->ig_nactif++; + illg->ig_next_ill = ill; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * Refresh ARP entries to use `ill', if need be. + */ + if (!ill->ill_isv6) + ipmp_illgrp_refresh_arpent(illg); + + /* + * Finally, mark the group link up, if necessary. + */ + if (grp->gr_nactif++ == 0) { + ASSERT(grp->gr_linkdownmp == NULL); + grp->gr_linkdownmp = linkdownmp; + put(illg->ig_ipmp_ill->ill_rq, linkupmp); + } + return (B_TRUE); +fail: + freemsg(actmp); + freemsg(deactmp); + freemsg(linkupmp); + freemsg(linkdownmp); + return (B_FALSE); +} + +/* + * Deactivate `ill' so it will not be used to send or receive data traffic. + */ +static void +ipmp_ill_deactivate(ill_t *ill) +{ + ill_t *minill; + ipif_t *ipif, *ubnextipif, *ubheadipif = NULL; + mblk_t *mp; + ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; + ipmp_illgrp_t *illg = ill->ill_grp; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + /* + * Delete IRE_CACHE entries tied to this ill before they become stale. + */ + ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, + ill_stq_cache_delete, ill, ill); + + /* + * Pull the interface out of the active list. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + list_remove(&illg->ig_actif, ill); + illg->ig_nactif--; + illg->ig_next_ill = list_head(&illg->ig_actif); + rw_exit(&ipst->ips_ipmp_lock); + + /* + * If the ill that's being deactivated had been nominated for + * multicast/broadcast, nominate a new one. + */ + if (ill == illg->ig_cast_ill) + ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif)); + + /* + * Unbind all of the ipifs bound to this ill, and save 'em in a list; + * we'll rebind them after we tell the resolver the ill is no longer + * active. We must do things in this order or the resolver could + * accidentally rebind to the ill we're trying to remove if multiple + * ills in the group have the same hardware address (which is + * unsupported, but shouldn't lead to a wedged machine). + */ + while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) { + ipif->ipif_bound_next = ubheadipif; + ubheadipif = ipif; + } + + if (!ill->ill_isv6) { + /* + * Tell ARP `ill' is no longer active in the group. + */ + mp = ill->ill_ardeact_mp; + ill->ill_ardeact_mp = NULL; + ASSERT(mp != NULL); + putnext(illg->ig_ipmp_ill->ill_rq, mp); + + /* + * Refresh any ARP entries that had been using `ill'. + */ + ipmp_illgrp_refresh_arpent(illg); + } + + /* + * Rebind each ipif from the deactivated ill to the active ill with + * the fewest ipifs. If there are no active ills, the ipifs will + * remain unbound. + */ + for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) { + ubnextipif = ipif->ipif_bound_next; + ipif->ipif_bound_next = NULL; + + if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) + ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind); + } + + /* + * Finally, mark the group link down, if necessary. + */ + if (--grp->gr_nactif == 0) { + mp = grp->gr_linkdownmp; + grp->gr_linkdownmp = NULL; + ASSERT(mp != NULL); + put(illg->ig_ipmp_ill->ill_rq, mp); + } +} + +/* + * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD) + * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners. + */ +static void +ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd) +{ + ipif_t *ipif; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE); + + /* + * If `ill' is truly down, there are no messages to generate since: + * + * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface + * and its addresses by bringing them down. But that's already + * true, so there's nothing to hide. + * + * 2. If cmd == RTM_ADD, then we're supposed to generate messages + * indicating that any previously-hidden up addresses are again + * back up (along with the interface). But they aren't, so + * there's nothing to expose. + */ + if (ill->ill_ipif_up_count == 0) + return; + + if (cmd == RTM_ADD) + ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL); + + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + if (ipif->ipif_flags & IPIF_UP) + ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL); + + if (cmd == RTM_DELETE) + ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL); +} + +/* + * Bind the address named by `ipif' to the underlying ill named by `ill'. + * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act' + * will indicate to the resolver whether this is an initial bringup of + * `ipif', or just a rebind to another ill. + */ +static void +ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) +{ + int err = 0; + ip_stack_t *ipst = ill->ill_ipst; + + ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif)); + ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill)); + ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif)); + ASSERT(ipif->ipif_bound_ill == NULL); + ASSERT(ipif->ipif_bound_next == NULL); + + ipif->ipif_bound_next = ill->ill_bound_ipif; + ill->ill_bound_ipif = ipif; + ill->ill_bound_cnt++; + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + ipif->ipif_bound_ill = ill; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * If necessary, tell ARP/NDP about the new mapping. Note that + * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills. + */ + if (act != Res_act_none) { + if (ill->ill_isv6) { + VERIFY(ipif_resolver_up(ipif, act) == 0); + err = ipif_ndp_up(ipif, act == Res_act_initial); + } else { + err = ipif_resolver_up(ipif, act); + } + + /* + * Since ipif_ndp_up() never returns EINPROGRESS and + * ipif_resolver_up() only returns EINPROGRESS when the + * associated ill is not up, we should never be here with + * EINPROGRESS. We rely on this to simplify the design. + */ + ASSERT(err != EINPROGRESS); + } + /* TODO: retry binding on failure? when? */ + ipif->ipif_bound = (err == 0); +} + +/* + * Unbind the address named by `ipif' from the underlying ill named by `ill'. + * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned. + * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is + * B_TRUE, notify the resolver about the change. + */ +static ipif_t * +ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) +{ + ill_t *ipmp_ill; + ipif_t *previpif; + ip_stack_t *ipst = ill->ill_ipst; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + ipmp_ill = ill->ill_grp->ig_ipmp_ill; + + /* + * If necessary, find an ipif to unbind. + */ + if (ipif == NULL) { + if ((ipif = ill->ill_bound_ipif) == NULL) { + ASSERT(ill->ill_bound_cnt == 0); + return (NULL); + } + } + + ASSERT(IAM_WRITER_IPIF(ipif)); + ASSERT(IS_IPMP(ipif->ipif_ill)); + ASSERT(ipif->ipif_bound_ill == ill); + ASSERT(ill->ill_bound_cnt > 0); + + /* + * Unbind it. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + ipif->ipif_bound_ill = NULL; + rw_exit(&ipst->ips_ipmp_lock); + ill->ill_bound_cnt--; + + if (ill->ill_bound_ipif == ipif) { + ill->ill_bound_ipif = ipif->ipif_bound_next; + } else { + previpif = ill->ill_bound_ipif; + while (previpif->ipif_bound_next != ipif) + previpif = previpif->ipif_bound_next; + + previpif->ipif_bound_next = ipif->ipif_bound_next; + } + ipif->ipif_bound_next = NULL; + + /* + * If requested, notify the resolvers (provided we're bound). + */ + if (notifyres && ipif->ipif_bound) { + if (ill->ill_isv6) { + ipif_ndp_down(ipif); + } else { + ASSERT(ipif->ipif_arp_del_mp != NULL); + putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp); + ipif->ipif_arp_del_mp = NULL; + } + } + ipif->ipif_bound = B_FALSE; + + return (ipif); +} + +/* + * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if + * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this + * to determine whether an ill should be considered active, other consumers + * may race and learn about an ill that should be deactivated/activated before + * IPMP has performed the activation/deactivation. This should be safe though + * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that + * would've been cleaned up by ipmp_ill_deactivate(). + */ +boolean_t +ipmp_ill_is_active(ill_t *ill) +{ + phyint_t *phyi = ill->ill_phyint; + + ASSERT(IS_UNDER_IPMP(ill)); + ASSERT(IAM_WRITER_ILL(ill) || + (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock))); + + /* + * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to + * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the + * link flapping logic to be just in in.mpathd and allows us to ignore + * changes to PHYI_RUNNING. + */ + return (!(ill->ill_ipif_up_count == 0 || + (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED)))); +} + +/* + * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet + * IREs with a source address on `ill_arg'. + */ +static void +ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) +{ + ill_t *ill = (ill_t *)ill_arg; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(!IS_IPMP(ill)); + + if (ire->ire_ipif->ipif_ill != ill) + return; + + switch (ire->ire_type) { + case IRE_HOST: + case IRE_PREFIX: + case IRE_DEFAULT: + case IRE_CACHE: + case IRE_IF_RESOLVER: + case IRE_IF_NORESOLVER: + DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); + ire->ire_marks |= IRE_MARK_TESTHIDDEN; + break; + default: + break; + } +} + +/* + * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source + * address on `ill_arg'. + */ +static void +ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) +{ + ill_t *ill = (ill_t *)ill_arg; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(!IS_IPMP(ill)); + + if (ire->ire_ipif->ipif_ill == ill) { + DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire); + ire->ire_marks &= ~IRE_MARK_TESTHIDDEN; + } +} + +/* + * Return a held pointer to the IPMP ill for underlying interface `ill', or + * NULL if one doesn't exist. (Unfortunately, this function needs to take an + * underlying ill rather than an ipmp_illgrp_t because an underlying ill's + * ill_grp pointer may become stale when not under an IPSQ and not holding + * ipmp_lock.) Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_ill_hold_ipmp_ill(ill_t *ill) +{ + ip_stack_t *ipst = ill->ill_ipst; + ipmp_illgrp_t *illg; + + ASSERT(!IS_IPMP(ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + illg = ill->ill_grp; + if (illg != NULL && ILL_CAN_LOOKUP(illg->ig_ipmp_ill)) { + ill_refhold(illg->ig_ipmp_ill); + rw_exit(&ipst->ips_ipmp_lock); + return (illg->ig_ipmp_ill); + } + /* + * Assume `ill' was removed from the illgrp in the meantime. + */ + rw_exit(&ill->ill_ipst->ips_ipmp_lock); + return (NULL); +} + +/* + * Return the interface index for the IPMP ill tied to underlying interface + * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ. + */ +uint_t +ipmp_ill_get_ipmp_ifindex(const ill_t *ill) +{ + uint_t ifindex = 0; + ip_stack_t *ipst = ill->ill_ipst; + ipmp_grp_t *grp; + + ASSERT(!IS_IPMP(ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + if ((grp = ill->ill_phyint->phyint_grp) != NULL) + ifindex = grp->gr_phyint->phyint_ifindex; + rw_exit(&ipst->ips_ipmp_lock); + return (ifindex); +} + +/* + * Place phyint `phyi' into IPMP group `grp'. + */ +void +ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp) +{ + ill_t *ill; + ipsq_t *ipsq = phyi->phyint_ipsq; + ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq; + ip_stack_t *ipst = PHYINT_TO_IPST(phyi); + + ASSERT(IAM_WRITER_IPSQ(ipsq)); + ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL); + + /* + * Send routing socket messages indicating that the phyint's ills + * and ipifs vanished. + */ + if (phyi->phyint_illv4 != NULL) { + ill = phyi->phyint_illv4; + ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); + } + + if (phyi->phyint_illv6 != NULL) { + ill = phyi->phyint_illv6; + ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); + } + + /* + * Snapshot the phyint's initial kstats as a baseline. + */ + ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0); + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + + phyi->phyint_grp = grp; + if (++grp->gr_nif == 1) + grp->gr_mactype = ill->ill_mactype; + else + ASSERT(grp->gr_mactype == ill->ill_mactype); + + /* + * Now that we're in the group, request a switch to the group's xop + * when we ipsq_exit(). All future operations will be exclusive on + * the group xop until ipmp_phyint_leave_grp() is called. + */ + ASSERT(ipsq->ipsq_swxop == NULL); + ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop); + ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop; + + rw_exit(&ipst->ips_ipmp_lock); +} + +/* + * Remove phyint `phyi' from its current IPMP group. + */ +void +ipmp_phyint_leave_grp(phyint_t *phyi) +{ + uint_t i; + ipsq_t *ipsq = phyi->phyint_ipsq; + ip_stack_t *ipst = PHYINT_TO_IPST(phyi); + uint64_t phyi_kstats[IPMP_KSTAT_MAX]; + + ASSERT(IAM_WRITER_IPSQ(ipsq)); + + /* + * If any of the phyint's ills are still in an illgrp, kick 'em out. + */ + if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4)) + ipmp_ill_leave_illgrp(phyi->phyint_illv4); + if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6)) + ipmp_ill_leave_illgrp(phyi->phyint_illv6); + + /* + * Send routing socket messages indicating that the phyint's ills + * and ipifs have reappeared. + */ + if (phyi->phyint_illv4 != NULL) + ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD); + if (phyi->phyint_illv6 != NULL) + ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD); + + /* + * Calculate the phyint's cumulative kstats while it was in the group, + * and add that to the group's baseline. + */ + ipmp_phyint_get_kstats(phyi, phyi_kstats); + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + phyi_kstats[i] -= phyi->phyint_kstats0[i]; + atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]); + } + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + + phyi->phyint_grp->gr_nif--; + phyi->phyint_grp = NULL; + + /* + * As our final act in leaving the group, request a switch back to our + * IPSQ's own xop when we ipsq_exit(). + */ + ASSERT(ipsq->ipsq_swxop == NULL); + ipsq->ipsq_swxop = &ipsq->ipsq_ownxop; + + rw_exit(&ipst->ips_ipmp_lock); +} + +/* + * Store the IPMP-related kstats for `phyi' into the array named by `kstats'. + * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements. + */ +static void +ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[]) +{ + uint_t i, j; + const char *name; + kstat_t *ksp; + kstat_named_t *kn; + + bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX); + + /* + * NOTE: ALL_ZONES here assumes that there's at most one link + * with a given name on a given system (safe for now). + */ + ksp = kstat_hold_byname("link", 0, phyi->phyint_name, ALL_ZONES); + if (ksp == NULL) + return; + + KSTAT_ENTER(ksp); + + if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { + /* + * Bring kstats up-to-date before recording. + */ + (void) KSTAT_UPDATE(ksp, KSTAT_READ); + + kn = KSTAT_NAMED_PTR(ksp); + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + name = ipmp_kstats[i].name; + kstats[i] = 0; + for (j = 0; j < ksp->ks_ndata; j++) { + if (strcmp(kn[j].name, name) != 0) + continue; + + switch (kn[j].data_type) { + case KSTAT_DATA_INT32: + case KSTAT_DATA_UINT32: + kstats[i] = kn[j].value.ui32; + break; +#ifdef _LP64 + case KSTAT_DATA_LONG: + case KSTAT_DATA_ULONG: + kstats[i] = kn[j].value.ul; + break; +#endif + case KSTAT_DATA_INT64: + case KSTAT_DATA_UINT64: + kstats[i] = kn[j].value.ui64; + break; + } + break; + } + } + } + + KSTAT_EXIT(ksp); + kstat_rele(ksp); +} + +/* + * Refresh the active state of all ills on `phyi'. + */ +void +ipmp_phyint_refresh_active(phyint_t *phyi) +{ + if (phyi->phyint_illv4 != NULL) + ipmp_ill_refresh_active(phyi->phyint_illv4); + if (phyi->phyint_illv6 != NULL) + ipmp_ill_refresh_active(phyi->phyint_illv6); +} + +/* + * Return a held pointer to the underlying ill bound to `ipif', or NULL if one + * doesn't exist. Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_ipif_hold_bound_ill(const ipif_t *ipif) +{ + ill_t *boundill; + ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + + ASSERT(IS_IPMP(ipif->ipif_ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + boundill = ipif->ipif_bound_ill; + if (boundill != NULL && ILL_CAN_LOOKUP(boundill)) { + ill_refhold(boundill); + rw_exit(&ipst->ips_ipmp_lock); + return (boundill); + } + rw_exit(&ipst->ips_ipmp_lock); + return (NULL); +} + +/* + * Return a pointer to the underlying ill bound to `ipif', or NULL if one + * doesn't exist. Caller must be inside the IPSQ. + */ +ill_t * +ipmp_ipif_bound_ill(const ipif_t *ipif) +{ + ASSERT(IAM_WRITER_ILL(ipif->ipif_ill)); + ASSERT(IS_IPMP(ipif->ipif_ill)); + + return (ipif->ipif_bound_ill); +} + +/* + * Check if `ipif' is a "stub" (placeholder address not being used). + */ +boolean_t +ipmp_ipif_is_stubaddr(const ipif_t *ipif) +{ + if (ipif->ipif_flags & IPIF_UP) + return (B_FALSE); + if (ipif->ipif_ill->ill_isv6) + return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); + else + return (ipif->ipif_lcl_addr == INADDR_ANY); +} + +/* + * Check if `ipif' is an IPMP data address. + */ +boolean_t +ipmp_ipif_is_dataaddr(const ipif_t *ipif) +{ + if (ipif->ipif_flags & IPIF_NOFAILOVER) + return (B_FALSE); + if (ipif->ipif_ill->ill_isv6) + return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); + else + return (ipif->ipif_lcl_addr != INADDR_ANY); +} + +/* + * Check if `ipif' is an IPIF_UP IPMP data address. + */ +static boolean_t +ipmp_ipif_is_up_dataaddr(const ipif_t *ipif) +{ + return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP)); +} |