summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet
diff options
context:
space:
mode:
authormeem <Peter.Memishian@Sun.COM>2009-01-06 20:16:25 -0500
committermeem <Peter.Memishian@Sun.COM>2009-01-06 20:16:25 -0500
commite11c3f44f531fdff80941ce57c065d2ae861cefc (patch)
treee921e957d727a9596275a1119fd627ef2ecca47d /usr/src/uts/common/inet
parent732675dd38771d280fdc276731344e9652071753 (diff)
downloadillumos-gate-e11c3f44f531fdff80941ce57c065d2ae861cefc.tar.gz
PSARC/2007/272 Project Clearview: IPMP Rearchitecture
PSARC/2008/773 IPQoS if_groupname Selector Removal PSARC/2009/001 Move in.mpathd into /lib/inet 6783149 Clearview IPMP Rearchitecture 4472956 libipmp should provide administrative interfaces 4494577 ipmp is opaque - there's no way to get current status 4509788 IPMP's usage of interface flags is not backward compatible 4509869 IPMP's address move mechanism needs to be transparent to applications 4531232 "in.rdiscd: sendto: Bad file number" seen during IPMP DR 4533876 new instances of interfaces under ipmp are generated with each dr/op 4699003 in.mpathd should squawk if interfaces in a group have the same hwaddr 4704937 SUNW_ip_rcm.so is sloppy with strings 4713308 IPMP shouldn't failover unconfigured logical interfaces 4785694 non-homogeneous IPMP group does not do failback 4850407 if_mpadm and IPMP DR failure 5015757 ip can panic with ASSERT(attach_ill == ipif->ipif_ill) failure 5086201 in.ndpd's phyint_reach_random() spews "SIOCSLIFLNKINFO Invalid argument" 6184000 routes cannot be created on failed interfaces 6246564 if_mpadm -r <ifname> doesn't bring up IPv6 link-local data address 6359058 SIOCLIFFAILBACK repeatedly fails with EAGAIN; in.mpathd fills logs 6359536 enabling STANDBY on an interface with no test address acts oddly 6378487 in.dhcpd doesn't work well in an IPMP setup 6462335 cannot offline to IPMP interfaces that have no probe targets 6516992 in.routed spews "Address already in use" during IPMP address move 6518460 ip_rcm`update_pif() must remain calm when logical interfaces disappear 6549957 failed IP interfaces at boot may go unreported 6591186 rpcbind can't deal with indirect calls if all addresses are deprecated 6667143 NCE_F_UNSOL_ADV broken 6698480 IGMP version not retained during IPMP failover 6726235 IPMP group failure can sometimes lead to an extra failover 6726645 in.routed skips DEPRECATED addresses even when no others exist 6738310 ip_ndp_recover() checks IPIF_CONDEMNED on the wrong ipif flags field 6739454 system panics at sdpib`sdp_rts_announce 6739531 IPv6 DAD doesn't work well with IPMP 6740719 in.mpathd may fail to switch to router target mode 6743260 ipif_resolver_up() can fail and leave ARP bringup pending 6746613 ip's DL_NOTE_SDU_SIZE logic mishandles ill_max_frag < ill_max_mtu 6748145 in.ndpd's IPv6 link-local hardware address mappings can go stale 6753560 ilg_delete_all() can race with ill_delete_tail() when ilg_ill changes 6755987 stillborn IFF_POINTOPOINT in.mpathd logic should be hauled out 6775126 SUBDIRS ipsecutils element does not in order be 6775811 NCEs can get stuck in ND_INCOMPLETE if ARP fails when IPMP is in-use 6777496 receive-side ILL_HCKSUM_CAPABLE checks look at the wrong ill 6781488 IPSQ timer restart logic can deadlock under stress 6781883 ip_ndp_find_solicitation() can be passed adverts, and has API issues 6784852 RPCIB, SDP, and RDS all break when vanity naming is used 6786048 IPv6 ND probes create IREs with incorrect source addresses 6786091 I_PLINK handling in IP must not request reentry via ipsq_try_enter() 6786711 IPQoS if_groupname selector needs to go 6787091 assertion failure in ipcl_conn_cleanup() due to non-NULL conn_ilg 6789235 INADDR_ANY ILMs can trigger an assertion failure in IPMP environments 6789502 ipif_resolver_up() calls after ipif_ndp_up() clobber ipif_addr_ready 6789718 ip6.tun0 cannot be plumbed in a non-global-zone post-6745288 6789732 libdlpi may get stuck in i_dlpi_strgetmsg() 6789870 ipif6_dup_recovery() may operate on a freed ipif, corrupting memory 6789874 ipnet_nicevent_cb() may call taskq_dispatch() on a bogus taskq 6790310 in.mpathd may core with "phyint_inst_timer: invalid state 4" --HG-- rename : usr/src/lib/libinetutil/common/inetutil4.c => usr/src/lib/libinetutil/common/inetutil.c rename : usr/src/uts/common/inet/vni/vni.c => usr/src/uts/common/inet/dlpistub/dlpistub.c rename : usr/src/uts/common/inet/vni/vni.conf => usr/src/uts/common/inet/dlpistub/dlpistub.conf rename : usr/src/uts/common/inet/vni/vni_impl.h => usr/src/uts/common/inet/dlpistub/dlpistub_impl.h rename : usr/src/uts/intel/vni/Makefile => usr/src/uts/intel/dlpistub/Makefile rename : usr/src/uts/sparc/vni/Makefile => usr/src/uts/sparc/dlpistub/Makefile
Diffstat (limited to 'usr/src/uts/common/inet')
-rw-r--r--usr/src/uts/common/inet/arp.h13
-rw-r--r--usr/src/uts/common/inet/arp/arp.c607
-rw-r--r--usr/src/uts/common/inet/arp_impl.h7
-rw-r--r--usr/src/uts/common/inet/dlpistub/dlpistub.c370
-rw-r--r--usr/src/uts/common/inet/dlpistub/dlpistub.conf (renamed from usr/src/uts/common/inet/vni/vni.conf)12
-rw-r--r--usr/src/uts/common/inet/dlpistub/dlpistub_impl.h49
-rw-r--r--usr/src/uts/common/inet/ip.h688
-rw-r--r--usr/src/uts/common/inet/ip/icmp.c4
-rw-r--r--usr/src/uts/common/inet/ip/icmp_opt_data.c11
-rw-r--r--usr/src/uts/common/inet/ip/igmp.c246
-rw-r--r--usr/src/uts/common/inet/ip/ip.c2367
-rw-r--r--usr/src/uts/common/inet/ip/ip6.c1170
-rw-r--r--usr/src/uts/common/inet/ip/ip6_if.c535
-rw-r--r--usr/src/uts/common/inet/ip/ip6_ire.c206
-rw-r--r--usr/src/uts/common/inet/ip/ip6_rts.c6
-rw-r--r--usr/src/uts/common/inet/ip/ip_ftable.c136
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c8887
-rw-r--r--usr/src/uts/common/inet/ip/ip_ire.c548
-rw-r--r--usr/src/uts/common/inet/ip/ip_mroute.c21
-rw-r--r--usr/src/uts/common/inet/ip/ip_multi.c787
-rw-r--r--usr/src/uts/common/inet/ip/ip_ndp.c1040
-rw-r--r--usr/src/uts/common/inet/ip/ip_netinfo.c132
-rw-r--r--usr/src/uts/common/inet/ip/ip_opt_data.c11
-rw-r--r--usr/src/uts/common/inet/ip/ip_rts.c134
-rw-r--r--usr/src/uts/common/inet/ip/ipclassifier.c5
-rw-r--r--usr/src/uts/common/inet/ip/ipmp.c2201
-rw-r--r--usr/src/uts/common/inet/ip/rts.c24
-rw-r--r--usr/src/uts/common/inet/ip/rts_opt_data.c3
-rw-r--r--usr/src/uts/common/inet/ip/spd.c7
-rw-r--r--usr/src/uts/common/inet/ip6.h12
-rw-r--r--usr/src/uts/common/inet/ip_if.h77
-rw-r--r--usr/src/uts/common/inet/ip_impl.h6
-rw-r--r--usr/src/uts/common/inet/ip_ire.h28
-rw-r--r--usr/src/uts/common/inet/ip_multi.h25
-rw-r--r--usr/src/uts/common/inet/ip_ndp.h9
-rw-r--r--usr/src/uts/common/inet/ip_rts.h23
-rw-r--r--usr/src/uts/common/inet/ip_stack.h36
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h7
-rw-r--r--usr/src/uts/common/inet/ipnet/ipnet.c33
-rw-r--r--usr/src/uts/common/inet/ipsec_info.h12
-rw-r--r--usr/src/uts/common/inet/mib2.h20
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_addr.c4
-rw-r--r--usr/src/uts/common/inet/sctp_ip.h4
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c26
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_fusion.c3
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_opt_data.c5
-rw-r--r--usr/src/uts/common/inet/udp/udp.c17
-rw-r--r--usr/src/uts/common/inet/udp/udp_opt_data.c11
-rw-r--r--usr/src/uts/common/inet/vni/vni.c359
-rw-r--r--usr/src/uts/common/inet/vni/vni_impl.h59
50 files changed, 9123 insertions, 11880 deletions
diff --git a/usr/src/uts/common/inet/arp.h b/usr/src/uts/common/inet/arp.h
index 0bca52e9ae..4351c91666 100644
--- a/usr/src/uts/common/inet/arp.h
+++ b/usr/src/uts/common/inet/arp.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -28,6 +28,7 @@
#define _INET_ARP_H
#include <sys/types.h>
+#include <net/if.h>
#ifdef __cplusplus
extern "C" {
@@ -64,6 +65,8 @@ extern "C" {
*/
#define AR_ARP_CLOSING (AR_IOCTL + 16)
#define AR_ARP_EXTEND (AR_IOCTL + 17)
+#define AR_IPMP_ACTIVATE (AR_IOCTL + 18)
+#define AR_IPMP_DEACTIVATE (AR_IOCTL + 19)
/* Both ace_flags and area_flags; must also modify arp.c in mdb */
#define ACE_F_PERMANENT 0x0001
@@ -182,6 +185,14 @@ typedef struct ar_mapping_add_s {
/* the mask&proto_addr */
} arma_t;
+/* Structure used to notify ARP of changes to IPMP group topology */
+typedef struct ar_ipmp_event_s {
+ uint32_t arie_cmd;
+ uint32_t arie_name_offset;
+ uint32_t arie_name_length;
+ char arie_grifname[LIFNAMSIZ];
+} arie_t;
+
/* Structure used to notify clients of interesting conditions. */
typedef struct ar_client_notify_s {
uint32_t arcn_cmd;
diff --git a/usr/src/uts/common/inet/arp/arp.c b/usr/src/uts/common/inet/arp/arp.c
index 815dfd19d3..06c499ced9 100644
--- a/usr/src/uts/common/inet/arp/arp.c
+++ b/usr/src/uts/common/inet/arp/arp.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -85,6 +85,30 @@
* talking to a given peer, then it doesn't matter if we have the right mapping
* for that peer. It would be possible to send queries on aging entries that
* are active, but this isn't done.
+ *
+ * IPMP Notes
+ * ----------
+ *
+ * ARP is aware of IPMP. In particular, IP notifies ARP about all "active"
+ * (able to transmit data packets) interfaces in a given group via
+ * AR_IPMP_ACTIVATE and AR_IPMP_DEACTIVATE messages. These messages, combined
+ * with the "IPMP arl_t" that ARP creates over the IPMP DLPI stub driver,
+ * enable ARP to track all the arl_t's that are in the same group and thus
+ * ensure that ACEs are shared across each group and the arl_t that ARP
+ * chooses to transmit on for a given ACE is optimal.
+ *
+ * ARP relies on IP for hardware address updates. In particular, if the
+ * hardware address of an interface changes (DL_NOTE_PHYS_ADDR), then IP will
+ * bring the interface down and back up -- and as part of bringing it back
+ * up, will send messages to ARP that allow it to update the affected arl's
+ * with new hardware addresses.
+ *
+ * N.B.: One side-effect of this approach is that when an interface fails and
+ * then starts to repair, it will temporarily populate the ARP cache with
+ * addresses that are owned by it rather than the group's arl_t. To address
+ * this, we could add more messages (e.g., AR_IPMP_JOIN and AR_IPMP_LEAVE),
+ * but as the issue appears to be only cosmetic (redundant entries in the ARP
+ * cache during interace repair), we've kept things simple for now.
*/
/*
@@ -134,6 +158,12 @@ typedef struct {
#define ARH_FIXED_LEN 8
/*
+ * Macro used when creating ACEs to determine the arl that should own it.
+ */
+#define OWNING_ARL(arl) \
+ ((arl)->arl_ipmp_arl != NULL ? (arl)->arl_ipmp_arl : arl)
+
+/*
* MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK
* doesn't quite do it for us.
*/
@@ -154,7 +184,7 @@ static int ar_ce_create(arl_t *arl, uint32_t proto, uchar_t *hw_addr,
uint32_t hw_addr_len, uchar_t *proto_addr,
uint32_t proto_addr_len, uchar_t *proto_mask,
uchar_t *proto_extract_mask, uint32_t hw_extract_start,
- uint32_t flags);
+ uchar_t *sender_addr, uint32_t flags);
static void ar_ce_delete(ace_t *ace);
static void ar_ce_delete_per_arl(ace_t *ace, void *arg);
static ace_t **ar_ce_hash(arp_stack_t *as, uint32_t proto,
@@ -167,6 +197,8 @@ static ace_t *ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp,
ace_t *matchfn());
static ace_t *ar_ce_lookup_mapping(arl_t *arl, uint32_t proto,
const uchar_t *proto_addr, uint32_t proto_addr_length);
+static ace_t *ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto,
+ uchar_t *proto_addr, uint32_t proto_addr_length);
static boolean_t ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr,
uint32_t hw_addr_length);
static void ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *),
@@ -187,6 +219,8 @@ static int ar_interface_up(queue_t *q, mblk_t *mp);
static int ar_interface_down(queue_t *q, mblk_t *mp);
static int ar_interface_on(queue_t *q, mblk_t *mp);
static int ar_interface_off(queue_t *q, mblk_t *mp);
+static int ar_ipmp_activate(queue_t *q, mblk_t *mp);
+static int ar_ipmp_deactivate(queue_t *q, mblk_t *mp);
static void ar_ll_cleanup_arl_queue(queue_t *q);
static void ar_ll_down(arl_t *arl);
static arl_t *ar_ll_lookup_by_name(arp_stack_t *as, const char *name);
@@ -208,7 +242,7 @@ static int ar_param_set(queue_t *q, mblk_t *mp, char *value,
static void ar_query_delete(ace_t *ace, void *ar);
static void ar_query_reply(ace_t *ace, int ret_val,
uchar_t *proto_addr, uint32_t proto_addr_len);
-static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace);
+static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace);
static void ar_rput(queue_t *q, mblk_t *mp_orig);
static void ar_rput_dlpi(queue_t *q, mblk_t *mp);
static void ar_set_address(ace_t *ace, uchar_t *addrpos,
@@ -344,6 +378,10 @@ static arct_t ar_cmd_tbl[] = {
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_ON" },
{ ar_interface_off, AR_INTERFACE_OFF, sizeof (arc_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_OFF" },
+ { ar_ipmp_activate, AR_IPMP_ACTIVATE, sizeof (arie_t),
+ ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_ACTIVATE" },
+ { ar_ipmp_deactivate, AR_IPMP_DEACTIVATE, sizeof (arie_t),
+ ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_DEACTIVATE" },
{ ar_set_ppa, (uint32_t)IF_UNITSEL, sizeof (int),
ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "IF_UNITSEL" },
{ ar_nd_ioctl, ND_GET, 1,
@@ -358,6 +396,65 @@ static arct_t ar_cmd_tbl[] = {
};
/*
+ * Lookup and return an arl appropriate for sending packets with either source
+ * hardware address `hw_addr' or source protocol address `ip_addr', in that
+ * order. If neither was specified or neither match, return any arl in the
+ * same group as `arl'.
+ */
+static arl_t *
+ar_ipmp_lookup_xmit_arl(arl_t *arl, uchar_t *hw_addr, uint_t hw_addrlen,
+ uchar_t *ip_addr)
+{
+ arlphy_t *ap;
+ ace_t *src_ace;
+ arl_t *xmit_arl = NULL;
+ arp_stack_t *as = ARL_TO_ARPSTACK(arl);
+
+ ASSERT(arl->arl_flags & ARL_F_IPMP);
+
+ if (hw_addr != NULL && hw_addrlen != 0) {
+ xmit_arl = as->as_arl_head;
+ for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) {
+ /*
+ * There may be arls with the same HW address that are
+ * not in our IPMP group; we don't want those.
+ */
+ if (xmit_arl->arl_ipmp_arl != arl)
+ continue;
+
+ ap = xmit_arl->arl_phy;
+ if (ap != NULL && ap->ap_hw_addrlen == hw_addrlen &&
+ bcmp(ap->ap_hw_addr, hw_addr, hw_addrlen) == 0)
+ break;
+ }
+
+ DTRACE_PROBE4(xmit_arl_hwsrc, arl_t *, arl, arl_t *,
+ xmit_arl, uchar_t *, hw_addr, uint_t, hw_addrlen);
+ }
+
+ if (xmit_arl == NULL && ip_addr != NULL) {
+ src_ace = ar_ce_lookup_permanent(as, IP_ARP_PROTO_TYPE, ip_addr,
+ IP_ADDR_LEN);
+ if (src_ace != NULL)
+ xmit_arl = src_ace->ace_xmit_arl;
+
+ DTRACE_PROBE4(xmit_arl_ipsrc, arl_t *, arl, arl_t *,
+ xmit_arl, uchar_t *, ip_addr, uint_t, IP_ADDR_LEN);
+ }
+
+ if (xmit_arl == NULL) {
+ xmit_arl = as->as_arl_head;
+ for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next)
+ if (xmit_arl->arl_ipmp_arl == arl && xmit_arl != arl)
+ break;
+
+ DTRACE_PROBE2(xmit_arl_any, arl_t *, arl, arl_t *, xmit_arl);
+ }
+
+ return (xmit_arl);
+}
+
+/*
* ARP Cache Entry creation routine.
* Cache entries are allocated within timer messages and inserted into
* the global hash list based on protocol and protocol address.
@@ -365,7 +462,8 @@ static arct_t ar_cmd_tbl[] = {
static int
ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
uchar_t *proto_addr, uint_t proto_addr_len, uchar_t *proto_mask,
- uchar_t *proto_extract_mask, uint_t hw_extract_start, uint_t flags)
+ uchar_t *proto_extract_mask, uint_t hw_extract_start, uchar_t *sender_addr,
+ uint_t flags)
{
static ace_t ace_null;
ace_t *ace;
@@ -373,17 +471,35 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
uchar_t *dst;
mblk_t *mp;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
+ arl_t *xmit_arl;
arlphy_t *ap;
if ((flags & ~ACE_EXTERNAL_FLAGS_MASK) || arl == NULL)
return (EINVAL);
- if ((ap = arl->arl_phy) == NULL)
+ if (proto_addr == NULL || proto_addr_len == 0 ||
+ (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN))
return (EINVAL);
if (flags & ACE_F_MYADDR)
flags |= ACE_F_PUBLISH | ACE_F_AUTHORITY;
+ /*
+ * Latch a transmit arl for this ace.
+ */
+ if (arl->arl_flags & ARL_F_IPMP) {
+ ASSERT(proto == IP_ARP_PROTO_TYPE);
+ xmit_arl = ar_ipmp_lookup_xmit_arl(arl, hw_addr, hw_addr_len,
+ sender_addr);
+ } else {
+ xmit_arl = arl;
+ }
+
+ if (xmit_arl == NULL || xmit_arl->arl_phy == NULL)
+ return (EINVAL);
+
+ ap = xmit_arl->arl_phy;
+
if (!hw_addr && hw_addr_len == 0) {
if (flags == ACE_F_PERMANENT) { /* Not publish */
/* 224.0.0.0 to zero length address */
@@ -398,9 +514,6 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
flags |= ACE_F_RESOLVED;
}
- if (proto_addr == NULL || proto_addr_len == 0 ||
- (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN))
- return (EINVAL);
/* Handle hw_addr_len == 0 for DL_ENABMULTI_REQ etc. */
if (hw_addr_len != 0 && hw_addr == NULL)
return (EINVAL);
@@ -432,6 +545,7 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
ace->ace_proto = proto;
ace->ace_mp = mp;
ace->ace_arl = arl;
+ ace->ace_xmit_arl = xmit_arl;
dst = (uchar_t *)&ace[1];
@@ -510,12 +624,73 @@ ar_ce_delete(ace_t *ace)
static void
ar_ce_delete_per_arl(ace_t *ace, void *arl)
{
- if (ace->ace_arl == arl) {
+ if (ace->ace_arl == arl || ace->ace_xmit_arl == arl) {
ace->ace_flags &= ~ACE_F_PERMANENT;
ar_ce_delete(ace);
}
}
+/*
+ * ar_ce_walk routine used when deactivating an `arl' in a group. Deletes
+ * `ace' if it was using `arl_arg' as its output interface.
+ */
+static void
+ar_ce_ipmp_deactivate(ace_t *ace, void *arl_arg)
+{
+ arl_t *arl = arl_arg;
+
+ ASSERT(!(arl->arl_flags & ARL_F_IPMP));
+
+ if (ace->ace_arl == arl) {
+ ASSERT(ace->ace_xmit_arl == arl);
+ /*
+ * This ACE is tied to the arl leaving the group (e.g., an
+ * ACE_F_PERMANENT for a test address) and is not used by the
+ * group, so we can leave it be.
+ */
+ return;
+ }
+
+ if (ace->ace_xmit_arl != arl)
+ return;
+
+ ASSERT(ace->ace_arl == arl->arl_ipmp_arl);
+
+ /*
+ * IP should've already sent us messages asking us to move any
+ * ACE_F_MYADDR entries to another arl, but there are two exceptions:
+ *
+ * 1. The group was misconfigured with interfaces that have duplicate
+ * hardware addresses, but in.mpathd was unable to offline those
+ * duplicate interfaces.
+ *
+ * 2. The messages from IP were lost or never created (e.g. due to
+ * memory pressure).
+ *
+ * We handle the first case by just quietly deleting the ACE. Since
+ * the second case cannot be distinguished from a more serious bug in
+ * the IPMP framework, we ASSERT() that this can't happen on DEBUG
+ * systems, but quietly delete the ACE on production systems (the
+ * deleted ACE will render the IP address unreachable).
+ */
+ if (ace->ace_flags & ACE_F_MYADDR) {
+ arlphy_t *ap = arl->arl_phy;
+ uint_t hw_addrlen = ap->ap_hw_addrlen;
+
+ ASSERT(hw_addrlen == ace->ace_hw_addr_length &&
+ bcmp(ap->ap_hw_addr, ace->ace_hw_addr, hw_addrlen) == 0);
+ }
+
+ /*
+ * NOTE: it's possible this arl got selected as the ace_xmit_arl when
+ * creating an ACE_F_PERMANENT ACE on behalf of an SIOCS*ARP ioctl for
+ * an IPMP IP interface. But it's still OK for us to delete such an
+ * ACE since ipmp_illgrp_refresh_arpent() will ask us to recreate it
+ * and we'll pick another arl then.
+ */
+ ar_ce_delete(ace);
+}
+
/* Cache entry hash routine, based on protocol and protocol address. */
static ace_t **
ar_ce_hash(arp_stack_t *as, uint32_t proto, const uchar_t *proto_addr,
@@ -559,7 +734,8 @@ ar_ce_lookup_entry(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
return (NULL);
ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length);
for (; ace; ace = ace->ace_next) {
- if (ace->ace_arl == arl &&
+ if ((ace->ace_arl == arl ||
+ ace->ace_arl == arl->arl_ipmp_arl) &&
ace->ace_proto_addr_length == proto_addr_length &&
ace->ace_proto == proto) {
int i1 = proto_addr_length;
@@ -632,13 +808,6 @@ ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
/*
* Look for a permanent entry for proto_addr across all interfaces.
- * This is used for sending ARP requests out. Requests may come from
- * IP on le0 with the source address of le1 and we need to send out
- * the request on le1 so that ARP does not think that somebody else
- * is using its PERMANENT address. If le0 and le1 are sitting on
- * the same wire, the same IP -> ethernet mapping might exist on
- * both the interfaces. But we should look for the permanent
- * mapping to avoid arp interpreting it as a duplicate.
*/
static ace_t *
ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr,
@@ -653,8 +822,8 @@ ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr,
if (ace->ace_proto_addr_length == proto_addr_length &&
ace->ace_proto == proto) {
int i1 = proto_addr_length;
- uchar_t *ace_addr = ace->ace_proto_addr;
- uchar_t *mask = ace->ace_proto_mask;
+ uchar_t *ace_addr = ace->ace_proto_addr;
+ uchar_t *mask = ace->ace_proto_mask;
/*
* Note that the ace_proto_mask is applied to the
@@ -703,12 +872,8 @@ ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length)
* 1. Resolution of unresolved entries and update of resolved entries.
* 2. Detection of nodes with our own IP address (duplicates).
*
- * This is complicated by ill groups. We don't currently have knowledge of ill
- * groups, so we can't distinguish between a packet that comes in on one of the
- * arls that's part of the group versus one that's on an unrelated arl. Thus,
- * we take a conservative approach. If the arls match, then we update resolved
- * and unresolved entries alike. If they don't match, then we update only
- * unresolved entries.
+ * If the resolving ARL is in the same group as a matching ACE's ARL, then
+ * update the ACE. Otherwise, make no updates.
*
* For all entries, we first check to see if this is a duplicate (probable
* loopback) message. If so, then just ignore it.
@@ -741,7 +906,7 @@ ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length)
static int
ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
- uint32_t hlen, const uchar_t *src_paddr, uint32_t plen)
+ uint32_t hlen, const uchar_t *src_paddr, uint32_t plen, arl_t **ace_arlp)
{
ace_t *ace;
ace_t *ace_next;
@@ -778,31 +943,35 @@ ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
if (i1 >= 0)
continue;
+ *ace_arlp = ace->ace_arl;
+
/*
- * If both IP addr and hardware address match what we already
- * have, then this is a broadcast packet emitted by one of our
- * interfaces, reflected by the switch and received on another
- * interface. We return AR_LOOPBACK.
+ * If the IP address is ours, and the hardware address matches
+ * one of our own arls, then this is a broadcast packet
+ * emitted by one of our interfaces, reflected by the switch
+ * and received on another interface. We return AR_LOOPBACK.
*/
- if ((ace->ace_flags & ACE_F_MYADDR) &&
- hlen == ace->ace_hw_addr_length &&
- bcmp(ace->ace_hw_addr, src_haddr,
- ace->ace_hw_addr_length) == 0) {
- return (AR_LOOPBACK);
+ if (ace->ace_flags & ACE_F_MYADDR) {
+ arl_t *hw_arl = as->as_arl_head;
+ arlphy_t *ap;
+
+ for (; hw_arl != NULL; hw_arl = hw_arl->arl_next) {
+ ap = hw_arl->arl_phy;
+ if (ap != NULL && ap->ap_hw_addrlen == hlen &&
+ bcmp(ap->ap_hw_addr, src_haddr, hlen) == 0)
+ return (AR_LOOPBACK);
+ }
}
/*
* If the entry is unverified, then we've just verified that
* someone else already owns this address, because this is a
* message with the same protocol address but different
- * hardware address. Conflicts received via an interface which
- * doesn't own the conflict address are not actioned. Multiple
- * interfaces on the same segment imply any conflict will also
- * be seen via the correct interface, so we can ignore anything
- * not matching the arl from the ace.
+ * hardware address. NOTE: the ace_xmit_arl check ensures we
+ * don't send duplicate AR_FAILEDs if arl is in an IPMP group.
*/
if ((ace->ace_flags & ACE_F_UNVERIFIED) &&
- arl == ace->ace_arl) {
+ arl == ace->ace_xmit_arl) {
ar_ce_delete(ace);
return (AR_FAILED);
}
@@ -814,30 +983,29 @@ ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
* that, if we're currently in initial announcement mode, we
* switch back to the lazier defense mode. Knowing that
* there's at least one duplicate out there, we ought not
- * blindly announce. Conflicts received via an interface which
- * doesn't own the conflict address are not actioned. Multiple
- * interfaces on the same segment imply the conflict will also
- * be seen via the correct interface, so we can ignore anything
- * not matching the arl from the ace.
+ * blindly announce. NOTE: the ace_xmit_arl check ensures we
+ * don't send duplicate AR_BOGONs if arl is in an IPMP group.
*/
if ((ace->ace_flags & ACE_F_AUTHORITY) &&
- arl == ace->ace_arl) {
+ arl == ace->ace_xmit_arl) {
ace->ace_xmit_count = 0;
return (AR_BOGON);
}
/*
- * Limit updating across other ills to unresolved
- * entries only. We don't want to inadvertently update
- * published entries.
+ * Only update this ACE if it's on the same network -- i.e.,
+ * it's for our ARL or another ARL in the same IPMP group.
*/
- if (ace->ace_arl == arl || !ACE_RESOLVED(ace)) {
+ if (ace->ace_arl == arl || ace->ace_arl == arl->arl_ipmp_arl) {
if (ar_ce_resolve(ace, src_haddr, hlen))
retv = AR_CHANGED;
else if (retv == AR_NOTFOUND)
retv = AR_MERGED;
}
}
+
+ if (retv == AR_NOTFOUND)
+ *ace_arlp = NULL;
return (retv);
}
@@ -917,7 +1085,7 @@ static void
ar_delete_notify(const ace_t *ace)
{
const arl_t *arl = ace->ace_arl;
- const arlphy_t *ap = arl->arl_phy;
+ const arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
mblk_t *mp;
size_t len;
arh_t *arh;
@@ -945,7 +1113,7 @@ ar_close(queue_t *q)
{
ar_t *ar = (ar_t *)q->q_ptr;
char name[LIFNAMSIZ];
- arl_t *arl;
+ arl_t *arl, *xarl;
arl_t **arlp;
cred_t *cr;
arc_t *arc;
@@ -999,6 +1167,21 @@ ar_close(queue_t *q)
while (arl->arl_state != ARL_S_DOWN)
qwait(arl->arl_rq);
+ if (arl->arl_flags & ARL_F_IPMP) {
+ /*
+ * Though rude, someone could force the IPMP arl
+ * closed without removing the underlying interfaces.
+ * In that case, force the ARLs out of the group.
+ */
+ xarl = as->as_arl_head;
+ for (; xarl != NULL; xarl = xarl->arl_next) {
+ if (xarl->arl_ipmp_arl != arl || xarl == arl)
+ continue;
+ ar_ce_walk(as, ar_ce_ipmp_deactivate, xarl);
+ xarl->arl_ipmp_arl = NULL;
+ }
+ }
+
ar_ll_clear_defaults(arl);
/*
* If this is the control stream for an arl, delete anything
@@ -1417,9 +1600,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
area_t *area;
ace_t *ace;
uchar_t *hw_addr;
- uint32_t hw_addr_len;
+ uint32_t hw_addr_len;
uchar_t *proto_addr;
- uint32_t proto_addr_len;
+ uint32_t proto_addr_len;
uchar_t *proto_mask;
arl_t *arl;
mblk_t *mp = mp_orig;
@@ -1494,6 +1677,7 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
proto_mask,
NULL,
(uint32_t)0,
+ NULL,
aflags & ~ACE_F_MAPPING & ~ACE_F_UNVERIFIED & ~ACE_F_DEFEND);
if (err != 0) {
DTRACE_PROBE3(eadd_create_failed, arl_t *, arl, area_t *, area,
@@ -1502,7 +1686,13 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
}
if (aflags & ACE_F_PUBLISH) {
- arlphy_t *ap = arl->arl_phy;
+ arlphy_t *ap;
+
+ ace = ar_ce_lookup(arl, area->area_proto, proto_addr,
+ proto_addr_len);
+ ASSERT(ace != NULL);
+
+ ap = ace->ace_xmit_arl->arl_phy;
if (hw_addr == NULL || hw_addr_len == 0) {
hw_addr = ap->ap_hw_addr;
@@ -1519,10 +1709,6 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
ap->ap_hw_addrlen = hw_addr_len;
}
- ace = ar_ce_lookup(arl, area->area_proto, proto_addr,
- proto_addr_len);
- ASSERT(ace != NULL);
-
if (ace->ace_flags & ACE_F_FAST) {
ace->ace_xmit_count = as->as_fastprobe_count;
ace->ace_xmit_interval = as->as_fastprobe_delay;
@@ -1555,9 +1741,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
*/
DTRACE_PROBE2(eadd_probe, ace_t *, ace,
area_t *, area);
- ar_xmit(arl, ARP_REQUEST, area->area_proto,
- proto_addr_len, hw_addr, NULL, NULL,
- proto_addr, NULL, as);
+ ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
+ area->area_proto, proto_addr_len,
+ hw_addr, NULL, NULL, proto_addr, NULL, as);
ace->ace_xmit_count--;
ace->ace_xmit_interval =
(ace->ace_flags & ACE_F_FAST) ?
@@ -1573,9 +1759,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
} else {
DTRACE_PROBE2(eadd_announce, ace_t *, ace,
area_t *, area);
- ar_xmit(arl, ARP_REQUEST, area->area_proto,
- proto_addr_len, hw_addr, proto_addr,
- ap->ap_arp_addr, proto_addr, NULL, as);
+ ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
+ area->area_proto, proto_addr_len, hw_addr,
+ proto_addr, ap->ap_arp_addr, proto_addr, NULL, as);
ace->ace_last_bcast = ddi_get_lbolt();
/*
@@ -1583,9 +1769,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
* entry; we believe we're the authority for this
* entry. In that case, and if we're not just doing
* one-off defense of the address, we send more than
- * one copy, so that if this is an IPMP failover, we'll
- * still have a good chance of updating everyone even
- * when there's a packet loss or two.
+ * one copy, so we'll still have a good chance of
+ * updating everyone even when there's a packet loss
+ * or two.
*/
if ((aflags & ACE_F_AUTHORITY) &&
!(aflags & ACE_F_DEFEND) &&
@@ -1667,7 +1853,6 @@ static int
ar_entry_query(queue_t *q, mblk_t *mp_orig)
{
ace_t *ace;
- ace_t *src_ace = NULL;
areq_t *areq;
arl_t *arl;
int err;
@@ -1782,20 +1967,12 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig)
err = ENXIO;
goto err_ret;
}
- if (arl->arl_phy == NULL) {
- /* Can't get help if we don't know how. */
- DTRACE_PROBE2(query_no_phy, ace_t *, ace,
- areq_t *, areq);
- mpp[0] = NULL;
- mp->b_prev = NULL;
- err = ENXIO;
- goto err_ret;
- }
DTRACE_PROBE2(query_unresolved, ace_t, ace, areq_t *, areq);
} else {
/* No ace yet. Make one now. (This is the common case.) */
- if (areq->areq_xmit_count == 0 || arl->arl_phy == NULL) {
- DTRACE_PROBE2(query_phy, arl_t *, arl, areq_t *, areq);
+ if (areq->areq_xmit_count == 0) {
+ DTRACE_PROBE2(query_template, arl_t *, arl,
+ areq_t *, areq);
mp->b_prev = NULL;
err = ENXIO;
goto err_ret;
@@ -1814,9 +1991,9 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig)
err = EINVAL;
goto err_ret;
}
- err = ar_ce_create(arl, areq->areq_proto, NULL, 0,
+ err = ar_ce_create(OWNING_ARL(arl), areq->areq_proto, NULL, 0,
proto_addr, proto_addr_len, NULL,
- NULL, (uint32_t)0,
+ NULL, (uint32_t)0, sender_addr,
areq->areq_flags);
if (err != 0) {
DTRACE_PROBE3(query_create_failed, arl_t *, arl,
@@ -1835,49 +2012,13 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig)
goto err_ret;
}
ace->ace_query_mp = mp;
- /*
- * We don't have group information here. But if the sender
- * address belongs to a different arl, we might as well
- * search the other arl for a resolved ACE. If we find one,
- * we resolve it rather than sending out a ARP request.
- */
- src_ace = ar_ce_lookup_permanent(as, areq->areq_proto,
- sender_addr, areq->areq_sender_addr_length);
- if (src_ace == NULL) {
- DTRACE_PROBE3(query_source_missing, arl_t *, arl,
- areq_t *, areq, ace_t *, ace);
- ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
- /*
- * ar_query_reply has already freed the mp.
- * Return EINPROGRESS, so that caller won't attempt
- * to free the 'mp' again.
- */
- return (EINPROGRESS);
- }
- if (src_ace->ace_arl != ace->ace_arl) {
- ace_t *dst_ace;
-
- /*
- * Check for a resolved entry in the src_ace->ace_arl.
- */
- dst_ace = ar_ce_lookup_entry(src_ace->ace_arl,
- areq->areq_proto, proto_addr, proto_addr_len);
-
- if (dst_ace != NULL && ACE_RESOLVED(dst_ace)) {
- DTRACE_PROBE3(query_other_arl, arl_t *, arl,
- areq_t *, areq, ace_t *, dst_ace);
- (void) ar_ce_resolve(ace, dst_ace->ace_hw_addr,
- dst_ace->ace_hw_addr_length);
- return (EINPROGRESS);
- }
- }
}
- ms = ar_query_xmit(as, ace, src_ace);
+ ms = ar_query_xmit(as, ace);
if (ms == 0) {
/* Immediate reply requested. */
ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
} else {
- mi_timer(arl->arl_wq, ace->ace_mp, ms);
+ mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, ms);
}
return (EINPROGRESS);
err_ret:
@@ -2073,6 +2214,80 @@ done:
}
/*
+ * Given an arie_t `mp', find the arl_t's that it names and return them
+ * in `*arlp' and `*ipmp_arlp'. If they cannot be found, return B_FALSE.
+ */
+static boolean_t
+ar_ipmp_lookup(arp_stack_t *as, mblk_t *mp, arl_t **arlp, arl_t **ipmp_arlp)
+{
+ arie_t *arie = (arie_t *)mp->b_rptr;
+
+ *arlp = ar_ll_lookup_from_mp(as, mp);
+ if (*arlp == NULL) {
+ DTRACE_PROBE1(ipmp_lookup_no_arl, mblk_t *, mp);
+ return (B_FALSE);
+ }
+
+ arie->arie_grifname[LIFNAMSIZ - 1] = '\0';
+ *ipmp_arlp = ar_ll_lookup_by_name(as, arie->arie_grifname);
+ if (*ipmp_arlp == NULL) {
+ DTRACE_PROBE1(ipmp_lookup_no_ipmp_arl, mblk_t *, mp);
+ return (B_FALSE);
+ }
+
+ DTRACE_PROBE2(ipmp_lookup, arl_t *, *arlp, arl_t *, *ipmp_arlp);
+ return (B_TRUE);
+}
+
+/*
+ * Bind an arl_t to an IPMP group arl_t.
+ */
+static int
+ar_ipmp_activate(queue_t *q, mblk_t *mp)
+{
+ arl_t *arl, *ipmp_arl;
+ arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
+
+ if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
+ return (EINVAL);
+
+ if (arl->arl_ipmp_arl != NULL) {
+ DTRACE_PROBE1(ipmp_activated_already, arl_t *, arl);
+ return (EALREADY);
+ }
+
+ DTRACE_PROBE2(ipmp_activate, arl_t *, arl, arl_t *, ipmp_arl);
+ arl->arl_ipmp_arl = ipmp_arl;
+ return (0);
+}
+
+/*
+ * Unbind an arl_t from an IPMP group arl_t and update the ace_t's so
+ * that it is no longer part of the group.
+ */
+static int
+ar_ipmp_deactivate(queue_t *q, mblk_t *mp)
+{
+ arl_t *arl, *ipmp_arl;
+ arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
+
+ if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
+ return (EINVAL);
+
+ if (ipmp_arl != arl->arl_ipmp_arl) {
+ DTRACE_PROBE2(ipmp_deactivate_notactive, arl_t *, arl, arl_t *,
+ ipmp_arl);
+ return (EINVAL);
+ }
+
+ DTRACE_PROBE2(ipmp_deactivate, arl_t *, arl, arl_t *,
+ arl->arl_ipmp_arl);
+ ar_ce_walk(as, ar_ce_ipmp_deactivate, arl);
+ arl->arl_ipmp_arl = NULL;
+ return (0);
+}
+
+/*
* Enable an interface to process ARP_REQUEST and ARP_RESPONSE messages.
*/
/* ARGSUSED */
@@ -2199,6 +2414,11 @@ ar_ll_init(arp_stack_t *as, ar_t *ar, mblk_t *mp)
if ((arl = (arl_t *)mi_zalloc(sizeof (arl_t))) == NULL)
return;
+ if (dlia->dl_mac_type == SUNW_DL_IPMP) {
+ arl->arl_flags |= ARL_F_IPMP;
+ arl->arl_ipmp_arl = arl;
+ }
+
arl->arl_provider_style = dlia->dl_provider_style;
arl->arl_rq = ar->ar_rq;
arl->arl_wq = ar->ar_wq;
@@ -2261,7 +2481,7 @@ ar_ll_set_defaults(arl_t *arl, mblk_t *mp)
dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
dl_unitdata_req_t *dlur;
uchar_t *up;
- arlphy_t *ap;
+ arlphy_t *ap;
ASSERT(arl != NULL);
@@ -2270,6 +2490,14 @@ ar_ll_set_defaults(arl_t *arl, mblk_t *mp)
*/
ar_ll_clear_defaults(arl);
+ if (arl->arl_flags & ARL_F_IPMP) {
+ /*
+ * If this is an IPMP arl_t, we have nothing to do,
+ * since we will never transmit or receive.
+ */
+ return;
+ }
+
ap = kmem_zalloc(sizeof (arlphy_t), KM_NOSLEEP);
if (ap == NULL)
goto bad;
@@ -2470,12 +2698,12 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig)
mblk_t *mp = mp_orig;
ace_t *ace;
uchar_t *hw_addr;
- uint32_t hw_addr_len;
+ uint32_t hw_addr_len;
uchar_t *proto_addr;
- uint32_t proto_addr_len;
+ uint32_t proto_addr_len;
uchar_t *proto_mask;
uchar_t *proto_extract_mask;
- uint32_t hw_extract_start;
+ uint32_t hw_extract_start;
arl_t *arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
@@ -2524,6 +2752,7 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig)
proto_mask,
proto_extract_mask,
hw_extract_start,
+ NULL,
arma->arma_flags | ACE_F_MAPPING));
}
@@ -2857,12 +3086,12 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
uint32_t proto_addr_len)
{
mblk_t *areq_mp;
- arl_t *arl = ace->ace_arl;
mblk_t *mp;
mblk_t *xmit_mp;
- arp_stack_t *as = ARL_TO_ARPSTACK(arl);
+ queue_t *arl_wq = ace->ace_arl->arl_wq;
+ arp_stack_t *as = ARL_TO_ARPSTACK(ace->ace_arl);
ip_stack_t *ipst = as->as_netstack->netstack_ip;
- arlphy_t *ap = arl->arl_phy;
+ arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
/*
* On error or completion for a query, we need to shut down the timer.
@@ -2870,7 +3099,8 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
* Duplicate Address Detection, or it will never finish that phase.
*/
if (!(ace->ace_flags & (ACE_F_UNVERIFIED | ACE_F_AUTHORITY)))
- mi_timer(arl->arl_wq, ace->ace_mp, -1L);
+ mi_timer(arl_wq, ace->ace_mp, -1L);
+
/* Establish the return value appropriate. */
if (ret_val == 0) {
if (!ACE_RESOLVED(ace) || ap == NULL)
@@ -2973,25 +3203,24 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
*/
ar_ce_delete(ace);
} else {
- mi_timer(arl->arl_wq, ace->ace_mp,
- as->as_cleanup_interval);
+ mi_timer(arl_wq, ace->ace_mp, as->as_cleanup_interval);
}
}
}
/*
* Returns number of milliseconds after which we should either rexmit or abort.
- * Return of zero means we should abort. src_ace is the ace corresponding
- * to the source address in the areq sent by IP.
+ * Return of zero means we should abort.
*/
static clock_t
-ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace)
+ar_query_xmit(arp_stack_t *as, ace_t *ace)
{
areq_t *areq;
mblk_t *mp;
uchar_t *proto_addr;
uchar_t *sender_addr;
- arl_t *src_arl;
+ ace_t *src_ace;
+ arl_t *xmit_arl = ace->ace_xmit_arl;
mp = ace->ace_query_mp;
/*
@@ -3016,18 +3245,15 @@ ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace)
areq->areq_sender_addr_length);
/*
- * Get the source h/w address for the sender addr. With interface
- * groups, IP sends us source address belonging to a different
- * interface.
+ * Get the ace for the sender address, so that we can verify that
+ * we have one and that DAD has completed.
*/
+ src_ace = ar_ce_lookup(xmit_arl, areq->areq_proto, sender_addr,
+ areq->areq_sender_addr_length);
if (src_ace == NULL) {
- src_ace = ar_ce_lookup_permanent(as, areq->areq_proto,
- sender_addr, areq->areq_sender_addr_length);
- if (src_ace == NULL) {
- DTRACE_PROBE3(xmit_no_source, ace_t *, ace,
- areq_t *, areq, uchar_t *, sender_addr);
- return (0);
- }
+ DTRACE_PROBE3(xmit_no_source, ace_t *, ace, areq_t *, areq,
+ uchar_t *, sender_addr);
+ return (0);
}
/*
@@ -3044,18 +3270,12 @@ ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace)
return (areq->areq_xmit_interval);
}
- /*
- * Transmit on src_arl. We should transmit on src_arl. Otherwise
- * the switch will send back a copy on other interfaces of the
- * same group and as we could be using somebody else's source
- * address + hardware address, ARP will treat this as a bogon.
- */
- src_arl = src_ace->ace_arl;
DTRACE_PROBE3(xmit_send, ace_t *, ace, ace_t *, src_ace,
areq_t *, areq);
- ar_xmit(src_arl, ARP_REQUEST, areq->areq_proto,
- areq->areq_sender_addr_length, src_arl->arl_phy->ap_hw_addr,
- sender_addr, src_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as);
+
+ ar_xmit(xmit_arl, ARP_REQUEST, areq->areq_proto,
+ areq->areq_sender_addr_length, xmit_arl->arl_phy->ap_hw_addr,
+ sender_addr, xmit_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as);
src_ace->ace_last_bcast = ddi_get_lbolt();
return (areq->areq_xmit_interval);
}
@@ -3066,6 +3286,7 @@ ar_rput(queue_t *q, mblk_t *mp)
{
arh_t *arh;
arl_t *arl;
+ arl_t *client_arl;
ace_t *dst_ace;
uchar_t *dst_paddr;
int err;
@@ -3079,6 +3300,8 @@ ar_rput(queue_t *q, mblk_t *mp)
uchar_t *src_paddr;
uchar_t *dst_haddr;
boolean_t is_probe;
+ boolean_t is_unicast = B_FALSE;
+ dl_unitdata_ind_t *dlindp;
int i;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
@@ -3135,9 +3358,10 @@ ar_rput(queue_t *q, mblk_t *mp)
return;
case M_PCPROTO:
case M_PROTO:
+ dlindp = (dl_unitdata_ind_t *)mp->b_rptr;
if (MBLKL(mp) >= sizeof (dl_unitdata_ind_t) &&
- ((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive ==
- DL_UNITDATA_IND) {
+ dlindp->dl_primitive == DL_UNITDATA_IND) {
+ is_unicast = (dlindp->dl_group_address == 0);
arl = ((ar_t *)q->q_ptr)->ar_arl;
if (arl != NULL && arl->arl_phy != NULL) {
/* Real messages from the wire! */
@@ -3261,19 +3485,24 @@ ar_rput(queue_t *q, mblk_t *mp)
* RFC 826: first check if the <protocol, sender protocol address> is
* in the cache, if there is a sender protocol address. Note that this
* step also handles resolutions based on source.
+ *
+ * Note that IP expects that each notification it receives will be
+ * tied to the ill it received it on. Thus, we must talk to it over
+ * the arl tied to the resolved IP address (if any), hence client_arl.
*/
if (is_probe)
err = AR_NOTFOUND;
else
err = ar_ce_resolve_all(arl, proto, src_haddr, hlen, src_paddr,
- plen);
+ plen, &client_arl);
+
switch (err) {
case AR_BOGON:
- ar_client_notify(arl, mp1, AR_CN_BOGON);
+ ar_client_notify(client_arl, mp1, AR_CN_BOGON);
mp1 = NULL;
break;
case AR_FAILED:
- ar_client_notify(arl, mp1, AR_CN_FAILED);
+ ar_client_notify(client_arl, mp1, AR_CN_FAILED);
mp1 = NULL;
break;
case AR_LOOPBACK:
@@ -3293,7 +3522,9 @@ ar_rput(queue_t *q, mblk_t *mp)
* Now look up the destination address. By RFC 826, we ignore the
* packet at this step if the target isn't one of our addresses. This
* is true even if the target is something we're trying to resolve and
- * the packet is a response.
+ * the packet is a response. To avoid duplicate responses, we also
+ * ignore the packet if it was multicast/broadcast to an arl that's in
+ * an IPMP group but was not the designated xmit_arl for the ACE.
*
* Note that in order to do this correctly, we need to know when to
* notify IP of a change implied by the source address of the ARP
@@ -3304,6 +3535,7 @@ ar_rput(queue_t *q, mblk_t *mp)
*/
dst_ace = ar_ce_lookup_entry(arl, proto, dst_paddr, plen);
if (dst_ace == NULL || !ACE_RESOLVED(dst_ace) ||
+ (dst_ace->ace_xmit_arl != arl && !is_unicast) ||
!(dst_ace->ace_flags & ACE_F_PUBLISH)) {
/*
* Let the client know if the source mapping has changed, even
@@ -3311,7 +3543,7 @@ ar_rput(queue_t *q, mblk_t *mp)
* client.
*/
if (err == AR_CHANGED)
- ar_client_notify(arl, mp1, AR_CN_ANNOUNCE);
+ ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
else
freemsg(mp1);
freeb(mp);
@@ -3341,6 +3573,7 @@ ar_rput(queue_t *q, mblk_t *mp)
"arp_rput_end: q %p (%S)", q, "reflection");
return;
}
+
/*
* Conflicts seen via the wrong interface may be bogus.
* Multiple interfaces on the same segment imply any conflict
@@ -3378,12 +3611,21 @@ ar_rput(queue_t *q, mblk_t *mp)
* the src_paddr field before sending it to IP. The same is
* required for probes, where src_paddr will be INADDR_ANY.
*/
- if (is_probe || op == ARP_RESPONSE) {
+ if (is_probe) {
+ /*
+ * In this case, client_arl will be invalid (e.g.,
+ * since probes don't have a valid sender address).
+ * But dst_ace has the appropriate arl.
+ */
bcopy(dst_paddr, src_paddr, plen);
- ar_client_notify(arl, mp1, AR_CN_FAILED);
+ ar_client_notify(dst_ace->ace_arl, mp1, AR_CN_FAILED);
+ ar_ce_delete(dst_ace);
+ } else if (op == ARP_RESPONSE) {
+ bcopy(dst_paddr, src_paddr, plen);
+ ar_client_notify(client_arl, mp1, AR_CN_FAILED);
ar_ce_delete(dst_ace);
} else if (err == AR_CHANGED) {
- ar_client_notify(arl, mp1, AR_CN_ANNOUNCE);
+ ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
} else {
DTRACE_PROBE3(rput_request_unverified, arl_t *, arl,
arh_t *, arh, ace_t *, dst_ace);
@@ -3431,19 +3673,19 @@ ar_rput(queue_t *q, mblk_t *mp)
dst_ace->ace_hw_addr, dst_ace->ace_proto_addr,
src_haddr, src_paddr, dstaddr, as);
if (!is_probe && err == AR_NOTFOUND &&
- ar_ce_create(arl, proto, src_haddr, hlen, src_paddr, plen,
- NULL, NULL, 0, 0) == 0) {
+ ar_ce_create(OWNING_ARL(arl), proto, src_haddr, hlen,
+ src_paddr, plen, NULL, NULL, 0, NULL, 0) == 0) {
ace_t *ace;
ace = ar_ce_lookup(arl, proto, src_paddr, plen);
ASSERT(ace != NULL);
- mi_timer(arl->arl_wq, ace->ace_mp,
+ mi_timer(ace->ace_arl->arl_wq, ace->ace_mp,
as->as_cleanup_interval);
}
}
if (err == AR_CHANGED) {
freeb(mp);
- ar_client_notify(arl, mp1, AR_CN_ANNOUNCE);
+ ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "reqchange");
} else {
@@ -3459,7 +3701,7 @@ ar_ce_restart_dad(ace_t *ace, void *arl_arg)
arl_t *arl = arl_arg;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
- if ((ace->ace_arl == arl) &&
+ if ((ace->ace_xmit_arl == arl) &&
(ace->ace_flags & (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) ==
(ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) {
/*
@@ -4060,9 +4302,9 @@ ar_wput(queue_t *q, mblk_t *mp)
static boolean_t
arp_say_ready(ace_t *ace)
{
- mblk_t *mp;
+ mblk_t *mp;
arl_t *arl = ace->ace_arl;
- arlphy_t *ap = arl->arl_phy;
+ arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
arh_t *arh;
uchar_t *cp;
@@ -4107,7 +4349,7 @@ ace_reschedule(ace_t *ace, void *arg)
ace_t **acemax;
ace_t *atemp;
- if (ace->ace_arl != art->art_arl)
+ if (ace->ace_xmit_arl != art->art_arl)
return;
/*
* Only published entries that are ready for announcement are eligible.
@@ -4179,7 +4421,6 @@ static void
ar_wsrv(queue_t *q)
{
ace_t *ace;
- arl_t *arl;
arlphy_t *ap;
mblk_t *mp;
clock_t ms;
@@ -4196,8 +4437,7 @@ ar_wsrv(queue_t *q)
ace = (ace_t *)mp->b_rptr;
if (ace->ace_flags & ACE_F_DYING)
continue;
- arl = ace->ace_arl;
- ap = arl->arl_phy;
+ ap = ace->ace_xmit_arl->arl_phy;
if (ace->ace_flags & ACE_F_UNVERIFIED) {
ASSERT(ace->ace_flags & ACE_F_PUBLISH);
ASSERT(ace->ace_query_mp == NULL);
@@ -4216,7 +4456,7 @@ ar_wsrv(queue_t *q)
DTRACE_PROBE1(timer_probe,
ace_t *, ace);
ace->ace_xmit_count--;
- ar_xmit(arl, ARP_REQUEST,
+ ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
ace->ace_proto,
ace->ace_proto_addr_length,
ace->ace_hw_addr, NULL, NULL,
@@ -4247,7 +4487,7 @@ ar_wsrv(queue_t *q)
now - ap->ap_defend_start >
SEC_TO_TICK(as->as_defend_period)) {
ap->ap_defend_start = now;
- arl_reschedule(arl);
+ arl_reschedule(ace->ace_xmit_arl);
}
/*
* Finish the job that we started in
@@ -4288,12 +4528,12 @@ ar_wsrv(queue_t *q)
DTRACE_PROBE1(timer_defend,
ace_t *, ace);
}
- ar_xmit(arl, ARP_REQUEST,
+ ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
ace->ace_proto,
ace->ace_proto_addr_length,
ace->ace_hw_addr,
ace->ace_proto_addr,
- ap->ap_arp_addr,
+ ace->ace_xmit_arl->arl_phy->ap_arp_addr,
ace->ace_proto_addr, NULL, as);
ace->ace_last_bcast = now;
if (ace->ace_xmit_count == 0)
@@ -4316,7 +4556,8 @@ ar_wsrv(queue_t *q)
ndp_lookup_ipaddr(*(ipaddr_t *)
ace->ace_proto_addr, as->as_netstack)) {
ace->ace_flags |= ACE_F_OLD;
- mi_timer(arl->arl_wq, ace->ace_mp,
+ mi_timer(ace->ace_arl->arl_wq,
+ ace->ace_mp,
as->as_cleanup_interval);
} else {
ar_delete_notify(ace);
@@ -4333,7 +4574,7 @@ ar_wsrv(queue_t *q)
* we complete the operation with a failure indication.
* Otherwise, we restart the timer.
*/
- ms = ar_query_xmit(as, ace, NULL);
+ ms = ar_query_xmit(as, ace);
if (ms == 0)
ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
else
@@ -4360,6 +4601,8 @@ ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen,
mblk_t *mp;
arlphy_t *ap = arl->arl_phy;
+ ASSERT(!(arl->arl_flags & ARL_F_IPMP));
+
if (ap == NULL) {
DTRACE_PROBE1(xmit_no_arl_phy, arl_t *, arl);
return;
diff --git a/usr/src/uts/common/inet/arp_impl.h b/usr/src/uts/common/inet/arp_impl.h
index a2564d5602..f16fdc97a0 100644
--- a/usr/src/uts/common/inet/arp_impl.h
+++ b/usr/src/uts/common/inet/arp_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -67,6 +67,7 @@ typedef struct arl_s {
uint_t arl_closing : 1; /* stream is closing */
uint32_t arl_index; /* instance number */
struct arlphy_s *arl_phy; /* physical info, if any */
+ struct arl_s *arl_ipmp_arl; /* pointer to group arl_t */
} arl_t;
/*
@@ -75,7 +76,7 @@ typedef struct arl_s {
*/
#define ARL_TO_ARPSTACK(_arl) (((ar_t *)(_arl)->arl_rq->q_ptr)->ar_as)
-/* ARL physical info structure for a link level device */
+/* ARL physical info structure, one per physical link level device */
typedef struct arlphy_s {
uint32_t ap_arp_hw_type; /* hardware type */
uchar_t *ap_arp_addr; /* multicast address to use */
@@ -110,6 +111,7 @@ typedef struct ace_s {
clock_t ace_last_bcast; /* last broadcast Response */
clock_t ace_xmit_interval;
int ace_xmit_count;
+ arl_t *ace_xmit_arl; /* xmit on this arl */
} ace_t;
#define ARPHOOK_INTERESTED_PHYSICAL_IN(as) \
@@ -216,6 +218,7 @@ struct arp_stack {
typedef struct arp_stack arp_stack_t;
#define ARL_F_NOARP 0x01
+#define ARL_F_IPMP 0x02
#define ARL_S_DOWN 0x00
#define ARL_S_PENDING 0x01
diff --git a/usr/src/uts/common/inet/dlpistub/dlpistub.c b/usr/src/uts/common/inet/dlpistub/dlpistub.c
new file mode 100644
index 0000000000..961876ac47
--- /dev/null
+++ b/usr/src/uts/common/inet/dlpistub/dlpistub.c
@@ -0,0 +1,370 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * DLPI stub driver; currently supports VNI and IPMP stub devices.
+ */
+
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/dlpi.h>
+#include <sys/stat.h>
+#include <sys/strsun.h>
+#include <sys/stropts.h>
+#include <sys/types.h>
+#include <sys/id_space.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/modctl.h>
+#include <sys/mkdev.h>
+#include <sys/sdt.h>
+
+#include "dlpistub_impl.h"
+
+static id_space_t *ds_minors;
+static dev_info_t *ds_dip;
+
+/*
+ * DL_INFO_ACK template.
+ */
+static dl_info_ack_t ds_infoack = {
+ DL_INFO_ACK, /* dl_primitive */
+ 0, /* dl_max_sdu */
+ 0, /* dl_min_sdu */
+ 0, /* dl_addr_length */
+ 0, /* dl_mac_type */
+ 0, /* dl_reserved */
+ 0, /* dl_current_state */
+ 0, /* dl_sap_length */
+ DL_CLDLS, /* dl_service_mode */
+ 0, /* dl_qos_length */
+ 0, /* dl_qos_offset */
+ 0, /* dl_qos_range_length */
+ 0, /* dl_qos_range_offset */
+ DL_STYLE2, /* dl_provider_style */
+ 0, /* dl_addr_offset */
+ DL_VERSION_2, /* dl_version */
+ 0, /* dl_brdcst_addr_length */
+ 0, /* dl_brdcst_addr_offset */
+ 0 /* dl_growth */
+};
+
+static int
+ds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (ddi_create_minor_node(dip, "vni", S_IFCHR, DS_MINOR_VNI,
+ DDI_PSEUDO, 0) == DDI_FAILURE ||
+ ddi_create_minor_node(dip, "ipmpstub", S_IFCHR, DS_MINOR_IPMP,
+ DDI_PSEUDO, 0) == DDI_FAILURE) {
+ ddi_remove_minor_node(dip, NULL);
+ cmn_err(CE_NOTE, "ds_attach: cannot create minor nodes");
+ return (DDI_FAILURE);
+ }
+
+ ds_dip = dip;
+ ds_minors = id_space_create("ds_minors", DS_MINOR_START, MAXMIN32);
+ return (DDI_SUCCESS);
+}
+
+static int
+ds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ id_space_destroy(ds_minors);
+ ds_minors = NULL;
+ ASSERT(dip == ds_dip);
+ ddi_remove_minor_node(dip, NULL);
+ ds_dip = NULL;
+ return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+ds_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ int error = DDI_FAILURE;
+
+ switch (infocmd) {
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2DEVINFO:
+ if (ds_dip != NULL) {
+ *result = ds_dip;
+ error = DDI_SUCCESS;
+ }
+ break;
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+ds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+ int type;
+ dlpistub_t *dsp;
+
+ if (sflag == CLONEOPEN || sflag == MODOPEN)
+ return (EINVAL);
+
+ if (q->q_ptr != NULL)
+ return (0);
+
+ switch (getminor(*devp)) {
+ case DS_MINOR_VNI:
+ type = SUNW_DL_VNI;
+ break;
+ case DS_MINOR_IPMP:
+ type = SUNW_DL_IPMP;
+ break;
+ default:
+ return (ENXIO);
+ }
+
+ dsp = kmem_zalloc(sizeof (dlpistub_t), KM_SLEEP);
+ dsp->ds_type = type;
+ dsp->ds_minor = (minor_t)id_alloc(ds_minors);
+ dsp->ds_state = DL_UNATTACHED;
+ *devp = makedevice(getmajor(*devp), dsp->ds_minor);
+ q->q_ptr = WR(q)->q_ptr = dsp;
+ qprocson(q);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+ds_close(queue_t *q, int flag, cred_t *credp)
+{
+ dlpistub_t *dsp = q->q_ptr;
+
+ qprocsoff(q);
+ q->q_ptr = WR(q)->q_ptr = NULL;
+
+ id_free(ds_minors, dsp->ds_minor);
+ kmem_free(dsp, sizeof (dlpistub_t));
+
+ return (0);
+}
+
+static int
+ds_badprim(queue_t *q, mblk_t *mp, t_scalar_t prim)
+{
+ dlerrorack(q, mp, prim, DL_BADPRIM, 0);
+ return (0);
+}
+
+static int
+ds_outstate(queue_t *q, mblk_t *mp, t_scalar_t prim)
+{
+ dlerrorack(q, mp, prim, DL_OUTSTATE, 0);
+ return (0);
+}
+
+static int
+ds_wput(queue_t *q, mblk_t *mp)
+{
+ union DL_primitives *dlp;
+ dl_info_ack_t *dlip;
+ dlpistub_t *dsp = q->q_ptr;
+ t_scalar_t prim;
+
+ switch (DB_TYPE(mp)) {
+ case M_PROTO:
+ case M_PCPROTO:
+ if (MBLKL(mp) < sizeof (t_scalar_t)) {
+ dlerrorack(q, mp, DL_PRIM_INVAL, DL_UNSUPPORTED, 0);
+ return (0);
+ }
+
+ dlp = (void *)mp->b_rptr;
+ prim = dlp->dl_primitive;
+ switch (prim) {
+ case DL_ATTACH_REQ:
+ if (MBLKL(mp) < DL_ATTACH_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ if (dsp->ds_state != DL_UNATTACHED)
+ return (ds_outstate(q, mp, prim));
+
+ dsp->ds_state = DL_UNBOUND;
+ dlokack(q, mp, DL_ATTACH_REQ);
+ break;
+
+ case DL_BIND_REQ:
+ if (MBLKL(mp) < DL_BIND_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ if (dsp->ds_state != DL_UNBOUND)
+ return (ds_outstate(q, mp, prim));
+
+ dsp->ds_state = DL_IDLE;
+ dlbindack(q, mp, dlp->bind_req.dl_sap, NULL, 0, 0, 0);
+ break;
+
+ case DL_INFO_REQ:
+ if (MBLKL(mp) < DL_INFO_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ mp = mexchange(q, mp, sizeof (dl_info_ack_t),
+ M_PCPROTO, DL_INFO_ACK);
+ if (mp != NULL) {
+ dlip = (void *)mp->b_rptr;
+ *dlip = ds_infoack;
+ dlip->dl_mac_type = dsp->ds_type;
+ dlip->dl_current_state = dsp->ds_state;
+ qreply(q, mp);
+ }
+ break;
+
+ case DL_PHYS_ADDR_REQ:
+ if (MBLKL(mp) < DL_PHYS_ADDR_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ dlphysaddrack(q, mp, NULL, 0);
+ break;
+
+ case DL_UNBIND_REQ:
+ if (MBLKL(mp) < DL_UNBIND_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ if (dsp->ds_state != DL_IDLE)
+ return (ds_outstate(q, mp, prim));
+
+ dsp->ds_state = DL_UNBOUND;
+ dlokack(q, mp, DL_UNBIND_REQ);
+ break;
+
+ case DL_DETACH_REQ:
+ if (MBLKL(mp) < DL_DETACH_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ if (dsp->ds_state != DL_UNBOUND)
+ return (ds_outstate(q, mp, prim));
+
+ dsp->ds_state = DL_UNATTACHED;
+ dlokack(q, mp, DL_DETACH_REQ);
+ break;
+
+ case DL_UNITDATA_REQ:
+ DTRACE_PROBE2(dlpistub__data, dlpistub_t *, dsp,
+ mblk_t *, mp);
+ freemsg(mp);
+ break;
+
+ default:
+ dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
+ }
+ break;
+
+ case M_IOCTL:
+ miocnak(q, mp, 0, EINVAL);
+ break;
+
+ case M_FLUSH:
+ *mp->b_rptr &= ~FLUSHW;
+ if (*mp->b_rptr & FLUSHR)
+ qreply(q, mp);
+ else
+ freemsg(mp);
+ break;
+ default:
+ freemsg(mp);
+ break;
+ }
+
+ return (0);
+}
+
+static struct module_info ds_minfo = {
+ DS_IDNUM, /* mi_idnum */
+ "dlpistub", /* mi_idname */
+ 0, /* mi_minpsz */
+ INFPSZ, /* mi_maxpsz */
+ 0, /* mi_hiwat */
+ 0, /* mi_lowat */
+};
+
+static struct qinit ds_rinit = {
+ NULL, /* qi_putp */
+ NULL, /* qi_srvp */
+ ds_open, /* qi_qopen */
+ ds_close, /* qi_qclose */
+ NULL, /* qi_qadmin */
+ &ds_minfo, /* qi_minfo */
+};
+
+static struct qinit ds_winit = {
+ ds_wput, /* qi_putp */
+ NULL, /* qi_srvp */
+ NULL, /* qi_qopen */
+ NULL, /* qi_qclose */
+ NULL, /* qi_qadmin */
+ &ds_minfo, /* qi_minfo */
+};
+
+static struct streamtab ds_info = {
+ &ds_rinit, /* st_rdinit */
+ &ds_winit /* st_wrinit */
+};
+
+DDI_DEFINE_STREAM_OPS(ds_ops, nulldev, nulldev, ds_attach, ds_detach,
+ nodev, ds_devinfo, D_MP|D_MTPERMOD, &ds_info, ddi_quiesce_not_supported);
+
+static struct modldrv modldrv = {
+ &mod_driverops,
+ "DLPI stub driver",
+ &ds_ops,
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, &modldrv, NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/inet/vni/vni.conf b/usr/src/uts/common/inet/dlpistub/dlpistub.conf
index d79915e01c..72264ca466 100644
--- a/usr/src/uts/common/inet/vni/vni.conf
+++ b/usr/src/uts/common/inet/dlpistub/dlpistub.conf
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,10 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-
-#ident "%Z%%M% %I% %E% SMI"
-#
-name="vni" parent="pseudo" instance=0;
+name="dlpistub" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h b/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h
new file mode 100644
index 0000000000..ece15320ee
--- /dev/null
+++ b/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _INET_DLPISTUB_IMPL_H
+#define _INET_DLPISTUB_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+
+typedef struct dlpistub {
+ int ds_type; /* DLPI MAC type */
+ t_uscalar_t ds_state; /* DLPI state */
+ minor_t ds_minor; /* corresponding minor */
+} dlpistub_t;
+
+#define DS_IDNUM 0x2a84
+
+enum { DS_MINOR_VNI = 1, DS_MINOR_IPMP, DS_MINOR_START };
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INET_DLPISTUB_IMPL_H */
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 323c8fd0de..41595280cb 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -56,6 +56,7 @@ extern "C" {
#include <net/route.h>
#include <sys/systm.h>
#include <sys/multidata.h>
+#include <sys/list.h>
#include <net/radix.h>
#include <sys/modhash.h>
@@ -565,15 +566,21 @@ typedef struct ipha_s {
#define IPH_ECN_ECT0 0x2 /* ECN-Capable Transport, ECT(0) */
#define IPH_ECN_CE 0x3 /* ECN-Congestion Experienced (CE) */
+struct ill_s;
+
+typedef boolean_t ip_v6intfid_func_t(struct ill_s *, in6_addr_t *);
+typedef boolean_t ip_v6mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *,
+ in6_addr_t *);
+typedef boolean_t ip_v4mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *,
+ ipaddr_t *);
+
/* IP Mac info structure */
typedef struct ip_m_s {
- t_uscalar_t ip_m_mac_type; /* From <sys/dlpi.h> */
- int ip_m_type; /* From <net/if_types.h> */
- boolean_t (*ip_m_v4mapinfo)(uint_t, uint8_t *, uint8_t *,
- uint32_t *, ipaddr_t *);
- boolean_t (*ip_m_v6mapinfo)(uint_t, uint8_t *, uint8_t *,
- uint32_t *, in6_addr_t *);
- boolean_t (*ip_m_v6intfid)(uint_t, uint8_t *, in6_addr_t *);
+ t_uscalar_t ip_m_mac_type; /* From <sys/dlpi.h> */
+ int ip_m_type; /* From <net/if_types.h> */
+ ip_v4mapinfo_func_t *ip_m_v4mapinfo;
+ ip_v6mapinfo_func_t *ip_m_v6mapinfo;
+ ip_v6intfid_func_t *ip_m_v6intfid;
} ip_m_t;
/*
@@ -583,18 +590,22 @@ typedef struct ip_m_s {
* layer multicast address range.
* b. map from IPv6 multicast address range (ff00::/8) to the link
* layer multicast address range.
- * c. derive the default IPv6 interface identifier from the link layer
- * address.
+ * c. derive the default IPv6 interface identifier from the interface.
+ * d. derive the default IPv6 destination interface identifier from
+ * the interface (point-to-point only).
*/
#define MEDIA_V4MINFO(ip_m, plen, bphys, maddr, hwxp, v4ptr) \
(((ip_m)->ip_m_v4mapinfo != NULL) && \
(*(ip_m)->ip_m_v4mapinfo)(plen, bphys, maddr, hwxp, v4ptr))
-#define MEDIA_V6INTFID(ip_m, plen, phys, v6ptr) \
- (((ip_m)->ip_m_v6intfid != NULL) && \
- (*(ip_m)->ip_m_v6intfid)(plen, phys, v6ptr))
#define MEDIA_V6MINFO(ip_m, plen, bphys, maddr, hwxp, v6ptr) \
(((ip_m)->ip_m_v6mapinfo != NULL) && \
(*(ip_m)->ip_m_v6mapinfo)(plen, bphys, maddr, hwxp, v6ptr))
+#define MEDIA_V6INTFID(ip_m, ill, v6ptr) \
+ (((ip_m)->ip_m_v6intfid != NULL) && \
+ (*(ip_m)->ip_m_v6intfid)(ill, v6ptr))
+#define MEDIA_V6DESTINTFID(ip_m, ill, v6ptr) \
+ (((ip_m)->ip_m_v6destintfid != NULL) && \
+ (*(ip_m)->ip_m_v6destintfid)(ill, v6ptr))
/* Router entry types */
#define IRE_BROADCAST 0x0001 /* Route entry for broadcast address */
@@ -621,18 +632,12 @@ typedef struct ip_m_s {
* the bucket should delete this IRE from this bucket.
*/
#define IRE_MARK_CONDEMNED 0x0001
+
/*
- * If a broadcast IRE is marked with IRE_MARK_NORECV, ip_rput will drop the
- * broadcast packets received on that interface. This is marked only
- * on broadcast ires. Employed by IPMP, where we have multiple NICs on the
- * same subnet receiving the same broadcast packet.
- */
-#define IRE_MARK_NORECV 0x0002
-/*
- * IRE_CACHE marked this way won't be returned by ire_cache_lookup. Need
- * to look specifically using MATCH_IRE_MARK_HIDDEN. Used by IPMP.
+ * An IRE with IRE_MARK_TESTHIDDEN is used by in.mpathd for test traffic. It
+ * can only be looked up by requesting MATCH_IRE_MARK_TESTHIDDEN.
*/
-#define IRE_MARK_HIDDEN 0x0004 /* Typically Used by in.mpathd */
+#define IRE_MARK_TESTHIDDEN 0x0004
/*
* An IRE with IRE_MARK_NOADD is created in ip_newroute_ipif when the outgoing
@@ -788,45 +793,18 @@ typedef struct mrec_s {
* ilm records the state of multicast memberships with the driver and is
* maintained per interface.
*
- * Notes :
- *
- * 1) There is no direct link between a given ilg and ilm. If the
- * application has joined a group G with ifindex I, we will have
- * an ilg with ilg_v6group and ilg_ill. There will be a corresponding
- * ilm with ilm_ill/ilm_v6addr recording the multicast membership.
- * To delete the membership,
- *
- * a) Search for ilg matching on G and I with ilg_v6group
- * and ilg_ill. Delete ilg_ill.
- * b) Search the corresponding ilm matching on G and I with
- * ilm_v6addr and ilm_ill. Delete ilm.
- *
- * In IPv4, the only difference is, we look using ipifs instead of
- * ills.
- *
- * 2) With IP multipathing, we want to keep receiving even after the
- * interface has failed. We do this by moving multicast memberships
- * to a new_ill within the group. This is achieved by sending
- * DL_DISABMULTI_REQS on ilg_ill/ilm_ill and sending DL_ENABMULTIREQS
- * on the new_ill and changing ilg_ill/ilm_ill to new_ill. But, we
- * need to be able to delete memberships which will still come down
- * with the ifindex of the old ill which is what the application
- * knows of. Thus we store the ilm_/ilg_orig_ifindex to keep track
- * of where we joined initially so that we can lookup even after we
- * moved the membership. It is also used for moving back the membership
- * when the old ill has been repaired. This is done by looking up for
- * ilms with ilm_orig_ifindex matching on the old ill's ifindex. Only
- * ilms actually move from old ill to new ill. ilgs don't move (just
- * the ilg_ill is changed when it moves) as it just records the state
- * of the application that has joined a group G where as ilm records
- * the state joined with the driver. Thus when we send DL_XXXMULTI_REQs
- * we also need to keep the ilm in the right ill.
- *
- * In IPv4, as ipifs move from old ill to new_ill, ilgs and ilms move
- * implicitly as we use only ipifs in IPv4. Thus, one can always lookup
- * a given ilm/ilg even after it fails without the support of
- * orig_ifindex. We move ilms still to record the driver state as
- * mentioned above.
+ * There is no direct link between a given ilg and ilm. If the
+ * application has joined a group G with ifindex I, we will have
+ * an ilg with ilg_v6group and ilg_ill. There will be a corresponding
+ * ilm with ilm_ill/ilm_v6addr recording the multicast membership.
+ * To delete the membership:
+ *
+ * a) Search for ilg matching on G and I with ilg_v6group
+ * and ilg_ill. Delete ilg_ill.
+ * b) Search the corresponding ilm matching on G and I with
+ * ilm_v6addr and ilm_ill. Delete ilm.
+ *
+ * For IPv4 the only difference is that we look using ipifs, not ills.
*/
/*
@@ -839,7 +817,6 @@ typedef struct ilg_s {
in6_addr_t ilg_v6group;
struct ipif_s *ilg_ipif; /* Logical interface we are member on */
struct ill_s *ilg_ill; /* Used by IPv6 */
- int ilg_orig_ifindex; /* Interface originally joined on */
uint_t ilg_flags;
mcast_record_t ilg_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */
slist_t *ilg_filter;
@@ -866,9 +843,7 @@ typedef struct ilm_s {
struct ilm_s *ilm_next; /* Linked list for each ill */
uint_t ilm_state; /* state of the membership */
struct ill_s *ilm_ill; /* Back pointer to ill for IPv6 */
- int ilm_orig_ifindex; /* V6_MULTICAST_IF/ilm_ipif index */
uint_t ilm_flags;
- boolean_t ilm_is_new; /* new ilm */
boolean_t ilm_notify_driver; /* Need to notify the driver */
zoneid_t ilm_zoneid;
int ilm_no_ilg_cnt; /* number of joins w/ no ilg */
@@ -881,28 +856,11 @@ typedef struct ilm_s {
#define ilm_addr V4_PART_OF_V6(ilm_v6addr)
-/*
- * ilm_walker_cleanup needs to execute when the ilm_walker_cnt goes down to
- * zero. In addition it needs to block new walkers while it is unlinking ilm's
- * from the list. Thus simple atomics for the ill_ilm_walker_cnt don't suffice.
- */
-#define ILM_WALKER_HOLD(ill) { \
- mutex_enter(&(ill)->ill_lock); \
- ill->ill_ilm_walker_cnt++; \
- mutex_exit(&(ill)->ill_lock); \
-}
-
-/*
- * ilm_walker_cleanup releases ill_lock
- */
-#define ILM_WALKER_RELE(ill) { \
- mutex_enter(&(ill)->ill_lock); \
- (ill)->ill_ilm_walker_cnt--; \
- if ((ill)->ill_ilm_walker_cnt == 0 && (ill)->ill_ilm_cleanup_reqd) \
- ilm_walker_cleanup(ill); \
- else \
- mutex_exit(&(ill)->ill_lock); \
-}
+typedef struct ilm_walker {
+ struct ill_s *ilw_ill; /* associated ill */
+ struct ill_s *ilw_ipmp_ill; /* associated ipmp ill (if any) */
+ struct ill_s *ilw_walk_ill; /* current ill being walked */
+} ilm_walker_t;
/*
* Soft reference to an IPsec SA.
@@ -1047,11 +1005,8 @@ typedef struct conn_s conn_t;
* ipc_acking_unbind conn_acking_unbind
* ipc_pad_to_bit_31 conn_pad_to_bit_31
*
- * ipc_nofailover_ill conn_nofailover_ill
- *
* ipc_proto conn_proto
* ipc_incoming_ill conn_incoming_ill
- * ipc_outgoing_pill conn_outgoing_pill
* ipc_pending_ill conn_pending_ill
* ipc_unbind_mp conn_unbind_mp
* ipc_ilg conn_ilg
@@ -1061,8 +1016,6 @@ typedef struct conn_s conn_t;
* ipc_refcv conn_refcv
* ipc_multicast_ipif conn_multicast_ipif
* ipc_multicast_ill conn_multicast_ill
- * ipc_orig_bound_ifindex conn_orig_bound_ifindex
- * ipc_orig_multicast_ifindex conn_orig_multicast_ifindex
* ipc_drain_next conn_drain_next
* ipc_drain_prev conn_drain_prev
* ipc_idl conn_idl
@@ -1263,7 +1216,6 @@ typedef struct th_hash_s {
/* The following are ipif_state_flags */
#define IPIF_CONDEMNED 0x1 /* The ipif is being removed */
#define IPIF_CHANGING 0x2 /* A critcal ipif field is changing */
-#define IPIF_MOVING 0x8 /* The ipif is being moved */
#define IPIF_SET_LINKLOCAL 0x10 /* transient flag during bringup */
#define IPIF_ZERO_SOURCE 0x20 /* transient flag during bringup */
@@ -1273,7 +1225,6 @@ typedef struct ipif_s {
struct ill_s *ipif_ill; /* Back pointer to our ill */
int ipif_id; /* Logical unit number */
uint_t ipif_mtu; /* Starts at ipif_ill->ill_max_frag */
- uint_t ipif_saved_mtu; /* Save of mtu during ipif_move() */
in6_addr_t ipif_v6lcl_addr; /* Local IP address for this if. */
in6_addr_t ipif_v6src_addr; /* Source IP address for this if. */
in6_addr_t ipif_v6subnet; /* Subnet prefix for this if. */
@@ -1306,17 +1257,15 @@ typedef struct ipif_s {
uint_t ipif_ob_pkt_count; /* Outbound packets to our dead IREs */
/* Exclusive bit fields, protected by ipsq_t */
unsigned int
- ipif_multicast_up : 1, /* We have joined the allhosts group */
- ipif_replace_zero : 1, /* Replacement for zero */
+ ipif_multicast_up : 1, /* ipif_multicast_up() successful */
ipif_was_up : 1, /* ipif was up before */
ipif_addr_ready : 1, /* DAD is done */
-
ipif_was_dup : 1, /* DAD had failed */
+
+ ipif_joined_allhosts : 1, /* allhosts joined */
ipif_pad_to_31 : 27;
- int ipif_orig_ifindex; /* ifindex before SLIFFAILOVER */
uint_t ipif_seqid; /* unique index across all ills */
- uint_t ipif_orig_ipifid; /* ipif_id before SLIFFAILOVER */
uint_t ipif_state_flags; /* See IPIF_* flag defs above */
uint_t ipif_refcnt; /* active consistent reader cnt */
@@ -1328,6 +1277,16 @@ typedef struct ipif_s {
zoneid_t ipif_zoneid; /* zone ID number */
timeout_id_t ipif_recovery_id; /* Timer for DAD recovery */
boolean_t ipif_trace_disable; /* True when alloc fails */
+ /*
+ * For an IPMP interface, ipif_bound_ill tracks the ill whose hardware
+ * information this ipif is associated with via ARP/NDP. We can use
+ * an ill pointer (rather than an index) because only ills that are
+ * part of a group will be pointed to, and an ill cannot disappear
+ * while it's in a group.
+ */
+ struct ill_s *ipif_bound_ill;
+ struct ipif_s *ipif_bound_next; /* bound ipif chain */
+ boolean_t ipif_bound; /* B_TRUE if we successfully bound */
} ipif_t;
/*
@@ -1405,8 +1364,6 @@ typedef struct ipif_s {
*
* bit fields ill_lock ill_lock
*
- * ipif_orig_ifindex ipsq None
- * ipif_orig_ipifid ipsq None
* ipif_seqid ipsq Write once
*
* ipif_state_flags ill_lock ill_lock
@@ -1414,6 +1371,10 @@ typedef struct ipif_s {
* ipif_ire_cnt ill_lock ill_lock
* ipif_ilm_cnt ill_lock ill_lock
* ipif_saved_ire_cnt
+ *
+ * ipif_bound_ill ipsq + ipmp_lock ipsq OR ipmp_lock
+ * ipif_bound_next ipsq ipsq
+ * ipif_bound ipsq ipsq
*/
#define IP_TR_HASH(tid) ((((uintptr_t)tid) >> 6) & (IP_TR_HASH_MAX - 1))
@@ -1457,103 +1418,154 @@ typedef struct ipif_s {
#define IPI2MODE(ipi) ((ipi)->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT)
/*
- * The IP-MT design revolves around the serialization object ipsq_t.
- * It is associated with an IPMP group. If IPMP is not enabled, there is
- * 1 ipsq_t per phyint. Eg. an ipsq_t would cover both hme0's IPv4 stream
- *
- * ipsq_lock protects
- * ipsq_reentry_cnt, ipsq_writer, ipsq_xopq_mphead, ipsq_xopq_mptail,
- * ipsq_mphead, ipsq_mptail, ipsq_split
- *
- * ipsq_pending_ipif, ipsq_current_ipif, ipsq_pending_mp, ipsq_flags,
- * ipsq_waitfor
- *
- * The fields in the last line above below are set mostly by a writer thread
- * But there is an exception in the last call to ipif_ill_refrele_tail which
- * could also race with a conn close which could be cleaning up the
- * fields. So we choose to protect using ipsq_lock instead of depending on
- * the property of the writer.
- * ill_g_lock protects
- * ipsq_refs, ipsq_phyint_list
- */
-typedef struct ipsq_s {
- kmutex_t ipsq_lock;
- int ipsq_reentry_cnt;
- kthread_t *ipsq_writer; /* current owner (thread id) */
- int ipsq_flags;
- mblk_t *ipsq_xopq_mphead; /* list of excl ops mostly ioctls */
- mblk_t *ipsq_xopq_mptail;
- mblk_t *ipsq_mphead; /* msgs on ipsq linked thru b_next */
- mblk_t *ipsq_mptail; /* msgs on ipsq linked thru b_next */
- int ipsq_current_ioctl; /* current ioctl, or 0 if no ioctl */
- boolean_t ipsq_current_done; /* is the current op done? */
- ipif_t *ipsq_current_ipif; /* ipif associated with current op */
- ipif_t *ipsq_pending_ipif; /* ipif associated w. ipsq_pending_mp */
- mblk_t *ipsq_pending_mp; /* current ioctl mp while waiting for */
- /* response from another module */
- struct ipsq_s *ipsq_next; /* list of all syncq's (ipsq_g_list) */
- uint_t ipsq_refs; /* Number of phyints on this ipsq */
- struct phyint *ipsq_phyint_list; /* List of phyints on this ipsq */
- boolean_t ipsq_split; /* ipsq may need to be split */
- int ipsq_waitfor; /* Values encoded below */
- char ipsq_name[LIFNAMSIZ+1]; /* same as phyint_groupname */
- ip_stack_t *ipsq_ipst; /* Does not have a netstack_hold */
-
+ * The IP-MT design revolves around the serialization objects ipsq_t (IPSQ)
+ * and ipxop_t (exclusive operation or "xop"). Becoming "writer" on an IPSQ
+ * ensures that no other threads can become "writer" on any IPSQs sharing that
+ * IPSQ's xop until the writer thread is done.
+ *
+ * Each phyint points to one IPSQ that remains fixed over the phyint's life.
+ * Each IPSQ points to one xop that can change over the IPSQ's life. If a
+ * phyint is *not* in an IPMP group, then its IPSQ will refer to the IPSQ's
+ * "own" xop (ipsq_ownxop). If a phyint *is* part of an IPMP group, then its
+ * IPSQ will refer to the "group" xop, which is shorthand for the xop of the
+ * IPSQ of the IPMP meta-interface's phyint. Thus, all phyints that are part
+ * of the same IPMP group will have their IPSQ's point to the group xop, and
+ * thus becoming "writer" on any phyint in the group will prevent any other
+ * writer on any other phyint in the group. All IPSQs sharing the same xop
+ * are chained together through ipsq_next (in the degenerate common case,
+ * ipsq_next simply refers to itself). Note that the group xop is guaranteed
+ * to exist at least as long as there are members in the group, since the IPMP
+ * meta-interface can only be destroyed if the group is empty.
+ *
+ * Incoming exclusive operation requests are enqueued on the IPSQ they arrived
+ * on rather than the xop. This makes switching xop's (as would happen when a
+ * phyint leaves an IPMP group) simple, because after the phyint leaves the
+ * group, any operations enqueued on its IPSQ can be safely processed with
+ * respect to its new xop, and any operations enqueued on the IPSQs of its
+ * former group can be processed with respect to their existing group xop.
+ * Even so, switching xops is a subtle dance; see ipsq_dq() for details.
+ *
+ * An IPSQ's "own" xop is embedded within the IPSQ itself since they have have
+ * identical lifetimes, and because doing so simplifies pointer management.
+ * While each phyint and IPSQ point to each other, it is not possible to free
+ * the IPSQ when the phyint is freed, since we may still *inside* the IPSQ
+ * when the phyint is being freed. Thus, ipsq_phyint is set to NULL when the
+ * phyint is freed, and the IPSQ free is later done in ipsq_exit().
+ *
+ * ipsq_t synchronization: read write
+ *
+ * ipsq_xopq_mphead ipx_lock ipx_lock
+ * ipsq_xopq_mptail ipx_lock ipx_lock
+ * ipsq_xop_switch_mp ipsq_lock ipsq_lock
+ * ipsq_phyint write once write once
+ * ipsq_next RW_READER ill_g_lock RW_WRITER ill_g_lock
+ * ipsq_xop ipsq_lock or ipsq ipsq_lock + ipsq
+ * ipsq_swxop ipsq ipsq
+ * ipsq_ownxop see ipxop_t see ipxop_t
+ * ipsq_ipst write once write once
+ *
+ * ipxop_t synchronization: read write
+ *
+ * ipx_writer ipx_lock ipx_lock
+ * ipx_xop_queued ipx_lock ipx_lock
+ * ipx_mphead ipx_lock ipx_lock
+ * ipx_mptail ipx_lock ipx_lock
+ * ipx_ipsq write once write once
+ * ips_ipsq_queued ipx_lock ipx_lock
+ * ipx_waitfor ipsq or ipx_lock ipsq + ipx_lock
+ * ipx_reentry_cnt ipsq or ipx_lock ipsq + ipx_lock
+ * ipx_current_done ipsq ipsq
+ * ipx_current_ioctl ipsq ipsq
+ * ipx_current_ipif ipsq or ipx_lock ipsq + ipx_lock
+ * ipx_pending_ipif ipsq or ipx_lock ipsq + ipx_lock
+ * ipx_pending_mp ipsq or ipx_lock ipsq + ipx_lock
+ * ipx_forced ipsq ipsq
+ * ipx_depth ipsq ipsq
+ * ipx_stack ipsq ipsq
+ */
+typedef struct ipxop_s {
+ kmutex_t ipx_lock; /* see above */
+ kthread_t *ipx_writer; /* current owner */
+ mblk_t *ipx_mphead; /* messages tied to this op */
+ mblk_t *ipx_mptail;
+ struct ipsq_s *ipx_ipsq; /* associated ipsq */
+ boolean_t ipx_ipsq_queued; /* ipsq using xop has queued op */
+ int ipx_waitfor; /* waiting; values encoded below */
+ int ipx_reentry_cnt;
+ boolean_t ipx_current_done; /* is the current operation done? */
+ int ipx_current_ioctl; /* current ioctl, or 0 if no ioctl */
+ ipif_t *ipx_current_ipif; /* ipif for current op */
+ ipif_t *ipx_pending_ipif; /* ipif for ipsq_pending_mp */
+ mblk_t *ipx_pending_mp; /* current ioctl mp while waiting */
+ boolean_t ipx_forced; /* debugging aid */
#ifdef DEBUG
- int ipsq_depth; /* debugging aid */
-#define IPSQ_STACK_DEPTH 15
- pc_t ipsq_stack[IPSQ_STACK_DEPTH]; /* debugging aid */
+ int ipx_depth; /* debugging aid */
+#define IPX_STACK_DEPTH 15
+ pc_t ipx_stack[IPX_STACK_DEPTH]; /* debugging aid */
#endif
-} ipsq_t;
+} ipxop_t;
-/* ipsq_flags */
-#define IPSQ_GROUP 0x1 /* This ipsq belongs to an IPMP group */
+typedef struct ipsq_s {
+ kmutex_t ipsq_lock; /* see above */
+ mblk_t *ipsq_switch_mp; /* op to handle right after switch */
+ mblk_t *ipsq_xopq_mphead; /* list of excl ops (mostly ioctls) */
+ mblk_t *ipsq_xopq_mptail;
+ struct phyint *ipsq_phyint; /* associated phyint */
+ struct ipsq_s *ipsq_next; /* next ipsq sharing ipsq_xop */
+ struct ipxop_s *ipsq_xop; /* current xop synchronization info */
+ struct ipxop_s *ipsq_swxop; /* switch xop to on ipsq_exit() */
+ struct ipxop_s ipsq_ownxop; /* our own xop (may not be in-use) */
+ ip_stack_t *ipsq_ipst; /* does not have a netstack_hold */
+} ipsq_t;
/*
- * ipsq_waitfor:
- *
- * IPIF_DOWN 1 ipif_down waiting for refcnts to drop
- * ILL_DOWN 2 ill_down waiting for refcnts to drop
- * IPIF_FREE 3 ipif_free waiting for refcnts to drop
- * ILL_FREE 4 ill unplumb waiting for refcnts to drop
- * ILL_MOVE_OK 5 failover waiting for refcnts to drop
+ * ipx_waitfor values:
*/
+enum {
+ IPIF_DOWN = 1, /* ipif_down() waiting for refcnts to drop */
+ ILL_DOWN, /* ill_down() waiting for refcnts to drop */
+ IPIF_FREE, /* ipif_free() waiting for refcnts to drop */
+ ILL_FREE /* ill unplumb waiting for refcnts to drop */
+};
-enum { IPIF_DOWN = 1, ILL_DOWN, IPIF_FREE, ILL_FREE, ILL_MOVE_OK };
+/* Operation types for ipsq_try_enter() */
+#define CUR_OP 0 /* request writer within current operation */
+#define NEW_OP 1 /* request writer for a new operation */
+#define SWITCH_OP 2 /* request writer once IPSQ XOP switches */
-/* Flags passed to ipsq_try_enter */
-#define CUR_OP 0 /* Current ioctl continuing again */
-#define NEW_OP 1 /* New ioctl starting afresh */
+/*
+ * Kstats tracked on each IPMP meta-interface. Order here must match
+ * ipmp_kstats[] in ip/ipmp.c.
+ */
+enum {
+ IPMP_KSTAT_OBYTES, IPMP_KSTAT_OBYTES64, IPMP_KSTAT_RBYTES,
+ IPMP_KSTAT_RBYTES64, IPMP_KSTAT_OPACKETS, IPMP_KSTAT_OPACKETS64,
+ IPMP_KSTAT_OERRORS, IPMP_KSTAT_IPACKETS, IPMP_KSTAT_IPACKETS64,
+ IPMP_KSTAT_IERRORS, IPMP_KSTAT_MULTIRCV, IPMP_KSTAT_MULTIXMT,
+ IPMP_KSTAT_BRDCSTRCV, IPMP_KSTAT_BRDCSTXMT, IPMP_KSTAT_LINK_UP,
+ IPMP_KSTAT_MAX /* keep last */
+};
/*
* phyint represents state that is common to both IPv4 and IPv6 interfaces.
* There is a separate ill_t representing IPv4 and IPv6 which has a
* backpointer to the phyint structure for accessing common state.
- *
- * NOTE : It just stores the group name as there is only one name for
- * IPv4 and IPv6 i.e it is a underlying link property. Actually
- * IPv4 and IPv6 ill are grouped together when their phyints have
- * the same name.
*/
typedef struct phyint {
struct ill_s *phyint_illv4;
struct ill_s *phyint_illv6;
- uint_t phyint_ifindex; /* SIOCLSLIFINDEX */
- char *phyint_groupname; /* SIOCSLIFGROUPNAME */
- uint_t phyint_groupname_len;
+ uint_t phyint_ifindex; /* SIOCSLIFINDEX */
uint64_t phyint_flags;
avl_node_t phyint_avl_by_index; /* avl tree by index */
avl_node_t phyint_avl_by_name; /* avl tree by name */
kmutex_t phyint_lock;
struct ipsq_s *phyint_ipsq; /* back pointer to ipsq */
- struct phyint *phyint_ipsq_next; /* phyint list on this ipsq */
- /* Once Clearview IPMP is added the follow two fields can be removed */
- uint_t phyint_group_ifindex; /* index assigned to group */
- uint_t phyint_hook_ifindex; /* index used with neti/hook */
+ struct ipmp_grp_s *phyint_grp; /* associated IPMP group */
+ char phyint_name[LIFNAMSIZ]; /* physical interface name */
+ uint64_t phyint_kstats0[IPMP_KSTAT_MAX]; /* baseline kstats */
} phyint_t;
#define CACHE_ALIGN_SIZE 64
-
#define CACHE_ALIGN(align_struct) P2ROUNDUP(sizeof (struct align_struct),\
CACHE_ALIGN_SIZE)
struct _phyint_list_s_ {
@@ -1568,34 +1580,6 @@ typedef union phyint_list_u {
#define phyint_list_avl_by_index phyint_list_s.phyint_list_avl_by_index
#define phyint_list_avl_by_name phyint_list_s.phyint_list_avl_by_name
-/*
- * ILL groups. We group ills,
- *
- * - if the ills have the same group name. (New way)
- *
- * ill_group locking notes:
- *
- * illgrp_lock protects ill_grp_ill_schednext.
- *
- * ill_g_lock protects ill_grp_next, illgrp_ill, illgrp_ill_count.
- * Holding ill_g_lock freezes the memberships of ills in IPMP groups.
- * It also freezes the global list of ills and all ipifs in all ills.
- *
- * To remove an ipif from the linked list of ipifs of that ill ipif_free_tail
- * holds both ill_g_lock, and ill_lock. Similarly to remove an ill from the
- * global list of ills, ill_glist_delete() holds ill_g_lock as writer.
- * This simplifies things for ipif_select_source, illgrp_scheduler etc.
- * that need to walk the members of an illgrp. They just hold ill_g_lock
- * as reader to do the walk.
- *
- */
-typedef struct ill_group {
- kmutex_t illgrp_lock;
- struct ill_group *illgrp_next; /* Next ill_group */
- struct ill_s *illgrp_ill_schednext; /* Next ill to be scheduled */
- struct ill_s *illgrp_ill; /* First ill in the group */
- int illgrp_ill_count;
-} ill_group_t;
/*
* Fragmentation hash bucket
@@ -1792,6 +1776,108 @@ typedef struct ill_lso_capab_s ill_lso_capab_t;
#define IS_LOOPBACK(ill) \
((ill)->ill_phyint->phyint_flags & PHYI_LOOPBACK)
+/* Is this an IPMP meta-interface ILL? */
+#define IS_IPMP(ill) \
+ ((ill)->ill_phyint->phyint_flags & PHYI_IPMP)
+
+/* Is this ILL under an IPMP meta-interface? (aka "in a group?") */
+#define IS_UNDER_IPMP(ill) \
+ ((ill)->ill_grp != NULL && !IS_IPMP(ill))
+
+/* Is ill1 in the same illgrp as ill2? */
+#define IS_IN_SAME_ILLGRP(ill1, ill2) \
+ ((ill1)->ill_grp != NULL && ((ill1)->ill_grp == (ill2)->ill_grp))
+
+/* Is ill1 on the same LAN as ill2? */
+#define IS_ON_SAME_LAN(ill1, ill2) \
+ ((ill1) == (ill2) || IS_IN_SAME_ILLGRP(ill1, ill2))
+
+#define ILL_OTHER(ill) \
+ ((ill)->ill_isv6 ? (ill)->ill_phyint->phyint_illv4 : \
+ (ill)->ill_phyint->phyint_illv6)
+
+/*
+ * IPMP group ILL state structure -- up to two per IPMP group (V4 and V6).
+ * Created when the V4 and/or V6 IPMP meta-interface is I_PLINK'd. It is
+ * guaranteed to persist while there are interfaces of that type in the group.
+ * In general, most fields are accessed outside of the IPSQ (e.g., in the
+ * datapath), and thus use locks in addition to the IPSQ for protection.
+ *
+ * synchronization: read write
+ *
+ * ig_if ipsq or ill_g_lock ipsq and ill_g_lock
+ * ig_actif ipsq or ipmp_lock ipsq and ipmp_lock
+ * ig_nactif ipsq or ipmp_lock ipsq and ipmp_lock
+ * ig_next_ill ipsq or ipmp_lock ipsq and ipmp_lock
+ * ig_ipmp_ill write once write once
+ * ig_cast_ill ipsq or ipmp_lock ipsq and ipmp_lock
+ * ig_arpent ipsq ipsq
+ * ig_mtu ipsq ipsq
+ */
+typedef struct ipmp_illgrp_s {
+ list_t ig_if; /* list of all interfaces */
+ list_t ig_actif; /* list of active interfaces */
+ uint_t ig_nactif; /* number of active interfaces */
+ struct ill_s *ig_next_ill; /* next active interface to use */
+ struct ill_s *ig_ipmp_ill; /* backpointer to IPMP meta-interface */
+ struct ill_s *ig_cast_ill; /* nominated ill for multi/broadcast */
+ list_t ig_arpent; /* list of ARP entries */
+ uint_t ig_mtu; /* ig_ipmp_ill->ill_max_mtu */
+} ipmp_illgrp_t;
+
+/*
+ * IPMP group state structure -- one per IPMP group. Created when the
+ * IPMP meta-interface is plumbed; it is guaranteed to persist while there
+ * are interfaces in it.
+ *
+ * ipmp_grp_t synchronization: read write
+ *
+ * gr_name ipmp_lock ipmp_lock
+ * gr_ifname write once write once
+ * gr_mactype ipmp_lock ipmp_lock
+ * gr_phyint write once write once
+ * gr_nif ipmp_lock ipmp_lock
+ * gr_nactif ipsq ipsq
+ * gr_v4 ipmp_lock ipmp_lock
+ * gr_v6 ipmp_lock ipmp_lock
+ * gr_nv4 ipmp_lock ipmp_lock
+ * gr_nv6 ipmp_lock ipmp_lock
+ * gr_pendv4 ipmp_lock ipmp_lock
+ * gr_pendv6 ipmp_lock ipmp_lock
+ * gr_linkdownmp ipsq ipsq
+ * gr_ksp ipmp_lock ipmp_lock
+ * gr_kstats0 atomic atomic
+ */
+typedef struct ipmp_grp_s {
+ char gr_name[LIFGRNAMSIZ]; /* group name */
+ char gr_ifname[LIFNAMSIZ]; /* interface name */
+ t_uscalar_t gr_mactype; /* DLPI mactype of group */
+ phyint_t *gr_phyint; /* IPMP group phyint */
+ uint_t gr_nif; /* number of interfaces in group */
+ uint_t gr_nactif; /* number of active interfaces */
+ ipmp_illgrp_t *gr_v4; /* V4 group information */
+ ipmp_illgrp_t *gr_v6; /* V6 group information */
+ uint_t gr_nv4; /* number of ills in V4 group */
+ uint_t gr_nv6; /* number of ills in V6 group */
+ uint_t gr_pendv4; /* number of pending ills in V4 group */
+ uint_t gr_pendv6; /* number of pending ills in V6 group */
+ mblk_t *gr_linkdownmp; /* message used to bring link down */
+ kstat_t *gr_ksp; /* group kstat pointer */
+ uint64_t gr_kstats0[IPMP_KSTAT_MAX]; /* baseline group kstats */
+} ipmp_grp_t;
+
+/*
+ * IPMP ARP entry -- one per SIOCS*ARP entry tied to the group. Used to keep
+ * ARP up-to-date as the active set of interfaces in the group changes.
+ */
+typedef struct ipmp_arpent_s {
+ mblk_t *ia_area_mp; /* AR_ENTRY_ADD pointer */
+ ipaddr_t ia_ipaddr; /* IP address for this entry */
+ boolean_t ia_proxyarp; /* proxy ARP entry? */
+ boolean_t ia_notified; /* ARP notified about this entry? */
+ list_node_t ia_node; /* next ARP entry in list */
+} ipmp_arpent_t;
+
/*
* IP Lower level Structure.
* Instance data structure in ip_open when there is a device below us.
@@ -1851,6 +1937,7 @@ typedef struct ill_s {
mblk_t *ill_unbind_mp; /* unbind mp from ill_dl_up() */
mblk_t *ill_promiscoff_mp; /* for ill_leave_allmulti() */
mblk_t *ill_dlpi_deferred; /* b_next chain of control messages */
+ mblk_t *ill_ardeact_mp; /* deact mp from ipmp_ill_activate() */
mblk_t *ill_phys_addr_mp; /* mblk which holds ill_phys_addr */
#define ill_last_mp_to_free ill_phys_addr_mp
@@ -1867,21 +1954,19 @@ typedef struct ill_s {
ill_dlpi_style_set : 1,
ill_ifname_pending : 1,
- ill_move_in_progress : 1, /* FAILOVER/FAILBACK in progress */
ill_join_allmulti : 1,
ill_logical_down : 1,
-
ill_is_6to4tun : 1, /* Interface is a 6to4 tunnel */
+
ill_promisc_on_phys : 1, /* phys interface in promisc mode */
ill_dl_up : 1,
ill_up_ipifs : 1,
-
ill_note_link : 1, /* supports link-up notification */
+
ill_capab_reneg : 1, /* capability renegotiation to be done */
ill_dld_capab_inprog : 1, /* direct dld capab call in prog */
ill_need_recover_multicast : 1,
-
- ill_pad_to_bit_31 : 16;
+ ill_pad_to_bit_31 : 17;
/* Following bit fields protected by ill_lock */
uint_t
@@ -1891,10 +1976,8 @@ typedef struct ill_s {
ill_arp_closing : 1,
ill_arp_bringup_pending : 1,
- ill_mtu_userspecified : 1, /* SIOCSLIFLNKINFO has set the mtu */
ill_arp_extend : 1, /* ARP has DAD extensions */
-
- ill_pad_bit_31 : 25;
+ ill_pad_bit_31 : 26;
/*
* Used in SIOCSIFMUXID and SIOCGIFMUXID for 'ifconfig unplumb'.
@@ -1931,6 +2014,7 @@ typedef struct ill_s {
*/
uint8_t ill_max_hops; /* Maximum hops for any logical interface */
uint_t ill_max_mtu; /* Maximum MTU for any logical interface */
+ uint_t ill_user_mtu; /* User-specified MTU via SIOCSLIFLNKINFO */
uint32_t ill_reachable_time; /* Value for ND algorithm in msec */
uint32_t ill_reachable_retrans_time; /* Value for ND algorithm msec */
uint_t ill_max_buf; /* Max # of req to buffer for ND */
@@ -1953,13 +2037,9 @@ typedef struct ill_s {
* of the ipif.
*/
mblk_t *ill_arp_on_mp;
- /* Peer ill of an IPMP move operation */
- struct ill_s *ill_move_peer;
phyint_t *ill_phyint;
uint64_t ill_flags;
- ill_group_t *ill_group;
- struct ill_s *ill_group_next;
kmutex_t ill_lock; /* Please see table below */
/*
@@ -2005,6 +2085,18 @@ typedef struct ill_s {
void *ill_flownotify_mh; /* Tx flow ctl, mac cb handle */
uint_t ill_ilm_cnt; /* ilms referencing this ill */
uint_t ill_ipallmulti_cnt; /* ip_join_allmulti() calls */
+ /*
+ * IPMP fields.
+ */
+ ipmp_illgrp_t *ill_grp; /* IPMP group information */
+ list_node_t ill_actnode; /* next active ill in group */
+ list_node_t ill_grpnode; /* next ill in group */
+ ipif_t *ill_src_ipif; /* source address selection rotor */
+ ipif_t *ill_move_ipif; /* ipif awaiting move to new ill */
+ boolean_t ill_nom_cast; /* nominated for mcast/bcast */
+ uint_t ill_bound_cnt; /* # of data addresses bound to ill */
+ ipif_t *ill_bound_ipif; /* ipif chain bound to ill */
+ timeout_id_t ill_refresh_tid; /* ill refresh retry timeout id */
} ill_t;
/*
@@ -2088,6 +2180,7 @@ typedef struct ill_s {
*
* ill_max_mtu
*
+ * ill_user_mtu ipsq + ill_lock ill_lock
* ill_reachable_time ipsq + ill_lock ill_lock
* ill_reachable_retrans_time ipsq + ill_lock ill_lock
* ill_max_buf ipsq + ill_lock ill_lock
@@ -2102,12 +2195,9 @@ typedef struct ill_s {
* ill_arp_down_mp ipsq ipsq
* ill_arp_del_mapping_mp ipsq ipsq
* ill_arp_on_mp ipsq ipsq
- * ill_move_peer ipsq ipsq
*
* ill_phyint ipsq, ill_g_lock, ill_lock Any of them
* ill_flags ill_lock ill_lock
- * ill_group ipsq, ill_g_lock, ill_lock Any of them
- * ill_group_next ipsq, ill_g_lock, ill_lock Any of them
* ill_nd_lla_mp ipsq + down ill only when ill is up
* ill_nd_lla ipsq + down ill only when ill is up
* ill_nd_lla_len ipsq + down ill only when ill is up
@@ -2122,11 +2212,26 @@ typedef struct ill_s {
* ill_ilm_walker_cnt ill_lock ill_lock
* ill_nce_cnt ill_lock ill_lock
* ill_ilm_cnt ill_lock ill_lock
+ * ill_src_ipif ill_g_lock ill_g_lock
* ill_trace ill_lock ill_lock
* ill_usesrc_grp_next ill_g_usesrc_lock ill_g_usesrc_lock
* ill_dhcpinit atomics atomics
* ill_flownotify_mh write once write once
* ill_capab_pending_cnt ipsq ipsq
+ *
+ * ill_bound_cnt ipsq ipsq
+ * ill_bound_ipif ipsq ipsq
+ * ill_actnode ipsq + ipmp_lock ipsq OR ipmp_lock
+ * ill_grpnode ipsq + ill_g_lock ipsq OR ill_g_lock
+ * ill_src_ipif ill_g_lock ill_g_lock
+ * ill_move_ipif ipsq ipsq
+ * ill_nom_cast ipsq ipsq OR advisory
+ * ill_refresh_tid ill_lock ill_lock
+ * ill_grp (for IPMP ill) write once write once
+ * ill_grp (for underlying ill) ipsq + ill_g_lock ipsq OR ill_g_lock
+ *
+ * NOTE: It's OK to make heuristic decisions on an underlying interface
+ * by using IS_UNDER_IPMP() or comparing ill_grp's raw pointer value.
*/
/*
@@ -2167,7 +2272,7 @@ enum { IF_CMD = 1, LIF_CMD, TUN_CMD, ARP_CMD, XARP_CMD, MSFILT_CMD, MISC_CMD };
#define IPI_MODOK 0x2 /* Permitted on mod instance of IP */
#define IPI_WR 0x4 /* Need to grab writer access */
#define IPI_GET_CMD 0x8 /* branch to mi_copyout on success */
-#define IPI_REPL 0x10 /* valid for replacement ipif created in MOVE */
+/* unused 0x10 */
#define IPI_NULL_BCONT 0x20 /* ioctl has not data and hence no b_cont */
#define IPI_PASS_DOWN 0x40 /* pass this ioctl down when a module only */
@@ -2176,17 +2281,6 @@ extern ip_ioctl_cmd_t ip_misc_ioctl_table[];
extern int ip_ndx_ioctl_count;
extern int ip_misc_ioctl_count;
-#define ILL_CLEAR_MOVE(ill) { \
- ill_t *peer_ill; \
- \
- peer_ill = (ill)->ill_move_peer; \
- ASSERT(peer_ill != NULL); \
- (ill)->ill_move_in_progress = B_FALSE; \
- peer_ill->ill_move_in_progress = B_FALSE; \
- (ill)->ill_move_peer = NULL; \
- peer_ill->ill_move_peer = NULL; \
-}
-
/* Passed down by ARP to IP during I_PLINK/I_PUNLINK */
typedef struct ipmx_s {
char ipmx_name[LIFNAMSIZ]; /* if name */
@@ -2799,19 +2893,11 @@ typedef struct ip_pktinfo {
(!((ipif)->ipif_state_flags & (IPIF_CONDEMNED)) || \
IAM_WRITER_IPIF(ipif))
-/*
- * These macros are used by critical set ioctls and failover ioctls to
- * mark the ipif appropriately before starting the operation and to clear the
- * marks after completing the operation.
- */
-#define IPIF_UNMARK_MOVING(ipif) \
- (ipif)->ipif_state_flags &= ~IPIF_MOVING & ~IPIF_CHANGING;
-
#define ILL_UNMARK_CHANGING(ill) \
(ill)->ill_state_flags &= ~ILL_CHANGING;
/* Macros used to assert that this thread is a writer */
-#define IAM_WRITER_IPSQ(ipsq) ((ipsq)->ipsq_writer == curthread)
+#define IAM_WRITER_IPSQ(ipsq) ((ipsq)->ipsq_xop->ipx_writer == curthread)
#define IAM_WRITER_ILL(ill) IAM_WRITER_IPSQ((ill)->ill_phyint->phyint_ipsq)
#define IAM_WRITER_IPIF(ipif) IAM_WRITER_ILL((ipif)->ipif_ill)
@@ -2837,9 +2923,9 @@ typedef struct ip_pktinfo {
#define RELEASE_ILL_LOCKS(ill_1, ill_2) \
{ \
if (ill_1 != NULL) \
- mutex_exit(&(ill_1)->ill_lock); \
+ mutex_exit(&(ill_1)->ill_lock); \
if (ill_2 != NULL && ill_2 != ill_1) \
- mutex_exit(&(ill_2)->ill_lock); \
+ mutex_exit(&(ill_2)->ill_lock); \
}
/* Get the other protocol instance ill */
@@ -2847,14 +2933,9 @@ typedef struct ip_pktinfo {
((ill)->ill_isv6 ? (ill)->ill_phyint->phyint_illv4 : \
(ill)->ill_phyint->phyint_illv6)
-#define MATCH_V4_ONLY 0x1
-#define MATCH_V6_ONLY 0x2
-#define MATCH_ILL_ONLY 0x4
-
/* ioctl command info: Ioctl properties extracted and stored in here */
typedef struct cmd_info_s
{
- char ci_groupname[LIFNAMSIZ + 1]; /* SIOCSLIFGROUPNAME */
ipif_t *ci_ipif; /* ipif associated with [l]ifreq ioctl's */
sin_t *ci_sin; /* the sin struct passed down */
sin6_t *ci_sin6; /* the sin6_t struct passed down */
@@ -2990,10 +3071,8 @@ extern struct module_info ip_mod_info;
((ipst)->ips_ip6_loopback_out_event.he_interested)
/*
- * Hooks marcos used inside of ip
+ * Hooks macros used inside of ip
*/
-#define IPHA_VHL ipha_version_and_hdr_length
-
#define FW_HOOKS(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst) \
\
if ((_hook).he_interested) { \
@@ -3002,21 +3081,8 @@ extern struct module_info ip_mod_info;
_NOTE(CONSTCOND) \
ASSERT((_ilp != NULL) || (_olp != NULL)); \
\
- _NOTE(CONSTCOND) \
- if ((_ilp != NULL) && \
- (((ill_t *)(_ilp))->ill_phyint != NULL)) \
- info.hpe_ifp = (phy_if_t)((ill_t *) \
- (_ilp))->ill_phyint->phyint_hook_ifindex; \
- else \
- info.hpe_ifp = 0; \
- \
- _NOTE(CONSTCOND) \
- if ((_olp != NULL) && \
- (((ill_t *)(_olp))->ill_phyint != NULL)) \
- info.hpe_ofp = (phy_if_t)((ill_t *) \
- (_olp))->ill_phyint->phyint_hook_ifindex; \
- else \
- info.hpe_ofp = 0; \
+ FW_SET_ILL_INDEX(info.hpe_ifp, (ill_t *)_ilp); \
+ FW_SET_ILL_INDEX(info.hpe_ofp, (ill_t *)_olp); \
info.hpe_protocol = ipst->ips_ipv4_net_data; \
info.hpe_hdr = _iph; \
info.hpe_mp = &(_fm); \
@@ -3026,10 +3092,8 @@ extern struct module_info ip_mod_info;
_event, (hook_data_t)&info) != 0) { \
ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\
(_hook).he_name, (void *)_fm, (void *)_m)); \
- if (_fm != NULL) { \
- freemsg(_fm); \
- _fm = NULL; \
- } \
+ freemsg(_fm); \
+ _fm = NULL; \
_iph = NULL; \
_m = NULL; \
} else { \
@@ -3046,21 +3110,8 @@ extern struct module_info ip_mod_info;
_NOTE(CONSTCOND) \
ASSERT((_ilp != NULL) || (_olp != NULL)); \
\
- _NOTE(CONSTCOND) \
- if ((_ilp != NULL) && \
- (((ill_t *)(_ilp))->ill_phyint != NULL)) \
- info.hpe_ifp = (phy_if_t)((ill_t *) \
- (_ilp))->ill_phyint->phyint_hook_ifindex; \
- else \
- info.hpe_ifp = 0; \
- \
- _NOTE(CONSTCOND) \
- if ((_olp != NULL) && \
- (((ill_t *)(_olp))->ill_phyint != NULL)) \
- info.hpe_ofp = (phy_if_t)((ill_t *) \
- (_olp))->ill_phyint->phyint_hook_ifindex; \
- else \
- info.hpe_ofp = 0; \
+ FW_SET_ILL_INDEX(info.hpe_ifp, (ill_t *)_ilp); \
+ FW_SET_ILL_INDEX(info.hpe_ofp, (ill_t *)_olp); \
info.hpe_protocol = ipst->ips_ipv6_net_data; \
info.hpe_hdr = _iph; \
info.hpe_mp = &(_fm); \
@@ -3070,10 +3121,8 @@ extern struct module_info ip_mod_info;
_event, (hook_data_t)&info) != 0) { \
ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\
(_hook).he_name, (void *)_fm, (void *)_m)); \
- if (_fm != NULL) { \
- freemsg(_fm); \
- _fm = NULL; \
- } \
+ freemsg(_fm); \
+ _fm = NULL; \
_iph = NULL; \
_m = NULL; \
} else { \
@@ -3082,6 +3131,17 @@ extern struct module_info ip_mod_info;
} \
}
+#define FW_SET_ILL_INDEX(fp, ill) \
+ _NOTE(CONSTCOND) \
+ if ((ill) == NULL || (ill)->ill_phyint == NULL) { \
+ (fp) = 0; \
+ _NOTE(CONSTCOND) \
+ } else if (IS_UNDER_IPMP(ill)) { \
+ (fp) = ipmp_ill_get_ipmp_ifindex(ill); \
+ } else { \
+ (fp) = (ill)->ill_phyint->phyint_ifindex; \
+ }
+
/*
* Network byte order macros
*/
@@ -3146,16 +3206,15 @@ struct ipsec_out_s;
struct mac_header_info_s;
-extern boolean_t ip_assign_ifindex(uint_t *, ip_stack_t *);
extern void ill_frag_timer(void *);
extern ill_t *ill_first(int, int, ill_walk_context_t *, ip_stack_t *);
extern ill_t *ill_next(ill_walk_context_t *, ill_t *);
extern void ill_frag_timer_start(ill_t *);
extern void ill_nic_event_dispatch(ill_t *, lif_if_t, nic_event_t,
nic_event_data_t, size_t);
-extern void ill_nic_event_plumb(ill_t *, boolean_t);
extern mblk_t *ip_carve_mp(mblk_t **, ssize_t);
extern mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t);
+extern mblk_t *ip_dlnotify_alloc(uint_t, uint_t);
extern char *ip_dot_addr(ipaddr_t, char *);
extern const char *mac_colon_addr(const uint8_t *, size_t, char *, size_t);
extern void ip_lwput(queue_t *, mblk_t *);
@@ -3239,8 +3298,49 @@ extern int ip_hdr_complete(ipha_t *, zoneid_t, ip_stack_t *);
extern struct qinit iprinitv6;
extern struct qinit ipwinitv6;
-extern void conn_drain_insert(conn_t *connp);
-extern int conn_ipsec_length(conn_t *connp);
+extern void ipmp_init(ip_stack_t *);
+extern void ipmp_destroy(ip_stack_t *);
+extern ipmp_grp_t *ipmp_grp_create(const char *, phyint_t *);
+extern void ipmp_grp_destroy(ipmp_grp_t *);
+extern void ipmp_grp_info(const ipmp_grp_t *, lifgroupinfo_t *);
+extern int ipmp_grp_rename(ipmp_grp_t *, const char *);
+extern ipmp_grp_t *ipmp_grp_lookup(const char *, ip_stack_t *);
+extern int ipmp_grp_vet_phyint(ipmp_grp_t *, phyint_t *);
+extern ipmp_illgrp_t *ipmp_illgrp_create(ill_t *);
+extern void ipmp_illgrp_destroy(ipmp_illgrp_t *);
+extern ill_t *ipmp_illgrp_add_ipif(ipmp_illgrp_t *, ipif_t *);
+extern void ipmp_illgrp_del_ipif(ipmp_illgrp_t *, ipif_t *);
+extern ill_t *ipmp_illgrp_next_ill(ipmp_illgrp_t *);
+extern ill_t *ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *);
+extern ill_t *ipmp_illgrp_cast_ill(ipmp_illgrp_t *);
+extern ill_t *ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *);
+extern ill_t *ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *);
+extern void ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *);
+extern ipmp_arpent_t *ipmp_illgrp_create_arpent(ipmp_illgrp_t *, mblk_t *,
+ boolean_t);
+extern void ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *, ipmp_arpent_t *);
+extern ipmp_arpent_t *ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *, ipaddr_t *);
+extern void ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *);
+extern void ipmp_illgrp_mark_arpent(ipmp_illgrp_t *, ipmp_arpent_t *);
+extern ill_t *ipmp_illgrp_find_ill(ipmp_illgrp_t *, uchar_t *, uint_t);
+extern void ipmp_illgrp_link_grp(ipmp_illgrp_t *, ipmp_grp_t *);
+extern int ipmp_illgrp_unlink_grp(ipmp_illgrp_t *);
+extern uint_t ipmp_ill_get_ipmp_ifindex(const ill_t *);
+extern void ipmp_ill_join_illgrp(ill_t *, ipmp_illgrp_t *);
+extern void ipmp_ill_leave_illgrp(ill_t *);
+extern ill_t *ipmp_ill_hold_ipmp_ill(ill_t *);
+extern boolean_t ipmp_ill_is_active(ill_t *);
+extern void ipmp_ill_refresh_active(ill_t *);
+extern void ipmp_phyint_join_grp(phyint_t *, ipmp_grp_t *);
+extern void ipmp_phyint_leave_grp(phyint_t *);
+extern void ipmp_phyint_refresh_active(phyint_t *);
+extern ill_t *ipmp_ipif_bound_ill(const ipif_t *);
+extern ill_t *ipmp_ipif_hold_bound_ill(const ipif_t *);
+extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *);
+extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *);
+
+extern void conn_drain_insert(conn_t *connp);
+extern int conn_ipsec_length(conn_t *connp);
extern void ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *,
ire_t *);
extern ipaddr_t ip_get_dst(ipha_t *);
@@ -3274,9 +3374,6 @@ extern int ip_srcid_report(queue_t *, mblk_t *, caddr_t, cred_t *);
extern uint8_t ipoptp_next(ipoptp_t *);
extern uint8_t ipoptp_first(ipoptp_t *, ipha_t *);
extern int ip_opt_get_user(const ipha_t *, uchar_t *);
-extern ill_t *ip_grab_attach_ill(ill_t *, mblk_t *, int, boolean_t,
- ip_stack_t *);
-extern ire_t *conn_set_outgoing_ill(conn_t *, ire_t *, ill_t **);
extern int ipsec_req_from_conn(conn_t *, ipsec_req_t *, int);
extern int ip_snmp_get(queue_t *q, mblk_t *mctl, int level);
extern int ip_snmp_set(queue_t *q, int, int, uchar_t *, int);
@@ -3295,7 +3392,6 @@ extern void ip_savebuf(void **, uint_t *, boolean_t, const void *, uint_t);
extern boolean_t ipsq_pending_mp_cleanup(ill_t *, conn_t *);
extern void conn_ioctl_cleanup(conn_t *);
extern ill_t *conn_get_held_ill(conn_t *, ill_t **, int *);
-extern ill_t *ip_newroute_get_dst_ill(ill_t *);
struct multidata_s;
struct pdesc_s;
@@ -3314,9 +3410,6 @@ extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *,
uint_t);
extern void ip_unbind(conn_t *connp);
-extern phyint_t *phyint_lookup_group(char *, boolean_t, ip_stack_t *);
-extern phyint_t *phyint_lookup_group_ifindex(uint_t, ip_stack_t *);
-
extern void tnet_init(void);
extern void tnet_fini(void);
@@ -3434,6 +3527,8 @@ typedef struct ipobs_cb {
* ihd_ifindex Interface index that the packet was received/sent over.
* For local packets, this is the index of the interface
* associated with the local destination address.
+ * ihd_grifindex IPMP group interface index (zero unless ihd_ifindex
+ * is an IPMP underlying interface).
* ihd_stack Netstack the packet is from.
*/
typedef struct ipobs_hook_data {
@@ -3443,6 +3538,7 @@ typedef struct ipobs_hook_data {
ipobs_hook_type_t ihd_htype;
uint16_t ihd_ipver;
uint64_t ihd_ifindex;
+ uint64_t ihd_grifindex;
netstack_t *ihd_stack;
} ipobs_hook_data_t;
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 3f967ea183..d484831a3c 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -1892,7 +1892,6 @@ icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
* case MRT_VERSION:
* case MRT_ASSERT:
* case IP_SEC_OPT:
- * case IP_DONTFAILOVER_IF:
* case IP_NEXTHOP:
*/
default:
@@ -2481,7 +2480,6 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
case MRT_VERSION:
case MRT_ASSERT:
case IP_SEC_OPT:
- case IP_DONTFAILOVER_IF:
case IP_NEXTHOP:
/*
* "soft" error (negative)
@@ -3014,9 +3012,7 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
case IPV6_PATHMTU:
return (EINVAL);
- case IPV6_BOUND_PIF:
case IPV6_SEC_OPT:
- case IPV6_DONTFAILOVER_IF:
case IPV6_SRC_PREFERENCES:
case IPV6_V6ONLY:
/* Handled at IP level */
diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c
index 4f15801dfb..24ba9d689c 100644
--- a/usr/src/uts/common/inet/ip/icmp_opt_data.c
+++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -138,9 +138,6 @@ opdes_t icmp_opt_arr[] = {
{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
sizeof (int), 0 /* no ifindex */ },
-{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (struct in_addr), 0 /* not initialized */ },
-
{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
sizeof (int), 0 },
@@ -222,12 +219,6 @@ opdes_t icmp_opt_arr[] = {
{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
sizeof (int), 0 /* no ifindex */ },
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (int), 0 /* no ifindex */ },
-
-{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (int), 0 /* no ifindex */ },
-
{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
sizeof (int), 0 },
diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c
index 091509c71e..681f198aa7 100644
--- a/usr/src/uts/common/inet/ip/igmp.c
+++ b/usr/src/uts/common/inet/ip/igmp.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -46,7 +46,7 @@
#include <sys/cmn_err.h>
#include <sys/atomic.h>
#include <sys/zone.h>
-
+#include <sys/callb.h>
#include <sys/param.h>
#include <sys/socket.h>
#include <inet/ipclassifier.h>
@@ -83,7 +83,7 @@ static mrec_t *mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
static void mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
mcast_record_t rtype, slist_t *flist);
static mrec_t *mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
-
+static void mcast_signal_restart_thread(ip_stack_t *ipst);
/*
* Macros used to do timer len conversions. Timer values are always
@@ -122,7 +122,7 @@ static mrec_t *mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
* The first multicast join will trigger the igmp timers / mld timers
* The unit for next is milliseconds.
*/
-void
+static void
igmp_start_timers(unsigned next, ip_stack_t *ipst)
{
int time_left;
@@ -207,7 +207,7 @@ igmp_start_timers(unsigned next, ip_stack_t *ipst)
* mld_start_timers:
* The unit for next is milliseconds.
*/
-void
+static void
mld_start_timers(unsigned next, ip_stack_t *ipst)
{
int time_left;
@@ -306,7 +306,8 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
uint32_t group;
uint_t next;
ipif_t *ipif;
- ip_stack_t *ipst;
+ ip_stack_t *ipst;
+ ilm_walker_t ilw;
ASSERT(ill != NULL);
ASSERT(!ill->ill_isv6);
@@ -401,8 +402,7 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
"igmp_input: we are only "
"member src 0x%x ipif_local 0x%x",
(int)ntohl(src),
- (int)
- ntohl(ipif->ipif_lcl_addr));
+ (int)ntohl(ipif->ipif_lcl_addr));
}
mutex_exit(&ill->ill_lock);
return (mp);
@@ -440,23 +440,20 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
}
/*
- * If we belong to the group being reported, and
- * we are a 'Delaying member' in the RFC terminology,
- * stop our timer for that group and 'clear flag' i.e.
- * mark as IGMP_OTHERMEMBER. Do this for all logical
- * interfaces on the given physical interface.
+ * If our ill has ILMs that belong to the group being
+ * reported, and we are a 'Delaying Member' in the RFC
+ * terminology, stop our timer for that group and 'clear
+ * flag' i.e. mark as IGMP_OTHERMEMBER.
*/
- mutex_enter(&ill->ill_lock);
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- ilm = ilm_lookup_ipif(ipif, group);
- if (ilm != NULL) {
+ ilm = ilm_walker_start(&ilw, ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ if (ilm->ilm_addr == group) {
++ipst->ips_igmpstat.igps_rcv_ourreports;
ilm->ilm_timer = INFINITY;
ilm->ilm_state = IGMP_OTHERMEMBER;
}
- } /* for */
- mutex_exit(&ill->ill_lock);
+ }
+ ilm_walker_finish(&ilw);
break;
case IGMP_V3_MEMBERSHIP_REPORT:
@@ -485,6 +482,7 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
int timer;
uint_t next, current;
ip_stack_t *ipst;
+ ilm_walker_t ilw;
ipst = ill->ill_ipst;
++ipst->ips_igmpstat.igps_rcv_queries;
@@ -583,11 +581,12 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
* the maximum timeout.
*/
next = (unsigned)INFINITY;
- mutex_enter(&ill->ill_lock);
+ ilm = ilm_walker_start(&ilw, ill);
+ mutex_enter(&ill->ill_lock);
current = CURRENT_MSTIME;
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
/*
* A multicast router joins INADDR_ANY address
* to enable promiscuous reception of all
@@ -610,6 +609,7 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
}
}
mutex_exit(&ill->ill_lock);
+ ilm_walker_finish(&ilw);
return (next);
}
@@ -623,6 +623,7 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
ipaddr_t *src_array;
uint8_t qrv;
ip_stack_t *ipst;
+ ilm_walker_t ilw;
ipst = ill->ill_ipst;
/* make sure numsrc matches packet size */
@@ -693,8 +694,9 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
} else {
/* group or group/source specific query */
+ ilm = ilm_walker_start(&ilw, ill);
mutex_enter(&ill->ill_lock);
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
(ilm->ilm_addr == htonl(INADDR_ANY)) ||
(ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
@@ -749,6 +751,7 @@ group_query:
ilm->ilm_timer += current;
}
mutex_exit(&ill->ill_lock);
+ ilm_walker_finish(&ilw);
}
return (next);
@@ -819,13 +822,22 @@ igmp_joingroup(ilm_t *ilm)
mutex_exit(&ill->ill_lock);
/*
- * To avoid deadlock, we defer igmp_start_timers() to
- * ipsq_exit(). See the comment in ipsq_exit() for details.
+ * We need to restart the IGMP timers, but we can't do it here
+ * since we're inside the IPSQ and thus igmp_start_timers() ->
+ * untimeout() (inside the IPSQ, waiting for a running timeout
+ * to finish) could deadlock with igmp_timeout_handler() ->
+ * ipsq_enter() (running the timeout, waiting to get inside
+ * the IPSQ). We also can't just delay it until after we
+ * ipsq_exit() since we could be inside more than one IPSQ and
+ * thus still have the other IPSQs pinned after we exit -- and
+ * igmp_start_timers() may be trying to enter one of those.
+ * Instead, signal a dedicated thread that will do it for us.
*/
mutex_enter(&ipst->ips_igmp_timer_lock);
ipst->ips_igmp_deferred_next = MIN(timer,
ipst->ips_igmp_deferred_next);
mutex_exit(&ipst->ips_igmp_timer_lock);
+ mcast_signal_restart_thread(ipst);
}
if (ip_debug > 1) {
@@ -897,13 +909,14 @@ mld_joingroup(ilm_t *ilm)
mutex_exit(&ill->ill_lock);
/*
- * To avoid deadlock, we defer mld_start_timers() to
- * ipsq_exit(). See the comment in ipsq_exit() for details.
+ * Signal another thread to restart the timers. See the
+ * comment in igmp_joingroup() for details.
*/
mutex_enter(&ipst->ips_mld_timer_lock);
ipst->ips_mld_deferred_next = MIN(timer,
ipst->ips_mld_deferred_next);
mutex_exit(&ipst->ips_mld_timer_lock);
+ mcast_signal_restart_thread(ipst);
}
if (ip_debug > 1) {
@@ -1073,8 +1086,8 @@ send_to_in:
/*
* Need to set up retransmission state; merge the new info with the
* current state (which may be null). If the timer is not currently
- * running, start it (need to do a delayed start of the timer as
- * we're currently in the sq).
+ * running, signal a thread to restart it -- see the comment in
+ * igmp_joingroup() for details.
*/
rp = mcast_merge_rtx(ilm, rp, flist);
if (ilm->ilm_rtx.rtx_timer == INFINITY) {
@@ -1085,6 +1098,7 @@ send_to_in:
ilm->ilm_rtx.rtx_timer);
ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
mutex_exit(&ipst->ips_igmp_timer_lock);
+ mcast_signal_restart_thread(ipst);
}
mutex_exit(&ill->ill_lock);
@@ -1161,8 +1175,8 @@ send_to_in:
/*
* Need to set up retransmission state; merge the new info with the
* current state (which may be null). If the timer is not currently
- * running, start it (need to do a deferred start of the timer as
- * we're currently in the sq).
+ * running, signal a thread to restart it -- see the comment in
+ * igmp_joingroup() for details.
*/
rp = mcast_merge_rtx(ilm, rp, flist);
ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
@@ -1174,6 +1188,7 @@ send_to_in:
MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
mutex_exit(&ipst->ips_mld_timer_lock);
+ mcast_signal_restart_thread(ipst);
}
mutex_exit(&ill->ill_lock);
@@ -1397,12 +1412,10 @@ per_ilm_rtxtimer:
*
* igmp_input() receives igmp queries and responds to the queries
* in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
- * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
+ * Later the igmp_timer fires, the timeout handler igmp_timeout_handler()
* performs the action exclusively after entering each ill's ipsq as writer.
- * The actual igmp timeout handler needs to run in the ipsq since it has to
- * access the ilm's and we don't want another exclusive operation like
- * say an IPMP failover to be simultaneously moving the ilms from one ill to
- * another.
+ * (The need to enter the IPSQ is largely historical but there are still some
+ * fields like ilm_filter that rely on it.)
*
* The igmp_slowtimeo() function is called thru another timer.
* igmp_slowtimeout_lock protects the igmp_slowtimeout_id
@@ -1420,7 +1433,6 @@ igmp_timeout_handler(void *arg)
ASSERT(arg != NULL);
mutex_enter(&ipst->ips_igmp_timer_lock);
ASSERT(ipst->ips_igmp_timeout_id != 0);
- ipst->ips_igmp_timer_thread = curthread;
ipst->ips_igmp_timer_scheduled_last = 0;
ipst->ips_igmp_time_to_next = 0;
mutex_exit(&ipst->ips_igmp_timer_lock);
@@ -1452,7 +1464,6 @@ igmp_timeout_handler(void *arg)
mutex_enter(&ipst->ips_igmp_timer_lock);
ASSERT(ipst->ips_igmp_timeout_id != 0);
ipst->ips_igmp_timeout_id = 0;
- ipst->ips_igmp_timer_thread = NULL;
mutex_exit(&ipst->ips_igmp_timer_lock);
if (global_next != INFINITY)
@@ -1663,7 +1674,6 @@ mld_timeout_handler(void *arg)
ASSERT(arg != NULL);
mutex_enter(&ipst->ips_mld_timer_lock);
ASSERT(ipst->ips_mld_timeout_id != 0);
- ipst->ips_mld_timer_thread = curthread;
ipst->ips_mld_timer_scheduled_last = 0;
ipst->ips_mld_time_to_next = 0;
mutex_exit(&ipst->ips_mld_timer_lock);
@@ -1695,7 +1705,6 @@ mld_timeout_handler(void *arg)
mutex_enter(&ipst->ips_mld_timer_lock);
ASSERT(ipst->ips_mld_timeout_id != 0);
ipst->ips_mld_timeout_id = 0;
- ipst->ips_mld_timer_thread = NULL;
mutex_exit(&ipst->ips_mld_timer_lock);
if (global_next != INFINITY)
@@ -1871,7 +1880,7 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
int hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
size_t size = hdrlen + sizeof (igmpa_t);
ipif_t *ipif = ilm->ilm_ipif;
- ill_t *ill = ipif->ipif_ill; /* Will be the "lower" ill */
+ ill_t *ill = ipif->ipif_ill;
mblk_t *first_mp;
ipsec_out_t *io;
zoneid_t zoneid;
@@ -1887,14 +1896,6 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
* not get forwarded on other interfaces or looped back, we
* set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
* to B_FALSE.
- *
- * We also need to make sure that this does not get load balanced
- * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if
- * here. If it gets load balanced, switches supporting igmp snooping
- * will send the packet that it receives for this multicast group
- * to the interface that we are sending on. As we have joined the
- * multicast group on this ill, by sending the packet out on this
- * ill, we receive all the packets back on this ill.
*/
first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
if (first_mp == NULL)
@@ -1909,7 +1910,6 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
io->ipsec_out_len = sizeof (ipsec_out_t);
io->ipsec_out_use_global_policy = B_TRUE;
io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
- io->ipsec_out_attach_if = B_TRUE;
io->ipsec_out_multicast_loop = B_FALSE;
io->ipsec_out_dontroute = B_TRUE;
if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES)
@@ -1995,6 +1995,8 @@ igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
zoneid_t zoneid;
ip_stack_t *ipst = ill->ill_ipst;
+ ASSERT(IAM_WRITER_IPIF(ipif));
+
/* if there aren't any records, there's nothing to send */
if (reclist == NULL)
return;
@@ -2022,6 +2024,14 @@ nextpkt:
int srcspace, srcsperpkt;
srcspace = ill->ill_max_frag - (size +
sizeof (grphdra_t));
+
+ /*
+ * Skip if there's not even enough room in
+ * a single packet to send something useful.
+ */
+ if (srcspace <= sizeof (ipaddr_t))
+ continue;
+
srcsperpkt = srcspace / sizeof (ipaddr_t);
/*
* Increment size and numrec, because we will
@@ -2082,7 +2092,6 @@ nextpkt:
io->ipsec_out_len = sizeof (ipsec_out_t);
io->ipsec_out_use_global_policy = B_TRUE;
io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
- io->ipsec_out_attach_if = B_TRUE;
io->ipsec_out_multicast_loop = B_FALSE;
io->ipsec_out_dontroute = B_TRUE;
if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES)
@@ -2188,6 +2197,7 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
uint_t next;
int mldlen;
ip_stack_t *ipst = ill->ill_ipst;
+ ilm_walker_t ilw;
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
@@ -2294,7 +2304,6 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
return;
}
-
/*
* If we belong to the group being reported, and we are a
* 'Delaying member' per the RFC terminology, stop our timer
@@ -2303,8 +2312,8 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
* membership entries for the same group address (one per zone)
* so we need to walk the ill_ilm list.
*/
- mutex_enter(&ill->ill_lock);
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
+ ilm = ilm_walker_start(&ilw, ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
continue;
BUMP_MIB(ill->ill_icmp6_mib,
@@ -2313,7 +2322,7 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
ilm->ilm_timer = INFINITY;
ilm->ilm_state = IGMP_OTHERMEMBER;
}
- mutex_exit(&ill->ill_lock);
+ ilm_walker_finish(&ilw);
break;
}
case MLD_LISTENER_REDUCTION:
@@ -2343,6 +2352,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
int timer;
uint_t next, current;
in6_addr_t *v6group;
+ ilm_walker_t ilw;
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
@@ -2397,10 +2407,12 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
* maximum timeout.
*/
next = INFINITY;
- mutex_enter(&ill->ill_lock);
+ ilm = ilm_walker_start(&ilw, ill);
+ mutex_enter(&ill->ill_lock);
current = CURRENT_MSTIME;
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
+
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
@@ -2430,6 +2442,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
}
}
mutex_exit(&ill->ill_lock);
+ ilm_walker_finish(&ilw);
return (next);
}
@@ -2446,6 +2459,7 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
in6_addr_t *v6group, *src_array;
uint_t next, numsrc, i, mrd, delay, qqi, current;
uint8_t qrv;
+ ilm_walker_t ilw;
v6group = &mld2q->mld2q_addr;
numsrc = ntohs(mld2q->mld2q_numsrc);
@@ -2518,8 +2532,9 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
} else {
/* group or group/source specific query */
+ ilm = ilm_walker_start(&ilw, ill);
mutex_enter(&ill->ill_lock);
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
@@ -2574,6 +2589,7 @@ group_query:
break;
}
mutex_exit(&ill->ill_lock);
+ ilm_walker_finish(&ilw);
}
return (next);
@@ -2591,9 +2607,8 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
ip6_hbh_t *ip6hbh;
struct ip6_opt_router *ip6router;
size_t size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
- ill_t *ill = ilm->ilm_ill; /* Will be the "lower" ill */
+ ill_t *ill = ilm->ilm_ill;
ipif_t *ipif;
- ip6i_t *ip6i;
/*
* We need to place a router alert option in this packet. The length
@@ -2605,30 +2620,14 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
ASSERT(ill->ill_isv6);
- /*
- * We need to make sure that this packet does not get load balanced.
- * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
- * ip_newroute_ipif_v6 knows how to handle such packets.
- * If it gets load balanced, switches supporting MLD snooping
- * (in the future) will send the packet that it receives for this
- * multicast group to the interface that we are sending on. As we have
- * joined the multicast group on this ill, by sending the packet out
- * on this ill, we receive all the packets back on this ill.
- */
- size += sizeof (ip6i_t) + router_alert_length;
+ size += router_alert_length;
mp = allocb(size, BPRI_HI);
if (mp == NULL)
return;
bzero(mp->b_rptr, size);
mp->b_wptr = mp->b_rptr + size;
- ip6i = (ip6i_t *)mp->b_rptr;
- ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- ip6i->ip6i_nxt = IPPROTO_RAW;
- ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
- ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
-
- ip6h = (ip6_t *)&ip6i[1];
+ ip6h = (ip6_t *)mp->b_rptr;
ip6hbh = (struct ip6_hbh *)&ip6h[1];
ip6router = (struct ip6_opt_router *)&ip6hbh[1];
/*
@@ -2698,7 +2697,6 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
in6_addr_t *srcarray;
ip6_t *ip6h;
ip6_hbh_t *ip6hbh;
- ip6i_t *ip6i;
struct ip6_opt_router *ip6router;
size_t size, optlen, padlen, icmpsize, rsize;
ipif_t *ipif;
@@ -2707,6 +2705,8 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
mrec_t *next_reclist = reclist;
boolean_t morepkts;
+ ASSERT(IAM_WRITER_ILL(ill));
+
/* If there aren't any records, there's nothing to send */
if (reclist == NULL)
return;
@@ -2743,6 +2743,14 @@ nextpkt:
int srcspace, srcsperpkt;
srcspace = ill->ill_max_frag -
(size + sizeof (mld2mar_t));
+
+ /*
+ * Skip if there's not even enough room in
+ * a single packet to send something useful.
+ */
+ if (srcspace <= sizeof (in6_addr_t))
+ continue;
+
srcsperpkt = srcspace / sizeof (in6_addr_t);
/*
* Increment icmpsize and size, because we will
@@ -2787,30 +2795,13 @@ nextpkt:
size += rsize;
}
- /*
- * We need to make sure that this packet does not get load balanced.
- * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
- * ip_newroute_ipif_v6 know how to handle such packets.
- * If it gets load balanced, switches supporting MLD snooping
- * (in the future) will send the packet that it receives for this
- * multicast group to the interface that we are sending on. As we have
- * joined the multicast group on this ill, by sending the packet out
- * on this ill, we receive all the packets back on this ill.
- */
- size += sizeof (ip6i_t);
mp = allocb(size, BPRI_HI);
if (mp == NULL)
goto free_reclist;
bzero(mp->b_rptr, size);
mp->b_wptr = mp->b_rptr + size;
- ip6i = (ip6i_t *)mp->b_rptr;
- ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- ip6i->ip6i_nxt = IPPROTO_RAW;
- ip6i->ip6i_flags = IP6I_ATTACH_IF;
- ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
-
- ip6h = (ip6_t *)&(ip6i[1]);
+ ip6h = (ip6_t *)mp->b_rptr;
ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
@@ -3102,3 +3093,64 @@ mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
return (rtnmrec);
}
+
+/*
+ * Convenience routine to signal the restart-timer thread.
+ */
+static void
+mcast_signal_restart_thread(ip_stack_t *ipst)
+{
+ mutex_enter(&ipst->ips_mrt_lock);
+ ipst->ips_mrt_flags |= IP_MRT_RUN;
+ cv_signal(&ipst->ips_mrt_cv);
+ mutex_exit(&ipst->ips_mrt_lock);
+}
+
+/*
+ * Thread to restart IGMP/MLD timers. See the comment in igmp_joingroup() for
+ * the story behind this unfortunate thread.
+ */
+void
+mcast_restart_timers_thread(ip_stack_t *ipst)
+{
+ int next;
+ char name[64];
+ callb_cpr_t cprinfo;
+
+ (void) snprintf(name, sizeof (name), "mcast_restart_timers_thread_%d",
+ ipst->ips_netstack->netstack_stackid);
+ CALLB_CPR_INIT(&cprinfo, &ipst->ips_mrt_lock, callb_generic_cpr, name);
+
+ for (;;) {
+ mutex_enter(&ipst->ips_mrt_lock);
+ while (!(ipst->ips_mrt_flags & (IP_MRT_STOP|IP_MRT_RUN))) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(&ipst->ips_mrt_cv, &ipst->ips_mrt_lock);
+ CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_mrt_lock);
+ }
+ if (ipst->ips_mrt_flags & IP_MRT_STOP)
+ break;
+ ipst->ips_mrt_flags &= ~IP_MRT_RUN;
+ mutex_exit(&ipst->ips_mrt_lock);
+
+ mutex_enter(&ipst->ips_igmp_timer_lock);
+ next = ipst->ips_igmp_deferred_next;
+ ipst->ips_igmp_deferred_next = INFINITY;
+ mutex_exit(&ipst->ips_igmp_timer_lock);
+
+ if (next != INFINITY)
+ igmp_start_timers(next, ipst);
+
+ mutex_enter(&ipst->ips_mld_timer_lock);
+ next = ipst->ips_mld_deferred_next;
+ ipst->ips_mld_deferred_next = INFINITY;
+ mutex_exit(&ipst->ips_mld_timer_lock);
+ if (next != INFINITY)
+ mld_start_timers(next, ipst);
+ }
+
+ ipst->ips_mrt_flags |= IP_MRT_DONE;
+ cv_signal(&ipst->ips_mrt_done_cv);
+ CALLB_CPR_EXIT(&cprinfo); /* drops ips_mrt_lock */
+ thread_exit();
+}
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 1d0bcf37de..dd87a09974 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -170,11 +170,14 @@ typedef struct listptr_s listptr_t;
*/
typedef struct iproutedata_s {
uint_t ird_idx;
+ uint_t ird_flags; /* see below */
listptr_t ird_route; /* ipRouteEntryTable */
listptr_t ird_netmedia; /* ipNetToMediaEntryTable */
listptr_t ird_attrs; /* ipRouteAttributeTable */
} iproutedata_t;
+#define IRD_REPORT_TESTHIDDEN 0x01 /* include IRE_MARK_TESTHIDDEN routes */
+
/*
* Cluster specific hooks. These should be NULL when booted as a non-cluster
*/
@@ -228,31 +231,27 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
* MT level protection given by STREAMS. IP uses a combination of its own
* internal serialization mechanism and standard Solaris locking techniques.
- * The internal serialization is per phyint (no IPMP) or per IPMP group.
- * This is used to serialize plumbing operations, IPMP operations, certain
- * multicast operations, most set ioctls, igmp/mld timers etc.
+ * The internal serialization is per phyint. This is used to serialize
+ * plumbing operations, certain multicast operations, most set ioctls,
+ * igmp/mld timers etc.
*
* Plumbing is a long sequence of operations involving message
* exchanges between IP, ARP and device drivers. Many set ioctls are typically
* involved in plumbing operations. A natural model is to serialize these
* ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
* parallel without any interference. But various set ioctls on hme0 are best
- * serialized. However if the system uses IPMP, the operations are easier if
- * they are serialized on a per IPMP group basis since IPMP operations
- * happen across ill's of a group. Thus the lowest common denominator is to
- * serialize most set ioctls, multicast join/leave operations, IPMP operations
- * igmp/mld timer operations, and processing of DLPI control messages received
- * from drivers on a per IPMP group basis. If the system does not employ
- * IPMP the serialization is on a per phyint basis. This serialization is
- * provided by the ipsq_t and primitives operating on this. Details can
- * be found in ip_if.c above the core primitives operating on ipsq_t.
+ * serialized, along with multicast join/leave operations, igmp/mld timer
+ * operations, and processing of DLPI control messages received from drivers
+ * on a per phyint basis. This serialization is provided by the ipsq_t and
+ * primitives operating on this. Details can be found in ip_if.c above the
+ * core primitives operating on ipsq_t.
*
* Lookups of an ipif or ill by a thread return a refheld ipif / ill.
* Simiarly lookup of an ire by a thread also returns a refheld ire.
* In addition ipif's and ill's referenced by the ire are also indirectly
* refheld. Thus no ipif or ill can vanish nor can critical parameters like
* the ipif's address or netmask change as long as an ipif is refheld
- * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the
+ * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
* address of an ipif has to go through the ipsq_t. This ensures that only
* 1 such exclusive operation proceeds at any time on the ipif. It then
* deletes all ires associated with this ipif, and waits for all refcnts
@@ -281,33 +280,24 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* - ill_g_lock: This is a global reader/writer lock. Protects the following
* * The AVL tree based global multi list of all ills.
* * The linked list of all ipifs of an ill
- * * The <ill-ipsq> mapping
- * * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next
- * * The illgroup list threaded by ill_group_next.
+ * * The <ipsq-xop> mapping
* * <ill-phyint> association
* Insertion/deletion of an ill in the system, insertion/deletion of an ipif
- * into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion
- * of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill
- * will all have to hold the ill_g_lock as writer for the actual duration
- * of the insertion/deletion/change. More details about the <ill-ipsq> mapping
- * may be found in the IPMP section.
+ * into an ill, changing the <ipsq-xop> mapping of an ill, changing the
+ * <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
+ * writer for the actual duration of the insertion/deletion/change.
*
* - ill_lock: This is a per ill mutex.
- * It protects some members of the ill and is documented below.
- * It also protects the <ill-ipsq> mapping
- * It also protects the illgroup list threaded by ill_group_next.
+ * It protects some members of the ill_t struct; see ip.h for details.
* It also protects the <ill-phyint> assoc.
* It also protects the list of ipifs hanging off the ill.
*
* - ipsq_lock: This is a per ipsq_t mutex lock.
- * This protects all the other members of the ipsq struct except
- * ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock
+ * This protects some members of the ipsq_t struct; see ip.h for details.
+ * It also protects the <ipsq-ipxop> mapping
*
- * - illgrp_lock: This is a per ill_group mutex lock.
- * The only thing it protects is the illgrp_ill_schednext member of ill_group
- * which dictates which is the next ill in an ill_group that is to be chosen
- * for sending outgoing packets, through creation of an IRE_CACHE that
- * references this ill.
+ * - ipx_lock: This is a per ipxop_t mutex lock.
+ * This protects some members of the ipxop_t struct; see ip.h for details.
*
* - phyint_lock: This is a per phyint mutex lock. Protects just the
* phyint_flags
@@ -335,27 +325,24 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* Note, it is only necessary to take this lock if the ill_usesrc_grp_next
* field is changing state i.e from NULL to non-NULL or vice-versa. For
* example, it is not necessary to take this lock in the initial portion
- * of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and
- * ip_sioctl_flags since the these operations are executed exclusively and
- * that ensures that the "usesrc group state" cannot change. The "usesrc
- * group state" change can happen only in the latter part of
- * ip_sioctl_slifusesrc and in ill_delete.
+ * of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
+ * operations are executed exclusively and that ensures that the "usesrc
+ * group state" cannot change. The "usesrc group state" change can happen
+ * only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
*
- * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications.
+ * Changing <ill-phyint>, <ipsq-xop> assocications:
*
* To change the <ill-phyint> association, the ill_g_lock must be held
* as writer, and the ill_locks of both the v4 and v6 instance of the ill
* must be held.
*
- * To change the <ill-ipsq> association the ill_g_lock must be held as writer
- * and the ill_lock of the ill in question must be held.
- *
- * To change the <ill-illgroup> association the ill_g_lock must be held as
- * writer and the ill_lock of the ill in question must be held.
+ * To change the <ipsq-xop> association, the ill_g_lock must be held as
+ * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
+ * This is only done when ills are added or removed from IPMP groups.
*
* To add or delete an ipif from the list of ipifs hanging off the ill,
* ill_g_lock (writer) and ill_lock must be held and the thread must be
- * a writer on the associated ipsq,.
+ * a writer on the associated ipsq.
*
* To add or delete an ill to the system, the ill_g_lock must be held as
* writer and the thread must be a writer on the associated ipsq.
@@ -367,8 +354,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
*
* Some lock hierarchy scenarios are listed below.
*
- * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock
- * ill_g_lock -> illgrp_lock -> ill_lock
+ * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
* ill_g_lock -> ill_lock(s) -> phyint_lock
* ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock
* ill_g_lock -> ip_addr_avail_lock
@@ -587,8 +573,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* back, i.e. the loopback which is required since neither Ethernet drivers
* nor Ethernet hardware loops them back. This is the case when the normal
* routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill (or ill group) as the ill with which is IRE_LOCAL is
- * associated.
+ * the same ill as the ill with which is IRE_LOCAL is associated.
*
* Multiple zones can share a common broadcast address; typically all zones
* share the 255.255.255.255 address. Incoming as well as locally originated
@@ -695,8 +680,8 @@ static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *,
mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *);
static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *,
ip_stack_t *);
-static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *,
- uint16_t *);
+static boolean_t ip_rput_fragment(ill_t *, ill_t *, mblk_t **, ipha_t *,
+ uint32_t *, uint16_t *);
int ip_snmp_get(queue_t *, mblk_t *, int);
static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
mib2_ipIfStatsEntry_t *, ip_stack_t *);
@@ -723,9 +708,9 @@ static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
ip_stack_t *ipst);
-static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *,
+static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
ip_stack_t *ipst);
-static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *,
+static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
ip_stack_t *ipst);
static void ip_snmp_get2_v4(ire_t *, iproutedata_t *);
static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
@@ -775,8 +760,6 @@ static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
cred_t *);
-static int ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t,
- cred_t *);
static int ip_squeue_switch(int);
static void *ip_kstat_init(netstackid_t, ip_stack_t *);
@@ -946,8 +929,6 @@ static ipndp_t lcl_ndp_arr[] = {
{ ip_cgtp_filter_get, ip_cgtp_filter_set, NULL,
"ip_cgtp_filter" },
#define IPNDP_IPMP_HOOK_OFFSET 10
- { ip_param_generic_get, ipmp_hook_emulation_set, NULL,
- "ipmp_hook_emulation" },
{ ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug,
"ip_debug" },
};
@@ -984,20 +965,19 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
- /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+ /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_addr, NULL },
/* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
/* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
- IPI_GET_CMD | IPI_REPL,
- IF_CMD, ip_sioctl_get_dstaddr, NULL },
+ IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
/* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
+ IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
/* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
- IPI_MODOK | IPI_GET_CMD | IPI_REPL,
+ IPI_MODOK | IPI_GET_CMD,
IF_CMD, ip_sioctl_get_flags, NULL },
/* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1009,31 +989,28 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_mtu, NULL },
- /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+ /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_mtu, NULL },
/* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
- IPI_GET_CMD | IPI_REPL,
- IF_CMD, ip_sioctl_get_brdaddr, NULL },
+ IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
/* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_brdaddr, NULL },
/* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
- IPI_GET_CMD | IPI_REPL,
- IF_CMD, ip_sioctl_get_netmask, NULL },
+ IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
/* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
/* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
- IPI_GET_CMD | IPI_REPL,
- IF_CMD, ip_sioctl_get_metric, NULL },
+ IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
/* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
IF_CMD, ip_sioctl_metric, NULL },
/* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* See 166-168 below for extended SIOC*XARP ioctls */
- /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV,
+ /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
ARP_CMD, ip_sioctl_arp, NULL },
- /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL,
+ /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
ARP_CMD, ip_sioctl_arp, NULL },
- /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV,
+ /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
ARP_CMD, ip_sioctl_arp, NULL },
/* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1098,21 +1075,19 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
- /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL,
+ /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
MISC_CMD, ip_sioctl_get_ifnum, NULL },
- /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+ /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_muxid, NULL },
/* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- IF_CMD, ip_sioctl_muxid, NULL },
+ IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
/* Both if and lif variants share same func */
- /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+ /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_lifindex, NULL },
/* Both if and lif variants share same func */
/* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- IF_CMD, ip_sioctl_slifindex, NULL },
+ IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
/* copyin size cannot be coded for SIOCGIFCONF */
/* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
@@ -1136,28 +1111,25 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_removeif,
+ IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
ip_sioctl_removeif_restart },
/* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL,
+ IPI_GET_CMD | IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_addif, NULL },
#define SIOCLIFADDR_NDX 112
/* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
/* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_addr, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
/* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
/* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_dstaddr, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
/* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
+ IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
/* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_MODOK | IPI_REPL,
+ IPI_GET_CMD | IPI_MODOK,
LIF_CMD, ip_sioctl_get_flags, NULL },
/* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1167,58 +1139,48 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
ip_sioctl_get_lifconf, NULL },
/* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_mtu, NULL },
- /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL,
+ /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
LIF_CMD, ip_sioctl_get_mtu, NULL },
/* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_brdaddr, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
/* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_brdaddr, NULL },
/* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_netmask, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
/* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
/* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_metric, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
/* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_metric, NULL },
/* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL,
+ IPI_PRIV | IPI_WR | IPI_MODOK,
LIF_CMD, ip_sioctl_slifname,
ip_sioctl_slifname_restart },
- /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL,
+ /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
MISC_CMD, ip_sioctl_get_lifnum, NULL },
/* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_muxid, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
/* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_muxid, NULL },
+ IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
/* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_lifindex, 0 },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
/* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_slifindex, 0 },
+ IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
/* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_token, NULL },
/* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_token, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
/* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
/* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_subnet, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
/* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_lnkinfo, NULL },
/* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
/* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
LIF_CMD, ip_siocdelndp_v6, NULL },
/* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
@@ -1231,8 +1193,8 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
MISC_CMD, ip_sioctl_tonlink, NULL },
/* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
MISC_CMD, ip_sioctl_tmysite, NULL },
- /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL,
- TUN_CMD, ip_sioctl_tunparam, NULL },
+ /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), 0,
+ TUN_CMD, ip_sioctl_tunparam, NULL },
/* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req),
IPI_PRIV | IPI_WR,
TUN_CMD, ip_sioctl_tunparam, NULL },
@@ -1243,29 +1205,24 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
/* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
- /* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_move, ip_sioctl_move },
- /* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_move, ip_sioctl_move },
+ /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
+
+ /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD |
+ IPI_WR, LIF_CMD, ip_sioctl_get_binding, NULL },
/* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
+ IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
/* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_groupname, NULL },
- /* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_oindex, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
+ /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
+ IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
/* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
/* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
- /* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
- LIF_CMD, ip_sioctl_slifoindex, NULL },
+ /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* These are handled in ip_sioctl_copyin_setup itself */
/* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
@@ -1277,22 +1234,20 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
ip_sioctl_get_lifconf, NULL },
- /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV,
+ /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
XARP_CMD, ip_sioctl_arp, NULL },
- /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL,
+ /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
XARP_CMD, ip_sioctl_arp, NULL },
- /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV,
+ /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
XARP_CMD, ip_sioctl_arp, NULL },
/* SIOCPOPSOCKFS is not handled by IP */
/* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
/* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_lifzone, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
/* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_slifzone,
+ IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
ip_sioctl_slifzone_restart },
/* 172-174 are SCTP ioctls and not handled by IP */
/* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1315,8 +1270,7 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
MSFILT_CMD, ip_sioctl_msfilter, NULL },
/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR,
MSFILT_CMD, ip_sioctl_msfilter, NULL },
- /* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD,
- ip_sioctl_set_ipmpfailback, NULL },
+ /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* SIOCSENABLESDP is handled by SDP */
/* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
/* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
@@ -1326,7 +1280,7 @@ int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
{ OSIOCGTUNPARAM, sizeof (struct old_iftun_req),
- IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL },
+ IPI_GET_CMD, TUN_CMD, ip_sioctl_tunparam, NULL },
{ OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR,
TUN_CMD, ip_sioctl_tunparam, NULL },
{ I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
@@ -1336,11 +1290,11 @@ ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
{ ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL },
{ ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
{ IP_IOCTL, 0, 0, 0, NULL, NULL },
- { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD,
+ { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
MISC_CMD, mrt_ioctl},
- { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD,
+ { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD,
MISC_CMD, mrt_ioctl},
- { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD,
+ { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
MISC_CMD, mrt_ioctl}
};
@@ -1629,8 +1583,6 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
ipif_t *ipif;
mblk_t *first_mp;
ipsec_in_t *ii;
- ire_t *src_ire;
- boolean_t onlink;
timestruc_t now;
uint32_t ill_index;
ip_stack_t *ipst;
@@ -2014,59 +1966,6 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
if (!IS_SIMPLE_IPH(ipha))
icmp_options_update(ipha);
- /*
- * ICMP echo replies should go out on the same interface
- * the request came on as probes used by in.mpathd for detecting
- * NIC failures are ECHO packets. We turn-off load spreading
- * by setting ipsec_in_attach_if to B_TRUE, which is copied
- * to ipsec_out_attach_if by ipsec_in_to_out called later in this
- * function. This is in turn handled by ip_wput and ip_newroute
- * to make sure that the packet goes out on the interface it came
- * in on. If we don't turnoff load spreading, the packets might get
- * dropped if there are no non-FAILED/INACTIVE interfaces for it
- * to go out and in.mpathd would wrongly detect a failure or
- * mis-detect a NIC failure for link failure. As load spreading
- * can happen only if ill_group is not NULL, we do only for
- * that case and this does not affect the normal case.
- *
- * We turn off load spreading only on echo packets that came from
- * on-link hosts. If the interface route has been deleted, this will
- * not be enforced as we can't do much. For off-link hosts, as the
- * default routes in IPv4 does not typically have an ire_ipif
- * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute.
- * Moreover, expecting a default route through this interface may
- * not be correct. We use ipha_dst because of the swap above.
- */
- onlink = B_FALSE;
- if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) {
- /*
- * First, we need to make sure that it is not one of our
- * local addresses. If we set onlink when it is one of
- * our local addresses, we will end up creating IRE_CACHES
- * for one of our local addresses. Then, we will never
- * accept packets for them afterwards.
- */
- src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (src_ire == NULL) {
- ipif = ipif_get_next_ipif(NULL, ill);
- if (ipif == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(mp);
- return;
- }
- src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0,
- IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
- NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE, ipst);
- ipif_refrele(ipif);
- if (src_ire != NULL) {
- onlink = B_TRUE;
- ire_refrele(src_ire);
- }
- } else {
- ire_refrele(src_ire);
- }
- }
if (!mctl_present) {
/*
* This packet should go out the same way as it
@@ -2085,20 +1984,7 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
/* This is not a secure packet */
ii->ipsec_in_secure = B_FALSE;
- if (onlink) {
- ii->ipsec_in_attach_if = B_TRUE;
- ii->ipsec_in_ill_index =
- ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index =
- recv_ill->ill_phyint->phyint_ifindex;
- }
first_mp->b_cont = mp;
- } else if (onlink) {
- ii = (ipsec_in_t *)first_mp->b_rptr;
- ii->ipsec_in_attach_if = B_TRUE;
- ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */
} else {
ii = (ipsec_in_t *)first_mp->b_rptr;
ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */
@@ -3733,7 +3619,6 @@ ipif_dup_recovery(void *arg)
ill_t *ill = ipif->ipif_ill;
mblk_t *arp_add_mp;
mblk_t *arp_del_mp;
- area_t *area;
ip_stack_t *ipst = ill->ill_ipst;
ipif->ipif_recovery_id = 0;
@@ -3744,12 +3629,13 @@ ipif_dup_recovery(void *arg)
*/
if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) ||
(ipif->ipif_flags & IPIF_POINTOPOINT) ||
- (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
+ (ipif->ipif_state_flags & (IPIF_CONDEMNED))) {
/* No reason to try to bring this address back. */
return;
}
- if ((arp_add_mp = ipif_area_alloc(ipif)) == NULL)
+ /* ACE_F_UNVERIFIED restarts DAD */
+ if ((arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL)
goto alloc_fail;
if (ipif->ipif_arp_del_mp == NULL) {
@@ -3758,10 +3644,6 @@ ipif_dup_recovery(void *arg)
ipif->ipif_arp_del_mp = arp_del_mp;
}
- /* Setting the 'unverified' flag restarts DAD */
- area = (area_t *)arp_add_mp->b_rptr;
- area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR |
- ACE_F_UNVERIFIED;
putnext(ill->ill_rq, arp_add_mp);
return;
@@ -3873,6 +3755,7 @@ ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
EINPROGRESS) {
ipif->ipif_addr_ready = 1;
(void) ipif_up_done(ipif);
+ ASSERT(ill->ill_move_ipif == NULL);
}
continue;
}
@@ -3893,6 +3776,7 @@ ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
ill->ill_net_type == IRE_IF_RESOLVER &&
!(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
ipst->ips_ip_dup_recovery > 0) {
+ ASSERT(ipif->ipif_recovery_id == 0);
ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
}
@@ -4196,8 +4080,9 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
{
mblk_t *mp;
ip_pktinfo_t *pinfo;
- ipha_t *ipha;
+ ipha_t *ipha;
struct ether_header *pether;
+ boolean_t ipmp_ill_held = B_FALSE;
mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED);
if (mp == NULL) {
@@ -4205,12 +4090,53 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
return (data_mp);
}
- ipha = (ipha_t *)data_mp->b_rptr;
+ ipha = (ipha_t *)data_mp->b_rptr;
pinfo = (ip_pktinfo_t *)mp->b_rptr;
bzero(pinfo, sizeof (ip_pktinfo_t));
pinfo->ip_pkt_flags = (uchar_t)flags;
pinfo->ip_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */
+ pether = (struct ether_header *)((char *)ipha
+ - sizeof (struct ether_header));
+
+ /*
+ * Make sure the interface is an ethernet type, since this option
+ * is currently supported only on this type of interface. Also make
+ * sure we are pointing correctly above db_base.
+ */
+ if ((flags & IPF_RECVSLLA) &&
+ ((uchar_t *)pether >= data_mp->b_datap->db_base) &&
+ (ill->ill_type == IFT_ETHER) &&
+ (ill->ill_net_type == IRE_IF_RESOLVER)) {
+ pinfo->ip_pkt_slla.sdl_type = IFT_ETHER;
+ bcopy(pether->ether_shost.ether_addr_octet,
+ pinfo->ip_pkt_slla.sdl_data, ETHERADDRL);
+ } else {
+ /*
+ * Clear the bit. Indicate to upper layer that IP is not
+ * sending this ancillary info.
+ */
+ pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA;
+ }
+
+ /*
+ * If `ill' is in an IPMP group, use the IPMP ill to determine
+ * IPF_RECVIF and IPF_RECVADDR. (This currently assumes that
+ * IPF_RECVADDR support on test addresses is not needed.)
+ *
+ * Note that `ill' may already be an IPMP ill if e.g. we're
+ * processing a packet looped back to an IPMP data address
+ * (since those IRE_LOCALs are tied to IPMP ills).
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ if ((ill = ipmp_ill_hold_ipmp_ill(ill)) == NULL) {
+ ip1dbg(("ip_add_info: cannot hold IPMP ill.\n"));
+ freemsg(mp);
+ return (data_mp);
+ }
+ ipmp_ill_held = B_TRUE;
+ }
+
if (flags & (IPF_RECVIF | IPF_RECVADDR))
pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex;
if (flags & IPF_RECVADDR) {
@@ -4239,7 +4165,7 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
ire = ire_ctable_lookup(ipha->ipha_dst, 0,
IRE_LOCAL | IRE_LOOPBACK,
ipif, zoneid, NULL,
- MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst);
+ MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
if (ire == NULL) {
/*
* packet must have come on a different
@@ -4276,29 +4202,8 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
}
}
- pether = (struct ether_header *)((char *)ipha
- - sizeof (struct ether_header));
- /*
- * Make sure the interface is an ethernet type, since this option
- * is currently supported only on this type of interface. Also make
- * sure we are pointing correctly above db_base.
- */
-
- if ((flags & IPF_RECVSLLA) &&
- ((uchar_t *)pether >= data_mp->b_datap->db_base) &&
- (ill->ill_type == IFT_ETHER) &&
- (ill->ill_net_type == IRE_IF_RESOLVER)) {
-
- pinfo->ip_pkt_slla.sdl_type = IFT_ETHER;
- bcopy((uchar_t *)pether->ether_shost.ether_addr_octet,
- (uchar_t *)pinfo->ip_pkt_slla.sdl_data, ETHERADDRL);
- } else {
- /*
- * Clear the bit. Indicate to upper layer that IP is not
- * sending this ancillary info.
- */
- pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA;
- }
+ if (ipmp_ill_held)
+ ill_refrele(ill);
mp->b_datap->db_type = M_CTL;
mp->b_wptr += sizeof (ip_pktinfo_t);
@@ -4946,8 +4851,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
}
}
- if (dst_ire != NULL &&
- dst_ire->ire_type == IRE_LOCAL &&
+ if (dst_ire != NULL && dst_ire->ire_type == IRE_LOCAL &&
dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) {
/*
* If the IRE belongs to a different zone, look for a matching
@@ -4983,7 +4887,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
* Pick a source address so that a proper inbound
* load spreading would happen.
*/
- ill_t *dst_ill = dst_ire->ire_ipif->ipif_ill;
+ ill_t *ire_ill = dst_ire->ire_ipif->ipif_ill;
ipif_t *src_ipif = NULL;
ire_t *ipif_ire;
@@ -4998,10 +4902,10 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
* found above so that upper layers know that the
* destination address is a broadcast address.
*
- * 2) If this is part of a group, select a better
- * source address so that better inbound load
- * balancing happens. Do the same if the ipif
- * is DEPRECATED.
+ * 2) If the ipif is DEPRECATED, select a better
+ * source address. Similarly, if the ipif is on
+ * the IPMP meta-interface, pick a source address
+ * at random to improve inbound load spreading.
*
* 3) If the outgoing interface is part of a usesrc
* group, then try selecting a source address from
@@ -5011,9 +4915,9 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
dst_ire->ire_zoneid != ALL_ZONES) ||
(!(dst_ire->ire_flags & RTF_SETSRC)) &&
(!(dst_ire->ire_type & IRE_BROADCAST) &&
- ((dst_ill->ill_group != NULL) ||
+ (IS_IPMP(ire_ill) ||
(dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
- (dst_ill->ill_usesrc_ifindex != 0)))) {
+ (ire_ill->ill_usesrc_ifindex != 0)))) {
/*
* If the destination is reachable via a
* given gateway, the selected source address
@@ -5035,7 +4939,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
*/
ipaddr_t saddr =
dst_ire->ire_ipif->ipif_src_addr;
- src_ipif = ipif_select_source(dst_ill,
+ src_ipif = ipif_select_source(ire_ill,
saddr, zoneid);
if (src_ipif != NULL) {
if (IS_VNI(src_ipif->ipif_ill)) {
@@ -5478,14 +5382,6 @@ ip_modclose(ill_t *ill)
(void) ill_frag_timeout(ill, 0);
/*
- * If MOVE was in progress, clear the
- * move_in_progress fields also.
- */
- if (ill->ill_move_in_progress) {
- ILL_CLEAR_MOVE(ill);
- }
-
- /*
* Call ill_delete to bring down the ipifs, ilms and ill on
* this ill. Then wait for the refcnts to drop to zero.
* ill_is_freeable checks whether the ill is really quiescent.
@@ -5510,7 +5406,7 @@ ip_modclose(ill_t *ill)
*/
netstack_hold(ipst->ips_netstack);
- /* qprocsoff is called in ill_delete_tail */
+ /* qprocsoff is done via ill_delete_tail */
ill_delete_tail(ill);
ASSERT(ill->ill_ipst == NULL);
@@ -5755,6 +5651,11 @@ ip_stack_shutdown(netstackid_t stackid, void *arg)
ipst->ips_capab_taskq_quit = B_TRUE;
cv_signal(&ipst->ips_capab_taskq_cv);
mutex_exit(&ipst->ips_capab_taskq_lock);
+
+ mutex_enter(&ipst->ips_mrt_lock);
+ ipst->ips_mrt_flags |= IP_MRT_STOP;
+ cv_signal(&ipst->ips_mrt_cv);
+ mutex_exit(&ipst->ips_mrt_lock);
}
/*
@@ -5766,6 +5667,9 @@ ip_stack_fini(netstackid_t stackid, void *arg)
ip_stack_t *ipst = (ip_stack_t *)arg;
int ret;
+#ifdef NS_DEBUG
+ printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
+#endif
/*
* At this point, all of the notifications that the events and
* protocols are going away have been run, meaning that we can
@@ -5779,9 +5683,14 @@ ip_stack_fini(netstackid_t stackid, void *arg)
cv_destroy(&ipst->ips_capab_taskq_cv);
list_destroy(&ipst->ips_capab_taskq_list);
-#ifdef NS_DEBUG
- printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
-#endif
+ mutex_enter(&ipst->ips_mrt_lock);
+ while (!(ipst->ips_mrt_flags & IP_MRT_DONE))
+ cv_wait(&ipst->ips_mrt_done_cv, &ipst->ips_mrt_lock);
+ mutex_destroy(&ipst->ips_mrt_lock);
+ cv_destroy(&ipst->ips_mrt_cv);
+ cv_destroy(&ipst->ips_mrt_done_cv);
+
+ ipmp_destroy(ipst);
rw_destroy(&ipst->ips_srcid_lock);
ip_kstat_fini(stackid, ipst->ips_ip_mibkp);
@@ -6038,10 +5947,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
"ip_cgtp_filter") == 0);
ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_data =
(caddr_t)&ipst->ips_ip_cgtp_filter;
- ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_name,
- "ipmp_hook_emulation") == 0);
- ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_data =
- (caddr_t)&ipst->ips_ipmp_hook_emulation;
(void) ip_param_register(&ipst->ips_ip_g_nd,
ipst->ips_param_arr, A_CNT(lcl_param_arr),
@@ -6053,8 +5958,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
ipst->ips_ip6_kstat =
ip6_kstat_init(stackid, &ipst->ips_ip6_statistics);
- ipst->ips_ipmp_enable_failback = B_TRUE;
-
ipst->ips_ip_src_id = 1;
rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
@@ -6062,6 +5965,7 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
ip_net_init(ipst, ns);
ipv4_hook_init(ipst);
ipv6_hook_init(ipst);
+ ipmp_init(ipst);
/*
* Create the taskq dispatcher thread and initialize related stuff.
@@ -6073,6 +5977,15 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
list_create(&ipst->ips_capab_taskq_list, sizeof (mblk_t),
offsetof(mblk_t, b_next));
+ /*
+ * Create the mcast_restart_timers_thread() worker thread.
+ */
+ mutex_init(&ipst->ips_mrt_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&ipst->ips_mrt_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&ipst->ips_mrt_done_cv, NULL, CV_DEFAULT, NULL);
+ ipst->ips_mrt_thread = thread_create(NULL, 0,
+ mcast_restart_timers_thread, ipst, 0, &p0, TS_RUN, minclsyspri);
+
major = mod_name_to_major(INET_NAME);
(void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
return (ipst);
@@ -6109,6 +6022,24 @@ ip_dlpi_alloc(size_t len, t_uscalar_t prim)
}
/*
+ * Allocate and initialize a DLPI notification. (May be called as writer.)
+ */
+mblk_t *
+ip_dlnotify_alloc(uint_t notification, uint_t data)
+{
+ dl_notify_ind_t *notifyp;
+ mblk_t *mp;
+
+ if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
+ return (NULL);
+
+ notifyp = (dl_notify_ind_t *)mp->b_rptr;
+ notifyp->dl_notification = notification;
+ notifyp->dl_data = data;
+ return (mp);
+}
+
+/*
* Debug formatting routine. Returns a character string representation of the
* addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address
* in the form of a ipaddr_t and calls ip_dot_saddr with a pointer.
@@ -7753,71 +7684,30 @@ ip_net_mask(ipaddr_t addr)
}
/*
- * Select an ill for the packet by considering load spreading across
- * a different ill in the group if dst_ill is part of some group.
- */
-ill_t *
-ip_newroute_get_dst_ill(ill_t *dst_ill)
-{
- ill_t *ill;
-
- /*
- * We schedule irrespective of whether the source address is
- * INADDR_ANY or not. illgrp_scheduler returns a held ill.
- */
- ill = illgrp_scheduler(dst_ill);
- if (ill == NULL)
- return (NULL);
-
- /*
- * For groups with names ip_sioctl_groupname ensures that all
- * ills are of same type. For groups without names, ifgrp_insert
- * ensures this.
- */
- ASSERT(dst_ill->ill_type == ill->ill_type);
-
- return (ill);
-}
-
-/*
- * Helper function for the IPIF_NOFAILOVER/ATTACH_IF interface attachment case.
+ * Helper ill lookup function used by IPsec.
*/
ill_t *
-ip_grab_attach_ill(ill_t *ill, mblk_t *first_mp, int ifindex, boolean_t isv6,
- ip_stack_t *ipst)
+ip_grab_ill(mblk_t *first_mp, int ifindex, boolean_t isv6, ip_stack_t *ipst)
{
ill_t *ret_ill;
ASSERT(ifindex != 0);
+
ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
ipst);
- if (ret_ill == NULL ||
- (ret_ill->ill_phyint->phyint_flags & PHYI_OFFLINE)) {
+ if (ret_ill == NULL) {
if (isv6) {
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip6_mib,
- ipIfStatsOutDiscards);
- }
- ip1dbg(("ip_grab_attach_ill (IPv6): "
- "bad ifindex %d.\n", ifindex));
+ BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
+ ip1dbg(("ip_grab_ill (IPv6): bad ifindex %d.\n",
+ ifindex));
} else {
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutDiscards);
- }
- ip1dbg(("ip_grab_attach_ill (IPv4): "
- "bad ifindex %d.\n", ifindex));
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ ip1dbg(("ip_grab_ill (IPv4): bad ifindex %d.\n",
+ ifindex));
}
- if (ret_ill != NULL)
- ill_refrele(ret_ill);
freemsg(first_mp);
return (NULL);
}
-
return (ret_ill);
}
@@ -7859,7 +7749,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
ire_t *sire = NULL;
mblk_t *first_mp;
ire_t *save_ire;
- ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER address */
ushort_t ire_marks = 0;
boolean_t mctl_present;
ipsec_out_t *io;
@@ -7873,7 +7762,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
boolean_t multirt_is_resolvable;
boolean_t multirt_resolve_next;
boolean_t unspec_src;
- boolean_t do_attach_ill = B_FALSE;
boolean_t ip_nexthop = B_FALSE;
tsol_ire_gw_secattr_t *attrp = NULL;
tsol_gcgrp_t *gcgrp = NULL;
@@ -7902,22 +7790,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
return;
}
- if (mctl_present && io->ipsec_out_attach_if) {
- /* ip_grab_attach_ill returns a held ill */
- attach_ill = ip_grab_attach_ill(NULL, first_mp,
- io->ipsec_out_ill_index, B_FALSE, ipst);
-
- /* Failure case frees things for us. */
- if (attach_ill == NULL)
- return;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill))
- ire_marks = IRE_MARK_HIDDEN;
- }
if (mctl_present && io->ipsec_out_ip_nexthop) {
ip_nexthop = B_TRUE;
nexthop_addr = io->ipsec_out_nexthop_addr;
@@ -7997,31 +7869,15 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
dst = nexthop_addr;
}
}
- } else if (attach_ill == NULL) {
+ } else {
ire = ire_ftable_lookup(dst, 0, 0, 0,
NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp),
MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT |
MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE,
ipst);
- } else {
- /*
- * attach_ill is set only for communicating with
- * on-link hosts. So, don't look for DEFAULT.
- */
- ipif_t *attach_ipif;
-
- attach_ipif = ipif_get_next_ipif(NULL, attach_ill);
- if (attach_ipif == NULL) {
- ill_refrele(attach_ill);
- goto icmp_err_ret;
- }
- ire = ire_ftable_lookup(dst, 0, 0, 0, attach_ipif,
- &sire, zoneid, 0, MBLK_GETLABEL(mp),
- MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL |
- MATCH_IRE_SECATTR, ipst);
- ipif_refrele(attach_ipif);
}
+
ip3dbg(("ip_newroute: ire_ftable_lookup() "
"returned ire %p, sire %p\n", (void *)ire, (void *)sire));
@@ -8122,8 +7978,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
}
ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0,
RTA_DST, ipst);
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
goto icmp_err_ret;
}
@@ -8134,8 +7988,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
*/
if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
(ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) {
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
goto icmp_err_ret;
}
/*
@@ -8157,119 +8009,51 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
sire->ire_last_used_time = lbolt;
}
/*
- * We have a route to reach the destination.
- *
- * 1) If the interface is part of ill group, try to get a new
- * ill taking load spreading into account.
- *
- * 2) After selecting the ill, get a source address that
- * might create good inbound load spreading.
- * ipif_select_source does this for us.
+ * We have a route to reach the destination. Find the
+ * appropriate ill, then get a source address using
+ * ipif_select_source().
*
- * If the application specified the ill (ifindex), we still
- * load spread. Only if the packets needs to go out
- * specifically on a given ill e.g. binding to
- * IPIF_NOFAILOVER address, then we don't try to use a
- * different ill for load spreading.
+ * If we are here trying to create an IRE_CACHE for an offlink
+ * destination and have an IRE_CACHE entry for VNI, then use
+ * ire_stq instead since VNI's queue is a black hole.
*/
- if (attach_ill == NULL) {
- /*
- * Don't perform outbound load spreading in the
- * case of an RTF_MULTIRT route, as we actually
- * typically want to replicate outgoing packets
- * through particular interfaces.
- */
- if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) {
- dst_ill = ire->ire_ipif->ipif_ill;
- /* for uniformity */
- ill_refhold(dst_ill);
- } else {
- /*
- * If we are here trying to create an IRE_CACHE
- * for an offlink destination and have the
- * IRE_CACHE for the next hop and the latter is
- * using virtual IP source address selection i.e
- * it's ire->ire_ipif is pointing to a virtual
- * network interface (vni) then
- * ip_newroute_get_dst_ll() will return the vni
- * interface as the dst_ill. Since the vni is
- * virtual i.e not associated with any physical
- * interface, it cannot be the dst_ill, hence
- * in such a case call ip_newroute_get_dst_ll()
- * with the stq_ill instead of the ire_ipif ILL.
- * The function returns a refheld ill.
- */
- if ((ire->ire_type == IRE_CACHE) &&
- IS_VNI(ire->ire_ipif->ipif_ill))
- dst_ill = ip_newroute_get_dst_ill(
- ire->ire_stq->q_ptr);
- else
- dst_ill = ip_newroute_get_dst_ill(
- ire->ire_ipif->ipif_ill);
- }
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute: "
- "no dst ill for dst"
- " %s\n", AF_INET, &dst);
- }
- goto icmp_err_ret;
- }
- } else {
- dst_ill = ire->ire_ipif->ipif_ill;
- /* for uniformity */
+ if ((ire->ire_type == IRE_CACHE) &&
+ IS_VNI(ire->ire_ipif->ipif_ill)) {
+ dst_ill = ire->ire_stq->q_ptr;
ill_refhold(dst_ill);
- /*
- * We should have found a route matching ill as we
- * called ire_ftable_lookup with MATCH_IRE_ILL.
- * Rather than asserting, when there is a mismatch,
- * we just drop the packet.
- */
- if (dst_ill != attach_ill) {
- ip0dbg(("ip_newroute: Packet dropped as "
- "IPIF_NOFAILOVER ill is %s, "
- "ire->ire_ipif->ipif_ill is %s\n",
- attach_ill->ill_name,
- dst_ill->ill_name));
- ill_refrele(attach_ill);
- goto icmp_err_ret;
+ } else {
+ ill_t *ill = ire->ire_ipif->ipif_ill;
+
+ if (IS_IPMP(ill)) {
+ dst_ill =
+ ipmp_illgrp_hold_next_ill(ill->ill_grp);
+ } else {
+ dst_ill = ill;
+ ill_refhold(dst_ill);
}
}
- /* attach_ill can't go in loop. IPMP and CGTP are disjoint */
- if (attach_ill != NULL) {
- ill_refrele(attach_ill);
- attach_ill = NULL;
- do_attach_ill = B_TRUE;
+
+ if (dst_ill == NULL) {
+ if (ip_debug > 2) {
+ pr_addr_dbg("ip_newroute: no dst "
+ "ill for dst %s\n", AF_INET, &dst);
+ }
+ goto icmp_err_ret;
}
- ASSERT(dst_ill != NULL);
ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name));
/*
* Pick the best source address from dst_ill.
*
- * 1) If it is part of a multipathing group, we would
- * like to spread the inbound packets across different
- * interfaces. ipif_select_source picks a random source
- * across the different ills in the group.
- *
- * 2) If it is not part of a multipathing group, we try
- * to pick the source address from the destination
+ * 1) Try to pick the source address from the destination
* route. Clustering assumes that when we have multiple
* prefixes hosted on an interface, the prefix of the
* source address matches the prefix of the destination
* route. We do this only if the address is not
* DEPRECATED.
*
- * 3) If the conn is in a different zone than the ire, we
+ * 2) If the conn is in a different zone than the ire, we
* need to pick a source address from the right zone.
- *
- * NOTE : If we hit case (1) above, the prefix of the source
- * address picked may not match the prefix of the
- * destination routes prefix as ipif_select_source
- * does not look at "dst" while picking a source
- * address.
- * If we want the same behavior as (2), we will need
- * to change the behavior of ipif_select_source.
*/
ASSERT(src_ipif == NULL);
if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
@@ -8287,7 +8071,8 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
if (src_ipif == NULL &&
(!unspec_src || ipha->ipha_src != INADDR_ANY)) {
ire_marks |= IRE_MARK_USESRC_CHECK;
- if ((dst_ill->ill_group != NULL) ||
+ if (!IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) &&
+ IS_IPMP(ire->ire_ipif->ipif_ill) ||
(ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
(connp != NULL && ire->ire_zoneid != zoneid &&
ire->ire_zoneid != ALL_ZONES) ||
@@ -8312,6 +8097,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
* as dst_ire source address.
*/
ipaddr_t saddr = ire->ire_ipif->ipif_src_addr;
+
src_ipif = ipif_select_source(dst_ill, saddr,
zoneid);
if (src_ipif == NULL) {
@@ -8319,7 +8105,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
pr_addr_dbg("ip_newroute: "
"no src for dst %s ",
AF_INET, &dst);
- printf("through interface %s\n",
+ printf("on interface %s\n",
dst_ill->ill_name);
}
goto icmp_err_ret;
@@ -8558,6 +8344,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
MULTIRT_DEBUG_TAG(first_mp);
}
}
+
ire_add_then_send(q, ire, xmit_mp);
ire_refrele(save_ire);
@@ -8766,7 +8553,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
"ip_newroute: no "
"src for gw %s ",
AF_INET, &gw);
- printf("through "
+ printf("on "
"interface %s\n",
dst_ill->ill_name);
}
@@ -8867,16 +8654,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
areq = (areq_t *)mp->b_rptr;
addrp = (ipaddr_t *)((char *)areq +
areq->areq_sender_addr_offset);
- if (do_attach_ill) {
- /*
- * This is bind to no failover case.
- * arp packet also must go out on attach_ill.
- */
- ASSERT(ipha->ipha_src != NULL);
- *addrp = ipha->ipha_src;
- } else {
- *addrp = save_ire->ire_src_addr;
- }
+ *addrp = save_ire->ire_src_addr;
ire_refrele(save_ire);
addrp = (ipaddr_t *)((char *)areq +
@@ -9076,14 +8854,10 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
ipaddr_t *addrp;
mblk_t *first_mp;
ire_t *save_ire = NULL;
- ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER */
ipif_t *src_ipif = NULL;
ushort_t ire_marks = 0;
ill_t *dst_ill = NULL;
- boolean_t mctl_present;
- ipsec_out_t *io;
ipha_t *ipha;
- int ihandle = 0;
mblk_t *saved_mp;
ire_t *fire = NULL;
mblk_t *copy_mp = NULL;
@@ -9117,10 +8891,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst),
ipif->ipif_ill->ill_name));
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
- if (mctl_present)
- io = (ipsec_out_t *)first_mp->b_rptr;
-
+ first_mp = mp;
+ if (DB_TYPE(mp) == M_CTL)
+ mp = mp->b_cont;
ipha = (ipha_t *)mp->b_rptr;
/*
@@ -9161,64 +8934,29 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
(void *)ipif, ntohl(dst), (void *)fire));
}
- if (mctl_present && io->ipsec_out_attach_if) {
- attach_ill = ip_grab_attach_ill(NULL, first_mp,
- io->ipsec_out_ill_index, B_FALSE, ipst);
-
- /* Failure case frees things for us. */
- if (attach_ill == NULL) {
- ipif_refrele(ipif);
- if (fire != NULL)
- ire_refrele(fire);
- return;
- }
+ /*
+ * Note: While we pick a dst_ill we are really only
+ * interested in the ill for load spreading. The source
+ * ipif is determined by source address selection below.
+ */
+ if (IS_IPMP(ipif->ipif_ill)) {
+ ipmp_illgrp_t *illg = ipif->ipif_ill->ill_grp;
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill)) {
- ire_marks = IRE_MARK_HIDDEN;
- }
- /*
- * ip_wput passes the right ipif for IPIF_NOFAILOVER
- * case.
- */
- dst_ill = ipif->ipif_ill;
- /* attach_ill has been refheld by ip_grab_attach_ill */
- ASSERT(dst_ill == attach_ill);
+ if (CLASSD(ipha_dst))
+ dst_ill = ipmp_illgrp_hold_cast_ill(illg);
+ else
+ dst_ill = ipmp_illgrp_hold_next_ill(illg);
} else {
- /*
- * If the interface belongs to an interface group,
- * make sure the next possible interface in the group
- * is used. This encourages load spreading among
- * peers in an interface group.
- * Note: load spreading is disabled for RTF_MULTIRT
- * routes.
- */
- if ((flags & RTF_MULTIRT) && (fire != NULL) &&
- (fire->ire_flags & RTF_MULTIRT)) {
- /*
- * Don't perform outbound load spreading
- * in the case of an RTF_MULTIRT issued route,
- * we actually typically want to replicate
- * outgoing packets through particular
- * interfaces.
- */
- dst_ill = ipif->ipif_ill;
- ill_refhold(dst_ill);
- } else {
- dst_ill = ip_newroute_get_dst_ill(
- ipif->ipif_ill);
- }
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute_ipif: "
- "no dst ill for dst %s\n",
- AF_INET, &dst);
- }
- goto err_ret;
+ dst_ill = ipif->ipif_ill;
+ ill_refhold(dst_ill);
+ }
+
+ if (dst_ill == NULL) {
+ if (ip_debug > 2) {
+ pr_addr_dbg("ip_newroute_ipif: no dst ill "
+ "for dst %s\n", AF_INET, &dst);
}
+ goto err_ret;
}
/*
@@ -9242,7 +8980,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
unspec_src = (connp != NULL && connp->conn_unspec_src);
- if (((!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) ||
+ if (!IS_UNDER_IPMP(ipif->ipif_ill) &&
+ (IS_IPMP(ipif->ipif_ill) ||
+ (!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) ||
(ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP ||
(connp != NULL && ipif->ipif_zoneid != zoneid &&
ipif->ipif_zoneid != ALL_ZONES)) &&
@@ -9256,7 +8996,7 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
"no src for dst %s",
AF_INET, &dst);
}
- ip1dbg((" through interface %s\n",
+ ip1dbg((" on interface %s\n",
dst_ill->ill_name));
goto err_ret;
}
@@ -9291,12 +9031,7 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
goto err_ret;
if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
goto err_ret;
- /*
- * ihandle is needed when the ire is added to
- * cache table.
- */
save_ire = ire;
- ihandle = save_ire->ire_ihandle;
ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, "
"flags %04x\n",
@@ -9328,10 +9063,6 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
ipha->ipha_src = fire->ire_src_addr;
}
} else {
- ASSERT((connp == NULL) ||
- (connp->conn_outgoing_ill != NULL) ||
- (connp->conn_dontroute) ||
- infop->ip_opt_ill_index != 0);
/*
* The only ways we can come here are:
* 1) IP_BOUND_IF socket option is set
@@ -9340,6 +9071,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
* In all cases, the new ire will not be added
* into cache table.
*/
+ ASSERT(connp == NULL || connp->conn_dontroute ||
+ connp->conn_outgoing_ill != NULL ||
+ infop->ip_opt_ill_index != 0);
ire_marks |= IRE_MARK_NOADD;
}
@@ -9374,7 +9108,8 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
(save_ire != NULL ? save_ire->ire_mask : 0),
(fire != NULL) ? /* Parent handle */
fire->ire_phandle : 0,
- ihandle, /* Interface handle */
+ (save_ire != NULL) ? /* Interface handle */
+ save_ire->ire_ihandle : 0,
(fire != NULL) ?
(fire->ire_flags &
(RTF_SETSRC | RTF_MULTIRT)) : 0,
@@ -9533,7 +9268,8 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
(save_ire != NULL ? save_ire->ire_mask : 0),
(fire != NULL) ? /* Parent handle */
fire->ire_phandle : 0,
- ihandle, /* Interface handle */
+ (save_ire != NULL) ? /* Interface handle */
+ save_ire->ire_ihandle : 0,
(fire != NULL) ? /* flags if any */
(fire->ire_flags &
(RTF_SETSRC | RTF_MULTIRT)) : 0,
@@ -9593,12 +9329,20 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
/*
* Fill in the source and dest addrs for the resolver.
* NOTE: this depends on memory layouts imposed by
- * ill_init().
+ * ill_init(). There are corner cases above where we
+ * might've created the IRE with an INADDR_ANY source
+ * address (e.g., if the zeroth ipif on an underlying
+ * ill in an IPMP group is 0.0.0.0, but another ipif
+ * on the ill has a usable test address). If so, tell
+ * ARP to use ipha_src as its sender address.
*/
areq = (areq_t *)mp->b_rptr;
addrp = (ipaddr_t *)((char *)areq +
areq->areq_sender_addr_offset);
- *addrp = ire->ire_src_addr;
+ if (ire->ire_src_addr != INADDR_ANY)
+ *addrp = ire->ire_src_addr;
+ else
+ *addrp = ipha->ipha_src;
addrp = (ipaddr_t *)((char *)areq +
areq->areq_target_addr_offset);
*addrp = dst;
@@ -10136,7 +9880,7 @@ ip_ipsec_load_complete(ipsec_stack_t *ipss)
/*
* Can't be used. Need to call svr4* -> optset directly. the leaf routine
* determines the grp on which it has to become exclusive, queues the mp
- * and sq draining restarts the optmgmt
+ * and IPSQ draining restarts the optmgmt
*/
static boolean_t
ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp)
@@ -10482,28 +10226,6 @@ ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option,
}
switch (option) {
- case IP_DONTFAILOVER_IF:
- /*
- * This option is used by in.mpathd to ensure
- * that IPMP probe packets only go out on the
- * test interfaces. in.mpathd sets this option
- * on the non-failover interfaces.
- * For backward compatibility, this option
- * implicitly sets IP_MULTICAST_IF, as used
- * be done in bind(), so that ip_wput gets
- * this ipif to send mcast packets.
- */
- if (ipif != NULL) {
- ASSERT(addr != INADDR_ANY);
- connp->conn_nofailover_ill = ipif->ipif_ill;
- connp->conn_multicast_ipif = ipif;
- } else {
- ASSERT(addr == INADDR_ANY);
- connp->conn_nofailover_ill = NULL;
- connp->conn_multicast_ipif = NULL;
- }
- break;
-
case IP_MULTICAST_IF:
connp->conn_multicast_ipif = ipif;
break;
@@ -10551,7 +10273,7 @@ ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly,
ill_refrele(ill);
return (0);
}
- if (!ipif_lookup_zoneid_group(ill, connp->conn_zoneid,
+ if (!ipif_lookup_zoneid(ill, connp->conn_zoneid,
0, NULL)) {
ill_refrele(ill);
ill = NULL;
@@ -10596,8 +10318,6 @@ setit:
case IP_BOUND_IF:
connp->conn_incoming_ill = ill;
connp->conn_outgoing_ill = ill;
- connp->conn_orig_bound_ifindex = (ill == NULL) ?
- 0 : ifindex;
break;
case IP_MULTICAST_IF:
@@ -10650,40 +10370,6 @@ setit:
case IPV6_BOUND_IF:
connp->conn_incoming_ill = ill;
connp->conn_outgoing_ill = ill;
- connp->conn_orig_bound_ifindex = (ill == NULL) ?
- 0 : ifindex;
- break;
-
- case IPV6_BOUND_PIF:
- /*
- * Limit all transmit to this ill.
- * Unlike IPV6_BOUND_IF, using this option
- * prevents load spreading and failover from
- * happening when the interface is part of the
- * group. That's why we don't need to remember
- * the ifindex in orig_bound_ifindex as in
- * IPV6_BOUND_IF.
- */
- connp->conn_outgoing_pill = ill;
- break;
-
- case IPV6_DONTFAILOVER_IF:
- /*
- * This option is used by in.mpathd to ensure
- * that IPMP probe packets only go out on the
- * test interfaces. in.mpathd sets this option
- * on the non-failover interfaces.
- */
- connp->conn_nofailover_ill = ill;
- /*
- * For backward compatibility, this option
- * implicitly sets ip_multicast_ill as used in
- * IPV6_MULTICAST_IF so that ip_wput gets
- * this ill to send mcast packets.
- */
- connp->conn_multicast_ill = ill;
- connp->conn_orig_multicast_ifindex = (ill == NULL) ?
- 0 : ifindex;
break;
case IPV6_MULTICAST_IF:
@@ -10700,12 +10386,9 @@ setit:
if (!checkonly) {
if (ifindex == 0) {
connp->conn_multicast_ill = NULL;
- connp->conn_orig_multicast_ifindex = 0;
connp->conn_multicast_ipif = NULL;
} else if (ill != NULL) {
connp->conn_multicast_ill = ill;
- connp->conn_orig_multicast_ifindex =
- ifindex;
}
}
break;
@@ -10867,8 +10550,7 @@ ip_opt_set(queue_t *q, uint_t optset_context, int level, int name,
if (secpolicy_ip_config(cr, B_FALSE) != 0)
return (EPERM);
/* FALLTHRU */
- case IP_MULTICAST_IF:
- case IP_DONTFAILOVER_IF: {
+ case IP_MULTICAST_IF: {
ipaddr_t addr = *i1;
error = ip_opt_set_ipif(connp, addr, checkonly, name,
@@ -11189,8 +10871,6 @@ ip_opt_set(queue_t *q, uint_t optset_context, int level, int name,
case IPPROTO_IPV6:
switch (name) {
case IPV6_BOUND_IF:
- case IPV6_BOUND_PIF:
- case IPV6_DONTFAILOVER_IF:
error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly,
level, name, first_mp);
if (error != 0)
@@ -12288,11 +11968,10 @@ ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha,
* frees mp on failure.
*/
static boolean_t
-ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha,
+ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
uint32_t *cksum_val, uint16_t *cksum_flags)
{
uint32_t frag_offset_flags;
- ill_t *ill = (ill_t *)q->q_ptr;
mblk_t *mp = *mpp;
mblk_t *t_mp;
ipaddr_t dst;
@@ -12337,12 +12016,12 @@ ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha,
/*
* We utilize hardware computed checksum info only for UDP since
- * IP fragmentation is a normal occurence for the protocol. In
+ * IP fragmentation is a normal occurrence for the protocol. In
* addition, checksum offload support for IP fragments carrying
* UDP payload is commonly implemented across network adapters.
*/
- ASSERT(ill != NULL);
- if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+ ASSERT(recv_ill != NULL);
+ if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(recv_ill) &&
(DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
mblk_t *mp1 = mp->b_cont;
int32_t len;
@@ -12808,7 +12487,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
goto ipoptions;
/* Check the IP header checksum. */
- if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
+ if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) {
/* Clear the IP header h/w cksum flag */
DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
} else if (!mctl_present) {
@@ -12871,7 +12550,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
* Revert to software checksum calculation if the interface
* isn't capable of checksum offload or if IPsec is present.
*/
- if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+ if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum)
hck_flags = DB_CKSUMFLAGS(mp);
if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
@@ -12958,8 +12637,11 @@ fragmented:
* reassembled packet has a valid hardware computed
* checksum information associated with it.
*/
- if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags))
+ if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, &sum,
+ &reass_hck_flags)) {
goto slow_done;
+ }
+
/*
* Make sure that first_mp points back to mp as
* the mp we came in with could have changed in
@@ -13073,7 +12755,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
goto ipoptions;
} else if (!mctl_present) {
/* Check the IP header checksum. */
- if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
+ if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) {
/* Clear the IP header h/w cksum flag */
DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
} else if (!mctl_present) {
@@ -13159,7 +12841,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
* Revert to software checksum calculation if the interface
* isn't capable of checksum offload or if IPsec is present.
*/
- if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+ if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum)
hck_flags = DB_CKSUMFLAGS(mp);
if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
@@ -13386,7 +13068,7 @@ ipoptions:
u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
if (u1 & (IPH_MF | IPH_OFFSET)) {
fragmented:
- if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
+ if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) {
if (mctl_present)
freeb(first_mp);
goto slow_done;
@@ -13530,7 +13212,7 @@ ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
goto ipoptions;
} else {
/* Check the IP header checksum. */
- if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, ill) &&
+ if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill) &&
!mctl_present) {
#define uph ((uint16_t *)ipha)
sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
@@ -13644,7 +13326,7 @@ ipoptions:
u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
if (u1 & (IPH_MF | IPH_OFFSET)) {
fragmented:
- if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL))
+ if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL))
goto slow_done;
/*
* Make sure that first_mp points back to mp as
@@ -13877,6 +13559,11 @@ ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst)
return (B_TRUE);
}
+/*
+ * Handle the situation where a packet came in on `ill' but matched an IRE
+ * whose ire_rfq doesn't match `ill'. We return the IRE that should be used
+ * for interface statistics.
+ */
ire_t *
ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
{
@@ -13887,16 +13574,22 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
boolean_t strict_check = B_FALSE;
/*
- * This packet came in on an interface other than the one associated
- * with the first ire we found for the destination address. We do
- * another ire lookup here, using the ingress ill, to see if the
- * interface is in an interface group.
+ * IPMP common case: if IRE and ILL are in the same group, there's no
+ * issue (e.g. packet received on an underlying interface matched an
+ * IRE_LOCAL on its associated group interface).
+ */
+ if (ire->ire_rfq != NULL &&
+ IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) {
+ return (ire);
+ }
+
+ /*
+ * Do another ire lookup here, using the ingress ill, to see if the
+ * interface is in a usesrc group.
* As long as the ills belong to the same group, we don't consider
* them to be arriving on the wrong interface. Thus, if the switch
* is doing inbound load spreading, we won't drop packets when the
- * ip*_strict_dst_multihoming switch is on. Note, the same holds true
- * for 'usesrc groups' where the destination address may belong to
- * another interface to allow multipathing to happen.
+ * ip*_strict_dst_multihoming switch is on.
* We also need to check for IPIF_UNNUMBERED point2point interfaces
* where the local address may not be unique. In this case we were
* at the mercy of the initial ire cache lookup and the IRE_LOCAL it
@@ -13910,18 +13603,18 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
strict_check = B_TRUE;
new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL,
ill->ill_ipif, ALL_ZONES, NULL,
- (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst);
+ (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst);
} else {
ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr));
if (ipst->ips_ipv6_strict_dst_multihoming)
strict_check = B_TRUE;
new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL,
IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL,
- (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst);
+ (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst);
}
/*
* If the same ire that was returned in ip_input() is found then this
- * is an indication that interface groups are in use. The packet
+ * is an indication that usesrc groups are in use. The packet
* arrived on a different ill in the group than the one associated with
* the destination address. If a different ire was found then the same
* IP address must be hosted on multiple ills. This is possible with
@@ -14075,11 +13768,10 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
/*
* Forwarding fastpath exception case:
- * If either of the follwoing case is true, we take
- * the slowpath
+ * If any of the following are true, we take the slowpath:
* o forwarding is not enabled
- * o incoming and outgoing interface are the same, or the same
- * IPMP group
+ * o incoming and outgoing interface are the same, or in the same
+ * IPMP group.
* o corresponding ire is in incomplete state
* o packet needs fragmentation
* o ARP cache is not resolved
@@ -14090,8 +13782,7 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
pkt_len = ntohs(ipha->ipha_length);
stq_ill = (ill_t *)ire->ire_stq->q_ptr;
if (!(stq_ill->ill_flags & ILLF_ROUTER) ||
- (ill == stq_ill) ||
- (ill->ill_group != NULL && ill->ill_group == stq_ill->ill_group) ||
+ (ill == stq_ill) || IS_IN_SAME_ILLGRP(ill, stq_ill) ||
(ire->ire_nce == NULL) ||
(pkt_len > ire->ire_max_frag) ||
((fpmp = ire->ire_nce->nce_fp_mp) == NULL) ||
@@ -14185,11 +13876,10 @@ static void
ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward)
{
- ill_group_t *ill_group;
- ill_group_t *ire_group;
queue_t *dev_q;
ire_t *src_ire;
ip_stack_t *ipst = ill->ill_ipst;
+ boolean_t same_illgrp = B_FALSE;
ASSERT(ire->ire_stq != NULL);
@@ -14200,11 +13890,8 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
* If the caller of this function is ip_fast_forward() skip the
* next three checks as it does not apply.
*/
- if (from_ip_fast_forward) {
- ill_group = ill->ill_group;
- ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group;
+ if (from_ip_fast_forward)
goto skip;
- }
if (ll_multicast != 0) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
@@ -14230,13 +13917,10 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
goto drop_pkt;
}
- ill_group = ill->ill_group;
- ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group;
/*
* Check if we want to forward this one at this time.
* We allow source routed packets on a host provided that
- * they go out the same interface or same interface group
- * as they came in on.
+ * they go out the same ill or illgrp as they came in on.
*
* XXX To be quicker, we may wish to not chase pointers to
* get the ILLF_ROUTER flag and instead store the
@@ -14245,11 +13929,12 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
* whenever the ILLF_ROUTER flag changes.
*/
skip:
+ same_illgrp = IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr);
+
if (((ill->ill_flags &
- ((ill_t *)ire->ire_stq->q_ptr)->ill_flags &
- ILLF_ROUTER) == 0) &&
- !(ip_source_routed(ipha, ipst) && (ire->ire_rfq == q ||
- (ill_group != NULL && ill_group == ire_group)))) {
+ ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_ROUTER) == 0) &&
+ !(ip_source_routed(ipha, ipst) &&
+ (ire->ire_rfq == q || same_illgrp))) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
if (ip_source_routed(ipha, ipst)) {
q = WR(q);
@@ -14290,12 +13975,10 @@ skip:
ire_t *nhop_ire = NULL;
/*
- * Check whether ire_rfq and q are from the same ill
- * or if they are not same, they at least belong
- * to the same group. If so, send redirects.
+ * Check whether ire_rfq and q are from the same ill or illgrp.
+ * If so, send redirects.
*/
- if ((ire->ire_rfq == q ||
- (ill_group != NULL && ill_group == ire_group)) &&
+ if ((ire->ire_rfq == q || same_illgrp) &&
!ip_source_routed(ipha, ipst)) {
nhop = (ire->ire_gateway_addr != 0 ?
@@ -14396,26 +14079,15 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha,
}
/*
* For multicast we have set dst to be INADDR_BROADCAST
- * for delivering to all STREAMS. IRE_MARK_NORECV is really
- * only for broadcast packets.
+ * for delivering to all STREAMS.
*/
if (!CLASSD(ipha->ipha_dst)) {
ire_t *new_ire;
ipif_t *ipif;
- /*
- * For ill groups, as the switch duplicates broadcasts
- * across all the ports, we need to filter out and
- * send up only one copy. There is one copy for every
- * broadcast address on each ill. Thus, we look for a
- * specific IRE on this ill and look at IRE_MARK_NORECV
- * later to see whether this ill is eligible to receive
- * them or not. ill_nominate_bcast_rcv() nominates only
- * one set of IREs for receiving.
- */
ipif = ipif_get_next_ipif(NULL, ill);
if (ipif == NULL) {
- ire_refrele(ire);
+discard: ire_refrele(ire);
freemsg(mp);
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
return (NULL);
@@ -14425,13 +14097,17 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha,
ipif_refrele(ipif);
if (new_ire != NULL) {
- if (new_ire->ire_marks & IRE_MARK_NORECV) {
- ire_refrele(ire);
+ /*
+ * If the matching IRE_BROADCAST is part of an IPMP
+ * group, then drop the packet unless our ill has been
+ * nominated to receive for the group.
+ */
+ if (IS_IPMP(new_ire->ire_ipif->ipif_ill) &&
+ new_ire->ire_rfq != q) {
ire_refrele(new_ire);
- freemsg(mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return (NULL);
+ goto discard;
}
+
/*
* In the special case of multirouted broadcast
* packets, we unconditionally need to "gateway"
@@ -14571,6 +14247,13 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
ntohs(ipha->ipha_length));
/*
+ * So that we don't end up with dups, only one ill an IPMP group is
+ * nominated to receive multicast traffic.
+ */
+ if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast)
+ goto drop_pkt;
+
+ /*
* Forward packets only if we have joined the allmulti
* group on this interface.
*/
@@ -14619,18 +14302,15 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
}
}
- ILM_WALKER_HOLD(ill);
if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) {
/*
* This might just be caused by the fact that
* multiple IP Multicast addresses map to the same
* link layer multicast - no need to increment counter!
*/
- ILM_WALKER_RELE(ill);
freemsg(mp);
return (B_TRUE);
}
- ILM_WALKER_RELE(ill);
done:
ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp)));
/*
@@ -15498,8 +15178,8 @@ local:
* broadcast ire.
*/
if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) {
- if ((ire = ip_check_multihome(&ipha->ipha_dst, ire,
- ill)) == NULL) {
+ ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
+ if (ire == NULL) {
/* Drop packet */
BUMP_MIB(ill->ill_ip_mib,
ipIfStatsForwProhibits);
@@ -15935,19 +15615,12 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
ip1dbg(("ip_rput_dlpi_writer .."));
ill = (ill_t *)q->q_ptr;
- ASSERT(ipsq == ill->ill_phyint->phyint_ipsq);
-
+ ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop);
ASSERT(IAM_WRITER_ILL(ill));
ipst = ill->ill_ipst;
- /*
- * ipsq_pending_mp and ipsq_pending_ipif track each other. i.e.
- * both are null or non-null. However we can assert that only
- * after grabbing the ipsq_lock. So we don't make any assertion
- * here and in other places in the code.
- */
- ipif = ipsq->ipsq_pending_ipif;
+ ipif = ipsq->ipsq_xop->ipx_pending_ipif;
/*
* The current ioctl could have been aborted by the user and a new
* ioctl to bring up another ill could have started. We could still
@@ -16045,9 +15718,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
*/
ASSERT(connp != NULL);
q = CONNP_TO_WQ(connp);
- if (ill->ill_move_in_progress) {
- ILL_CLEAR_MOVE(ill);
- }
(void) ipif_down(ipif, NULL, NULL);
/* error is set below the switch */
}
@@ -16196,45 +15866,31 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
* ill_dl_up(), which stopped ipif_up()'s processing.
*/
if (ill->ill_isv6) {
- /*
- * v6 interfaces.
- * Unlike ARP which has to do another bind
- * and attach, once we get here we are
- * done with NDP. Except in the case of
- * ILLF_XRESOLV, in which case we send an
- * AR_INTERFACE_UP to the external resolver.
- * If all goes well, the ioctl will complete
- * in ip_rput(). If there's an error, we
- * complete it here.
- */
- if ((err = ipif_ndp_up(ipif)) == 0) {
- if (ill->ill_flags & ILLF_XRESOLV) {
- mutex_enter(&connp->conn_lock);
- mutex_enter(&ill->ill_lock);
- success = ipsq_pending_mp_add(
- connp, ipif, q, mp1, 0);
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- if (success) {
- err = ipif_resolver_up(ipif,
- Res_act_initial);
- if (err == EINPROGRESS) {
- freemsg(mp);
- return;
- }
- ASSERT(err != 0);
- mp1 = ipsq_pending_mp_get(ipsq,
- &connp);
- ASSERT(mp1 != NULL);
- } else {
- /* conn has started closing */
- err = EINTR;
- }
- } else { /* Non XRESOLV interface */
- (void) ipif_resolver_up(ipif,
+ if (ill->ill_flags & ILLF_XRESOLV) {
+ mutex_enter(&connp->conn_lock);
+ mutex_enter(&ill->ill_lock);
+ success = ipsq_pending_mp_add(connp, ipif, q,
+ mp1, 0);
+ mutex_exit(&ill->ill_lock);
+ mutex_exit(&connp->conn_lock);
+ if (success) {
+ err = ipif_resolver_up(ipif,
Res_act_initial);
- err = ipif_up_done_v6(ipif);
+ if (err == EINPROGRESS) {
+ freemsg(mp);
+ return;
+ }
+ ASSERT(err != 0);
+ mp1 = ipsq_pending_mp_get(ipsq, &connp);
+ ASSERT(mp1 != NULL);
+ } else {
+ /* conn has started closing */
+ err = EINTR;
}
+ } else { /* Non XRESOLV interface */
+ (void) ipif_resolver_up(ipif, Res_act_initial);
+ if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
+ err = ipif_up_done_v6(ipif);
}
} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
/*
@@ -16275,14 +15931,31 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
}
- if (ill->ill_up_ipifs) {
- ill_group_cleanup(ill);
+ /*
+ * If we have a moved ipif to bring up, and everything has
+ * succeeded to this point, bring it up on the IPMP ill.
+ * Otherwise, leave it down -- the admin can try to bring it
+ * up by hand if need be.
+ */
+ if (ill->ill_move_ipif != NULL) {
+ if (err != 0) {
+ ill->ill_move_ipif = NULL;
+ } else {
+ ipif = ill->ill_move_ipif;
+ ill->ill_move_ipif = NULL;
+ err = ipif_up(ipif, q, mp1);
+ if (err == EINPROGRESS) {
+ freemsg(mp);
+ return;
+ }
+ }
}
-
break;
+
case DL_NOTIFY_IND: {
dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr;
ire_t *ire;
+ uint_t orig_mtu;
boolean_t need_ire_walk_v4 = B_FALSE;
boolean_t need_ire_walk_v6 = B_FALSE;
@@ -16322,17 +15995,27 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
* which it is being derived.
*/
mutex_enter(&ill->ill_lock);
+
+ orig_mtu = ill->ill_max_mtu;
ill->ill_max_frag = (uint_t)notify->dl_data;
+ ill->ill_max_mtu = (uint_t)notify->dl_data;
+
+ /*
+ * If ill_user_mtu was set (via SIOCSLIFLNKINFO),
+ * clamp ill_max_mtu at it.
+ */
+ if (ill->ill_user_mtu != 0 &&
+ ill->ill_user_mtu < ill->ill_max_mtu)
+ ill->ill_max_mtu = ill->ill_user_mtu;
/*
- * If an SIOCSLIFLNKINFO has changed the ill_max_mtu
- * leave it alone
+ * If the MTU is unchanged, we're done.
*/
- if (ill->ill_mtu_userspecified) {
+ if (orig_mtu == ill->ill_max_mtu) {
mutex_exit(&ill->ill_lock);
break;
}
- ill->ill_max_mtu = ill->ill_max_frag;
+
if (ill->ill_isv6) {
if (ill->ill_max_mtu < IPV6_MIN_MTU)
ill->ill_max_mtu = IPV6_MIN_MTU;
@@ -16371,7 +16054,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
if (need_ire_walk_v6)
ire_walk_v6(ill_mtu_change, (char *)ill,
ALL_ZONES, ipst);
+
+ /*
+ * Refresh IPMP meta-interface MTU if necessary.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_illgrp_refresh_mtu(ill->ill_grp);
break;
+
case DL_NOTE_LINK_UP:
case DL_NOTE_LINK_DOWN: {
/*
@@ -16385,9 +16075,17 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
went_up = notify->dl_notification == DL_NOTE_LINK_UP;
mutex_enter(&phyint->phyint_lock);
+
new_phyint_flags = went_up ?
phyint->phyint_flags | PHYI_RUNNING :
phyint->phyint_flags & ~PHYI_RUNNING;
+
+ if (IS_IPMP(ill)) {
+ new_phyint_flags = went_up ?
+ new_phyint_flags & ~PHYI_FAILED :
+ new_phyint_flags | PHYI_FAILED;
+ }
+
if (new_phyint_flags != phyint->phyint_flags) {
phyint->phyint_flags = new_phyint_flags;
changed = B_TRUE;
@@ -16474,7 +16172,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
* is invoked from an ill queue, conn_oper_pending_ill is not
* available, but we know the ioctl is pending on ill_wq.)
*/
- uint_t paddrlen, paddroff;
+ uint_t paddrlen, paddroff;
paddrreq = ill->ill_phys_addr_pend;
paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length;
@@ -16592,29 +16290,59 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
freemsg(mp);
- if (mp1 != NULL) {
+ if (mp1 == NULL)
+ return;
+
+ /*
+ * The operation must complete without EINPROGRESS since
+ * ipsq_pending_mp_get() has removed the mblk (mp1). Otherwise,
+ * the operation will be stuck forever inside the IPSQ.
+ */
+ ASSERT(err != EINPROGRESS);
+
+ switch (ipsq->ipsq_xop->ipx_current_ioctl) {
+ case 0:
+ ipsq_current_finish(ipsq);
+ break;
+
+ case SIOCSLIFNAME:
+ case IF_UNITSEL: {
+ ill_t *ill_other = ILL_OTHER(ill);
+
/*
- * The operation must complete without EINPROGRESS
- * since ipsq_pending_mp_get() has removed the mblk
- * from ipsq_pending_mp. Otherwise, the operation
- * will be stuck forever in the ipsq.
+ * If SIOCSLIFNAME or IF_UNITSEL is about to succeed, and the
+ * ill has a peer which is in an IPMP group, then place ill
+ * into the same group. One catch: although ifconfig plumbs
+ * the appropriate IPMP meta-interface prior to plumbing this
+ * ill, it is possible for multiple ifconfig applications to
+ * race (or for another application to adjust plumbing), in
+ * which case the IPMP meta-interface we need will be missing.
+ * If so, kick the phyint out of the group.
*/
- ASSERT(err != EINPROGRESS);
+ if (err == 0 && ill_other != NULL && IS_UNDER_IPMP(ill_other)) {
+ ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
+ ipmp_illgrp_t *illg;
- switch (ipsq->ipsq_current_ioctl) {
- case 0:
- ipsq_current_finish(ipsq);
- break;
+ illg = ill->ill_isv6 ? grp->gr_v6 : grp->gr_v4;
+ if (illg == NULL)
+ ipmp_phyint_leave_grp(ill->ill_phyint);
+ else
+ ipmp_ill_join_illgrp(ill, illg);
+ }
- case SIOCLIFADDIF:
- case SIOCSLIFNAME:
+ if (ipsq->ipsq_xop->ipx_current_ioctl == IF_UNITSEL)
+ ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
+ else
ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
- break;
+ break;
+ }
+ case SIOCLIFADDIF:
+ ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
+ break;
- default:
- ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
- break;
- }
+ default:
+ ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
+ break;
}
}
@@ -16626,20 +16354,16 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
void
ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
{
- ill_t *ill;
+ ill_t *ill = q->q_ptr;
struct iocblk *iocp;
mblk_t *mp1;
conn_t *connp = NULL;
ip1dbg(("ip_rput_other "));
- ill = (ill_t *)q->q_ptr;
- /*
- * This routine is not a writer in the case of SIOCGTUNPARAM
- * in which case ipsq is NULL.
- */
if (ipsq != NULL) {
ASSERT(IAM_WRITER_IPSQ(ipsq));
- ASSERT(ipsq == ill->ill_phyint->phyint_ipsq);
+ ASSERT(ipsq->ipsq_xop ==
+ ill->ill_phyint->phyint_ipsq->ipsq_xop);
}
switch (mp->b_datap->db_type) {
@@ -16752,7 +16476,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
case DL_IOC_HDR_INFO:
/*
- * If this was the first attempt turn of the
+ * If this was the first attempt, turn off the
* fastpath probing.
*/
mutex_enter(&ill->ill_lock);
@@ -16768,7 +16492,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
freemsg(mp);
break;
- case SIOCSTUNPARAM:
+ case SIOCSTUNPARAM:
case OSIOCSTUNPARAM:
ASSERT(ipsq != NULL);
/*
@@ -17017,14 +16741,13 @@ ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif)
/*
* Find an IRE which matches the destination and the outgoing
* queue in the cache table. All we need is an IRE_CACHE which
- * is pointing at ipif->ipif_ill. If it is part of some ill group,
- * then it is enough to have some IRE_CACHE in the group.
+ * is pointing at ipif->ipif_ill.
*/
if (ipif->ipif_flags & IPIF_POINTOPOINT)
dst = ipif->ipif_pp_dst_addr;
ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, MBLK_GETLABEL(mp),
- MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR, ipst);
+ MATCH_IRE_ILL | MATCH_IRE_SECATTR, ipst);
if (ire == NULL) {
/*
* Mark this packet to make it be delivered to
@@ -17321,7 +17044,8 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire)
*/
mp->b_datap->db_type = M_DATA;
icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp,
- ip6h, icmp6, ill, B_TRUE, ii->ipsec_in_zoneid);
+ ip6h, icmp6, ill, recv_ill, B_TRUE,
+ ii->ipsec_in_zoneid);
}
if (ill_need_rele)
ill_refrele(ill);
@@ -17357,37 +17081,36 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire)
}
switch (ipha->ipha_protocol) {
- case IPPROTO_UDP:
- ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire,
- recv_ill);
- if (ire_need_rele)
- ire_refrele(ire);
- break;
- case IPPROTO_TCP:
- if (!ire_need_rele)
- IRE_REFHOLD(ire);
- mp = ip_tcp_input(mp, ipha, ill, B_TRUE,
- ire, ipsec_mp, 0, ill->ill_rq, NULL);
- IRE_REFRELE(ire);
- if (mp != NULL) {
-
- SQUEUE_ENTER(GET_SQUEUE(mp), mp,
- mp, 1, SQ_PROCESS,
- SQTAG_IP_PROTO_AGAIN);
- }
- break;
- case IPPROTO_SCTP:
- if (!ire_need_rele)
- IRE_REFHOLD(ire);
- ip_sctp_input(mp, ipha, ill, B_TRUE, ire,
- ipsec_mp, 0, ill->ill_rq, dst);
- break;
- default:
- ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire,
- recv_ill, 0);
- if (ire_need_rele)
- ire_refrele(ire);
- break;
+ case IPPROTO_UDP:
+ ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire,
+ recv_ill);
+ if (ire_need_rele)
+ ire_refrele(ire);
+ break;
+ case IPPROTO_TCP:
+ if (!ire_need_rele)
+ IRE_REFHOLD(ire);
+ mp = ip_tcp_input(mp, ipha, ill, B_TRUE,
+ ire, ipsec_mp, 0, ill->ill_rq, NULL);
+ IRE_REFRELE(ire);
+ if (mp != NULL) {
+ SQUEUE_ENTER(GET_SQUEUE(mp), mp,
+ mp, 1, SQ_PROCESS,
+ SQTAG_IP_PROTO_AGAIN);
+ }
+ break;
+ case IPPROTO_SCTP:
+ if (!ire_need_rele)
+ IRE_REFHOLD(ire);
+ ip_sctp_input(mp, ipha, ill, B_TRUE, ire,
+ ipsec_mp, 0, ill->ill_rq, dst);
+ break;
+ default:
+ ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire,
+ recv_ill, 0);
+ if (ire_need_rele)
+ ire_refrele(ire);
+ break;
}
} else {
uint32_t rput_flags = 0;
@@ -17621,9 +17344,9 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
*/
ASSERT(!mctl_present);
ASSERT(first_mp == mp);
- if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
+ if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL))
return;
- }
+
/*
* Make sure that first_mp points back to mp as
* the mp we came in with could have changed in
@@ -17647,17 +17370,10 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
ilm_t *ilm;
mblk_t *mp1;
zoneid_t last_zoneid;
+ ilm_walker_t ilw;
if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) {
ASSERT(ire->ire_type == IRE_BROADCAST);
- /*
- * Inactive/Failed interfaces are not supposed to
- * respond to the multicast packets.
- */
- if (ill_is_probeonly(ill)) {
- freemsg(first_mp);
- return;
- }
/*
* In the multicast case, applications may have joined
@@ -17680,11 +17396,9 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
* have been exhausted.
*/
last_zoneid = -1;
- ILM_WALKER_HOLD(recv_ill);
- for (ilm = recv_ill->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if ((ilm->ilm_flags & ILM_DELETED) ||
- ipha->ipha_dst != ilm->ilm_addr ||
+ ilm = ilm_walker_start(&ilw, recv_ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ if (ipha->ipha_dst != ilm->ilm_addr ||
ilm->ilm_zoneid == last_zoneid ||
ilm->ilm_zoneid == ire->ire_zoneid ||
ilm->ilm_zoneid == ALL_ZONES ||
@@ -17693,12 +17407,12 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
mp1 = ip_copymsg(first_mp);
if (mp1 == NULL)
continue;
- icmp_inbound(q, mp1, B_TRUE, ill,
+ icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill,
0, sum, mctl_present, B_TRUE,
recv_ill, ilm->ilm_zoneid);
last_zoneid = ilm->ilm_zoneid;
}
- ILM_WALKER_RELE(recv_ill);
+ ilm_walker_finish(&ilw);
} else if (ire->ire_type == IRE_BROADCAST) {
/*
* In the broadcast case, there may be many zones
@@ -18580,14 +18294,13 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level)
return (1);
}
- if ((mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, ipst)) == NULL) {
+ mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, level, ipst);
+ if (mpctl == NULL)
return (1);
- }
- mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, ipst);
- if (mpctl == NULL) {
+ mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, level, ipst);
+ if (mpctl == NULL)
return (1);
- }
if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) {
return (1);
@@ -19048,6 +18761,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
mblk_t *mp_tail = NULL;
ill_walk_context_t ctx;
zoneid_t zoneid;
+ ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -19064,7 +18778,10 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- ILM_WALKER_HOLD(ill);
+ if (IS_UNDER_IPMP(ill))
+ continue;
+
+ ilm = ilm_walker_start(&ilw, ill);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if (ipif->ipif_zoneid != zoneid &&
@@ -19074,7 +18791,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
OCTET_LENGTH);
ipm.ipGroupMemberIfIndex.o_length =
mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes);
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
ASSERT(ilm->ilm_ipif != NULL);
ASSERT(ilm->ilm_ill == NULL);
if (ilm->ilm_ipif != ipif)
@@ -19090,7 +18807,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
}
}
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
}
rw_exit(&ipst->ips_ill_g_lock);
optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
@@ -19112,6 +18829,7 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
mblk_t *mp_tail = NULL;
ill_walk_context_t ctx;
zoneid_t zoneid;
+ ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -19127,9 +18845,12 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- ILM_WALKER_HOLD(ill);
+ if (IS_UNDER_IPMP(ill))
+ continue;
+
+ ilm = ilm_walker_start(&ilw, ill);
ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex;
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
ASSERT(ilm->ilm_ipif == NULL);
ASSERT(ilm->ilm_ill != NULL);
if (ilm->ilm_zoneid != zoneid)
@@ -19145,7 +18866,7 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
(uint_t)sizeof (ipm6)));
}
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
}
rw_exit(&ipst->ips_ill_g_lock);
@@ -19171,6 +18892,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
zoneid_t zoneid;
int i;
slist_t *sl;
+ ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -19187,7 +18909,10 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- ILM_WALKER_HOLD(ill);
+ if (IS_UNDER_IPMP(ill))
+ continue;
+
+ ilm = ilm_walker_start(&ilw, ill);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if (ipif->ipif_zoneid != zoneid)
@@ -19196,7 +18921,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
OCTET_LENGTH);
ips.ipGroupSourceIfIndex.o_length =
mi_strlen(ips.ipGroupSourceIfIndex.o_bytes);
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
ASSERT(ilm->ilm_ipif != NULL);
ASSERT(ilm->ilm_ill == NULL);
sl = ilm->ilm_filter;
@@ -19220,7 +18945,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
}
}
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
}
rw_exit(&ipst->ips_ill_g_lock);
optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
@@ -19244,6 +18969,7 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
zoneid_t zoneid;
int i;
slist_t *sl;
+ ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -19259,9 +18985,12 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- ILM_WALKER_HOLD(ill);
+ if (IS_UNDER_IPMP(ill))
+ continue;
+
+ ilm = ilm_walker_start(&ilw, ill);
ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex;
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
ASSERT(ilm->ilm_ipif == NULL);
ASSERT(ilm->ilm_ill != NULL);
sl = ilm->ilm_filter;
@@ -19279,7 +19008,7 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
}
}
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
}
rw_exit(&ipst->ips_ill_g_lock);
@@ -19345,7 +19074,8 @@ ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
* in one IRE walk.
*/
static mblk_t *
-ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
+ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
+ ip_stack_t *ipst)
{
struct opthdr *optp;
mblk_t *mp2ctl; /* Returned */
@@ -19377,6 +19107,14 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
ird.ird_route.lp_head = mpctl->b_cont;
ird.ird_netmedia.lp_head = mp3ctl->b_cont;
ird.ird_attrs.lp_head = mp4ctl->b_cont;
+ /*
+ * If the level has been set the special EXPER_IP_AND_TESTHIDDEN
+ * value, then also include IRE_MARK_TESTHIDDEN IREs. This is
+ * intended a temporary solution until a proper MIB API is provided
+ * that provides complete filtering/caller-opt-in.
+ */
+ if (level == EXPER_IP_AND_TESTHIDDEN)
+ ird.ird_flags |= IRD_REPORT_TESTHIDDEN;
zoneid = Q_TO_CONN(q)->conn_zoneid;
ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst);
@@ -19419,7 +19157,8 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
* ipv6NetToMediaEntryTable in an NDP walk.
*/
static mblk_t *
-ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
+ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
+ ip_stack_t *ipst)
{
struct opthdr *optp;
mblk_t *mp2ctl; /* Returned */
@@ -19451,6 +19190,14 @@ ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
ird.ird_route.lp_head = mpctl->b_cont;
ird.ird_netmedia.lp_head = mp3ctl->b_cont;
ird.ird_attrs.lp_head = mp4ctl->b_cont;
+ /*
+ * If the level has been set the special EXPER_IP_AND_TESTHIDDEN
+ * value, then also include IRE_MARK_TESTHIDDEN IREs. This is
+ * intended a temporary solution until a proper MIB API is provided
+ * that provides complete filtering/caller-opt-in.
+ */
+ if (level == EXPER_IP_AND_TESTHIDDEN)
+ ird.ird_flags |= IRD_REPORT_TESTHIDDEN;
zoneid = Q_TO_CONN(q)->conn_zoneid;
ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst);
@@ -19671,6 +19418,11 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
ASSERT(ire->ire_ipversion == IPV4_VERSION);
+ if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) &&
+ ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+ return;
+ }
+
if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
return;
@@ -19812,6 +19564,11 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
ASSERT(ire->ire_ipversion == IPV6_VERSION);
+ if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) &&
+ ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+ return;
+ }
+
if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
return;
@@ -20518,8 +20275,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
boolean_t mctl_present;
ipsec_out_t *io;
int match_flags;
- ill_t *attach_ill = NULL;
- /* Bind to IPIF_NOFAILOVER ill etc. */
ill_t *xmit_ill = NULL; /* IP_PKTINFO etc. */
ipif_t *dst_ipif;
boolean_t multirt_need_resolve = B_FALSE;
@@ -20639,16 +20394,11 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
}
/*
- * IP_DONTFAILOVER_IF and IP_BOUND_IF have precedence over ill index
- * passed in IP_PKTINFO.
+ * IP_BOUND_IF has precedence over the ill index passed in IP_PKTINFO.
*/
- if (infop->ip_opt_ill_index != 0 &&
- connp->conn_outgoing_ill == NULL &&
- connp->conn_nofailover_ill == NULL) {
-
- xmit_ill = ill_lookup_on_ifindex(
- infop->ip_opt_ill_index, B_FALSE, NULL, NULL, NULL, NULL,
- ipst);
+ if (infop->ip_opt_ill_index != 0 && connp->conn_outgoing_ill == NULL) {
+ xmit_ill = ill_lookup_on_ifindex(infop->ip_opt_ill_index,
+ B_FALSE, NULL, NULL, NULL, NULL, ipst);
if (xmit_ill == NULL || IS_VNI(xmit_ill))
goto drop_pkt;
@@ -20659,7 +20409,7 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
* accessible from all zones i.e has a valid ipif in
* all zones.
*/
- if (!ipif_lookup_zoneid_group(xmit_ill, zoneid, 0, NULL)) {
+ if (!ipif_lookup_zoneid(xmit_ill, zoneid, 0, NULL)) {
goto drop_pkt;
}
}
@@ -20696,18 +20446,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
goto version_hdrlen_check;
dst = ipha->ipha_dst;
- if (connp->conn_nofailover_ill != NULL) {
- attach_ill = conn_get_held_ill(connp,
- &connp->conn_nofailover_ill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- if (need_decref)
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
- }
-
/* If IP_BOUND_IF has been set, use that ill. */
if (connp->conn_outgoing_ill != NULL) {
xmit_ill = conn_get_held_ill(connp,
@@ -20761,9 +20499,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
ire = NULL;
}
- if (attach_ill != NULL)
- goto send_from_ill;
-
/*
* We cache IRE_CACHEs to avoid lookups. We don't do
* this for the tcp global queue and listen end point
@@ -21074,45 +20809,21 @@ notdata:
}
ASSERT(first_mp != NULL);
- /*
- * ICMP echo replies attach an ipsec_out and set ipsec_out_attach_if
- * to make sure that this packet goes out on the same interface it
- * came in. We handle that here.
- */
- if (mctl_present) {
- uint_t ifindex;
+ if (mctl_present) {
io = (ipsec_out_t *)first_mp->b_rptr;
- if (io->ipsec_out_attach_if || io->ipsec_out_ip_nexthop) {
+ if (io->ipsec_out_ip_nexthop) {
/*
* We may have lost the conn context if we are
* coming here from ip_newroute(). Copy the
* nexthop information.
*/
- if (io->ipsec_out_ip_nexthop) {
- ip_nexthop = B_TRUE;
- nexthop_addr = io->ipsec_out_nexthop_addr;
+ ip_nexthop = B_TRUE;
+ nexthop_addr = io->ipsec_out_nexthop_addr;
- ipha = (ipha_t *)mp->b_rptr;
- dst = ipha->ipha_dst;
- goto send_from_ill;
- } else {
- ASSERT(io->ipsec_out_ill_index != 0);
- ifindex = io->ipsec_out_ill_index;
- attach_ill = ill_lookup_on_ifindex(ifindex,
- B_FALSE, NULL, NULL, NULL, NULL, ipst);
- if (attach_ill == NULL) {
- ASSERT(xmit_ill == NULL);
- ip1dbg(("ip_output: bad ifindex for "
- "(BIND TO IPIF_NOFAILOVER) %d\n",
- ifindex));
- freemsg(first_mp);
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutDiscards);
- ASSERT(!need_decref);
- return;
- }
- }
+ ipha = (ipha_t *)mp->b_rptr;
+ dst = ipha->ipha_dst;
+ goto send_from_ill;
}
}
@@ -21161,7 +20872,7 @@ hdrtoosmall:
ipha = (ipha_t *)mp->b_rptr;
if (first_mp == NULL) {
- ASSERT(attach_ill == NULL && xmit_ill == NULL);
+ ASSERT(xmit_ill == NULL);
/*
* If we got here because of "goto hdrtoosmall"
* We need to attach a IPSEC_OUT.
@@ -21213,8 +20924,6 @@ version_hdrlen_check:
*/
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion);
ASSERT(xmit_ill == NULL);
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
if (need_decref)
mp->b_flag |= MSGHASREF;
(void) ip_output_v6(arg, first_mp, arg2, caller);
@@ -21255,8 +20964,6 @@ version_hdrlen_check:
zoneid, ipst)) {
ASSERT(xmit_ill == NULL);
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
"ip_wput_end: q %p (%S)", q, "badopts");
if (need_decref)
@@ -21295,22 +21002,6 @@ multicast:
*/
ill_t *ill = (ill_t *)q->q_ptr;
- /*
- * Don't honor attach_if for this case. If ill
- * is part of the group, ipif could belong to
- * any ill and we cannot maintain attach_ill
- * and ipif_ill same anymore and the assert
- * below would fail.
- */
- if (mctl_present && io->ipsec_out_attach_if) {
- io->ipsec_out_ill_index = 0;
- io->ipsec_out_attach_if = B_FALSE;
- ASSERT(attach_ill != NULL);
- ill_refrele(attach_ill);
- attach_ill = NULL;
- }
-
- ASSERT(attach_ill == NULL);
ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID);
if (ipif == NULL) {
if (need_decref)
@@ -21429,25 +21120,11 @@ multicast:
first_mp->b_cont = mp;
mctl_present = B_TRUE;
}
- if (attach_ill != NULL) {
- ASSERT(attach_ill == ipif->ipif_ill);
- match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill)) {
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- }
- io->ipsec_out_ill_index =
- attach_ill->ill_phyint->phyint_ifindex;
- io->ipsec_out_attach_if = B_TRUE;
- } else {
- match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
- io->ipsec_out_ill_index =
- ipif->ipif_ill->ill_phyint->phyint_ifindex;
- }
+ match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
+ io->ipsec_out_ill_index =
+ ipif->ipif_ill->ill_phyint->phyint_ifindex;
+
if (connp != NULL) {
io->ipsec_out_multicast_loop =
connp->conn_multicast_loop;
@@ -21469,9 +21146,7 @@ multicast:
*
* NOTE : We need to do it for non-secure case also as
* this might go out secure if there is a global policy
- * match in ip_wput_ire. For bind to IPIF_NOFAILOVER
- * address, the source should be initialized already and
- * hence we won't be initializing here.
+ * match in ip_wput_ire.
*
* As we do not have the ire yet, it is possible that
* we set the source address here and then later discover
@@ -21507,14 +21182,6 @@ multicast:
zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
}
- /*
- * refrele attach_ill as its not needed anymore.
- */
- if (attach_ill != NULL) {
- ill_refrele(attach_ill);
- attach_ill = NULL;
- }
-
if (ire == NULL) {
/*
* Multicast loopback and multicast forwarding is
@@ -21630,33 +21297,9 @@ noroute:
ipif_refrele(dst_ipif);
}
}
- /*
- * If we are bound to IPIF_NOFAILOVER address, look for
- * an IRE_CACHE matching the ill.
- */
-send_from_ill:
- if (attach_ill != NULL) {
- ipif_t *attach_ipif;
- match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill)) {
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- }
-
- attach_ipif = ipif_get_next_ipif(NULL, attach_ill);
- if (attach_ipif == NULL) {
- ip1dbg(("ip_wput: No ipif for attach_ill\n"));
- goto discard_pkt;
- }
- ire = ire_ctable_lookup(dst, 0, 0, attach_ipif,
- zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
- ipif_refrele(attach_ipif);
- } else if (xmit_ill != NULL) {
+send_from_ill:
+ if (xmit_ill != NULL) {
ipif_t *ipif;
/*
@@ -21681,6 +21324,10 @@ send_from_ill:
goto drop_pkt;
}
+ match_flags = 0;
+ if (IS_UNDER_IPMP(xmit_ill))
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+
/*
* Look for a ire that is part of the group,
* if found use it else call ip_newroute_ipif.
@@ -21689,7 +21336,7 @@ send_from_ill:
* ill is accessible from all zones i.e has a
* valid ipif in all zones.
*/
- match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
+ match_flags |= MATCH_IRE_ILL | MATCH_IRE_SECATTR;
ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid,
MBLK_GETLABEL(mp), match_flags, ipst);
/*
@@ -21729,12 +21376,7 @@ send_from_ill:
ipst);
}
if (!ire) {
- /*
- * Make sure we don't load spread if this
- * is IPIF_NOFAILOVER case.
- */
- if ((attach_ill != NULL) ||
- (ip_nexthop && !ignore_nexthop)) {
+ if (ip_nexthop && !ignore_nexthop) {
if (mctl_present) {
io = (ipsec_out_t *)first_mp->b_rptr;
ASSERT(first_mp->b_datap->db_type ==
@@ -21764,15 +21406,8 @@ send_from_ill:
first_mp->b_cont = mp;
mctl_present = B_TRUE;
}
- if (attach_ill != NULL) {
- io->ipsec_out_ill_index = attach_ill->
- ill_phyint->phyint_ifindex;
- io->ipsec_out_attach_if = B_TRUE;
- } else {
- io->ipsec_out_ip_nexthop = ip_nexthop;
- io->ipsec_out_nexthop_addr =
- nexthop_addr;
- }
+ io->ipsec_out_ip_nexthop = ip_nexthop;
+ io->ipsec_out_nexthop_addr = nexthop_addr;
}
noirefound:
/*
@@ -21787,8 +21422,6 @@ noirefound:
ip_newroute(q, first_mp, dst, connp, zoneid, ipst);
TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
"ip_wput_end: q %p (%S)", q, "newroute");
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
if (xmit_ill != NULL)
ill_refrele(xmit_ill);
if (need_decref)
@@ -21869,8 +21502,6 @@ noirefound:
ip_newroute(q, copy_mp, dst, connp, zoneid, ipst);
}
}
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
if (xmit_ill != NULL)
ill_refrele(xmit_ill);
if (need_decref)
@@ -21896,8 +21527,6 @@ drop_pkt:
if (need_decref)
CONN_DEC_REF(connp);
freemsg(first_mp);
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
if (xmit_ill != NULL)
ill_refrele(xmit_ill);
TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
@@ -21923,8 +21552,8 @@ ip_wput(queue_t *q, mblk_t *mp)
/*
*
* The following rules must be observed when accessing any ipif or ill
- * that has been cached in the conn. Typically conn_nofailover_ill,
- * conn_outgoing_ill, conn_multicast_ipif and conn_multicast_ill.
+ * that has been cached in the conn. Typically conn_outgoing_ill,
+ * conn_multicast_ipif and conn_multicast_ill.
*
* Access: The ipif or ill pointed to from the conn can be accessed under
* the protection of the conn_lock or after it has been refheld under the
@@ -21944,10 +21573,8 @@ ip_wput(queue_t *q, mblk_t *mp)
* The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock
* On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock
* or a reference to the ipif or a reference to an ire that references the
- * ipif. An ipif does not change its ill except for failover/failback. Since
- * failover/failback happens only after bringing down the ipif and making sure
- * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock
- * the above holds.
+ * ipif. An ipif only changes its ill when migrating from an underlying ill
+ * to an IPMP ill in ipif_up().
*/
ipif_t *
conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err)
@@ -22302,96 +21929,6 @@ ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire,
zoneid));
}
-ire_t *
-conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill)
-{
- ipaddr_t addr;
- ire_t *save_ire;
- irb_t *irb;
- ill_group_t *illgrp;
- int err;
-
- save_ire = ire;
- addr = ire->ire_addr;
-
- ASSERT(ire->ire_type == IRE_BROADCAST);
-
- illgrp = connp->conn_outgoing_ill->ill_group;
- if (illgrp == NULL) {
- *conn_outgoing_ill = conn_get_held_ill(connp,
- &connp->conn_outgoing_ill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- ire_refrele(save_ire);
- return (NULL);
- }
- return (save_ire);
- }
- /*
- * If IP_BOUND_IF has been done, conn_outgoing_ill will be set.
- * If it is part of the group, we need to send on the ire
- * that has been cleared of IRE_MARK_NORECV and that belongs
- * to this group. This is okay as IP_BOUND_IF really means
- * any ill in the group. We depend on the fact that the
- * first ire in the group is always cleared of IRE_MARK_NORECV
- * if such an ire exists. This is possible only if you have
- * at least one ill in the group that has not failed.
- *
- * First get to the ire that matches the address and group.
- *
- * We don't look for an ire with a matching zoneid because a given zone
- * won't always have broadcast ires on all ills in the group.
- */
- irb = ire->ire_bucket;
- rw_enter(&irb->irb_lock, RW_READER);
- if (ire->ire_marks & IRE_MARK_NORECV) {
- /*
- * If the current zone only has an ire broadcast for this
- * address marked NORECV, the ire we want is ahead in the
- * bucket, so we look it up deliberately ignoring the zoneid.
- */
- for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_addr != addr)
- continue;
- /* skip over deleted ires */
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
- }
- }
- while (ire != NULL) {
- /*
- * If a new interface is coming up, we could end up
- * seeing the loopback ire and the non-loopback ire
- * may not have been added yet. So check for ire_stq
- */
- if (ire->ire_stq != NULL && (ire->ire_addr != addr ||
- ire->ire_ipif->ipif_ill->ill_group == illgrp)) {
- break;
- }
- ire = ire->ire_next;
- }
- if (ire != NULL && ire->ire_addr == addr &&
- ire->ire_ipif->ipif_ill->ill_group == illgrp) {
- IRE_REFHOLD(ire);
- rw_exit(&irb->irb_lock);
- ire_refrele(save_ire);
- *conn_outgoing_ill = ire_to_ill(ire);
- /*
- * Refhold the ill to make the conn_outgoing_ill
- * independent of the ire. ip_wput_ire goes in a loop
- * and may refrele the ire. Since we have an ire at this
- * point we don't need to use ILL_CAN_LOOKUP on the ill.
- */
- ill_refhold(*conn_outgoing_ill);
- return (ire);
- }
- rw_exit(&irb->irb_lock);
- ip1dbg(("conn_set_outgoing_ill: No matching ire\n"));
- /*
- * If we can't find a suitable ire, return the original ire.
- */
- return (save_ire);
-}
-
/*
* This function does the ire_refrele of the ire passed in as the
* argument. As this function looks up more ires i.e broadcast ires,
@@ -22401,7 +21938,6 @@ conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill)
* IPQoS Notes:
* IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for
* IPsec packets are done in ipsec_out_process.
- *
*/
void
ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
@@ -22471,9 +22007,8 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
if ((first_ire->ire_flags & RTF_MULTIRT) &&
(first_ire->ire_addr == ire->ire_addr) &&
!(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) {
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
break;
- }
}
if ((first_ire != NULL) && (first_ire != ire)) {
@@ -22489,36 +22024,15 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
* conn_outgoing_ill variable is used only in the broadcast loop.
* for performance we don't grab the mutexs in the fastpath
*/
- if ((connp != NULL) &&
- (ire->ire_type == IRE_BROADCAST) &&
- ((connp->conn_nofailover_ill != NULL) ||
- (connp->conn_outgoing_ill != NULL))) {
- /*
- * Bind to IPIF_NOFAILOVER address overrides IP_BOUND_IF
- * option. So, see if this endpoint is bound to a
- * IPIF_NOFAILOVER address. If so, honor it. This implies
- * that if the interface is failed, we will still send
- * the packet on the same ill which is what we want.
- */
+ if (ire->ire_type == IRE_BROADCAST && connp != NULL &&
+ connp->conn_outgoing_ill != NULL) {
conn_outgoing_ill = conn_get_held_ill(connp,
- &connp->conn_nofailover_ill, &err);
+ &connp->conn_outgoing_ill, &err);
if (err == ILL_LOOKUP_FAILED) {
ire_refrele(ire);
freemsg(mp);
return;
}
- if (conn_outgoing_ill == NULL) {
- /*
- * Choose a good ill in the group to send the
- * packets on.
- */
- ire = conn_set_outgoing_ill(connp, ire,
- &conn_outgoing_ill);
- if (ire == NULL) {
- freemsg(mp);
- return;
- }
- }
}
if (mp->b_datap->db_type != M_CTL) {
@@ -22578,7 +22092,7 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
if (src_ire != NULL &&
!(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
(!ipst->ips_ip_restrict_interzone_loopback ||
- ire_local_same_ill_group(ire, src_ire))) {
+ ire_local_same_lan(ire, src_ire))) {
if (ipha->ipha_src == INADDR_ANY && !unspec_src)
ipha->ipha_src = src_ire->ire_src_addr;
ire_refrele(src_ire);
@@ -22741,39 +22255,7 @@ another:;
*/
ASSERT(ire->ire_ipversion == IPV4_VERSION);
- /*
- * With IP multipathing, broadcast packets are sent on the ire
- * that has been cleared of IRE_MARK_NORECV and that belongs to
- * the group. However, this ire might not be in the same zone so
- * we can't always use its source address. We look for a
- * broadcast ire in the same group and in the right zone.
- */
- if (ire->ire_type == IRE_BROADCAST &&
- ire->ire_zoneid != zoneid) {
- ire_t *src_ire = ire_ctable_lookup(dst, 0,
- IRE_BROADCAST, ire->ire_ipif, zoneid, NULL,
- (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst);
- if (src_ire != NULL) {
- src = src_ire->ire_src_addr;
- ire_refrele(src_ire);
- } else {
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- freemsg(first_mp);
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutDiscards);
- }
- return;
- }
- } else {
- src = ire->ire_src_addr;
- }
-
+ src = ire->ire_src_addr;
if (connp == NULL) {
ip1dbg(("ip_wput_ire: no connp and no src "
"address for dst 0x%x, using src 0x%x\n",
@@ -22917,10 +22399,9 @@ another:;
ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t));
io = (ipsec_out_t *)first_mp->b_rptr;
- io->ipsec_out_ill_index = ((ill_t *)stq->q_ptr)->
- ill_phyint->phyint_ifindex;
-
- ipsec_out_process(q, first_mp, ire, ill_index);
+ io->ipsec_out_ill_index =
+ ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex;
+ ipsec_out_process(q, first_mp, ire, 0);
ire_refrele(ire);
if (conn_outgoing_ill != NULL)
ill_refrele(conn_outgoing_ill);
@@ -22960,7 +22441,7 @@ another:;
if (ire1->ire_addr != ire->ire_addr)
continue;
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
continue;
/* Got one */
@@ -23147,71 +22628,16 @@ broadcast:
* back outbound packets in different zones but on the
* same ill, as the application would see duplicates.
*
- * If the interfaces are part of the same group,
- * we would want to send only one copy out for
- * whole group.
- *
* This logic assumes that ire_add_v4() groups the
* IRE_BROADCAST entries so that those with the same
- * ire_addr and ill_group are kept together.
+ * ire_addr are kept together.
*/
ire_ill = ire->ire_ipif->ipif_ill;
- if (ire->ire_stq == NULL && ire1->ire_stq != NULL) {
- if (ire_ill->ill_group != NULL &&
- (ire->ire_marks & IRE_MARK_NORECV)) {
- /*
- * If the current zone only has an ire
- * broadcast for this address marked
- * NORECV, the ire we want is ahead in
- * the bucket, so we look it up
- * deliberately ignoring the zoneid.
- */
- for (ire1 = ire->ire_bucket->irb_ire;
- ire1 != NULL;
- ire1 = ire1->ire_next) {
- ire1_ill =
- ire1->ire_ipif->ipif_ill;
- if (ire1->ire_addr != dst)
- continue;
- /* skip over the current ire */
- if (ire1 == ire)
- continue;
- /* skip over deleted ires */
- if (ire1->ire_marks &
- IRE_MARK_CONDEMNED)
- continue;
- /*
- * non-loopback ire in our
- * group: use it for the next
- * pass in the loop
- */
- if (ire1->ire_stq != NULL &&
- ire1_ill->ill_group ==
- ire_ill->ill_group)
- break;
- }
- }
- } else {
+ if (ire->ire_stq != NULL || ire1->ire_stq == NULL) {
while (ire1 != NULL && ire1->ire_addr == dst) {
ire1_ill = ire1->ire_ipif->ipif_ill;
- /*
- * We can have two broadcast ires on the
- * same ill in different zones; here
- * we'll send a copy of the packet on
- * each ill and the fanout code will
- * call conn_wantpacket() to check that
- * the zone has the broadcast address
- * configured on the ill. If the two
- * ires are in the same group we only
- * send one copy up.
- */
- if (ire1_ill != ire_ill &&
- (ire1_ill->ill_group == NULL ||
- ire_ill->ill_group == NULL ||
- ire1_ill->ill_group !=
- ire_ill->ill_group)) {
+ if (ire1_ill != ire_ill)
break;
- }
ire1 = ire1->ire_next;
}
}
@@ -23403,13 +22829,8 @@ multi_loopback:
* logic.
*/
if (ill != NULL) {
- ilm_t *ilm;
-
- ILM_WALKER_HOLD(ill);
- ilm = ilm_lookup_ill(ill, ipha->ipha_dst,
- ALL_ZONES);
- ILM_WALKER_RELE(ill);
- if (ilm != NULL) {
+ if (ilm_lookup_ill(ill, ipha->ipha_dst,
+ ALL_ZONES) != NULL) {
/*
* Pass along the virtual output q.
* ip_wput_local() will distribute the
@@ -23565,18 +22986,17 @@ checksumoptions:
ire1 != NULL;
ire1 = ire1->ire_next) {
if (!(ire1->ire_flags &
- RTF_MULTIRT)) {
+ RTF_MULTIRT))
continue;
- }
+
if (ire1->ire_addr !=
- ire->ire_addr) {
+ ire->ire_addr)
continue;
- }
+
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED|
- IRE_MARK_HIDDEN)) {
+ (IRE_MARK_CONDEMNED |
+ IRE_MARK_TESTHIDDEN))
continue;
- }
/* Got one */
IRE_REFHOLD(ire1);
@@ -24743,9 +24163,8 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
if ((first_ire->ire_flags & RTF_MULTIRT) &&
(first_ire->ire_addr == ire->ire_addr) &&
!(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) {
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
break;
- }
}
if (first_ire != NULL) {
@@ -24808,7 +24227,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
if (ire1->ire_addr != ire->ire_addr)
continue;
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
continue;
/*
* Ensure we do not exceed the MTU
@@ -25130,10 +24549,9 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
if (ire1->ire_addr != ire->ire_addr)
continue;
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED|
- IRE_MARK_HIDDEN)) {
+ (IRE_MARK_CONDEMNED |
+ IRE_MARK_TESTHIDDEN))
continue;
- }
/*
* Ensure we do not exceed the MTU
* of the next route.
@@ -25500,6 +24918,7 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire,
ilm_t *ilm;
mblk_t *mp1;
zoneid_t last_zoneid;
+ ilm_walker_t ilw;
if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) {
ASSERT(ire_type == IRE_BROADCAST);
@@ -25524,11 +24943,9 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire,
* have been exhausted.
*/
last_zoneid = -1;
- ILM_WALKER_HOLD(ill);
- for (ilm = ill->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if ((ilm->ilm_flags & ILM_DELETED) ||
- ipha->ipha_dst != ilm->ilm_addr ||
+ ilm = ilm_walker_start(&ilw, ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ if (ipha->ipha_dst != ilm->ilm_addr ||
ilm->ilm_zoneid == last_zoneid ||
ilm->ilm_zoneid == zoneid ||
!(ilm->ilm_ipif->ipif_flags & IPIF_UP))
@@ -25536,12 +24953,12 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire,
mp1 = ip_copymsg(first_mp);
if (mp1 == NULL)
continue;
- icmp_inbound(q, mp1, B_TRUE, ill, 0, 0,
- mctl_present, B_FALSE, ill,
+ icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill,
+ 0, 0, mctl_present, B_FALSE, ill,
ilm->ilm_zoneid);
last_zoneid = ilm->ilm_zoneid;
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
/*
* Loopback case: the sending endpoint has
* IP_MULTICAST_LOOP disabled, therefore we don't
@@ -25859,14 +25276,9 @@ ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid)
* caller and hence matching on ILL (MATCH_IRE_ILL) would
* be sufficient rather than MATCH_IRE_IPIF.
*
- * This function is used for sending IGMP packets. We need
- * to make sure that we send the packet out of the interface
- * (ipif->ipif_ill) where we joined the group. This is to
- * prevent from switches doing IGMP snooping to send us multicast
- * packets for a given group on the interface we have joined.
- * If we can't find an ire, igmp_sendpkt has already initialized
- * ipsec_out_attach_if so that this will not be load spread in
- * ip_newroute_ipif.
+ * This function is used for sending IGMP packets. For IPMP,
+ * we sidestep IGMP snooping issues by sending all multicast
+ * traffic on a single interface in the IPMP group.
*/
ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL,
MATCH_IRE_ILL, ipst);
@@ -26035,7 +25447,7 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill,
ip6_t *ip6h1;
uint_t ill_index;
ipsec_out_t *io;
- boolean_t attach_if, hwaccel;
+ boolean_t hwaccel;
uint32_t flags = IP6_NO_IPPOLICY;
int match_flags;
zoneid_t zoneid;
@@ -26052,42 +25464,22 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill,
if (io->ipsec_out_reachable) {
flags |= IPV6_REACHABILITY_CONFIRMATION;
}
- attach_if = io->ipsec_out_attach_if;
hwaccel = io->ipsec_out_accelerated;
zoneid = io->ipsec_out_zoneid;
ASSERT(zoneid != ALL_ZONES);
- match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
+ match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
/* Multicast addresses should have non-zero ill_index. */
v6dstp = &ip6h->ip6_dst;
ASSERT(ip6h->ip6_nxt != IPPROTO_RAW);
ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0);
- ASSERT(!attach_if || ill_index != 0);
- if (ill_index != 0) {
- if (ill == NULL) {
- ill = ip_grab_attach_ill(NULL, ipsec_mp, ill_index,
- B_TRUE, ipst);
- /* Failure case frees things for us. */
- if (ill == NULL)
- return;
-
- ill_need_rele = B_TRUE;
- }
- /*
- * If this packet needs to go out on a particular interface
- * honor it.
- */
- if (attach_if) {
- match_flags = MATCH_IRE_ILL;
+ if (ill == NULL && ill_index != 0) {
+ ill = ip_grab_ill(ipsec_mp, ill_index, B_TRUE, ipst);
+ /* Failure case frees things for us. */
+ if (ill == NULL)
+ return;
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill)) {
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- }
- }
+ ill_need_rele = B_TRUE;
}
ASSERT(mp != NULL);
@@ -26138,32 +25530,15 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill,
return;
}
- ip_newroute_ipif_v6(q, ipsec_mp, ipif, *v6dstp,
+ ip_newroute_ipif_v6(q, ipsec_mp, ipif, v6dstp, &ip6h->ip6_src,
unspec_src, zoneid);
ipif_refrele(ipif);
} else {
- if (attach_if) {
- ipif_t *ipif;
-
- ipif = ipif_get_next_ipif(NULL, ill);
- if (ipif == NULL) {
- if (ill_need_rele)
- ill_refrele(ill);
- freemsg(ipsec_mp);
- return;
- }
- ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif,
- zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
- ire_need_rele = B_TRUE;
- ipif_refrele(ipif);
+ if (ire_arg != NULL) {
+ ire = ire_arg;
} else {
- if (ire_arg != NULL) {
- ire = ire_arg;
- } else {
- ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL,
- ipst);
- ire_need_rele = B_TRUE;
- }
+ ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, ipst);
+ ire_need_rele = B_TRUE;
}
if (ire != NULL)
goto send;
@@ -26350,7 +25725,6 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
ipha_t *ipha1;
uint_t ill_index;
ipsec_out_t *io;
- boolean_t attach_if;
int match_flags;
irb_t *irb = NULL;
boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE;
@@ -26372,39 +25746,19 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
io = (ipsec_out_t *)ipsec_mp->b_rptr;
ill_index = io->ipsec_out_ill_index;
- attach_if = io->ipsec_out_attach_if;
zoneid = io->ipsec_out_zoneid;
ASSERT(zoneid != ALL_ZONES);
ipst = io->ipsec_out_ns->netstack_ip;
ASSERT(io->ipsec_out_ns != NULL);
- match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
- if (ill_index != 0) {
- if (ill == NULL) {
- ill = ip_grab_attach_ill(NULL, ipsec_mp,
- ill_index, B_FALSE, ipst);
-
- /* Failure case frees things for us. */
- if (ill == NULL)
- return;
-
- ill_need_rele = B_TRUE;
- }
- /*
- * If this packet needs to go out on a particular interface
- * honor it.
- */
- if (attach_if) {
- match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
+ match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
+ if (ill == NULL && ill_index != 0) {
+ ill = ip_grab_ill(ipsec_mp, ill_index, B_FALSE, ipst);
+ /* Failure case frees things for us. */
+ if (ill == NULL)
+ return;
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill)) {
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- }
- }
+ ill_need_rele = B_TRUE;
}
if (CLASSD(dst)) {
@@ -26474,17 +25828,12 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT,
zoneid, &zero_info);
} else {
- if (attach_if) {
- ire = ire_ctable_lookup(dst, 0, 0, ill->ill_ipif,
- zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
+ if (ire_arg != NULL) {
+ ire = ire_arg;
+ ire_need_rele = B_FALSE;
} else {
- if (ire_arg != NULL) {
- ire = ire_arg;
- ire_need_rele = B_FALSE;
- } else {
- ire = ire_cache_lookup(dst, zoneid,
- MBLK_GETLABEL(mp), ipst);
- }
+ ire = ire_cache_lookup(dst, zoneid,
+ MBLK_GETLABEL(mp), ipst);
}
if (ire != NULL) {
goto send;
@@ -26613,11 +25962,9 @@ send:
(void *)ire->ire_ipif, (void *)ipif));
/*
- * Multiroute the secured packet, unless IPsec really
- * requires the packet to go out only through a particular
- * interface.
+ * Multiroute the secured packet.
*/
- if ((ire->ire_flags & RTF_MULTIRT) && !attach_if) {
+ if (ire->ire_flags & RTF_MULTIRT) {
ire_t *first_ire;
irb = ire->ire_bucket;
ASSERT(irb != NULL);
@@ -26634,9 +25981,8 @@ send:
if ((first_ire->ire_flags & RTF_MULTIRT) &&
(first_ire->ire_addr == ire->ire_addr) &&
!(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) {
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
break;
- }
}
if ((first_ire != NULL) && (first_ire != ire)) {
@@ -26657,11 +26003,6 @@ send:
multirt_send = B_TRUE;
max_frag = ire->ire_max_frag;
- } else {
- if ((ire->ire_flags & RTF_MULTIRT) && attach_if) {
- ip1dbg(("ip_wput_ipsec_out: ignoring multirouting "
- "flag, attach_if %d\n", attach_if));
- }
}
/*
@@ -26689,7 +26030,7 @@ send:
if (ire1->ire_addr != ire->ire_addr)
continue;
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
continue;
/* No loopback here */
if (ire1->ire_stq == NULL)
@@ -27155,10 +26496,8 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
* before sending it the accelerated packet.
*/
if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) {
- int ifindex;
ill = ire_to_ill(ire);
- ifindex = ill->ill_phyint->phyint_ifindex;
- io->ipsec_out_capab_ill_index = ifindex;
+ io->ipsec_out_capab_ill_index = ill->ill_phyint->phyint_ifindex;
}
/*
@@ -27284,17 +26623,18 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
}
}
/*
- * We are done with IPsec processing. Send it over
- * the wire.
+ * We are done with IPsec processing. Send it over the wire.
*/
done:
mp = ipsec_mp->b_cont;
ipha = (ipha_t *)mp->b_rptr;
if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
- ip_wput_ipsec_out(q, ipsec_mp, ipha, ill, ire);
+ ip_wput_ipsec_out(q, ipsec_mp, ipha, ire->ire_ipif->ipif_ill,
+ ire);
} else {
ip6h = (ip6_t *)ipha;
- ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ill, ire);
+ ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ire->ire_ipif->ipif_ill,
+ ire);
}
if (ill != NULL && ill_need_rele)
ill_refrele(ill);
@@ -27356,18 +26696,16 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
ipip = ip_sioctl_lookup(iocp->ioc_cmd);
if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) {
/*
- * Special case where ipsq_current_ipif is not set:
+ * Special case where ipx_current_ipif is not set:
* ill_phyint_reinit merged the v4 and v6 into a single ipsq.
- * ill could also have become part of a ipmp group in the
- * process, we are here as were not able to complete the
- * operation in ipif_set_values because we could not become
- * exclusive on the new ipsq, In such a case ipsq_current_ipif
- * will not be set so we need to set it.
+ * We are here as were not able to complete the operation in
+ * ipif_set_values because we could not become exclusive on
+ * the new ipsq.
*/
ill_t *ill = q->q_ptr;
ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd);
}
- ASSERT(ipsq->ipsq_current_ipif != NULL);
+ ASSERT(ipsq->ipsq_xop->ipx_current_ipif != NULL);
if (ipip->ipi_cmd_type == IF_CMD) {
/* This a old style SIOC[GS]IF* command */
@@ -27381,8 +26719,8 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
sin = NULL;
}
- err = (*ipip->ipi_func_restart)(ipsq->ipsq_current_ipif, sin, q, mp,
- ipip, mp1->b_rptr);
+ err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin,
+ q, mp, ipip, mp1->b_rptr);
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
}
@@ -27424,6 +26762,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
ip_extract_func_t *extract_funcp;
cmd_info_t ci;
int err;
+ boolean_t entered_ipsq = B_FALSE;
ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd));
@@ -27505,18 +26844,21 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
return;
}
+ ASSERT(ci.ci_ipif != NULL);
+
/*
- * If ipsq is non-null, we are already being called exclusively on an
- * ill but in the case of a failover in progress it is the "from" ill,
- * rather than the "to" ill (which is the ill ptr passed in).
- * In order to ensure we are exclusive on both ILLs we rerun
- * ipsq_try_enter() here, ipsq's support recursive entry.
+ * If ipsq is non-NULL, we are already being called exclusively.
*/
ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
- ASSERT(ci.ci_ipif != NULL);
-
- ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl,
- NEW_OP, B_TRUE);
+ if (ipsq == NULL) {
+ ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl,
+ NEW_OP, B_TRUE);
+ if (ipsq == NULL) {
+ ipif_refrele(ci.ci_ipif);
+ return;
+ }
+ entered_ipsq = B_TRUE;
+ }
/*
* Release the ipif so that ipif_down and friends that wait for
@@ -27525,8 +26867,6 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
* the ipif.
*/
ipif_refrele(ci.ci_ipif);
- if (ipsq == NULL)
- return;
ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
@@ -27535,19 +26875,12 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
* where we set the IPIF_CHANGING flag. This ensures that there won't
* be any new references to the ipif. This helps functions that go
* through this path and end up trying to wait for the refcnts
- * associated with the ipif to go down to zero. Some exceptions are
- * Failover, Failback, and Groupname commands that operate on more than
- * just the ci.ci_ipif. These commands internally determine the
- * set of ipif's they operate on and set and clear the IPIF_CHANGING
- * flags on that set. Another exception is the Removeif command that
- * sets the IPIF_CONDEMNED flag internally after identifying the right
- * ipif to operate on.
+ * associated with the ipif to go down to zero. The exception is
+ * SIOCSLIFREMOVEIF, which sets IPIF_CONDEMNED internally after
+ * identifying the right ipif to operate on.
*/
mutex_enter(&(ci.ci_ipif)->ipif_ill->ill_lock);
- if (ipip->ipi_cmd != SIOCLIFREMOVEIF &&
- ipip->ipi_cmd != SIOCLIFFAILOVER &&
- ipip->ipi_cmd != SIOCLIFFAILBACK &&
- ipip->ipi_cmd != SIOCSLIFGROUPNAME)
+ if (ipip->ipi_cmd != SIOCLIFREMOVEIF)
(ci.ci_ipif)->ipif_state_flags |= IPIF_CHANGING;
mutex_exit(&(ci.ci_ipif)->ipif_ill->ill_lock);
@@ -27560,7 +26893,8 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
- ipsq_exit(ipsq);
+ if (entered_ipsq)
+ ipsq_exit(ipsq);
}
/*
@@ -27708,7 +27042,7 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
* Refhold the conn, till the ioctl completes. This is
* needed in case the ioctl ends up in the pending mp
* list. Every mp in the ill_pending_mp list and
- * the ipsq_pending_mp must have a refhold on the conn
+ * the ipx_pending_mp must have a refhold on the conn
* to resume processing. The refhold is released when
* the ioctl completes. (normally or abnormally)
* In all cases ip_ioctl_finish is called to finish
@@ -27753,8 +27087,25 @@ nak:
if (CONN_Q(q))
goto nak;
- /* Finish socket ioctls passed through to ARP. */
- ip_sioctl_iocack(q, mp);
+ /*
+ * Finish socket ioctls passed through to ARP. We use the
+ * ioc_cmd values we set in ip_sioctl_arp() to decide whether
+ * we need to become writer before calling ip_sioctl_iocack().
+ * Note that qwriter_ip() will release the refhold, and that a
+ * refhold is OK without ILL_CAN_LOOKUP() since we're on the
+ * ill stream.
+ */
+ iocp = (struct iocblk *)mp->b_rptr;
+ if (iocp->ioc_cmd == AR_ENTRY_SQUERY) {
+ ip_sioctl_iocack(NULL, q, mp, NULL);
+ return;
+ }
+
+ ASSERT(iocp->ioc_cmd == AR_ENTRY_DELETE ||
+ iocp->ioc_cmd == AR_ENTRY_ADD);
+ ill = q->q_ptr;
+ ill_refhold(ill);
+ qwriter_ip(ill, q, mp, ip_sioctl_iocack, CUR_OP, B_FALSE);
return;
case M_FLUSH:
if (*mp->b_rptr & FLUSHW)
@@ -28021,11 +27372,11 @@ nak:
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
- nce = ndp_lookup_v6(ill,
+ nce = ndp_lookup_v6(ill, B_FALSE,
&ire->ire_addr_v6, B_FALSE);
} else {
- nce = ndp_lookup_v6(ill, &gw_addr_v6,
- B_FALSE);
+ nce = ndp_lookup_v6(ill, B_FALSE,
+ &gw_addr_v6, B_FALSE);
}
if (nce != NULL) {
nce_resolv_failed(nce);
@@ -28061,10 +27412,11 @@ nak:
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
- nce = ndp_lookup_v6(ill, &ire->ire_addr_v6,
- B_FALSE);
+ nce = ndp_lookup_v6(ill, B_FALSE,
+ &ire->ire_addr_v6, B_FALSE);
} else {
- nce = ndp_lookup_v6(ill, &gw_addr_v6, B_FALSE);
+ nce = ndp_lookup_v6(ill, B_FALSE,
+ &gw_addr_v6, B_FALSE);
}
if (nce != NULL) {
/*
@@ -28238,13 +27590,14 @@ nak:
fake_ire = (ire_t *)mp->b_rptr;
/*
- * By the time we come back here from ARP the incomplete ire
- * created in ire_forward() could have been removed. We use
- * the parameters stored in the fake_ire to specify the real
- * ire as explicitly as possible. This avoids problems when
- * IPMP groups are configured as an ipif can 'float'
- * across several ill queues. We can be confident that the
- * the inability to find an ire is because it no longer exists.
+ * By the time we come back here from ARP the logical outgoing
+ * interface of the incomplete ire we added in ire_forward()
+ * could have disappeared, causing the incomplete ire to also
+ * disappear. So we need to retreive the proper ipif for the
+ * ire before looking in ctable. In the case of IPMP, the
+ * ipif may be on the IPMP ill, so look it up based on the
+ * ire_ipif_ifindex we stashed back in ire_init_common().
+ * Then, we can verify that ire_ipif_seqid still exists.
*/
ill = ill_lookup_on_ifindex(fake_ire->ire_ipif_ifindex, B_FALSE,
NULL, NULL, NULL, NULL, ipst);
@@ -28299,6 +27652,7 @@ nak:
freemsg(mp); /* fake ire */
return;
}
+
nce = ire->ire_nce;
DTRACE_PROBE2(ire__arpresolve__type,
ire_t *, ire, nce_t *, nce);
@@ -29030,7 +28384,7 @@ boolean_t
conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
zoneid_t zoneid)
{
- ill_t *in_ill;
+ ill_t *bound_ill;
boolean_t found;
ipif_t *ipif;
ire_t *ire;
@@ -29045,32 +28399,15 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
* unicast, broadcast and multicast reception to
* conn_incoming_ill. conn_wantpacket itself is called
* only for BROADCAST and multicast.
- *
- * 1) ip_rput supresses duplicate broadcasts if the ill
- * is part of a group. Hence, we should be receiving
- * just one copy of broadcast for the whole group.
- * Thus, if it is part of the group the packet could
- * come on any ill of the group and hence we need a
- * match on the group. Otherwise, match on ill should
- * be sufficient.
- *
- * 2) ip_rput does not suppress duplicate multicast packets.
- * If there are two interfaces in a ill group and we have
- * 2 applications (conns) joined a multicast group G on
- * both the interfaces, ilm_lookup_ill filter in ip_rput
- * will give us two packets because we join G on both the
- * interfaces rather than nominating just one interface
- * for receiving multicast like broadcast above. So,
- * we have to call ilg_lookup_ill to filter out duplicate
- * copies, if ill is part of a group.
- */
- in_ill = connp->conn_incoming_ill;
- if (in_ill != NULL) {
- if (in_ill->ill_group == NULL) {
- if (in_ill != ill)
+ */
+ bound_ill = connp->conn_incoming_ill;
+ if (bound_ill != NULL) {
+ if (IS_IPMP(bound_ill)) {
+ if (bound_ill->ill_grp != ill->ill_grp)
+ return (B_FALSE);
+ } else {
+ if (bound_ill != ill)
return (B_FALSE);
- } else if (in_ill->ill_group != ill->ill_group) {
- return (B_FALSE);
}
}
@@ -29079,15 +28416,14 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
return (B_TRUE);
/*
* The conn is in a different zone; we need to check that this
- * broadcast address is configured in the application's zone and
- * on one ill in the group.
+ * broadcast address is configured in the application's zone.
*/
ipif = ipif_get_next_ipif(NULL, ill);
if (ipif == NULL)
return (B_FALSE);
ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif,
connp->conn_zoneid, NULL,
- (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst);
+ (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
ipif_refrele(ipif);
if (ire != NULL) {
ire_refrele(ire);
@@ -29171,7 +28507,7 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
ipsq = ill->ill_phyint->phyint_ipsq;
- ipif = ipsq->ipsq_pending_ipif;
+ ipif = ipsq->ipsq_xop->ipx_pending_ipif;
mp1 = ipsq_pending_mp_get(ipsq, &connp);
ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
if (mp1 == NULL) {
@@ -29181,12 +28517,12 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
/*
- * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
+ * If an IOCTL is waiting on this (ipx_current_ioctl != 0), then we
* must have an associated conn_t. Otherwise, we're bringing this
* interface back up as part of handling an asynchronous event (e.g.,
* physical address change).
*/
- if (ipsq->ipsq_current_ioctl != 0) {
+ if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
ASSERT(connp != NULL);
q = CONNP_TO_WQ(connp);
} else {
@@ -29219,16 +28555,28 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg)
return;
}
- if (ill->ill_up_ipifs)
- ill_group_cleanup(ill);
+ /*
+ * If we have a moved ipif to bring up, and everything has succeeded
+ * to this point, bring it up on the IPMP ill. Otherwise, leave it
+ * down -- the admin can try to bring it up by hand if need be.
+ */
+ if (ill->ill_move_ipif != NULL) {
+ ipif = ill->ill_move_ipif;
+ ill->ill_move_ipif = NULL;
+ if (err == 0) {
+ err = ipif_up(ipif, q, mp1);
+ if (err == EINPROGRESS)
+ return;
+ }
+ }
/*
* The operation must complete without EINPROGRESS since
- * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
- * Otherwise, the operation will be stuck forever in the ipsq.
+ * ipsq_pending_mp_get() has removed the mblk. Otherwise, the
+ * operation will be stuck forever in the ipsq.
*/
ASSERT(err != EINPROGRESS);
- if (ipsq->ipsq_current_ioctl != 0)
+ if (ipsq->ipsq_xop->ipx_current_ioctl != 0)
ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
else
ipsq_current_finish(ipsq);
@@ -29649,124 +28997,6 @@ ip_int_set(queue_t *q, mblk_t *mp, char *value,
return (0);
}
-/*
- * Handle changes to ipmp_hook_emulation ndd variable.
- * Need to update phyint_hook_ifindex.
- * Also generate a nic plumb event should a new ifidex be assigned to a group.
- */
-static void
-ipmp_hook_emulation_changed(ip_stack_t *ipst)
-{
- phyint_t *phyi;
- phyint_t *phyi_tmp;
- char *groupname;
- int namelen;
- ill_t *ill;
- boolean_t new_group;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- /*
- * Group indicies are stored in the phyint - a common structure
- * to both IPv4 and IPv6.
- */
- phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
- for (; phyi != NULL;
- phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
- phyi, AVL_AFTER)) {
- /* Ignore the ones that do not have a group */
- if (phyi->phyint_groupname_len == 0)
- continue;
-
- /*
- * Look for other phyint in group.
- * Clear name/namelen so the lookup doesn't find ourselves.
- */
- namelen = phyi->phyint_groupname_len;
- groupname = phyi->phyint_groupname;
- phyi->phyint_groupname_len = 0;
- phyi->phyint_groupname = NULL;
-
- phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst);
- /* Restore */
- phyi->phyint_groupname_len = namelen;
- phyi->phyint_groupname = groupname;
-
- new_group = B_FALSE;
- if (ipst->ips_ipmp_hook_emulation) {
- /*
- * If the group already exists and has already
- * been assigned a group ifindex, we use the existing
- * group_ifindex, otherwise we pick a new group_ifindex
- * here.
- */
- if (phyi_tmp != NULL &&
- phyi_tmp->phyint_group_ifindex != 0) {
- phyi->phyint_group_ifindex =
- phyi_tmp->phyint_group_ifindex;
- } else {
- /* XXX We need a recovery strategy here. */
- if (!ip_assign_ifindex(
- &phyi->phyint_group_ifindex, ipst))
- cmn_err(CE_PANIC,
- "ip_assign_ifindex() failed");
- new_group = B_TRUE;
- }
- } else {
- phyi->phyint_group_ifindex = 0;
- }
- if (ipst->ips_ipmp_hook_emulation)
- phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex;
- else
- phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-
- /*
- * For IP Filter to find out the relationship between
- * names and interface indicies, we need to generate
- * a NE_PLUMB event when a new group can appear.
- * We always generate events when a new interface appears
- * (even when ipmp_hook_emulation is set) so there
- * is no need to generate NE_PLUMB events when
- * ipmp_hook_emulation is turned off.
- * And since it isn't critical for IP Filter to get
- * the NE_UNPLUMB events we skip those here.
- */
- if (new_group) {
- /*
- * First phyint in group - generate group PLUMB event.
- * Since we are not running inside the ipsq we do
- * the dispatch immediately.
- */
- if (phyi->phyint_illv4 != NULL)
- ill = phyi->phyint_illv4;
- else
- ill = phyi->phyint_illv6;
-
- if (ill != NULL)
- ill_nic_event_plumb(ill, B_TRUE);
- }
- }
- rw_exit(&ipst->ips_ill_g_lock);
-}
-
-/* ARGSUSED */
-static int
-ipmp_hook_emulation_set(queue_t *q, mblk_t *mp, char *value,
- caddr_t addr, cred_t *cr)
-{
- int *v = (int *)addr;
- long new_value;
- ip_stack_t *ipst = CONNQ_TO_IPST(q);
-
- if (ddi_strtol(value, NULL, 10, &new_value) != 0)
- return (EINVAL);
-
- if (*v != new_value) {
- *v = new_value;
- ipmp_hook_emulation_changed(ipst);
- }
- return (0);
-}
-
static void *
ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp)
{
@@ -30448,12 +29678,12 @@ next_mp:
arpce->nce_state = ND_INCOMPLETE;
mutex_exit(&arpce->nce_lock);
+
/*
* Note that ire_add() (called from ire_forward())
* holds a ref on the ire until ARP is completed.
*/
-
- ire_arpresolve(ire, ire_to_ill(ire));
+ ire_arpresolve(ire);
return (LOOKUP_IN_PROGRESS);
default:
ASSERT(0);
@@ -30596,7 +29826,7 @@ ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill,
return (ALL_ZONES);
if (IN6_IS_ADDR_LINKLOCAL(addr)) {
- ire_flags |= MATCH_IRE_ILL_GROUP;
+ ire_flags |= MATCH_IRE_ILL;
ipif_arg = ill->ill_ipif;
}
if (lookup_zoneid != ALL_ZONES)
@@ -30648,20 +29878,24 @@ void
ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
const ill_t *ill, int ipver, uint32_t hlen, ip_stack_t *ipst)
{
+ mblk_t *mp2;
ipobs_cb_t *ipobs_cb;
+ ipobs_hook_data_t *ihd;
+ uint64_t grifindex = 0;
ASSERT(DB_TYPE(mp) == M_DATA);
+ if (IS_UNDER_IPMP(ill))
+ grifindex = ipmp_ill_get_ipmp_ifindex(ill);
+
mutex_enter(&ipst->ips_ipobs_cb_lock);
ipst->ips_ipobs_cb_nwalkers++;
mutex_exit(&ipst->ips_ipobs_cb_lock);
for (ipobs_cb = list_head(&ipst->ips_ipobs_cb_list); ipobs_cb != NULL;
ipobs_cb = list_next(&ipst->ips_ipobs_cb_list, ipobs_cb)) {
- mblk_t *mp2 = allocb(sizeof (ipobs_hook_data_t),
- BPRI_HI);
+ mp2 = allocb(sizeof (ipobs_hook_data_t), BPRI_HI);
if (mp2 != NULL) {
- ipobs_hook_data_t *ihd =
- (ipobs_hook_data_t *)mp2->b_rptr;
+ ihd = (ipobs_hook_data_t *)mp2->b_rptr;
if (((ihd->ihd_mp = dupmsg(mp)) == NULL) &&
((ihd->ihd_mp = copymsg(mp)) == NULL)) {
freemsg(mp2);
@@ -30673,6 +29907,7 @@ ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
ihd->ihd_zsrc = zsrc;
ihd->ihd_zdst = zdst;
ihd->ihd_ifindex = ill->ill_phyint->phyint_ifindex;
+ ihd->ihd_grifindex = grifindex;
ihd->ihd_stack = ipst->ips_netstack;
mp2->b_wptr += sizeof (*ihd);
ipobs_cb->ipobs_cbfunc(mp2);
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index fe326778c2..6e63af32b3 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -95,7 +95,6 @@
#include <sys/pattr.h>
#include <inet/ipclassifier.h>
#include <inet/ipsecah.h>
-#include <inet/udp_impl.h>
#include <inet/rawip_impl.h>
#include <inet/rts_impl.h>
#include <sys/squeue_impl.h>
@@ -186,7 +185,7 @@ const in6_addr_t ipv6_solicited_node_mcast =
#define IP6_MBLK_HDR_ERR 1
#define IP6_MBLK_LEN_ERR 2
-static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *ill,
+static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *, ill_t *,
boolean_t, zoneid_t);
static void icmp_pkt_v6(queue_t *, mblk_t *, void *, size_t,
const in6_addr_t *, boolean_t, zoneid_t, ip_stack_t *);
@@ -208,11 +207,13 @@ static void ip_fanout_udp_v6(queue_t *, mblk_t *, ip6_t *, uint32_t,
ill_t *, ill_t *, uint_t, boolean_t, zoneid_t);
static int ip_process_options_v6(queue_t *, mblk_t *, ip6_t *,
uint8_t *, uint_t, uint8_t, ip_stack_t *);
-static mblk_t *ip_rput_frag_v6(queue_t *, mblk_t *, ip6_t *,
+static mblk_t *ip_rput_frag_v6(ill_t *, ill_t *, mblk_t *, ip6_t *,
ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *);
static boolean_t ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *);
static void ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int,
- conn_t *, int, int, int, zoneid_t);
+ conn_t *, int, int, zoneid_t);
+static boolean_t ipif_lookup_testaddr_v6(ill_t *, const in6_addr_t *,
+ ipif_t **);
/*
* A template for an IPv6 AR_ENTRY_QUERY
@@ -248,15 +249,14 @@ static areq_t ipv6_areq_template = {
* call icmp_inbound_v6() for each relevant zone.
*/
static void
-icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
- boolean_t mctl_present, uint_t flags, zoneid_t zoneid, mblk_t *dl_mp)
+icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
+ uint_t hdr_length, boolean_t mctl_present, uint_t flags, zoneid_t zoneid,
+ mblk_t *dl_mp)
{
icmp6_t *icmp6;
ip6_t *ip6h;
boolean_t interested;
- ip6i_t *ip6i;
in6_addr_t origsrc;
- ire_t *ire;
mblk_t *first_mp;
ipsec_in_t *ii;
ip_stack_t *ipst = ill->ill_ipst;
@@ -344,7 +344,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
break;
case ICMP6_PACKET_TOO_BIG:
- icmp_inbound_too_big_v6(q, first_mp, ill, mctl_present,
+ icmp_inbound_too_big_v6(q, first_mp, ill, inill, mctl_present,
zoneid);
return;
case ICMP6_ECHO_REQUEST:
@@ -422,66 +422,6 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
* checksum field. The checksum is calculated in ip_wput_v6.
*/
icmp6->icmp6_cksum = ip6h->ip6_plen;
- /*
- * ICMP echo replies should go out on the same interface
- * the request came on as probes used by in.mpathd for
- * detecting NIC failures are ECHO packets. We turn-off load
- * spreading by allocating a ip6i and setting ip6i_attach_if
- * to B_TRUE which is handled both by ip_wput_v6 and
- * ip_newroute_v6. If we don't turnoff load spreading,
- * the packets might get dropped if there are no
- * non-FAILED/INACTIVE interfaces for it to go out on and
- * in.mpathd would wrongly detect a failure or mis-detect
- * a NIC failure as a link failure. As load spreading can
- * happen only if ill_group is not NULL, we do only for
- * that case and this does not affect the normal case.
- *
- * We force this only on echo packets that came from on-link
- * hosts. We restrict this to link-local addresses which
- * is used by in.mpathd for probing. In the IPv6 case,
- * default routes typically have an ire_ipif pointer and
- * hence a MATCH_IRE_ILL later in ip_newroute_v6/ip_wput_v6
- * might work. As a default route out of this interface
- * may not be present, enforcing this packet to go out in
- * this case may not work.
- */
- if (ill->ill_group != NULL &&
- IN6_IS_ADDR_LINKLOCAL(&origsrc)) {
- /*
- * If we are sending replies to ourselves, don't
- * set ATTACH_IF as we may not be able to find
- * the IRE_LOCAL on this ill i.e setting ATTACH_IF
- * causes ip_wput_v6 to look for an IRE_LOCAL on
- * "ill" which it may not find and will try to
- * create an IRE_CACHE for our local address. Once
- * we do this, we will try to forward all packets
- * meant to our LOCAL address.
- */
- ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ALL_ZONES,
- NULL, ipst);
- if (ire == NULL || ire->ire_type != IRE_LOCAL) {
- mp = ip_add_info_v6(mp, NULL, &ip6h->ip6_dst);
- if (mp == NULL) {
- BUMP_MIB(ill->ill_icmp6_mib,
- ipv6IfIcmpInErrors);
- if (ire != NULL)
- ire_refrele(ire);
- if (mctl_present)
- freeb(first_mp);
- return;
- } else if (mctl_present) {
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
- ip6i = (ip6i_t *)mp->b_rptr;
- ip6i->ip6i_flags = IP6I_ATTACH_IF;
- ip6i->ip6i_ifindex =
- ill->ill_phyint->phyint_ifindex;
- }
- if (ire != NULL)
- ire_refrele(ire);
- }
if (!mctl_present) {
/*
@@ -529,7 +469,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
if (mctl_present)
freeb(first_mp);
/* XXX may wish to pass first_mp up to ndp_input someday. */
- ndp_input(ill, mp, dl_mp);
+ ndp_input(inill, mp, dl_mp);
return;
case ND_NEIGHBOR_ADVERT:
@@ -538,7 +478,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
if (mctl_present)
freeb(first_mp);
/* XXX may wish to pass first_mp up to ndp_input someday. */
- ndp_input(ill, mp, dl_mp);
+ ndp_input(inill, mp, dl_mp);
return;
case ND_REDIRECT: {
@@ -579,7 +519,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
}
if (interested) {
icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill,
- mctl_present, zoneid);
+ inill, mctl_present, zoneid);
} else {
freemsg(first_mp);
}
@@ -592,7 +532,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
*/
/* ARGSUSED */
static void
-icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
+icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
boolean_t mctl_present, zoneid_t zoneid)
{
ip6_t *ip6h;
@@ -658,11 +598,10 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
* sufficient. Same link local addresses for different ILL's is
* possible.
*/
-
if (IN6_IS_ADDR_LINKLOCAL(&inner_ip6h->ip6_dst)) {
first_ire = ire_ctable_lookup_v6(&inner_ip6h->ip6_dst, NULL,
IRE_CACHE, ill->ill_ipif, ALL_ZONES, NULL,
- MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst);
+ MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
if (first_ire == NULL) {
if (ip_debug > 2) {
@@ -773,7 +712,7 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
}
rw_exit(&irb->irb_lock);
}
- icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill,
+ icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, inill,
mctl_present, zoneid);
}
@@ -783,7 +722,8 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
*/
void
icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
- icmp6_t *icmp6, ill_t *ill, boolean_t mctl_present, zoneid_t zoneid)
+ icmp6_t *icmp6, ill_t *ill, ill_t *inill, boolean_t mctl_present,
+ zoneid_t zoneid)
{
uint16_t *up; /* Pointer to ports in ULP header */
uint32_t ports; /* reversed ports for fanout */
@@ -861,7 +801,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
((uint16_t *)&ports)[0] = up[1];
((uint16_t *)&ports)[1] = up[0];
- ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, ill,
+ ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, inill,
IP6_NO_IPPOLICY, mctl_present, zoneid);
return;
}
@@ -908,7 +848,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
((uint16_t *)&ports)[0] = up[1];
((uint16_t *)&ports)[1] = up[0];
- ip_fanout_sctp(first_mp, ill, (ipha_t *)ip6h, ports, 0,
+ ip_fanout_sctp(first_mp, inill, (ipha_t *)ip6h, ports, 0,
mctl_present, IP6_NO_IPPOLICY, zoneid);
return;
case IPPROTO_ESP:
@@ -940,7 +880,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
ASSERT(ill != NULL);
ii->ipsec_in_ill_index =
ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index = ii->ipsec_in_ill_index;
+ ii->ipsec_in_rill_index =
+ inill->ill_phyint->phyint_ifindex;
first_mp->b_cont->b_datap->db_type = M_CTL;
} else {
/*
@@ -970,7 +911,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
mp->b_datap->db_type = M_CTL;
ii->ipsec_in_ill_index =
ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index = ii->ipsec_in_ill_index;
+ ii->ipsec_in_rill_index =
+ inill->ill_phyint->phyint_ifindex;
}
if (!ipsec_loaded(ipss)) {
@@ -985,7 +927,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
if (ipsec_rc == IPSEC_STATUS_FAILED)
return;
- ip_fanout_proto_again(first_mp, ill, ill, NULL);
+ ip_fanout_proto_again(first_mp, ill, inill, NULL);
return;
}
case IPPROTO_ENCAP:
@@ -1083,8 +1025,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
* doing here.
*/
icmp_inbound_error_fanout_v6(q, first_mp,
- (ip6_t *)mp->b_rptr, icmp6, ill, mctl_present,
- zoneid);
+ (ip6_t *)mp->b_rptr, icmp6, ill, inill,
+ mctl_present, zoneid);
return;
}
/* FALLTHRU */
@@ -1096,7 +1038,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
rip6h.ip6_src = ip6h->ip6_dst;
rip6h.ip6_dst = ip6h->ip6_src;
rip6h.ip6_nxt = nexthdr;
- ip_fanout_proto_v6(q, first_mp, &rip6h, ill, ill, nexthdr, 0,
+ ip_fanout_proto_v6(q, first_mp, &rip6h, ill, inill, nexthdr, 0,
IP6_NO_IPPOLICY, mctl_present, zoneid);
return;
}
@@ -1194,9 +1136,8 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
* redirect packet.)
*/
- prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL,
- ALL_ZONES, NULL, MATCH_IRE_GW | MATCH_IRE_ILL_GROUP |
- MATCH_IRE_DEFAULT, ipst);
+ prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL, ALL_ZONES,
+ NULL, MATCH_IRE_GW | MATCH_IRE_ILL | MATCH_IRE_DEFAULT, ipst);
/*
* Check that
@@ -1260,6 +1201,7 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR);
if (opt != NULL) {
err = ndp_lookup_then_add_v6(ill,
+ B_FALSE, /* don't match across illgrp */
(uchar_t *)&opt[1], /* Link layer address */
gateway,
&ipv6_all_ones, /* prefix mask */
@@ -1367,8 +1309,7 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
*/
redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST,
ire->ire_ipif, NULL, ALL_ZONES, 0, NULL,
- (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP),
- ipst);
+ (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
ire_refrele(ire); /* Held in ire_add_v6 */
@@ -1457,15 +1398,11 @@ icmp_pick_source_v6(queue_t *wq, in6_addr_t *origsrc, in6_addr_t *origdst,
BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes);
return (NULL);
}
- /*
- * Does not matter whether we use ire_stq or ire_ipif here.
- * Just pick an ill for ICMP replies.
- */
ASSERT(ire->ire_ipif != NULL);
ill = ire->ire_ipif->ipif_ill;
ire_refrele(ire);
}
- ipif = ipif_select_source_v6(ill, origsrc, RESTRICT_TO_NONE,
+ ipif = ipif_select_source_v6(ill, origsrc, B_FALSE,
IPV6_PREFER_SRC_DEFAULT, zoneid);
if (ipif != NULL) {
*src = ipif->ipif_v6src_addr;
@@ -1858,7 +1795,7 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
mp = icmp_pkt_err_ok_v6(q, mp, llbcast, B_FALSE, ipst);
if (mp == NULL)
return;
- nce = ndp_lookup_v6(ill, targetp, B_FALSE);
+ nce = ndp_lookup_v6(ill, B_TRUE, targetp, B_FALSE);
if (nce != NULL && nce->nce_state != ND_INCOMPLETE) {
ll_opt_len = (sizeof (nd_opt_hdr_t) +
ill->ill_phys_addr_length + 7)/8 * 8;
@@ -1908,31 +1845,8 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
rdh->nd_opt_rh_reserved1 = 0;
rdh->nd_opt_rh_reserved2 = 0;
/* ipif_v6src_addr contains the link-local source address */
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- if (ill->ill_group != NULL) {
- /*
- * The receiver of the redirect will verify whether it
- * had a route through us (srcp that we will use in
- * the redirect) or not. As we load spread even link-locals,
- * we don't know which source address the receiver of
- * redirect has in its route for communicating with us.
- * Thus we randomly choose a source here and finally we
- * should get to the right one and it will eventually
- * accept the redirect from us. We can't call
- * ip_lookup_scope_v6 because we don't have the right
- * link-local address here. Thus we randomly choose one.
- */
- int cnt = ill->ill_group->illgrp_ill_count;
+ srcp = &ill->ill_ipif->ipif_v6src_addr;
- ill = ill->ill_group->illgrp_ill;
- cnt = ++ipst->ips_icmp_redirect_v6_src_index % cnt;
- while (cnt--)
- ill = ill->ill_group_next;
- srcp = &ill->ill_ipif->ipif_v6src_addr;
- } else {
- srcp = &ill->ill_ipif->ipif_v6src_addr;
- }
- rw_exit(&ipst->ips_ill_g_lock);
/* Redirects sent by router, and router is global zone */
icmp_pkt_v6(q, mp, buf, len, srcp, B_FALSE, GLOBAL_ZONEID, ipst);
kmem_free(buf, len);
@@ -2231,6 +2145,7 @@ ip_bind_post_handling_v6(conn_t *connp, mblk_t *mp,
if (version_changed) {
ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst);
}
+
/*
* Pass the IPSEC headers size in ire_ipsec_overhead.
* We can't do this in ip_bind_insert_ire because the policy
@@ -2771,8 +2686,8 @@ ip_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
}
if (ip6_asp_can_lookup(ipst)) {
src_ipif = ipif_select_source_v6(dst_ill,
- v6dst, RESTRICT_TO_NONE,
- connp->conn_src_preferences, zoneid);
+ v6dst, B_FALSE, connp->conn_src_preferences,
+ zoneid);
ip6_asp_table_refrele(ipst);
if (src_ipif == NULL) {
pr_addr_dbg("ip_bind_connected_v6: "
@@ -3111,7 +3026,15 @@ ip_add_info_v6(mblk_t *mp, ill_t *ill, const in6_addr_t *dst)
ip6i->ip6i_nxt = IPPROTO_RAW;
if (ill != NULL) {
ip6i->ip6i_flags = IP6I_IFINDEX;
- ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
+ /*
+ * If `ill' is in an IPMP group, make sure we use the IPMP
+ * interface index so that e.g. IPV6_RECVPKTINFO will get the
+ * IPMP interface index and not an underlying interface index.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ip6i->ip6i_ifindex = ipmp_ill_get_ipmp_ifindex(ill);
+ else
+ ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
} else {
ip6i->ip6i_flags = 0;
}
@@ -4257,33 +4180,6 @@ ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h)
}
/*
- * Select an ill for the packet by considering load spreading across
- * a different ill in the group if dst_ill is part of some group.
- */
-static ill_t *
-ip_newroute_get_dst_ill_v6(ill_t *dst_ill)
-{
- ill_t *ill;
-
- /*
- * We schedule irrespective of whether the source address is
- * INADDR_UNSPECIED or not.
- */
- ill = illgrp_scheduler(dst_ill);
- if (ill == NULL)
- return (NULL);
-
- /*
- * For groups with names ip_sioctl_groupname ensures that all
- * ills are of same type. For groups without names, ifgrp_insert
- * ensures this.
- */
- ASSERT(dst_ill->ill_type == ill->ill_type);
-
- return (ill);
-}
-
-/*
* IPv6 -
* ip_newroute_v6 is called by ip_rput_data_v6 or ip_wput_v6 whenever we need
* to send out a packet to a destination address for which we do not have
@@ -4303,14 +4199,6 @@ ip_newroute_get_dst_ill_v6(ill_t *dst_ill)
* node sits at a site boundary).
* We create the cache entries in the regular ctable since
* it can not "confuse" things for other destinations.
- * table.
- *
- * When ill is part of a ill group, we subject the packets
- * to load spreading even if the ill is specified by the
- * means described above. We disable only for IPV6_BOUND_PIF
- * and for the cases where IP6I_ATTACH_IF is set i.e NS/NA/
- * Echo replies to link-local destinations have IP6I_ATTACH_IF
- * set.
*
* NOTE : These are the scopes of some of the variables that point at IRE,
* which needs to be followed while making any future modifications
@@ -4327,8 +4215,6 @@ ip_newroute_get_dst_ill_v6(ill_t *dst_ill)
*
* Thus on failures, we have to REFRELE only ire and sire, if they
* are not NULL.
- *
- * v6srcp may be used in the future. Currently unused.
*/
/* ARGSUSED */
void
@@ -4346,10 +4232,8 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
int err = 0;
mblk_t *first_mp;
ipsec_out_t *io;
- ill_t *attach_ill = NULL;
ushort_t ire_marks = 0;
int match_flags;
- boolean_t ip6i_present;
ire_t *first_sire = NULL;
mblk_t *copy_mp = NULL;
mblk_t *xmit_mp = NULL;
@@ -4359,7 +4243,6 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
boolean_t multirt_is_resolvable;
boolean_t multirt_resolve_next;
boolean_t need_rele = B_FALSE;
- boolean_t do_attach_ill = B_FALSE;
boolean_t ip6_asp_table_held = B_FALSE;
tsol_ire_gw_secattr_t *attrp = NULL;
tsol_gcgrp_t *gcgrp = NULL;
@@ -4376,39 +4259,12 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
io = NULL;
}
- /*
- * If this end point is bound to IPIF_NOFAILOVER, set bnf_ill and
- * bind_to_nofailover B_TRUE. We can't use conn to determine as it
- * could be NULL.
- *
- * This information can appear either in an ip6i_t or an IPSEC_OUT
- * message.
- */
ip6h = (ip6_t *)mp->b_rptr;
- ip6i_present = (ip6h->ip6_nxt == IPPROTO_RAW);
- if (ip6i_present || (io != NULL && io->ipsec_out_attach_if)) {
- if (!ip6i_present ||
- ((ip6i_t *)ip6h)->ip6i_flags & IP6I_ATTACH_IF) {
- attach_ill = ip_grab_attach_ill(ill, first_mp,
- (ip6i_present ? ((ip6i_t *)ip6h)->ip6i_ifindex :
- io->ipsec_out_ill_index), B_TRUE, ipst);
- /* Failure case frees things for us. */
- if (attach_ill == NULL)
- return;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill))
- ire_marks = IRE_MARK_HIDDEN;
- }
- }
if (IN6_IS_ADDR_LOOPBACK(v6dstp)) {
ip1dbg(("ip_newroute_v6: dst with loopback addr\n"));
goto icmp_err_ret;
- } else if ((v6srcp != NULL) && IN6_IS_ADDR_LOOPBACK(v6srcp)) {
+ } else if (IN6_IS_ADDR_LOOPBACK(v6srcp)) {
ip1dbg(("ip_newroute_v6: src with loopback addr\n"));
goto icmp_err_ret;
}
@@ -4436,30 +4292,24 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0,
NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp),
match_flags, ipst);
+ } else {
+ match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
+ MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL;
+ match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR;
+
/*
- * ire_add_then_send -> ip_newroute_v6 in the CGTP case passes
- * in a NULL ill, but the packet could be a neighbor
- * solicitation/advertisment and could have a valid attach_ill.
+ * Because nce_xmit() calls ip_output_v6() and NCEs are always
+ * tied to an underlying interface, IS_UNDER_IPMP() may be
+ * true even when building IREs that will be used for data
+ * traffic. As such, use the packet's source address to
+ * determine whether the traffic is test traffic, and set
+ * MATCH_IRE_MARK_TESTHIDDEN if so.
*/
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
- } else {
- if (attach_ill != NULL) {
- /*
- * attach_ill is set only for communicating with
- * on-link hosts. So, don't look for DEFAULT.
- * ip_wput_v6 passes the right ill in this case and
- * hence we can assert.
- */
- ASSERT(ill == attach_ill);
- ill_refrele(attach_ill);
- do_attach_ill = B_TRUE;
- match_flags = MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL;
- } else {
- match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL_GROUP;
+ if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) {
+ if (ipif_lookup_testaddr_v6(ill, v6srcp, NULL))
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
}
- match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR;
+
ire = ire_ftable_lookup_v6(v6dstp, NULL, NULL, 0, ill->ill_ipif,
&sire, zoneid, 0, MBLK_GETLABEL(mp), match_flags, ipst);
}
@@ -4601,106 +4451,56 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
}
/*
- * We have a route to reach the destination.
- *
- * 1) If the interface is part of ill group, try to get a new
- * ill taking load spreading into account.
+ * We have a route to reach the destination. Find the
+ * appropriate ill, then get a source address that matches the
+ * right scope via ipif_select_source_v6().
*
- * 2) After selecting the ill, get a source address that might
- * create good inbound load spreading and that matches the
- * right scope. ipif_select_source_v6 does this for us.
+ * If we are here trying to create an IRE_CACHE for an offlink
+ * destination and have an IRE_CACHE entry for VNI, then use
+ * ire_stq instead since VNI's queue is a black hole.
*
- * If the application specified the ill (ifindex), we still
- * load spread. Only if the packets needs to go out specifically
- * on a given ill e.g. bind to IPIF_NOFAILOVER address,
- * IPV6_BOUND_PIF we don't try to use a different ill for load
- * spreading.
+ * Note: While we pick a dst_ill we are really only interested
+ * in the ill for load spreading. The source ipif is
+ * determined by source address selection below.
*/
- if (!do_attach_ill) {
- /*
- * If the interface belongs to an interface group,
- * make sure the next possible interface in the group
- * is used. This encourages load spreading among
- * peers in an interface group. However, in the case
- * of multirouting, load spreading is not used, as we
- * actually want to replicate outgoing packets through
- * particular interfaces.
- *
- * Note: While we pick a dst_ill we are really only
- * interested in the ill for load spreading.
- * The source ipif is determined by source address
- * selection below.
- */
- if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) {
- dst_ill = ire->ire_ipif->ipif_ill;
- /* For uniformity do a refhold */
- ill_refhold(dst_ill);
+ if ((ire->ire_type == IRE_CACHE) &&
+ IS_VNI(ire->ire_ipif->ipif_ill)) {
+ dst_ill = ire->ire_stq->q_ptr;
+ ill_refhold(dst_ill);
+ } else {
+ ill_t *ill = ire->ire_ipif->ipif_ill;
+
+ if (IS_IPMP(ill)) {
+ dst_ill =
+ ipmp_illgrp_hold_next_ill(ill->ill_grp);
} else {
- /*
- * If we are here trying to create an IRE_CACHE
- * for an offlink destination and have the
- * IRE_CACHE for the next hop and the latter is
- * using virtual IP source address selection i.e
- * it's ire->ire_ipif is pointing to a virtual
- * network interface (vni) then
- * ip_newroute_get_dst_ll() will return the vni
- * interface as the dst_ill. Since the vni is
- * virtual i.e not associated with any physical
- * interface, it cannot be the dst_ill, hence
- * in such a case call ip_newroute_get_dst_ll()
- * with the stq_ill instead of the ire_ipif ILL.
- * The function returns a refheld ill.
- */
- if ((ire->ire_type == IRE_CACHE) &&
- IS_VNI(ire->ire_ipif->ipif_ill))
- dst_ill = ip_newroute_get_dst_ill_v6(
- ire->ire_stq->q_ptr);
- else
- dst_ill = ip_newroute_get_dst_ill_v6(
- ire->ire_ipif->ipif_ill);
+ dst_ill = ill;
+ ill_refhold(dst_ill);
}
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute_v6 : no dst "
- "ill for dst %s\n",
- AF_INET6, v6dstp);
- }
- goto icmp_err_ret;
- } else if (dst_ill->ill_group == NULL && ill != NULL &&
- dst_ill != ill) {
- /*
- * If "ill" is not part of any group, we should
- * have found a route matching "ill" as we
- * called ire_ftable_lookup_v6 with
- * MATCH_IRE_ILL_GROUP.
- * Rather than asserting when there is a
- * mismatch, we just drop the packet.
- */
- ip0dbg(("ip_newroute_v6: BOUND_IF failed : "
- "dst_ill %s ill %s\n",
- dst_ill->ill_name,
- ill->ill_name));
- goto icmp_err_ret;
+ }
+
+ if (dst_ill == NULL) {
+ if (ip_debug > 2) {
+ pr_addr_dbg("ip_newroute_v6 : no dst "
+ "ill for dst %s\n", AF_INET6, v6dstp);
}
- } else {
- dst_ill = ire->ire_ipif->ipif_ill;
- /* For uniformity do refhold */
- ill_refhold(dst_ill);
+ goto icmp_err_ret;
+ }
+
+ if (ill != NULL && dst_ill != ill &&
+ !IS_IN_SAME_ILLGRP(dst_ill, ill)) {
/*
- * We should have found a route matching ill as we
- * called ire_ftable_lookup_v6 with MATCH_IRE_ILL.
- * Rather than asserting, while there is a mismatch,
- * we just drop the packet.
+ * We should have found a route matching "ill"
+ * as we called ire_ftable_lookup_v6 with
+ * MATCH_IRE_ILL. Rather than asserting when
+ * there is a mismatch, we just drop the packet.
*/
- if (dst_ill != ill) {
- ip0dbg(("ip_newroute_v6: Packet dropped as "
- "IP6I_ATTACH_IF ill is %s, "
- "ire->ire_ipif->ipif_ill is %s\n",
- ill->ill_name,
- dst_ill->ill_name));
- goto icmp_err_ret;
- }
+ ip0dbg(("ip_newroute_v6: BOUND_IF failed: "
+ "dst_ill %s ill %s\n", dst_ill->ill_name,
+ ill->ill_name));
+ goto icmp_err_ret;
}
+
/*
* Pick a source address which matches the scope of the
* destination address.
@@ -4708,7 +4508,20 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
* parent ire (sire).
*/
ASSERT(src_ipif == NULL);
- if (ire->ire_type == IRE_IF_RESOLVER &&
+
+ /*
+ * Because nce_xmit() calls ip_output_v6() and NCEs are always
+ * tied to the underlying interface, IS_UNDER_IPMP() may be
+ * true even when building IREs that will be used for data
+ * traffic. As such, see if the packet's source address is a
+ * test address, and if so use that test address's ipif for
+ * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in
+ * ire_add_v6() can work properly.
+ */
+ if (ill != NULL && IS_UNDER_IPMP(ill))
+ (void) ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif);
+
+ if (src_ipif == NULL && ire->ire_type == IRE_IF_RESOLVER &&
!IN6_IS_ADDR_UNSPECIFIED(&v6gw) &&
ip6_asp_can_lookup(ipst)) {
/*
@@ -4718,10 +4531,10 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
*/
ip6_asp_table_held = B_TRUE;
src_ipif = ipif_select_source_v6(dst_ill, &v6gw,
- RESTRICT_TO_GROUP, IPV6_PREFER_SRC_DEFAULT, zoneid);
+ B_TRUE, IPV6_PREFER_SRC_DEFAULT, zoneid);
if (src_ipif != NULL)
ire_marks |= IRE_MARK_USESRC_CHECK;
- } else {
+ } else if (src_ipif == NULL) {
if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
/*
* Check that the ipif matching the requested
@@ -4732,14 +4545,9 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
NULL, NULL, NULL, NULL, ipst);
}
if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
- uint_t restrict_ill = RESTRICT_TO_NONE;
-
- if (ip6i_present && ((ip6i_t *)ip6h)->ip6i_flags
- & IP6I_ATTACH_IF)
- restrict_ill = RESTRICT_TO_ILL;
ip6_asp_table_held = B_TRUE;
src_ipif = ipif_select_source_v6(dst_ill,
- v6dstp, restrict_ill,
+ v6dstp, B_FALSE,
IPV6_PREFER_SRC_DEFAULT, zoneid);
if (src_ipif != NULL)
ire_marks |= IRE_MARK_USESRC_CHECK;
@@ -4750,7 +4558,7 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
if (ip_debug > 2) {
/* ip1dbg */
pr_addr_dbg("ip_newroute_v6: no src for "
- "dst %s\n, ", AF_INET6, v6dstp);
+ "dst %s\n", AF_INET6, v6dstp);
printf("ip_newroute_v6: interface name %s\n",
dst_ill->ill_name);
}
@@ -4837,14 +4645,7 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
"ire_ihandle_lookup_offlink_v6 failed\n"));
goto icmp_err_ret;
}
- /*
- * Assume DL_UNITDATA_REQ is same for all physical
- * interfaces in the ifgrp. If it isn't, this code will
- * have to be seriously rewhacked to allow the
- * fastpath probing (such that I cache the link
- * header in the IRE_CACHE) to work over ifgrps.
- * We have what we need to build an IRE_CACHE.
- */
+
/*
* Note: the new ire inherits RTF_SETSRC
* and RTF_MULTIRT to propagate these flags from prefix
@@ -5659,24 +5460,22 @@ icmp_err_ret:
*/
void
ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
- in6_addr_t v6dst, int unspec_src, zoneid_t zoneid)
+ const in6_addr_t *v6dstp, const in6_addr_t *v6srcp, int unspec_src,
+ zoneid_t zoneid)
{
ire_t *ire = NULL;
ipif_t *src_ipif = NULL;
int err = 0;
ill_t *dst_ill = NULL;
ire_t *save_ire;
- ushort_t ire_marks = 0;
ipsec_out_t *io;
- ill_t *attach_ill = NULL;
ill_t *ill;
- ip6_t *ip6h;
mblk_t *first_mp;
- boolean_t ip6i_present;
ire_t *fire = NULL;
mblk_t *copy_mp = NULL;
+ const in6_addr_t *ire_v6srcp;
+ boolean_t probe = B_FALSE;
boolean_t multirt_resolve_next;
- in6_addr_t *v6dstp = &v6dst;
boolean_t ipif_held = B_FALSE;
boolean_t ill_held = B_FALSE;
boolean_t ip6_asp_table_held = B_FALSE;
@@ -5728,35 +5527,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
if (!(ill->ill_flags & ILLF_MULTICAST)) {
goto err_ret;
}
- /*
- * If this end point is bound to IPIF_NOFAILOVER, set bnf_ill
- * and bind_to_nofailover B_TRUE. We can't use conn to determine
- * as it could be NULL.
- *
- * This information can appear either in an ip6i_t or an
- * IPSEC_OUT message.
- */
- ip6h = (ip6_t *)mp->b_rptr;
- ip6i_present = (ip6h->ip6_nxt == IPPROTO_RAW);
- if (ip6i_present || (io != NULL && io->ipsec_out_attach_if)) {
- if (!ip6i_present ||
- ((ip6i_t *)ip6h)->ip6i_flags & IP6I_ATTACH_IF) {
- attach_ill = ip_grab_attach_ill(ill, first_mp,
- (ip6i_present ?
- ((ip6i_t *)ip6h)->ip6i_ifindex :
- io->ipsec_out_ill_index), B_TRUE, ipst);
- /* Failure case frees things for us. */
- if (attach_ill == NULL)
- return;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill))
- ire_marks = IRE_MARK_HIDDEN;
- }
- }
/*
* We check if an IRE_OFFSUBNET for the addr that goes through
@@ -5770,76 +5540,93 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
(void *)ipif, ntohl(V4_PART_OF_V6((*v6dstp))),
(void *)fire));
+ ASSERT(src_ipif == NULL);
+
/*
- * If the application specified the ill (ifindex), we still
- * load spread. Only if the packets needs to go out specifically
- * on a given ill e.g. binding to IPIF_NOFAILOVER address or
- * IPV6_BOUND_PIF, or there is a parent ire entry that specified
- * multirouting, then we don't try to use a different ill for
- * load spreading.
+ * Because nce_xmit() calls ip_output_v6() and NCEs are always
+ * tied to the underlying interface, IS_UNDER_IPMP() may be
+ * true even when building IREs that will be used for data
+ * traffic. As such, see if the packet's source address is a
+ * test address, and if so use that test address's ipif for
+ * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in
+ * ire_add_v6() can work properly.
+ */
+ if (IS_UNDER_IPMP(ill))
+ probe = ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif);
+
+ /*
+ * Determine the outbound (destination) ill for this route.
+ * If IPMP is not in use, that's the same as our ill. If IPMP
+ * is in-use and we're on the IPMP interface, or we're on an
+ * underlying ill but sending data traffic, use a suitable
+ * destination ill from the group. The latter case covers a
+ * subtle edge condition with multicast: when we bring up an
+ * IPv6 data address, we will create an NCE on an underlying
+ * interface, and send solitications to ff02::1, which would
+ * take us through here, and cause us to create an IRE for
+ * ff02::1. To meet our defined semantics for multicast (and
+ * ensure there aren't unexpected echoes), that IRE needs to
+ * use the IPMP group's nominated multicast interface.
+ *
+ * Note: the source ipif is determined by source address
+ * selection later.
*/
- if (attach_ill == NULL) {
- /*
- * If the interface belongs to an interface group,
- * make sure the next possible interface in the group
- * is used. This encourages load spreading among peers
- * in an interface group.
- *
- * Note: While we pick a dst_ill we are really only
- * interested in the ill for load spreading. The source
- * ipif is determined by source address selection below.
- */
- if ((fire != NULL) && (fire->ire_flags & RTF_MULTIRT)) {
- dst_ill = ipif->ipif_ill;
- /* For uniformity do a refhold */
- ill_refhold(dst_ill);
+ if (IS_IPMP(ill) || (IS_UNDER_IPMP(ill) && !probe)) {
+ ill_t *ipmp_ill;
+ ipmp_illgrp_t *illg;
+
+ if (IS_UNDER_IPMP(ill)) {
+ ipmp_ill = ipmp_ill_hold_ipmp_ill(ill);
} else {
- /* refheld by ip_newroute_get_dst_ill_v6 */
- dst_ill =
- ip_newroute_get_dst_ill_v6(ipif->ipif_ill);
+ ipmp_ill = ill;
+ ill_refhold(ipmp_ill); /* for symmetry */
}
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute_ipif_v6: "
- "no dst ill for dst %s\n",
- AF_INET6, v6dstp);
- }
+
+ if (ipmp_ill == NULL)
goto err_ret;
- }
+
+ illg = ipmp_ill->ill_grp;
+ if (IN6_IS_ADDR_MULTICAST(v6dstp))
+ dst_ill = ipmp_illgrp_hold_cast_ill(illg);
+ else
+ dst_ill = ipmp_illgrp_hold_next_ill(illg);
+
+ ill_refrele(ipmp_ill);
} else {
- dst_ill = ipif->ipif_ill;
- /*
- * ip_wput_v6 passes the right ipif for IPIF_NOFAILOVER
- * and IPV6_BOUND_PIF case.
- */
- ASSERT(dst_ill == attach_ill);
- /* attach_ill is already refheld */
+ dst_ill = ill;
+ ill_refhold(dst_ill); /* for symmetry */
+ }
+
+ if (dst_ill == NULL) {
+ if (ip_debug > 2) {
+ pr_addr_dbg("ip_newroute_ipif_v6: "
+ "no dst ill for dst %s\n",
+ AF_INET6, v6dstp);
+ }
+ goto err_ret;
}
+
/*
* Pick a source address which matches the scope of the
* destination address.
* For RTF_SETSRC routes, the source address is imposed by the
* parent ire (fire).
*/
- ASSERT(src_ipif == NULL);
- if ((fire != NULL) && (fire->ire_flags & RTF_SETSRC)) {
+
+ if (src_ipif == NULL && fire != NULL &&
+ (fire->ire_flags & RTF_SETSRC)) {
/*
* Check that the ipif matching the requested source
* address still exists.
*/
- src_ipif =
- ipif_lookup_addr_v6(&fire->ire_src_addr_v6,
+ src_ipif = ipif_lookup_addr_v6(&fire->ire_src_addr_v6,
NULL, zoneid, NULL, NULL, NULL, NULL, ipst);
}
- if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
- uint_t restrict_ill = RESTRICT_TO_NONE;
- if (ip6i_present && ((ip6i_t *)ip6h)->ip6i_flags
- & IP6I_ATTACH_IF)
- restrict_ill = RESTRICT_TO_ILL;
+ if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
ip6_asp_table_held = B_TRUE;
src_ipif = ipif_select_source_v6(dst_ill, v6dstp,
- restrict_ill, IPV6_PREFER_SRC_DEFAULT, zoneid);
+ B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid);
}
if (src_ipif == NULL) {
@@ -5847,16 +5634,20 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
if (ip_debug > 2) {
/* ip1dbg */
pr_addr_dbg("ip_newroute_ipif_v6: "
- "no src for dst %s\n,",
+ "no src for dst %s\n",
AF_INET6, v6dstp);
printf(" through interface %s\n",
dst_ill->ill_name);
}
goto err_ret;
}
+ ire_v6srcp = &ipv6_all_zeros;
src_ipif = ipif;
ipif_refhold(src_ipif);
+ } else {
+ ire_v6srcp = &src_ipif->ipif_v6src_addr;
}
+
ire = ipif_to_ire_v6(ipif);
if (ire == NULL) {
if (ip_debug > 2) {
@@ -5903,7 +5694,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
}
}
- ASSERT((attach_ill == NULL) || (dst_ill == attach_ill));
switch (ire->ire_type) {
case IRE_IF_NORESOLVER: {
/*
@@ -5921,7 +5711,7 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
ire = ire_create_v6(
v6dstp, /* dest address */
&ipv6_all_ones, /* mask */
- &src_ipif->ipif_v6src_addr, /* source address */
+ ire_v6srcp, /* source address */
NULL, /* gateway address */
&save_ire->ire_max_frag,
NULL, /* no src nce */
@@ -5946,8 +5736,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
break;
}
- ire->ire_marks |= ire_marks;
-
err = ndp_noresolver(dst_ill, v6dstp);
if (err != 0) {
ire_refrele(save_ire);
@@ -6051,7 +5839,7 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
ire = ire_create_v6(
v6dstp, /* dest address */
&ipv6_all_ones, /* mask */
- &src_ipif->ipif_v6src_addr, /* source address */
+ ire_v6srcp, /* source address */
NULL, /* gateway address */
&save_ire->ire_max_frag,
NULL, /* src nce */
@@ -6076,8 +5864,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
break;
}
- ire->ire_marks |= ire_marks;
-
/* Resolve and add ire to the ctable */
err = ndp_resolver(dst_ill, v6dstp, first_mp, zoneid);
switch (err) {
@@ -6273,8 +6059,8 @@ err_ret:
ipif_refrele(ipif);
if (src_ipif != NULL)
ipif_refrele(src_ipif);
+
/* Multicast - no point in trying to generate ICMP error */
- ASSERT((attach_ill == NULL) || (dst_ill == attach_ill));
if (dst_ill != NULL) {
ill = dst_ill;
ill_held = B_TRUE;
@@ -6499,7 +6285,7 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
&ip6h->ip6_dst)) {
ipif = ipif_select_source_v6(
ill, &ip6h->ip6_src,
- RESTRICT_TO_GROUP,
+ B_TRUE,
IPV6_PREFER_SRC_DEFAULT,
ALL_ZONES);
if (ipif != NULL) {
@@ -7050,7 +6836,7 @@ ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr)
*/
static boolean_t
ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present,
- ill_t *ill, mblk_t *hada_mp, zoneid_t zoneid)
+ ill_t *ill, ill_t *inill, mblk_t *hada_mp, zoneid_t zoneid)
{
mblk_t *mp;
uint8_t nexthdr;
@@ -7093,7 +6879,7 @@ ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present,
*/
ii = (ipsec_in_t *)first_mp->b_rptr;
ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index = ii->ipsec_in_ill_index;
+ ii->ipsec_in_rill_index = inill->ill_phyint->phyint_ifindex;
first_mp->b_cont = mp;
}
/*
@@ -7122,7 +6908,7 @@ ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present,
switch (ipsec_rc) {
case IPSEC_STATUS_SUCCESS:
/* we're done with IPsec processing, send it up */
- ip_fanout_proto_again(first_mp, ill, ill, NULL);
+ ip_fanout_proto_again(first_mp, ill, inill, NULL);
break;
case IPSEC_STATUS_FAILED:
BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
@@ -7225,7 +7011,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
ip6_hbh_t *hbhhdr;
boolean_t ll_multicast = (flags & IP6_IN_LLMCAST);
conn_t *connp;
- ilm_t *ilm;
uint32_t ports;
zoneid_t zoneid = GLOBAL_ZONEID;
uint16_t hck_flags, reass_hck_flags;
@@ -7347,10 +7132,8 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
/*
* XXX TODO Give to mrouted to for multicast forwarding.
*/
- ILM_WALKER_HOLD(ill);
- ilm = ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, ALL_ZONES);
- ILM_WALKER_RELE(ill);
- if (ilm == NULL) {
+ if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE,
+ ALL_ZONES) == NULL) {
if (ip_debug > 3) {
/* ip2dbg */
pr_addr_dbg("ip_rput_data_v6: got mcast packet"
@@ -7405,7 +7188,7 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) {
ire = ire_ctable_lookup_v6(&ip6h->ip6_dst, NULL,
IRE_CACHE|IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL,
- MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst);
+ MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
} else {
ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ALL_ZONES,
MBLK_GETLABEL(mp), ipst);
@@ -7466,9 +7249,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
}
/* we have a matching IRE */
if (ire->ire_stq != NULL) {
- ill_group_t *ill_group;
- ill_group_t *ire_group;
-
/*
* To be quicker, we may wish not to chase pointers
* (ire->ire_ipif->ipif_ill...) and instead store the
@@ -7483,7 +7263,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
no_forward = ((ill->ill_flags &
ire->ire_ipif->ipif_ill->ill_flags & ILLF_ROUTER) == 0);
-
ASSERT(first_mp == mp);
/*
* This ire has a send-to queue - forward the packet.
@@ -7568,10 +7347,8 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
* we're forwarding onto the same link), conditionally send
* a redirect message.
*/
- ill_group = ill->ill_group;
- ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group;
- if (ire->ire_rfq != q && (ill_group == NULL ||
- ill_group != ire_group)) {
+ if (ire->ire_rfq != q &&
+ !IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) {
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) ||
IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) {
BUMP_MIB(ill->ill_ip_mib,
@@ -8006,7 +7783,10 @@ tcp_fanout:
* where there is no conn.
*/
if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
- ASSERT(!IS_LOOPBACK((ill)));
+ ilm_t *ilm;
+ ilm_walker_t ilw;
+
+ ASSERT(!IS_LOOPBACK(ill));
/*
* In the multicast case, applications may have
* joined the group from different zones, so we
@@ -8015,32 +7795,32 @@ tcp_fanout:
* structures (ilm) on the receive ill and send
* a copy of the packet up each matching one.
*/
- ILM_WALKER_HOLD(ill);
- for (ilm = ill->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if (ilm->ilm_flags & ILM_DELETED)
- continue;
+ ilm = ilm_walker_start(&ilw, inill);
+ for (; ilm != NULL;
+ ilm = ilm_walker_step(&ilw, ilm)) {
if (!IN6_ARE_ADDR_EQUAL(
&ilm->ilm_v6addr, &ip6h->ip6_dst))
continue;
- if (!ipif_lookup_zoneid(ill,
- ilm->ilm_zoneid, IPIF_UP, NULL))
+ if (!ipif_lookup_zoneid(
+ ilw.ilw_walk_ill, ilm->ilm_zoneid,
+ IPIF_UP, NULL))
continue;
first_mp1 = ip_copymsg(first_mp);
if (first_mp1 == NULL)
continue;
- icmp_inbound_v6(q, first_mp1, ill,
+ icmp_inbound_v6(q, first_mp1,
+ ilw.ilw_walk_ill, inill,
hdr_len, mctl_present, 0,
ilm->ilm_zoneid, dl_mp);
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
} else {
first_mp1 = ip_copymsg(first_mp);
if (first_mp1 != NULL)
icmp_inbound_v6(q, first_mp1, ill,
- hdr_len, mctl_present, 0, zoneid,
- dl_mp);
+ inill, hdr_len, mctl_present, 0,
+ zoneid, dl_mp);
}
}
/* FALLTHRU */
@@ -8082,7 +7862,7 @@ tcp_fanout:
/* Check if AH is present. */
if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill,
- hada_mp, zoneid)) {
+ inill, hada_mp, zoneid)) {
ip0dbg(("dst early hada drop\n"));
return;
}
@@ -8206,7 +7986,7 @@ tcp_fanout:
/* Restore the flags */
DB_CKSUMFLAGS(mp) = hck_flags;
- mp = ip_rput_frag_v6(q, mp, ip6h, fraghdr,
+ mp = ip_rput_frag_v6(ill, inill, mp, ip6h, fraghdr,
remlen - used, &prev_nexthdr_offset,
&reass_sum, &reass_hck_flags);
if (mp == NULL) {
@@ -8249,7 +8029,7 @@ tcp_fanout:
/* Check if AH is present. */
if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill,
- hada_mp, zoneid)) {
+ inill, hada_mp, zoneid)) {
ip0dbg(("routing hada drop\n"));
return;
}
@@ -8322,7 +8102,7 @@ tcp_fanout:
ii->ipsec_in_ill_index =
ill->ill_phyint->phyint_ifindex;
ii->ipsec_in_rill_index =
- ii->ipsec_in_ill_index;
+ inill->ill_phyint->phyint_ifindex;
first_mp->b_cont = mp;
/*
* Cache hardware acceleration info.
@@ -8480,11 +8260,10 @@ hada_drop:
* nexthdr field when reassembly completes.
*/
static mblk_t *
-ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
+ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset,
uint32_t *cksum_val, uint16_t *cksum_flags)
{
- ill_t *ill = (ill_t *)q->q_ptr;
uint32_t ident = ntohl(fraghdr->ip6f_ident);
uint16_t offset;
boolean_t more_frags;
@@ -8518,8 +8297,8 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
* addition, checksum offload support for IP fragments carrying
* UDP payload is commonly implemented across network adapters.
*/
- ASSERT(ill != NULL);
- if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+ ASSERT(inill != NULL);
+ if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(inill) &&
(DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
mblk_t *mp1 = mp->b_cont;
int32_t len;
@@ -8581,7 +8360,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
freemsg(mp);
return (NULL);
}
- icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER,
+ icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER,
(uint32_t)((char *)&ip6h->ip6_plen -
(char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst);
return (NULL);
@@ -8607,7 +8386,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
freemsg(mp);
return (NULL);
}
- icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER,
+ icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER,
(uint32_t)((char *)&fraghdr->ip6f_offlg -
(char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst);
return (NULL);
@@ -9204,16 +8983,14 @@ ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst)
* The routine can handle an ICMPv6 header that is not in the first mblk.
*
* The order to determine the outgoing interface is as follows:
- * 1. IPV6_BOUND_PIF is set, use that ill (conn_outgoing_pill)
- * 2. If conn_nofailover_ill is set then use that ill.
- * 3. If an ip6i_t with IP6I_IFINDEX set then use that ill.
- * 4. If q is an ill queue and (link local or multicast destination) then
+ * 1. If an ip6i_t with IP6I_IFINDEX set then use that ill.
+ * 2. If q is an ill queue and (link local or multicast destination) then
* use that ill.
- * 5. If IPV6_BOUND_IF has been set use that ill.
- * 6. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise
+ * 3. If IPV6_BOUND_IF has been set use that ill.
+ * 4. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise
* look for the best IRE match for the unspecified group to determine
* the ill.
- * 7. For unicast: Just do an IRE lookup for the best match.
+ * 5. For unicast: Just do an IRE lookup for the best match.
*
* arg2 is always a queue_t *.
* When that queue is an ill_t (i.e. q_next != NULL), then arg must be
@@ -9238,12 +9015,10 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
int unspec_src;
boolean_t do_outrequests; /* Increment OutRequests? */
mib2_ipIfStatsEntry_t *mibptr;
- int match_flags = MATCH_IRE_ILL_GROUP;
- boolean_t attach_if = B_FALSE;
+ int match_flags = MATCH_IRE_ILL;
mblk_t *first_mp;
boolean_t mctl_present;
ipsec_out_t *io;
- boolean_t drop_if_delayed = B_FALSE;
boolean_t multirt_need_resolve = B_FALSE;
mblk_t *copy_mp = NULL;
int err = 0;
@@ -9574,16 +9349,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
*/
mp->b_rptr = (uchar_t *)ip6h;
- /*
- * IP6I_ATTACH_IF is set in this function when we had a
- * conn and it was either bound to the IPFF_NOFAILOVER address
- * or IPV6_BOUND_PIF was set. These options override other
- * options that set the ifindex. We come here with
- * IP6I_ATTACH_IF set when we can't find the ire and
- * ip_newroute_v6 is feeding the packet for second time.
- */
- if ((ip6i->ip6i_flags & IP6I_IFINDEX) ||
- (ip6i->ip6i_flags & IP6I_ATTACH_IF)) {
+ if (ip6i->ip6i_flags & IP6I_IFINDEX) {
ASSERT(ip6i->ip6i_ifindex != 0);
if (ill != NULL)
ill_refrele(ill);
@@ -9603,33 +9369,13 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
return;
}
mibptr = ill->ill_ip_mib;
- if (ip6i->ip6i_flags & IP6I_IFINDEX) {
- /*
- * Preserve the index so that when we return
- * from IPSEC processing, we know where to
- * send the packet.
- */
- if (mctl_present) {
- ASSERT(io != NULL);
- io->ipsec_out_ill_index =
- ip6i->ip6i_ifindex;
- }
- }
- if (ip6i->ip6i_flags & IP6I_ATTACH_IF) {
- /*
- * This is a multipathing probe packet that has
- * been delayed in ND resolution. Drop the
- * packet for the reasons mentioned in
- * nce_queue_mp()
- */
- if ((ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) &&
- (ip6i->ip6i_flags & IP6I_ND_DELAYED)) {
- freemsg(first_mp);
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
+ /*
+ * Preserve the index so that when we return from
+ * IPSEC processing, we know where to send the packet.
+ */
+ if (mctl_present) {
+ ASSERT(io != NULL);
+ io->ipsec_out_ill_index = ip6i->ip6i_ifindex;
}
}
if (ip6i->ip6i_flags & IP6I_VERIFY_SRC) {
@@ -9698,114 +9444,20 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
if (IN6_IS_ADDR_MULTICAST(v6dstp))
goto ipv6multicast;
- /* 1. IPV6_BOUND_PIF takes precedence over all the ifindex settings. */
- if (connp != NULL && connp->conn_outgoing_pill != NULL) {
- ill_t *conn_outgoing_pill;
-
- conn_outgoing_pill = conn_get_held_ill(connp,
- &connp->conn_outgoing_pill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- if (ill != NULL)
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
- if (conn_outgoing_pill != NULL) {
- if (ill != NULL)
- ill_refrele(ill);
- ill = conn_outgoing_pill;
- attach_if = B_TRUE;
- match_flags = MATCH_IRE_ILL;
- mibptr = ill->ill_ip_mib;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- goto send_from_ill;
- }
- }
-
- /* 2. If ipc_nofailover_ill is set then use that ill. */
- if (connp != NULL && connp->conn_nofailover_ill != NULL) {
- ill_t *conn_nofailover_ill;
-
- conn_nofailover_ill = conn_get_held_ill(connp,
- &connp->conn_nofailover_ill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- if (ill != NULL)
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
- if (conn_nofailover_ill != NULL) {
- if (ill != NULL)
- ill_refrele(ill);
- ill = conn_nofailover_ill;
- attach_if = B_TRUE;
- /*
- * Assumes that ipc_nofailover_ill is used only for
- * multipathing probe packets. These packets are better
- * dropped, if they are delayed in ND resolution, for
- * the reasons described in nce_queue_mp().
- * IP6I_DROP_IFDELAYED will be set later on in this
- * function for this packet.
- */
- drop_if_delayed = B_TRUE;
- match_flags = MATCH_IRE_ILL;
- mibptr = ill->ill_ip_mib;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- goto send_from_ill;
- }
- }
-
- /*
- * Redo 1. If we did not find an IRE_CACHE the first time, we should
- * have an ip6i_t with IP6I_ATTACH_IF if IPV6_BOUND_PIF or
- * bind to the IPIF_NOFAILOVER address was used on this endpoint.
- */
- if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_ATTACH_IF)) {
- ASSERT(ip6i->ip6i_ifindex != 0);
- attach_if = B_TRUE;
- ASSERT(ill != NULL);
- match_flags = MATCH_IRE_ILL;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- goto send_from_ill;
- }
-
- /* 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
+ /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
ASSERT(ill != NULL);
goto send_from_ill;
}
/*
- * 4. If q is an ill queue and (link local or multicast destination)
+ * 2. If q is an ill queue and there's a link-local destination
* then use that ill.
*/
- if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp)) {
+ if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp))
goto send_from_ill;
- }
- /* 5. If IPV6_BOUND_IF has been set use that ill. */
+ /* 3. If IPV6_BOUND_IF has been set use that ill. */
if (connp != NULL && connp->conn_outgoing_ill != NULL) {
ill_t *conn_outgoing_ill;
@@ -9827,7 +9479,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
}
/*
- * 6. For unicast: Just do an IRE lookup for the best match.
+ * 4. For unicast: Just do an IRE lookup for the best match.
* If we get here for a link-local address it is rather random
* what interface we pick on a multihomed host.
* *If* there is an IRE_CACHE (and the link-local address
@@ -9913,7 +9565,6 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
}
BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
}
- ASSERT(!attach_if);
/*
* Check if the ire has the RTF_MULTIRT flag, inherited
@@ -9966,7 +9617,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
}
}
ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request,
- connp, caller, 0, ip6i_flags, zoneid);
+ connp, caller, ip6i_flags, zoneid);
if (need_decref) {
CONN_DEC_REF(connp);
connp = NULL;
@@ -10086,9 +9737,6 @@ ipv6multicast:
ip2dbg(("ip_wput_v6: multicast\n"));
/*
- * 1. IPV6_BOUND_PIF takes precedence over all the ifindex settings
- * 2. If conn_nofailover_ill is set then use that ill.
- *
* Hold the conn_lock till we refhold the ill of interest that is
* pointed to from the conn. Since we cannot do an ill/ipif_refrele
* while holding any locks, postpone the refrele until after the
@@ -10100,79 +9748,12 @@ ipv6multicast:
} else {
conn_lock_held = B_FALSE;
}
- if (connp != NULL && connp->conn_outgoing_pill != NULL) {
- err = ill_check_and_refhold(connp->conn_outgoing_pill);
- if (err == ILL_LOOKUP_FAILED) {
- ip1dbg(("ip_output_v6: multicast"
- " conn_outgoing_pill no ipif\n"));
-multicast_discard:
- ASSERT(saved_ill == NULL);
- if (conn_lock_held)
- mutex_exit(&connp->conn_lock);
- if (ill != NULL)
- ill_refrele(ill);
- freemsg(first_mp);
- if (do_outrequests)
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- saved_ill = ill;
- ill = connp->conn_outgoing_pill;
- attach_if = B_TRUE;
- match_flags = MATCH_IRE_ILL;
- mibptr = ill->ill_ip_mib;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- } else if (connp != NULL && connp->conn_nofailover_ill != NULL) {
- err = ill_check_and_refhold(connp->conn_nofailover_ill);
- if (err == ILL_LOOKUP_FAILED) {
- ip1dbg(("ip_output_v6: multicast"
- " conn_nofailover_ill no ipif\n"));
- goto multicast_discard;
- }
- saved_ill = ill;
- ill = connp->conn_nofailover_ill;
- attach_if = B_TRUE;
- match_flags = MATCH_IRE_ILL;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- } else if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_ATTACH_IF)) {
- /*
- * Redo 1. If we did not find an IRE_CACHE the first time,
- * we should have an ip6i_t with IP6I_ATTACH_IF if
- * IPV6_BOUND_PIF or bind to the IPIF_NOFAILOVER address was
- * used on this endpoint.
- */
- ASSERT(ip6i->ip6i_ifindex != 0);
- attach_if = B_TRUE;
- ASSERT(ill != NULL);
- match_flags = MATCH_IRE_ILL;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- } else if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
- /* 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
-
+ if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
+ /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
ASSERT(ill != NULL);
} else if (ill != NULL) {
/*
- * 4. If q is an ill queue and (link local or multicast
+ * 2. If q is an ill queue and (link local or multicast
* destination) then use that ill.
* We don't need the ipif initialization here.
* This useless assert below is just to prevent lint from
@@ -10181,9 +9762,9 @@ multicast_discard:
ASSERT(ill != NULL);
} else if (connp != NULL) {
/*
- * 5. If IPV6_BOUND_IF has been set use that ill.
+ * 3. If IPV6_BOUND_IF has been set use that ill.
*
- * 6. For multicast: if IPV6_MULTICAST_IF has been set use it.
+ * 4. For multicast: if IPV6_MULTICAST_IF has been set use it.
* Otherwise look for the best IRE match for the unspecified
* group to determine the ill.
*
@@ -10198,7 +9779,18 @@ multicast_discard:
if (err == ILL_LOOKUP_FAILED) {
ip1dbg(("ip_output_v6: multicast"
" conn_outgoing_ill no ipif\n"));
- goto multicast_discard;
+multicast_discard:
+ ASSERT(saved_ill == NULL);
+ if (conn_lock_held)
+ mutex_exit(&connp->conn_lock);
+ if (ill != NULL)
+ ill_refrele(ill);
+ freemsg(first_mp);
+ if (do_outrequests)
+ BUMP_MIB(mibptr, ipIfStatsOutDiscards);
+ if (need_decref)
+ CONN_DEC_REF(connp);
+ return;
}
ill = connp->conn_outgoing_ill;
} else if (connp->conn_multicast_ill != NULL) {
@@ -10239,8 +9831,6 @@ multicast_discard:
*/
mutex_enter(&connp->conn_lock);
connp->conn_multicast_ill = ill;
- connp->conn_orig_multicast_ifindex =
- ill->ill_phyint->phyint_ifindex;
mutex_exit(&connp->conn_lock);
}
}
@@ -10307,11 +9897,55 @@ multicast_discard:
send_from_ill:
ASSERT(ill != NULL);
ASSERT(mibptr == ill->ill_ip_mib);
+
if (do_outrequests) {
BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
do_outrequests = B_FALSE;
}
+ /*
+ * Because nce_xmit() calls ip_output_v6() and NCEs are always tied to
+ * an underlying interface, IS_UNDER_IPMP() may be true even when
+ * building IREs that will be used for data traffic. As such, use the
+ * packet's source address to determine whether the traffic is test
+ * traffic, and set MATCH_IRE_MARK_TESTHIDDEN if so.
+ *
+ * Separately, we also need to mark probe packets so that ND can
+ * process them specially; see the comments in nce_queue_mp_common().
+ */
+ if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
+ ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) {
+ if (ip6i == NULL) {
+ if ((mp = ip_add_info_v6(mp, NULL, v6dstp)) == NULL) {
+ if (mctl_present)
+ freeb(first_mp);
+ goto discard;
+ }
+
+ if (mctl_present)
+ first_mp->b_cont = mp;
+ else
+ first_mp = mp;
+
+ /* ndp_resolver() expects a pulled-up message */
+ if (MBLKL(mp) == sizeof (ip6i_t) &&
+ pullupmsg(mp, -1) == 0) {
+ ip1dbg(("ip_output_v6: pullupmsg failed\n"));
+discard: BUMP_MIB(mibptr, ipIfStatsOutDiscards);
+ ill_refrele(ill);
+ if (need_decref)
+ CONN_DEC_REF(connp);
+ return;
+ }
+ ip6i = (ip6i_t *)mp->b_rptr;
+ ip6h = (ip6_t *)&ip6i[1];
+ v6dstp = &ip6h->ip6_dst;
+ mp->b_rptr = (uchar_t *)ip6h; /* rewound below */
+ }
+ ip6i->ip6i_flags |= IP6I_IPMP_PROBE;
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+ }
+
if (io != NULL)
io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
@@ -10390,9 +10024,7 @@ send_from_ill:
ill->ill_name, (void *)ire,
ill->ill_phyint->phyint_ifindex));
ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request,
- connp, caller,
- (attach_if ? ill->ill_phyint->phyint_ifindex : 0),
- ip6i_flags, zoneid);
+ connp, caller, ip6i_flags, zoneid);
ire_refrele(ire);
if (need_decref) {
CONN_DEC_REF(connp);
@@ -10422,7 +10054,8 @@ send_from_ill:
return;
}
ip_newroute_ipif_v6(q, copy_mp, ipif,
- ip6h->ip6_dst, unspec_src, zoneid);
+ &ip6h->ip6_dst, &ip6h->ip6_src, unspec_src,
+ zoneid);
ipif_refrele(ipif);
} else {
ip_newroute_v6(q, copy_mp, &ip6h->ip6_dst,
@@ -10440,12 +10073,11 @@ send_from_ill:
/* Update rptr if there was an ip6i_t header. */
if (ip6i != NULL)
mp->b_rptr -= sizeof (ip6i_t);
- if (unspec_src || attach_if) {
+ if (unspec_src) {
if (ip6i == NULL) {
/*
* Add ip6i_t header to carry unspec_src
- * or attach_if until the packet comes back in
- * ip_wput_v6.
+ * until the packet comes back in ip_wput_v6.
*/
if (mctl_present) {
first_mp->b_cont =
@@ -10481,28 +10113,15 @@ send_from_ill:
ip6h = (ip6_t *)&ip6i[1];
v6dstp = &ip6h->ip6_dst;
}
- if (unspec_src)
- ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
- if (attach_if) {
- /*
- * Bind to nofailover/BOUND_PIF overrides ifindex.
- */
- ip6i->ip6i_flags |= IP6I_ATTACH_IF;
- ip6i->ip6i_flags &= ~IP6I_IFINDEX;
- ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
- if (drop_if_delayed) {
- /* This is a multipathing probe packet */
- ip6i->ip6i_flags |= IP6I_DROP_IFDELAYED;
- }
- }
+ ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
if (mctl_present) {
ASSERT(io != NULL);
io->ipsec_out_unspec_src = unspec_src;
}
}
if (IN6_IS_ADDR_MULTICAST(v6dstp)) {
- ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, *v6dstp,
- unspec_src, zoneid);
+ ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, v6dstp,
+ &ip6h->ip6_src, unspec_src, zoneid);
} else {
ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, ill,
zoneid, ipst);
@@ -10544,14 +10163,6 @@ ip_wput_v6(queue_t *q, mblk_t *mp)
ip_output_v6(GLOBAL_ZONEID, mp, q, IP_WPUT);
}
-static void
-ipsec_out_attach_if(ipsec_out_t *io, int attach_index)
-{
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- io->ipsec_out_attach_if = B_TRUE;
- io->ipsec_out_ill_index = attach_index;
-}
-
/*
* NULL send-to queue - packet is to be delivered locally.
*/
@@ -10731,6 +10342,8 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
*/
if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
!IS_LOOPBACK(ill)) {
+ ilm_walker_t ilw;
+
/*
* In the multicast case, applications may have
* joined the group from different zones, so we
@@ -10742,11 +10355,9 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
* on the loopback interface (PHYI_LOOPBACK flag
* set) as they must stay in the sender's zone.
*/
- ILM_WALKER_HOLD(ill);
- for (ilm = ill->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if (ilm->ilm_flags & ILM_DELETED)
- continue;
+ ilm = ilm_walker_start(&ilw, ill);
+ for (; ilm != NULL;
+ ilm = ilm_walker_step(&ilw, ilm)) {
if (!IN6_ARE_ADDR_EQUAL(
&ilm->ilm_v6addr, &ip6h->ip6_dst))
continue;
@@ -10754,23 +10365,24 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
IP_FF_NO_MCAST_LOOP) &&
ilm->ilm_zoneid == ire->ire_zoneid)
continue;
- if (!ipif_lookup_zoneid(ill,
- ilm->ilm_zoneid, IPIF_UP, NULL))
+ if (!ipif_lookup_zoneid(
+ ilw.ilw_walk_ill, ilm->ilm_zoneid,
+ IPIF_UP, NULL))
continue;
first_mp1 = ip_copymsg(first_mp);
if (first_mp1 == NULL)
continue;
- icmp_inbound_v6(q, first_mp1, ill,
- hdr_length, mctl_present,
- IP6_NO_IPPOLICY, ilm->ilm_zoneid,
- NULL);
+ icmp_inbound_v6(q, first_mp1,
+ ilw.ilw_walk_ill, ill, hdr_length,
+ mctl_present, IP6_NO_IPPOLICY,
+ ilm->ilm_zoneid, NULL);
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
} else {
first_mp1 = ip_copymsg(first_mp);
if (first_mp1 != NULL)
- icmp_inbound_v6(q, first_mp1, ill,
+ icmp_inbound_v6(q, first_mp1, ill, ill,
hdr_length, mctl_present,
IP6_NO_IPPOLICY, ire->ire_zoneid,
NULL);
@@ -10823,8 +10435,7 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
*/
static void
ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
- int cksum_request, conn_t *connp, int caller, int attach_index, int flags,
- zoneid_t zoneid)
+ int cksum_request, conn_t *connp, int caller, int flags, zoneid_t zoneid)
{
ip6_t *ip6h;
uint8_t nexthdr;
@@ -10917,7 +10528,7 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
if (src_ire != NULL &&
!(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
(!ipst->ips_ip_restrict_interzone_loopback ||
- ire_local_same_ill_group(ire, src_ire))) {
+ ire_local_same_lan(ire, src_ire))) {
if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
!unspec_src) {
ip6h->ip6_src = src_ire->ire_src_addr_v6;
@@ -10974,20 +10585,14 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
/*
* Select the source address using ipif_select_source_v6.
*/
- if (attach_index != 0) {
- ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst,
- RESTRICT_TO_ILL, IPV6_PREFER_SRC_DEFAULT, zoneid);
- } else {
- ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst,
- RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid);
- }
+ ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, B_FALSE,
+ IPV6_PREFER_SRC_DEFAULT, zoneid);
if (ipif == NULL) {
if (ip_debug > 2) {
/* ip1dbg */
pr_addr_dbg("ip_wput_ire_v6: no src for "
- "dst %s\n, ", AF_INET6, &ip6h->ip6_dst);
- printf("ip_wput_ire_v6: interface name %s\n",
- ill->ill_name);
+ "dst %s\n", AF_INET6, &ip6h->ip6_dst);
+ printf("through interface %s\n", ill->ill_name);
}
freemsg(first_mp);
return;
@@ -10998,12 +10603,8 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
if ((connp != NULL && connp->conn_multicast_loop) ||
!IS_LOOPBACK(ill)) {
- ilm_t *ilm;
-
- ILM_WALKER_HOLD(ill);
- ilm = ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, ALL_ZONES);
- ILM_WALKER_RELE(ill);
- if (ilm != NULL) {
+ if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE,
+ ALL_ZONES) != NULL) {
mblk_t *nmp;
int fanout_flags = 0;
@@ -11417,8 +11018,6 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
}
/* Do IPSEC processing first */
if (mctl_present) {
- if (attach_index != 0)
- ipsec_out_attach_if(io, attach_index);
ipsec_out_process(q, first_mp, ire, ill_index);
return;
}
@@ -11456,8 +11055,6 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
max_frag, B_FALSE, B_TRUE, zoneid, ipst);
return;
}
- if (attach_index != 0)
- ipsec_out_attach_if(io, attach_index);
ipsec_out_process(q, first_mp, ire, ill_index);
return;
}
@@ -11948,8 +11545,8 @@ boolean_t
conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags,
zoneid_t zoneid)
{
- ill_t *in_ill;
- boolean_t wantpacket = B_TRUE;
+ ill_t *bound_ill;
+ boolean_t wantpacket;
in6_addr_t *v6dst_ptr = &ip6h->ip6_dst;
in6_addr_t *v6src_ptr = &ip6h->ip6_src;
@@ -11958,42 +11555,16 @@ conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags,
* unicast and multicast reception to conn_incoming_ill.
* conn_wantpacket_v6 is called both for unicast and
* multicast.
- *
- * 1) The unicast copy of the packet can come anywhere in
- * the ill group if it is part of the group. Thus, we
- * need to check to see whether the ill group matches
- * if in_ill is part of a group.
- *
- * 2) ip_rput does not suppress duplicate multicast packets.
- * If there are two interfaces in a ill group and we have
- * 2 applications (conns) joined a multicast group G on
- * both the interfaces, ilm_lookup_ill filter in ip_rput
- * will give us two packets because we join G on both the
- * interfaces rather than nominating just one interface
- * for receiving multicast like broadcast above. So,
- * we have to call ilg_lookup_ill to filter out duplicate
- * copies, if ill is part of a group, to supress duplicates.
*/
- in_ill = connp->conn_incoming_ill;
- if (in_ill != NULL) {
- mutex_enter(&connp->conn_lock);
- in_ill = connp->conn_incoming_ill;
- mutex_enter(&ill->ill_lock);
- /*
- * No IPMP, and the packet did not arrive on conn_incoming_ill
- * OR, IPMP in use and the packet arrived on an IPMP group
- * different from the conn_incoming_ill's IPMP group.
- * Reject the packet.
- */
- if ((in_ill->ill_group == NULL && in_ill != ill) ||
- (in_ill->ill_group != NULL &&
- in_ill->ill_group != ill->ill_group)) {
- wantpacket = B_FALSE;
+ bound_ill = connp->conn_incoming_ill;
+ if (bound_ill != NULL) {
+ if (IS_IPMP(bound_ill)) {
+ if (bound_ill->ill_grp != ill->ill_grp)
+ return (B_FALSE);
+ } else {
+ if (bound_ill != ill)
+ return (B_FALSE);
}
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- if (!wantpacket)
- return (B_FALSE);
}
if (connp->conn_multi_router)
@@ -12140,7 +11711,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
(IN6_ARE_ADDR_EQUAL(&first_ire->ire_addr_v6,
&ire->ire_addr_v6)) &&
!(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)))
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
break;
}
@@ -12204,8 +11775,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
&ire->ire_addr_v6))
continue;
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED|
- IRE_MARK_HIDDEN))
+ IRE_MARK_CONDEMNED)
continue;
/* Got one */
@@ -13279,3 +12849,31 @@ ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah)
size += ehdrlen;
}
}
+
+/*
+ * Utility routine that checks if `v6srcp' is a valid address on underlying
+ * interface `ill'. If `ipifp' is non-NULL, it's set to a held ipif
+ * associated with `v6srcp' on success. NOTE: if this is not called from
+ * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
+ * group during or after this lookup.
+ */
+static boolean_t
+ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp)
+{
+ ipif_t *ipif;
+
+ ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst);
+ if (ipif != NULL) {
+ if (ipifp != NULL)
+ *ipifp = ipif;
+ else
+ ipif_refrele(ipif);
+ return (B_TRUE);
+ }
+
+ if (ip_debug > 2) {
+ pr_addr_dbg("ipif_lookup_testaddr_v6: cannot find ipif for "
+ "src %s\n", AF_INET6, v6srcp);
+ }
+ return (B_FALSE);
+}
diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c
index 81447c2e30..c729118fec 100644
--- a/usr/src/uts/common/inet/ip/ip6_if.c
+++ b/usr/src/uts/common/inet/ip/ip6_if.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -53,7 +53,6 @@
#include <netinet/igmp_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
-#include <netinet/in.h>
#include <inet/common.h>
#include <inet/nd.h>
@@ -178,10 +177,12 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
} else if (IPIF_CAN_WAIT(ipif, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
rw_exit(&ipst->ips_ill_g_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP,
ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (error != NULL)
@@ -202,16 +203,12 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
}
/*
- * Look for an ipif with the specified address. For point-point links
- * we look for matches on either the destination address and the local
- * address, but we ignore the check on the local address if IPIF_UNNUMBERED
- * is set.
- * Matches on a specific ill if match_ill is set.
+ * Common function for ipif_lookup_addr_v6() and ipif_lookup_addr_exact_v6().
*/
-/* ARGSUSED */
-ipif_t *
-ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
- queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+static ipif_t *
+ipif_lookup_addr_common_v6(const in6_addr_t *addr, ill_t *match_ill,
+ boolean_t match_illgrp, zoneid_t zoneid, queue_t *q, mblk_t *mp,
+ ipsq_func_t func, int *error, ip_stack_t *ipst)
{
ipif_t *ipif;
ill_t *ill;
@@ -230,7 +227,8 @@ ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
repeat:
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (match_ill != NULL && ill != match_ill) {
+ if (match_ill != NULL && ill != match_ill &&
+ (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
continue;
}
GRAB_CONN_LOCK(q);
@@ -257,10 +255,12 @@ repeat:
} else if (IPIF_CAN_WAIT(ipif, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
rw_exit(&ipst->ips_ill_g_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP,
ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (error != NULL)
@@ -323,11 +323,41 @@ ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid,
}
/*
+ * Lookup an ipif with the specified address. For point-to-point links we
+ * look for matches on either the destination address or the local address,
+ * but we skip the local address check if IPIF_UNNUMBERED is set. If the
+ * `match_ill' argument is non-NULL, the lookup is restricted to that ill
+ * (or illgrp if `match_ill' is in an IPMP group).
+ */
+ipif_t *
+ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
+ queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+{
+ return (ipif_lookup_addr_common_v6(addr, match_ill, B_TRUE, zoneid, q,
+ mp, func, error, ipst));
+}
+
+/*
+ * Special abbreviated version of ipif_lookup_addr_v6() that doesn't match
+ * `match_ill' across the IPMP group. This function is only needed in some
+ * corner-cases; almost everything should use ipif_lookup_addr_v6().
+ */
+ipif_t *
+ipif_lookup_addr_exact_v6(const in6_addr_t *addr, ill_t *match_ill,
+ ip_stack_t *ipst)
+{
+ ASSERT(match_ill != NULL);
+ return (ipif_lookup_addr_common_v6(addr, match_ill, B_FALSE, ALL_ZONES,
+ NULL, NULL, NULL, NULL, ipst));
+}
+
+/*
* Look for an ipif with the specified address. For point-point links
* we look for matches on either the destination address and the local
* address, but we ignore the check on the local address if IPIF_UNNUMBERED
* is set.
- * Matches on a specific ill if match_ill is set.
+ * If the `match_ill' argument is non-NULL, the lookup is restricted to that
+ * ill (or illgrp if `match_ill' is in an IPMP group).
* Return the zoneid for the ipif. ALL_ZONES if none found.
*/
zoneid_t
@@ -348,7 +378,8 @@ ipif_lookup_addr_zoneid_v6(const in6_addr_t *addr, ill_t *match_ill,
repeat:
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (match_ill != NULL && ill != match_ill) {
+ if (match_ill != NULL && ill != match_ill &&
+ !IS_IN_SAME_ILLGRP(ill, match_ill)) {
continue;
}
mutex_enter(&ill->ill_lock);
@@ -1120,11 +1151,10 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
boolean_t
ill_setdefaulttoken(ill_t *ill)
{
- int i;
+ int i;
in6_addr_t v6addr, v6mask;
- if (!MEDIA_V6INTFID(ill->ill_media, ill->ill_phys_addr_length,
- ill->ill_phys_addr, &v6addr))
+ if (!MEDIA_V6INTFID(ill->ill_media, ill, &v6addr))
return (B_FALSE);
(void) ip_plen_to_mask_v6(IPV6_TOKEN_LEN, &v6mask);
@@ -1161,7 +1191,7 @@ ipif_set_tun_auto_addr(ipif_t *ipif, struct iftun_req *ta)
{
sin6_t sin6;
sin_t *sin;
- ill_t *ill = ipif->ipif_ill;
+ ill_t *ill = ipif->ipif_ill;
tun_t *tp = (tun_t *)ill->ill_wq->q_next->q_ptr;
if (ta->ifta_saddr.ss_family != AF_INET ||
@@ -1227,7 +1257,7 @@ ipif_set_tun_llink(ill_t *ill, struct iftun_req *ta)
if ((ta->ifta_flags & IFTUN_DST) &&
IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)) {
- sin6_t sin6;
+ sin6_t sin6;
ASSERT(!(ipif->ipif_flags & IPIF_UP));
bzero(&sin6, sizeof (sin6_t));
@@ -1344,13 +1374,22 @@ ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce)
if (ret_nce != NULL)
*ret_nce = NULL;
+
+ /*
+ * IPMP meta-interfaces don't have any inherent multicast mappings,
+ * and instead use the ones on the underlying interfaces.
+ */
+ if (IS_IPMP(ill))
+ return (0);
+
/*
* Delete the mapping nce. Normally these should not exist
* as a previous ipif_down -> ipif_ndp_down should have deleted
* all the nces. But they can exist if ip_rput_dlpi_writer
- * calls this when PHYI_MULTI_BCAST is set.
+ * calls this when PHYI_MULTI_BCAST is set. Mappings are always
+ * tied to the underlying ill, so don't match across the illgrp.
*/
- mnce = ndp_lookup_v6(ill, &v6_mcast_addr, B_FALSE);
+ mnce = ndp_lookup_v6(ill, B_FALSE, &v6_mcast_addr, B_FALSE);
if (mnce != NULL) {
ndp_delete(mnce);
NCE_REFRELE(mnce);
@@ -1424,13 +1463,15 @@ ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce)
* Get the resolver set up for a new ipif. (Always called as writer.)
*/
int
-ipif_ndp_up(ipif_t *ipif)
+ipif_ndp_up(ipif_t *ipif, boolean_t initial)
{
ill_t *ill = ipif->ipif_ill;
int err = 0;
nce_t *nce = NULL;
nce_t *mnce = NULL;
+ boolean_t added_ipif = B_FALSE;
+ ASSERT(IAM_WRITER_ILL(ill));
ip1dbg(("ipif_ndp_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
/*
@@ -1464,7 +1505,10 @@ ipif_ndp_up(ipif_t *ipif)
if ((ipif->ipif_flags & (IPIF_UNNUMBERED|IPIF_NOLOCAL)) == 0) {
uint16_t flags;
- uchar_t *hw_addr = NULL;
+ uint16_t state;
+ uchar_t *hw_addr = NULL;
+ ill_t *bound_ill;
+ ipmp_illgrp_t *illg = ill->ill_grp;
/* Permanent entries don't need NUD */
flags = NCE_F_PERMANENT | NCE_F_NONUD;
@@ -1474,26 +1518,65 @@ ipif_ndp_up(ipif_t *ipif)
if (ipif->ipif_flags & IPIF_ANYCAST)
flags |= NCE_F_ANYCAST;
- if (ill->ill_net_type == IRE_IF_RESOLVER) {
- hw_addr = ill->ill_nd_lla;
-
- if (ill->ill_move_in_progress) {
- /*
- * Addresses are failing over to this ill.
- * Don't wait for NUD to see this change.
- * Publish our new link-layer address.
- */
- flags |= NCE_F_UNSOL_ADV;
+ if (IS_IPMP(ill)) {
+ ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
+ /*
+ * If we're here via ipif_up(), then the ipif won't be
+ * bound yet -- add it to the group, which will bind
+ * it if possible. (We would add it in ipif_up(), but
+ * deleting on failure there is gruesome.) If we're
+ * here via ipmp_ill_bind_ipif(), then the ipif has
+ * already been added to the group and we just need to
+ * use the binding.
+ */
+ if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
+ bound_ill = ipmp_illgrp_add_ipif(illg, ipif);
+ if (bound_ill == NULL) {
+ /*
+ * We couldn't bind the ipif to an ill
+ * yet, so we have nothing to publish.
+ * Set ipif_addr_ready so that this
+ * address can be used locally for now.
+ * The routing socket message will be
+ * sent from ipif_up_done_v6().
+ */
+ ipif->ipif_addr_ready = 1;
+ return (0);
+ }
+ added_ipif = B_TRUE;
}
+ hw_addr = bound_ill->ill_nd_lla;
+ } else {
+ bound_ill = ill;
+ if (ill->ill_net_type == IRE_IF_RESOLVER)
+ hw_addr = ill->ill_nd_lla;
+ }
+
+ /*
+ * If this is an initial bring-up (or the ipif was never
+ * completely brought up), do DAD. Otherwise, we're here
+ * because IPMP has rebound an address to this ill: send
+ * unsolicited advertisements to inform others.
+ */
+ if (initial || !ipif->ipif_addr_ready) {
+ state = ND_PROBE;
+ } else {
+ state = ND_REACHABLE;
+ flags |= NCE_F_UNSOL_ADV;
}
- err = ndp_lookup_then_add_v6(ill,
+ /*
+ * NOTE: for IPMP, local addresses are always associated with
+ * the ill they're bound to, so don't match across the illgrp.
+ */
+ err = ndp_lookup_then_add_v6(bound_ill,
+ B_FALSE,
hw_addr,
&ipif->ipif_v6lcl_addr,
&ipv6_all_ones,
&ipv6_all_zeros,
0,
flags,
- ND_PROBE, /* Causes Duplicate Address Detection to run */
+ state,
&nce);
switch (err) {
case 0:
@@ -1509,19 +1592,11 @@ ipif_ndp_up(ipif_t *ipif)
NCE_REFRELE(nce);
ip1dbg(("ipif_ndp_up: NCE already exists for %s\n",
ill->ill_name));
- if (mnce != NULL) {
- ndp_delete(mnce);
- NCE_REFRELE(mnce);
- }
- return (err);
+ goto fail;
default:
- ip1dbg(("ipif_ndp_up: NCE creation failed %s\n",
+ ip1dbg(("ipif_ndp_up: NCE creation failed for %s\n",
ill->ill_name));
- if (mnce != NULL) {
- ndp_delete(mnce);
- NCE_REFRELE(mnce);
- }
- return (err);
+ goto fail;
}
} else {
/* No local NCE for this entry */
@@ -1532,6 +1607,15 @@ ipif_ndp_up(ipif_t *ipif)
if (mnce != NULL)
NCE_REFRELE(mnce);
return (0);
+fail:
+ if (mnce != NULL) {
+ ndp_delete(mnce);
+ NCE_REFRELE(mnce);
+ }
+ if (added_ipif)
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+
+ return (err);
}
/* Remove all cache entries for this logical interface */
@@ -1539,23 +1623,42 @@ void
ipif_ndp_down(ipif_t *ipif)
{
nce_t *nce;
+ ill_t *ill = ipif->ipif_ill;
+
+ ASSERT(IAM_WRITER_ILL(ill));
if (ipif->ipif_isv6) {
- nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr,
- B_FALSE);
- if (nce != NULL) {
- ndp_delete(nce);
- NCE_REFRELE(nce);
+ ill_t *bound_ill;
+
+ if (IS_IPMP(ill))
+ bound_ill = ipmp_ipif_bound_ill(ipif);
+ else
+ bound_ill = ill;
+
+ if (bound_ill != NULL) {
+ nce = ndp_lookup_v6(bound_ill,
+ B_FALSE, /* see comment in ipif_ndp_up() */
+ &ipif->ipif_v6lcl_addr,
+ B_FALSE);
+ if (nce != NULL) {
+ ndp_delete(nce);
+ NCE_REFRELE(nce);
+ }
}
+
+ /*
+ * Make IPMP aware of the deleted data address.
+ */
+ if (IS_IPMP(ill))
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
}
+
/*
* Remove mapping and all other nces dependent on this ill
* when the last ipif is going away.
*/
- if (ipif->ipif_ill->ill_ipif_up_count == 0) {
- ndp_walk(ipif->ipif_ill, (pfi_t)ndp_delete_per_ill,
- (uchar_t *)ipif->ipif_ill, ipif->ipif_ill->ill_ipst);
- }
+ if (ill->ill_ipif_up_count == 0)
+ ndp_walk(ill, (pfi_t)ndp_delete_per_ill, ill, ill->ill_ipst);
}
/*
@@ -1936,9 +2039,7 @@ rule_preferred(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
}
/*
- * Prefer source addresses that are assigned to the outgoing interface, or
- * to an interface that is in the same IPMP group as the outgoing
- * interface.
+ * Prefer source addresses that are assigned to the outgoing interface.
*/
/* ARGSUSED3 */
static rule_res_t
@@ -1955,15 +2056,11 @@ rule_interface(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
return (CAND_TIE);
if (!bc->cand_matchedinterface_set) {
- bc->cand_matchedinterface = (bc->cand_ill == dstill ||
- (dstill->ill_group != NULL &&
- dstill->ill_group == bc->cand_ill->ill_group));
+ bc->cand_matchedinterface = bc->cand_ill == dstill;
bc->cand_matchedinterface_set = B_TRUE;
}
- cc->cand_matchedinterface = (cc->cand_ill == dstill ||
- (dstill->ill_group != NULL &&
- dstill->ill_group == cc->cand_ill->ill_group));
+ cc->cand_matchedinterface = cc->cand_ill == dstill;
cc->cand_matchedinterface_set = B_TRUE;
if (bc->cand_matchedinterface == cc->cand_matchedinterface)
@@ -2134,6 +2231,13 @@ rule_addr_type(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
static rule_res_t
rule_prefix(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst)
{
+ /*
+ * For IPMP, we always want to choose a random source address from
+ * among any equally usable addresses, so always report a tie.
+ */
+ if (IS_IPMP(dstinfo->dst_ill))
+ return (CAND_TIE);
+
if (!bc->cand_common_pref_set) {
bc->cand_common_pref = ip_common_prefix_v6(&bc->cand_srcaddr,
dstinfo->dst_addr);
@@ -2177,10 +2281,9 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
* specification's algorithm could traverse the list of addresses once for
* every rule).
*
- * The restrict_ill argument restricts the algorithm to chose a source
- * address that is assigned to the destination ill or an ill in the same
- * IPMP group as the destination ill. This is used when the destination
- * address is a link-local or multicast address, and when
+ * The restrict_ill argument restricts the algorithm to choose a source
+ * address that is assigned to the destination ill. This is used when
+ * the destination address is a link-local or multicast address, and when
* ipv6_strict_dst_multihoming is turned on.
*
* src_prefs is the caller's set of source address preferences. If source
@@ -2192,13 +2295,13 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
*/
ipif_t *
ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
- uint_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid)
+ boolean_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid)
{
dstinfo_t dstinfo;
char dstr[INET6_ADDRSTRLEN];
char sstr[INET6_ADDRSTRLEN];
- ipif_t *ipif;
- ill_t *ill, *usesrc_ill = NULL;
+ ipif_t *ipif, *start_ipif, *next_ipif;
+ ill_t *ill, *usesrc_ill = NULL, *ipmp_ill = NULL;
ill_walk_context_t ctx;
cand_t best_c; /* The best candidate */
cand_t curr_c; /* The current candidate */
@@ -2247,6 +2350,16 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
} else {
return (NULL);
}
+ } else if (IS_UNDER_IPMP(dstill)) {
+ /*
+ * Test addresses should never be used for source address
+ * selection, so if we were passed an underlying ill, switch
+ * to the IPMP meta-interface.
+ */
+ if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(dstill)) != NULL)
+ dstinfo.dst_ill = ipmp_ill;
+ else
+ return (NULL);
} else {
dstinfo.dst_ill = dstill;
}
@@ -2286,10 +2399,7 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
*/
if (IN6_IS_ADDR_LINKLOCAL(dst) || IN6_IS_ADDR_MULTICAST(dst) ||
ipst->ips_ipv6_strict_dst_multihoming || usesrc_ill != NULL) {
- if (restrict_ill == RESTRICT_TO_NONE)
- dstinfo.dst_restrict_ill = RESTRICT_TO_GROUP;
- else
- dstinfo.dst_restrict_ill = restrict_ill;
+ dstinfo.dst_restrict_ill = B_TRUE;
} else {
dstinfo.dst_restrict_ill = restrict_ill;
}
@@ -2297,39 +2407,41 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
bzero(&best_c, sizeof (cand_t));
/*
- * Take a pass through the list of IPv6 interfaces to chose the
- * best possible source address. If restrict_ill is true, we only
- * iterate through the ill's that are in the same IPMP group as the
- * destination's outgoing ill. If restrict_ill is false, we walk
- * the entire list of IPv6 ill's.
+ * Take a pass through the list of IPv6 interfaces to choose the best
+ * possible source address. If restrict_ill is set, just use dst_ill.
*/
- if (dstinfo.dst_restrict_ill != RESTRICT_TO_NONE) {
- if (dstinfo.dst_ill->ill_group != NULL &&
- dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP) {
- ill = dstinfo.dst_ill->ill_group->illgrp_ill;
- } else {
- ill = dstinfo.dst_ill;
- }
- } else {
+ if (dstinfo.dst_restrict_ill)
+ ill = dstinfo.dst_ill;
+ else
ill = ILL_START_WALK_V6(&ctx, ipst);
- }
- while (ill != NULL) {
+ for (; ill != NULL; ill = ill_next(&ctx, ill)) {
ASSERT(ill->ill_isv6);
/*
- * Avoid FAILED/OFFLINE ills.
- * Global and site local addresses will failover and
- * will be available on the new ill.
- * But link local addresses don't move.
+ * Test addresses should never be used for source address
+ * selection, so ignore underlying ills.
*/
- if (dstinfo.dst_restrict_ill != RESTRICT_TO_ILL &&
- ill->ill_phyint->phyint_flags &
- (PHYI_OFFLINE | PHYI_FAILED))
- goto next_ill;
+ if (IS_UNDER_IPMP(ill))
+ continue;
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
+ /*
+ * For source address selection, we treat the ipif list as
+ * circular and continue until we get back to where we
+ * started. This allows IPMP to vary source address selection
+ * (which improves inbound load spreading) by caching its last
+ * ending point and starting from there. NOTE: we don't have
+ * to worry about ill_src_ipif changing ills since that can't
+ * happen on the IPMP ill.
+ */
+ start_ipif = ill->ill_ipif;
+ if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
+ start_ipif = ill->ill_src_ipif;
+
+ ipif = start_ipif;
+ do {
+ if ((next_ipif = ipif->ipif_next) == NULL)
+ next_ipif = ill->ill_ipif;
if (!IPIF_VALID_IPV6_SOURCE(ipif))
continue;
@@ -2387,9 +2499,8 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
*/
for (index = 0; rules[index] != NULL; index++) {
/* Apply a comparison rule. */
- rule_result =
- (rules[index])(&best_c, &curr_c, &dstinfo,
- ipst);
+ rule_result = (rules[index])(&best_c, &curr_c,
+ &dstinfo, ipst);
if (rule_result == CAND_AVOID) {
/*
* The best candidate is still the
@@ -2417,21 +2528,29 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
* have been prefered as the best candidate so far.
*/
ASSERT(rule_result != CAND_TIE);
+ } while ((ipif = next_ipif) != start_ipif);
+
+ /*
+ * For IPMP, update the source ipif rotor to the next ipif,
+ * provided we can look it up. (We must not use it if it's
+ * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
+ * ipif_free() checked ill_src_ipif.)
+ */
+ if (IS_IPMP(ill) && ipif != NULL) {
+ mutex_enter(&ipif->ipif_ill->ill_lock);
+ next_ipif = ipif->ipif_next;
+ if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif))
+ ill->ill_src_ipif = next_ipif;
+ else
+ ill->ill_src_ipif = NULL;
+ mutex_exit(&ipif->ipif_ill->ill_lock);
}
/*
- * We may be walking the linked-list of ill's in an
- * IPMP group or traversing the IPv6 ill avl tree. If it is a
- * usesrc ILL then it can't be part of IPMP group and we
- * will exit the while loop.
+ * Only one ill to consider if dst_restrict_ill is set.
*/
-next_ill:
- if (dstinfo.dst_restrict_ill == RESTRICT_TO_ILL)
- ill = NULL;
- else if (dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP)
- ill = ill->ill_group_next;
- else
- ill = ill_next(&ctx, ill);
+ if (dstinfo.dst_restrict_ill)
+ break;
}
ipif = best_c.cand_ipif;
@@ -2444,6 +2563,9 @@ next_ill:
if (usesrc_ill != NULL)
ill_refrele(usesrc_ill);
+ if (ipmp_ill != NULL)
+ ill_refrele(ipmp_ill);
+
if (dst_rhtp != NULL)
TPC_RELE(dst_rhtp);
@@ -2474,8 +2596,7 @@ next_ill:
* ipif_update_other_ipifs calls us.
*
* If old_ipif is NULL, just redo the source address selection
- * if needed. This happens when illgrp_insert or ipif_up_done_v6
- * calls us.
+ * if needed. This happens when ipif_up_done_v6 calls us.
*/
void
ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
@@ -2561,8 +2682,7 @@ ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
if (ip6_asp_can_lookup(ipst)) {
ip6_asp_table_held = B_TRUE;
nipif = ipif_select_source_v6(ill, &ipif->ipif_v6subnet,
- RESTRICT_TO_GROUP, IPV6_PREFER_SRC_DEFAULT,
- ipif->ipif_zoneid);
+ B_TRUE, IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid);
}
if (nipif == NULL) {
/* Last resort - all ipif's have IPIF_NOLOCAL */
@@ -2630,13 +2750,9 @@ ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
* Find the IRE_INTERFACE for such ipif's and recreate them
* to use an different source address following the rules in
* ipif_up_done_v6.
- *
- * This function takes an illgrp as an argument so that illgrp_delete
- * can call this to update source address even after deleting the
- * old_ipif->ipif_ill from the ill group.
*/
void
-ipif_update_other_ipifs_v6(ipif_t *old_ipif, ill_group_t *illgrp)
+ipif_update_other_ipifs_v6(ipif_t *old_ipif)
{
ipif_t *ipif;
ill_t *ill;
@@ -2651,23 +2767,9 @@ ipif_update_other_ipifs_v6(ipif_t *old_ipif, ill_group_t *illgrp)
inet_ntop(AF_INET6, &old_ipif->ipif_v6lcl_addr,
buf, sizeof (buf))));
- /*
- * If this part of a group, look at all ills as ipif_select_source
- * borrows a source address across all the ills in the group.
- */
- if (illgrp != NULL)
- ill = illgrp->illgrp_ill;
-
- /* Don't need a lock since this is a writer */
- for (; ill != NULL; ill = ill->ill_group_next) {
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
-
- if (ipif == old_ipif)
- continue;
-
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ if (ipif != old_ipif)
ipif_recreate_interface_routes_v6(old_ipif, ipif);
- }
}
}
@@ -2828,12 +2930,10 @@ ipif_up_done_v6(ipif_t *ipif)
boolean_t flush_ire_cache = B_TRUE;
int err;
char buf[INET6_ADDRSTRLEN];
- phyint_t *phyi;
ire_t **ipif_saved_irep = NULL;
int ipif_saved_ire_cnt;
int cnt;
boolean_t src_ipif_held = B_FALSE;
- boolean_t ire_added = B_FALSE;
boolean_t loopback = B_FALSE;
boolean_t ip6_asp_table_held = B_FALSE;
ip_stack_t *ipst = ill->ill_ipst;
@@ -2868,8 +2968,8 @@ ipif_up_done_v6(ipif_t *ipif)
break;
}
if (flush_ire_cache)
- ire_walk_ill_v6(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE,
- IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
+ ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
+ IRE_CACHE, ill_ipif_cache_delete, ill, ill);
/*
* Figure out which way the send-to queue should go. Only
@@ -2900,7 +3000,9 @@ ipif_up_done_v6(ipif_t *ipif)
ipif->ipif_ire_type = IRE_LOCAL;
}
- if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) {
+ if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) ||
+ ((ipif->ipif_flags & IPIF_DEPRECATED) &&
+ !(ipif->ipif_flags & IPIF_NOFAILOVER))) {
/*
* Can't use our source address. Select a different
* source address for the IRE_INTERFACE and IRE_LOCAL
@@ -2908,7 +3010,7 @@ ipif_up_done_v6(ipif_t *ipif)
if (ip6_asp_can_lookup(ipst)) {
ip6_asp_table_held = B_TRUE;
src_ipif = ipif_select_source_v6(ipif->ipif_ill,
- &ipif->ipif_v6subnet, RESTRICT_TO_NONE,
+ &ipif->ipif_v6subnet, B_FALSE,
IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid);
}
if (src_ipif == NULL)
@@ -3090,9 +3192,9 @@ ipif_up_done_v6(ipif_t *ipif)
ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
/*
- * Need to atomically check for ip_addr_availablity_check
- * now under ill_g_lock, and if it fails got bad, and remove
- * from group also
+ * Need to atomically check for IP address availability under
+ * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new
+ * ills or new ipifs can be added while we are checking availability.
*/
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
mutex_enter(&ipst->ips_ip_addr_avail_lock);
@@ -3125,9 +3227,7 @@ ipif_up_done_v6(ipif_t *ipif)
}
/*
- * Add in all newly created IREs. We want to add before
- * we call ifgrp_insert which wants to know whether
- * IRE_IF_RESOLVER exists or not.
+ * Add in all newly created IREs.
*
* NOTE : We refrele the ire though we may branch to "bad"
* later on where we do ire_delete. This is okay
@@ -3148,36 +3248,6 @@ ipif_up_done_v6(ipif_t *ipif)
ip6_asp_table_refrele(ipst);
ip6_asp_table_held = B_FALSE;
}
- ire_added = B_TRUE;
-
- /*
- * Form groups if possible.
- *
- * If we are supposed to be in a ill_group with a name, insert it
- * now as we know that at least one ipif is UP. Otherwise form
- * nameless groups.
- *
- * If ip_enable_group_ifs is set and ipif address is not ::0, insert
- * this ipif into the appropriate interface group, or create a
- * new one. If this is already in a nameless group, we try to form
- * a bigger group looking at other ills potentially sharing this
- * ipif's prefix.
- */
- phyi = ill->ill_phyint;
- if (phyi->phyint_groupname_len != 0) {
- ASSERT(phyi->phyint_groupname != NULL);
- if (ill->ill_ipif_up_count == 1) {
- ASSERT(ill->ill_group == NULL);
- err = illgrp_insert(&ipst->ips_illgrp_head_v6, ill,
- phyi->phyint_groupname, NULL, B_TRUE);
- if (err != 0) {
- ip1dbg(("ipif_up_done_v6: illgrp allocation "
- "failed, error %d\n", err));
- goto bad;
- }
- }
- ASSERT(ill->ill_group != NULL);
- }
/* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
@@ -3190,19 +3260,23 @@ ipif_up_done_v6(ipif_t *ipif)
*/
ill_recover_multicast(ill);
}
- /* Join the allhosts multicast address and the solicited node MC */
- ipif_multicast_up(ipif);
- if (!loopback) {
+ if (ill->ill_ipif_up_count == 1) {
/*
- * See whether anybody else would benefit from the
- * new ipif that we added. We call this always rather
- * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST
- * ipif for the benefit of illgrp_insert (done above)
- * which does not do source address selection as it does
- * not want to re-create interface routes that we are
- * having reference to it here.
+ * Since the interface is now up, it may now be active.
*/
+ if (IS_UNDER_IPMP(ill))
+ ipmp_ill_refresh_active(ill);
+ }
+
+ /* Join the allhosts multicast address and the solicited node MC */
+ ipif_multicast_up(ipif);
+
+ /*
+ * See if anybody else would benefit from our new ipif.
+ */
+ if (!loopback &&
+ !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
ill_update_source_selection(ill);
}
@@ -3238,29 +3312,11 @@ ipif_up_done_v6(ipif_t *ipif)
bad:
if (ip6_asp_table_held)
ip6_asp_table_refrele(ipst);
- /*
- * We don't have to bother removing from ill groups because
- *
- * 1) For groups with names, we insert only when the first ipif
- * comes up. In that case if it fails, it will not be in any
- * group. So, we need not try to remove for that case.
- *
- * 2) For groups without names, either we tried to insert ipif_ill
- * in a group as singleton or found some other group to become
- * a bigger group. For the former, if it fails we don't have
- * anything to do as ipif_ill is not in the group and for the
- * latter, there are no failures in illgrp_insert/illgrp_delete
- * (ENOMEM can't occur for this. Check ifgrp_insert).
- */
while (irep > ire_array) {
irep--;
- if (*irep != NULL) {
+ if (*irep != NULL)
ire_delete(*irep);
- if (ire_added)
- ire_refrele(*irep);
- }
-
}
(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
@@ -3272,8 +3328,7 @@ bad:
ipif_refrele(src_ipif);
ipif_ndp_down(ipif);
- if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV)
- ipif_arp_down(ipif);
+ ipif_resolver_down(ipif);
return (err);
}
@@ -3286,15 +3341,14 @@ int
ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
{
- in6_addr_t addr;
sin6_t *sin6;
nce_t *nce;
struct lifreq *lifr;
lif_nd_req_t *lnr;
- mblk_t *mp1;
+ ill_t *ill = ipif->ipif_ill;
+ ire_t *ire;
- mp1 = mp->b_cont->b_cont;
- lifr = (struct lifreq *)mp1->b_rptr;
+ lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr;
lnr = &lifr->lifr_nd;
/* Only allow for logical unit zero i.e. not on "le0:17" */
if (ipif->ipif_id != 0)
@@ -3307,8 +3361,28 @@ ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
return (EAFNOSUPPORT);
sin6 = (sin6_t *)&lnr->lnr_addr;
- addr = sin6->sin6_addr;
- nce = ndp_lookup_v6(ipif->ipif_ill, &addr, B_FALSE);
+
+ /*
+ * Since ND mappings must be consistent across an IPMP group, prohibit
+ * deleting ND mappings on underlying interfaces. Also, since ND
+ * mappings for IPMP data addresses are owned by IP itself, prohibit
+ * deleting them.
+ */
+ if (IS_UNDER_IPMP(ill))
+ return (EPERM);
+
+ if (IS_IPMP(ill)) {
+ ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL,
+ ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL,
+ ill->ill_ipst);
+ if (ire != NULL) {
+ ire_refrele(ire);
+ return (EPERM);
+ }
+ }
+
+ /* See comment in ndp_query() regarding IS_IPMP(ill) usage */
+ nce = ndp_lookup_v6(ill, IS_IPMP(ill), &sin6->sin6_addr, B_FALSE);
if (nce == NULL)
return (ESRCH);
ndp_delete(nce);
@@ -3354,11 +3428,11 @@ int
ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
{
+ sin6_t *sin6;
ill_t *ill = ipif->ipif_ill;
struct lifreq *lifr;
lif_nd_req_t *lnr;
-
- ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
+ ire_t *ire;
lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr;
lnr = &lifr->lifr_nd;
@@ -3372,5 +3446,26 @@ ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
if (lnr->lnr_addr.ss_family != AF_INET6)
return (EAFNOSUPPORT);
+ sin6 = (sin6_t *)&lnr->lnr_addr;
+
+ /*
+ * Since ND mappings must be consistent across an IPMP group, prohibit
+ * updating ND mappings on underlying interfaces. Also, since ND
+ * mappings for IPMP data addresses are owned by IP itself, prohibit
+ * updating them.
+ */
+ if (IS_UNDER_IPMP(ill))
+ return (EPERM);
+
+ if (IS_IPMP(ill)) {
+ ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL,
+ ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL,
+ ill->ill_ipst);
+ if (ire != NULL) {
+ ire_refrele(ire);
+ return (EPERM);
+ }
+ }
+
return (ndp_sioc_update(ill, lnr));
}
diff --git a/usr/src/uts/common/inet/ip/ip6_ire.c b/usr/src/uts/common/inet/ip/ip6_ire.c
index 41461ca96f..0d0f3621f5 100644
--- a/usr/src/uts/common/inet/ip/ip6_ire.c
+++ b/usr/src/uts/common/inet/ip/ip6_ire.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -73,7 +73,6 @@ static ire_t *ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *,
const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
static ire_t *ip6_ctable_lookup_impl(ire_ctable_args_t *);
-
/*
* Initialize the ire that is specific to IPv6 part and call
* ire_init_common to finish it.
@@ -261,13 +260,11 @@ ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
* Make sure we follow ire_ipif.
*
* We need to determine the interface route through
- * which the gateway will be reached. We don't really
- * care which interface is picked if the interface is
- * part of a group.
+ * which the gateway will be reached.
*/
if (ire->ire_ipif != NULL) {
ipif = ire->ire_ipif;
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
}
switch (ire->ire_type) {
@@ -409,35 +406,54 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
ire_t *ire = *ire_p;
int error;
ip_stack_t *ipst = ire->ire_ipst;
+ uint_t marks = 0;
ASSERT(ire->ire_ipversion == IPV6_VERSION);
ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
ASSERT(ire->ire_nce == NULL);
+ /*
+ * IREs with source addresses hosted on interfaces that are under IPMP
+ * should be hidden so that applications don't accidentally end up
+ * sending packets with test addresses as their source addresses, or
+ * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here.
+ * (We let IREs with unspecified source addresses slip through since
+ * ire_send_v6() will delete them automatically.)
+ */
+ if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) &&
+ !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
+ DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
+ marks |= IRE_MARK_TESTHIDDEN;
+ }
+
/* Find the appropriate list head. */
switch (ire->ire_type) {
case IRE_HOST:
ire->ire_mask_v6 = ipv6_all_ones;
ire->ire_masklen = IPV6_ABITS;
+ ire->ire_marks |= marks;
if ((ire->ire_flags & RTF_SETSRC) == 0)
ire->ire_src_addr_v6 = ipv6_all_zeros;
break;
case IRE_CACHE:
+ ire->ire_mask_v6 = ipv6_all_ones;
+ ire->ire_masklen = IPV6_ABITS;
+ ire->ire_marks |= marks;
+ break;
case IRE_LOCAL:
case IRE_LOOPBACK:
ire->ire_mask_v6 = ipv6_all_ones;
ire->ire_masklen = IPV6_ABITS;
break;
case IRE_PREFIX:
- if ((ire->ire_flags & RTF_SETSRC) == 0)
- ire->ire_src_addr_v6 = ipv6_all_zeros;
- break;
case IRE_DEFAULT:
+ ire->ire_marks |= marks;
if ((ire->ire_flags & RTF_SETSRC) == 0)
ire->ire_src_addr_v6 = ipv6_all_zeros;
break;
case IRE_IF_RESOLVER:
case IRE_IF_NORESOLVER:
+ ire->ire_marks |= marks;
break;
default:
printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n",
@@ -543,9 +559,8 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
* 2) We could have multiple packets trying to create
* an IRE_CACHE for the same ill.
*
- * Moreover, IPIF_NOFAILOVER and IPV6_BOUND_PIF endpoints wants
- * to go out on a particular ill. Rather than looking at the
- * packet, we depend on the above for MATCH_IRE_ILL here.
+ * Rather than looking at the packet, we depend on the above for
+ * MATCH_IRE_ILL here.
*
* Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have
* multiple IRE_CACHES for an ill for the same destination
@@ -555,20 +570,15 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
*/
if (ire->ire_ipif != NULL)
flags |= MATCH_IRE_IPIF;
+
/*
- * If we are creating hidden ires, make sure we search on
- * this ill (MATCH_IRE_ILL) and a hidden ire, while we are
- * searching for duplicates below. Otherwise we could
- * potentially find an IRE on some other interface
- * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
- * shouldn't do this as this will lead to an infinite loop as
- * eventually we need an hidden ire for this packet to go
- * out. MATCH_IRE_ILL is already marked above.
+ * If we are creating a hidden IRE, make sure we search for
+ * hidden IREs when searching for duplicates below.
+ * Otherwise, we might find an IRE on some other interface
+ * that's not marked hidden.
*/
- if (ire->ire_marks & IRE_MARK_HIDDEN) {
- ASSERT(ire->ire_type == IRE_CACHE);
- flags |= MATCH_IRE_MARK_HIDDEN;
- }
+ if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
+ flags |= MATCH_IRE_MARK_TESTHIDDEN;
/*
* Start the atomic add of the ire. Grab the ill locks,
@@ -692,7 +702,7 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
}
}
if (ire->ire_type == IRE_CACHE) {
- in6_addr_t gw_addr_v6;
+ const in6_addr_t *addr_v6;
ill_t *ill = ire_to_ill(ire);
char buf[INET6_ADDRSTRLEN];
nce_t *nce;
@@ -712,12 +722,12 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
* time on the list and rts_setgwr_v6 could not
* be changing this.
*/
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
- nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, B_TRUE);
- } else {
- nce = ndp_lookup_v6(ill, &gw_addr_v6, B_TRUE);
- }
+ addr_v6 = &ire->ire_gateway_addr_v6;
+ if (IN6_IS_ADDR_UNSPECIFIED(addr_v6))
+ addr_v6 = &ire->ire_addr_v6;
+
+ /* nce fastpath is per-ill; don't match across illgrp */
+ nce = ndp_lookup_v6(ill, B_FALSE, addr_v6, B_TRUE);
if (nce == NULL)
goto failed;
@@ -1217,28 +1227,29 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
in6_addr_t gw_addr_v6;
ill_t *ire_ill = NULL, *dst_ill;
ill_t *ipif_ill = NULL;
- ill_group_t *ire_ill_group = NULL;
- ill_group_t *ipif_ill_group = NULL;
ipif_t *src_ipif;
ASSERT(ire->ire_ipversion == IPV6_VERSION);
ASSERT(addr != NULL);
ASSERT(mask != NULL);
ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
- ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) ||
+ ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
(ipif != NULL && ipif->ipif_isv6));
/*
- * HIDDEN cache entries have to be looked up specifically with
- * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set
- * when the interface is FAILED or INACTIVE. In that case,
- * any IRE_CACHES that exists should be marked with
- * IRE_MARK_HIDDEN. So, we don't really need to match below
- * for IRE_MARK_HIDDEN. But we do so for consistency.
+ * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
+ * is in fact hidden, to ensure the caller gets the right one. One
+ * exception: if the caller passed MATCH_IRE_IHANDLE, then they
+ * already know the identity of the given IRE_INTERFACE entry and
+ * there's no point trying to hide it from them.
*/
- if (!(match_flags & MATCH_IRE_MARK_HIDDEN) &&
- (ire->ire_marks & IRE_MARK_HIDDEN))
- return (B_FALSE);
+ if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+ if (match_flags & MATCH_IRE_IHANDLE)
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+
+ if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
+ return (B_FALSE);
+ }
if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
ire->ire_zoneid != ALL_ZONES) {
@@ -1288,7 +1299,7 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
*/
if ((dst_ill->ill_usesrc_ifindex != 0) &&
(src_ipif = ipif_select_source_v6(dst_ill, addr,
- RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid))
+ B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid))
!= NULL) {
ip3dbg(("ire_match_args: src_ipif %p"
" dst_ill %p", (void *)src_ipif,
@@ -1326,20 +1337,20 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
}
+
/*
- * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that
- * somebody wants to send out on a particular interface which
- * is given by ire_stq and hence use ire_stq to derive the ill
- * value. ire_ipif for IRE_CACHES is just the
- * means of getting a source address i.e ire_src_addr_v6 =
- * ire->ire_ipif->ipif_src_addr_v6.
+ * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
+ * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
+ * of getting a source address -- i.e., ire_src_addr_v6 ==
+ * ire->ire_ipif->ipif_v6src_addr). ire_to_ill() handles this.
+ *
+ * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
+ * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
+ * IPMP test traffic), then the ill must match exactly.
*/
- if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
+ if (match_flags & MATCH_IRE_ILL) {
ire_ill = ire_to_ill(ire);
- if (ire_ill != NULL)
- ire_ill_group = ire_ill->ill_group;
ipif_ill = ipif->ipif_ill;
- ipif_ill_group = ipif_ill->ill_group;
}
/* No ire_addr_v6 bits set past the mask */
@@ -1357,17 +1368,14 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
&ipif->ipif_v6src_addr)) &&
((!(match_flags & MATCH_IRE_IPIF)) ||
(ire->ire_ipif == ipif)) &&
- ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
- (ire->ire_type != IRE_CACHE ||
- ire->ire_marks & IRE_MARK_HIDDEN)) &&
+ ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
+ (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
((!(match_flags & MATCH_IRE_ILL)) ||
- (ire_ill == ipif_ill)) &&
+ (ire_ill == ipif_ill ||
+ (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
+ ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
((!(match_flags & MATCH_IRE_IHANDLE)) ||
(ire->ire_ihandle == ihandle)) &&
- ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
- (ire_ill == ipif_ill) ||
- (ire_ill_group != NULL &&
- ire_ill_group == ipif_ill_group)) &&
((!(match_flags & MATCH_IRE_SECATTR)) ||
(!is_system_labeled()) ||
(tsol_ire_match_gwattr(ire, tsl) == 0))) {
@@ -1391,8 +1399,7 @@ ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
* ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
* MATCH_IRE_ILL is set.
*/
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
- (ipif == NULL))
+ if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
return (NULL);
/*
@@ -1477,8 +1484,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
* ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
* MATCH_IRE_ILL is set.
*/
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
- (ipif == NULL))
+ if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
return (NULL);
/*
@@ -1661,8 +1667,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
mutex_enter(&ire->ire_lock);
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
- match_flags = MATCH_IRE_ILL_GROUP |
- MATCH_IRE_SECATTR;
+ match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL,
0, ire->ire_ipif, zoneid, tsl, match_flags,
ipst);
@@ -1703,7 +1708,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
if (ire->ire_ipif != NULL) {
ire_match_flags |=
- MATCH_IRE_ILL_GROUP;
+ MATCH_IRE_ILL;
}
rire = ire_route_lookup_v6(&gw_addr_v6,
NULL, NULL, IRE_INTERFACE,
@@ -1791,21 +1796,8 @@ found_ire_held:
*/
saved_ire = ire;
- /*
- * Currently MATCH_IRE_ILL is never used with
- * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
- * sending out packets as MATCH_IRE_ILL is used only
- * for communicating with on-link hosts. We can't assert
- * that here as RTM_GET calls this function with
- * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
- * We have already used the MATCH_IRE_ILL in determining
- * the right prefix route at this point. To match the
- * behavior of how we locate routes while sending out
- * packets, we don't want to use MATCH_IRE_ILL below
- * while locating the interface route.
- */
if (ire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
mutex_enter(&ire->ire_lock);
gw_addr_v6 = ire->ire_gateway_addr_v6;
@@ -1958,9 +1950,7 @@ ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
}
/*
- * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers
- * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get
- * to the hidden ones.
+ * Lookup cache.
*
* In general the zoneid has to match (where ALL_ZONES match all of them).
* But for IRE_LOCAL we also need to handle the case where L2 should
@@ -1968,8 +1958,7 @@ ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
* Ethernet drivers nor Ethernet hardware loops back packets sent to their
* own MAC address. This loopback is needed when the normal
* routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill (or ill group) as the ill with which this IRE_LOCAL is
- * associated.
+ * the same ill as the ill with which this IRE_LOCAL is associated.
*
* Earlier versions of this code always matched an IRE_LOCAL independently of
* the zoneid. We preserve that earlier behavior when
@@ -1986,7 +1975,7 @@ ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid,
ipst->ips_ip6_cache_table_size)];
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
- if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
+ if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
continue;
if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) {
/*
@@ -2125,13 +2114,8 @@ ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
ASSERT(cire != NULL && pire != NULL);
match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
- /*
- * ip_newroute_v6 calls ire_ftable_lookup with MATCH_IRE_ILL only
- * for on-link hosts. We should never be here for onlink.
- * Thus, use MATCH_IRE_ILL_GROUP.
- */
if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
/*
* We know that the mask of the interface ire equals cire->ire_cmask.
* (When ip_newroute_v6() created 'cire' for an on-link destn. it set
@@ -2168,7 +2152,7 @@ ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
*/
match_flags = MATCH_IRE_TYPE;
if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
mutex_enter(&pire->ire_lock);
gw_addr = pire->ire_gateway_addr_v6;
@@ -2210,24 +2194,30 @@ ire_t *
ipif_to_ire_v6(const ipif_t *ipif)
{
ire_t *ire;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF;
+
+ /*
+ * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
+ * so that they aren't accidentally returned. However, if the
+ * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
+ */
+ if (IS_UNDER_IPMP(ipif->ipif_ill))
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
ASSERT(ipif->ipif_isv6);
if (ipif->ipif_ire_type == IRE_LOOPBACK) {
ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL,
- IRE_LOOPBACK, ipif, ALL_ZONES, NULL,
- (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ipst);
+ IRE_LOOPBACK, ipif, ALL_ZONES, NULL, match_flags, ipst);
} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
/* In this case we need to lookup destination address. */
ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr,
&ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES,
- 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
- MATCH_IRE_MASK), ipst);
+ 0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
} else {
ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet,
&ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL,
- ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
- MATCH_IRE_MASK), ipst);
+ ALL_ZONES, 0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
}
return (ire);
}
@@ -2296,7 +2286,7 @@ ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl,
continue;
if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp))
continue;
- if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
+ if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
continue;
unres_cnt--;
}
@@ -2434,7 +2424,7 @@ ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
continue;
if (cire->ire_marks &
(IRE_MARK_CONDEMNED|
- IRE_MARK_HIDDEN))
+ IRE_MARK_TESTHIDDEN))
continue;
if (cire->ire_gw_secattr != NULL &&
@@ -2635,8 +2625,7 @@ ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
&cire->ire_addr_v6, &v6dst))
continue;
if (cire->ire_marks &
- (IRE_MARK_CONDEMNED|
- IRE_MARK_HIDDEN))
+ IRE_MARK_CONDEMNED)
continue;
if (cire->ire_gw_secattr != NULL &&
@@ -2845,8 +2834,7 @@ ip6_ctable_lookup_impl(ire_ctable_args_t *margs)
ire_t *ire;
ip_stack_t *ipst = margs->ict_ipst;
- if ((margs->ict_flags &
- (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
+ if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
(margs->ict_ipif == NULL)) {
return (NULL);
}
diff --git a/usr/src/uts/common/inet/ip/ip6_rts.c b/usr/src/uts/common/inet/ip/ip6_rts.c
index 7d2ddd5c04..dcf429c8ba 100644
--- a/usr/src/uts/common/inet/ip/ip6_rts.c
+++ b/usr/src/uts/common/inet/ip/ip6_rts.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -38,8 +38,6 @@
* @(#)rtsock.c 8.6 (Berkeley) 2/11/95
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file contains routines that processes routing socket requests.
*/
@@ -216,5 +214,5 @@ ip_rts_change_v6(int type, const in6_addr_t *dst_addr,
rtm->rtm_errno = error;
rtm->rtm_flags |= RTF_DONE;
rtm->rtm_addrs = rtm_addrs;
- rts_queue_input(mp, NULL, AF_INET6, ipst);
+ rts_queue_input(mp, NULL, AF_INET6, RTSQ_ALL, ipst);
}
diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c
index 4fa3c7a74d..31f83c842d 100644
--- a/usr/src/uts/common/inet/ip/ip_ftable.c
+++ b/usr/src/uts/common/inet/ip/ip_ftable.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -67,7 +67,6 @@
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
#include <inet/sadb.h>
-#include <sys/kmem.h>
#include <inet/tcp.h>
#include <inet/ipclassifier.h>
#include <sys/zone.h>
@@ -159,8 +158,7 @@ ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
* ire_match_args() will dereference ipif MATCH_IRE_SRC or
* MATCH_IRE_ILL is set.
*/
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
- (ipif == NULL))
+ if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
return (NULL);
(void) memset(&rdst, 0, sizeof (rdst));
@@ -290,28 +288,16 @@ found_ire_held:
*/
save_ire = ire;
+ if (ire->ire_ipif != NULL)
+ match_flags |= MATCH_IRE_ILL;
+
/*
- * Currently MATCH_IRE_ILL is never used with
- * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
- * sending out packets as MATCH_IRE_ILL is used only
- * for communicating with on-link hosts. We can't assert
- * that here as RTM_GET calls this function with
- * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
- * We have already used the MATCH_IRE_ILL in determining
- * the right prefix route at this point. To match the
- * behavior of how we locate routes while sending out
- * packets, we don't want to use MATCH_IRE_ILL below
- * while locating the interface route.
- *
* ire_ftable_lookup may end up with an incomplete IRE_CACHE
* entry for the gateway (i.e., one for which the
* ire_nce->nce_state is not yet ND_REACHABLE). If the caller
* has specified MATCH_IRE_COMPLETE, such entries will not
* be returned; instead, we return the IF_RESOLVER ire.
*/
- if (ire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
-
ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0,
ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
@@ -532,7 +518,7 @@ ire_ftable_lookup_simple(ipaddr_t addr,
}
}
if (ire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
ire = ire_route_lookup(ire->ire_gateway_addr, 0,
0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst);
@@ -678,13 +664,11 @@ ire_lookup_multi(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
* Make sure we follow ire_ipif.
*
* We need to determine the interface route through
- * which the gateway will be reached. We don't really
- * care which interface is picked if the interface is
- * part of a group.
+ * which the gateway will be reached.
*/
if (ire->ire_ipif != NULL) {
ipif = ire->ire_ipif;
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
}
switch (ire->ire_type) {
@@ -854,40 +838,26 @@ ire_get_next_default_ire(ire_t *ire, ire_t *ire_origin)
}
static ipif_t *
-ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill,
+ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire,
int zoneid, ushort_t *marks)
{
ipif_t *src_ipif;
- ip_stack_t *ipst = dst_ill->ill_ipst;
+ ill_t *ill = ire->ire_ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
/*
- * Pick the best source address from dst_ill.
+ * Pick the best source address from ill.
*
- * 1) If it is part of a multipathing group, we would
- * like to spread the inbound packets across different
- * interfaces. ipif_select_source picks a random source
- * across the different ills in the group.
- *
- * 2) If it is not part of a multipathing group, we try
- * to pick the source address from the destination
+ * 1) Try to pick the source address from the destination
* route. Clustering assumes that when we have multiple
* prefixes hosted on an interface, the prefix of the
* source address matches the prefix of the destination
* route. We do this only if the address is not
* DEPRECATED.
*
- * 3) If the conn is in a different zone than the ire, we
+ * 2) If the conn is in a different zone than the ire, we
* need to pick a source address from the right zone.
- *
- * NOTE : If we hit case (1) above, the prefix of the source
- * address picked may not match the prefix of the
- * destination routes prefix as ipif_select_source
- * does not look at "dst" while picking a source
- * address.
- * If we want the same behavior as (2), we will need
- * to change the behavior of ipif_select_source.
*/
-
if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
/*
* The RTF_SETSRC flag is set in the parent ire (sire).
@@ -899,13 +869,10 @@ ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill,
return (src_ipif);
}
*marks |= IRE_MARK_USESRC_CHECK;
- if ((dst_ill->ill_group != NULL) ||
+ if (IS_IPMP(ill) ||
(ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
- (dst_ill->ill_usesrc_ifindex != 0)) {
- src_ipif = ipif_select_source(dst_ill, dst, zoneid);
- if (src_ipif == NULL)
- return (NULL);
-
+ (ill->ill_usesrc_ifindex != 0)) {
+ src_ipif = ipif_select_source(ill, dst, zoneid);
} else {
src_ipif = ire->ire_ipif;
ASSERT(src_ipif != NULL);
@@ -1071,18 +1038,20 @@ create_irecache:
sire->ire_last_used_time = lbolt;
}
- /* Obtain dst_ill */
- dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill);
+ dst_ill = ire->ire_ipif->ipif_ill;
+ if (IS_IPMP(dst_ill))
+ dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp);
+ else
+ ill_refhold(dst_ill);
+
if (dst_ill == NULL) {
- ip2dbg(("ire_forward no dst ill; ire 0x%p\n",
- (void *)ire));
+ ip2dbg(("ire_forward no dst ill; ire 0x%p\n", (void *)ire));
goto icmp_err_ret;
}
ASSERT(src_ipif == NULL);
/* Now obtain the src_ipif */
- src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill,
- zoneid, &ire_marks);
+ src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks);
if (src_ipif == NULL)
goto icmp_err_ret;
@@ -1254,18 +1223,13 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
ire_t *sire = NULL, *save_ire;
ill_t *dst_ill = NULL;
int error;
- zoneid_t zoneid;
+ zoneid_t zoneid = GLOBAL_ZONEID;
ipif_t *src_ipif = NULL;
mblk_t *res_mp;
ushort_t ire_marks = 0;
- zoneid = GLOBAL_ZONEID;
-
-
ire = ire_ftable_lookup_simple(dst, &sire, zoneid,
- MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE, ipst);
-
+ MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE, ipst);
if (ire == NULL) {
ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
goto icmp_err_ret;
@@ -1288,9 +1252,7 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
* nexthop router, just hand over the cache entry
* and we are done.
*/
-
if (ire->ire_type & IRE_CACHE) {
-
/*
* If we are using this ire cache entry as a
* gateway to forward packets, chances are we
@@ -1334,18 +1296,21 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
UPDATE_OB_PKT_COUNT(sire);
}
- /* Obtain dst_ill */
- dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill);
+ dst_ill = ire->ire_ipif->ipif_ill;
+ if (IS_IPMP(dst_ill))
+ dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp);
+ else
+ ill_refhold(dst_ill); /* for symmetry */
+
if (dst_ill == NULL) {
- ip2dbg(("ire_forward no dst ill; ire 0x%p\n",
+ ip2dbg(("ire_forward_simple: no dst ill; ire 0x%p\n",
(void *)ire));
goto icmp_err_ret;
}
ASSERT(src_ipif == NULL);
/* Now obtain the src_ipif */
- src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill,
- zoneid, &ire_marks);
+ src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks);
if (src_ipif == NULL)
goto icmp_err_ret;
@@ -1720,33 +1685,24 @@ ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
MATCH_IRE_RECURSIVE| MATCH_IRE_RJ_BHOLE|
- MATCH_IRE_SECATTR);
+ MATCH_IRE_SECATTR | MATCH_IRE_ILL);
/*
* If supplied ifindex is non-null, the only valid
- * nexthop is one off of the interface or group corresponding
+ * nexthop is one off of the interface corresponding
* to the specified ifindex.
*/
ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
NULL, NULL, NULL, NULL, ipst);
if (ill != NULL) {
- match_flags |= MATCH_IRE_ILL;
+ supplied_ipif = ipif_get_next_ipif(NULL, ill);
} else {
- /* Fallback to group names if hook_emulation set */
- if (ipst->ips_ipmp_hook_emulation) {
- ill = ill_group_lookup_on_ifindex(ifindex,
- B_FALSE, ipst);
- }
- if (ill == NULL) {
- ip1dbg(("ipfil_sendpkt: Could not find"
- " route to dst\n"));
- value = ECOMM;
- freemsg(mp);
- goto discard;
- }
- match_flags |= MATCH_IRE_ILL_GROUP;
+ ip1dbg(("ipfil_sendpkt: Could not find"
+ " route to dst\n"));
+ value = ECOMM;
+ freemsg(mp);
+ goto discard;
}
- supplied_ipif = ipif_get_next_ipif(NULL, ill);
ire = ire_route_lookup(dst, 0, 0, 0, supplied_ipif,
&sire, zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
@@ -2325,9 +2281,9 @@ ire_round_robin(irb_t *irb_ptr, zoneid_t zoneid, ire_ftable_args_t *margs,
* interested in routers that are
* reachable through ipifs within our zone.
*/
- if (ire->ire_ipif != NULL) {
- match_flags |= MATCH_IRE_ILL_GROUP;
- }
+ if (ire->ire_ipif != NULL)
+ match_flags |= MATCH_IRE_ILL;
+
rire = ire_route_lookup(ire->ire_gateway_addr, 0, 0,
IRE_INTERFACE, ire->ire_ipif, NULL, zoneid, margs->ift_tsl,
match_flags, ipst);
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 0597245499..9771c87721 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -46,6 +46,7 @@
#include <sys/bitmap.h>
#include <sys/cpuvar.h>
#include <sys/time.h>
+#include <sys/ctype.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/param.h>
@@ -61,10 +62,10 @@
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet/igmp_var.h>
-#include <sys/strsun.h>
#include <sys/policy.h>
#include <sys/ethernet.h>
#include <sys/callb.h>
+#include <sys/md5.h>
#include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
#include <inet/mi.h>
@@ -85,7 +86,6 @@
#include <inet/tun.h>
#include <inet/sctp_ip.h>
#include <inet/ip_netinfo.h>
-#include <inet/mib2.h>
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
@@ -93,7 +93,6 @@
#include <inet/ipsec_impl.h>
#include <sys/iphada.h>
-
#include <netinet/igmp.h>
#include <inet/ip_listutils.h>
#include <inet/ipclassifier.h>
@@ -158,7 +157,7 @@ static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
static void ipsq_delete(ipsq_t *);
static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
- boolean_t initialize);
+ boolean_t initialize, boolean_t insert);
static void ipif_check_bcast_ires(ipif_t *test_ipif);
static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
@@ -169,7 +168,6 @@ static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
static void ipif_free(ipif_t *ipif);
static void ipif_free_tail(ipif_t *ipif);
static void ipif_mtu_change(ire_t *ire, char *ipif_arg);
-static void ipif_multicast_down(ipif_t *ipif);
static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif);
static void ipif_set_default(ipif_t *ipif);
static int ipif_set_values(queue_t *q, mblk_t *mp,
@@ -179,8 +177,7 @@ static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
static ipif_t *ipif_lookup_on_name(char *name, size_t namelen,
boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *);
-static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp);
-static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp);
+static void ipif_update_other_ipifs(ipif_t *old_ipif);
static int ill_alloc_ppa(ill_if_t *, ill_t *);
static int ill_arp_off(ill_t *ill);
@@ -192,33 +189,18 @@ static void ill_down(ill_t *ill);
static void ill_downi(ire_t *ire, char *ill_arg);
static void ill_free_mib(ill_t *ill);
static void ill_glist_delete(ill_t *);
-static boolean_t ill_has_usable_ipif(ill_t *);
-static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int);
-static void ill_nominate_bcast_rcv(ill_group_t *illgrp);
-static void ill_phyint_free(ill_t *ill);
static void ill_phyint_reinit(ill_t *ill);
static void ill_set_nce_router_flags(ill_t *, boolean_t);
static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
-static void ill_signal_ipsq_ills(ipsq_t *, boolean_t);
-static boolean_t ill_split_ipsq(ipsq_t *cur_sq);
-static void ill_stq_cache_delete(ire_t *, char *);
-
-static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
- in6_addr_t *);
-static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
- ipaddr_t *);
-static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
- in6_addr_t *);
-static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
- ipaddr_t *);
-
+static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
+static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
+static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo;
+static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo;
static void ipif_save_ire(ipif_t *, ire_t *);
static void ipif_remove_ire(ipif_t *, ire_t *);
static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *);
static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
+static void phyint_free(phyint_t *);
/*
* Per-ill IPsec capabilities management.
@@ -250,18 +232,14 @@ static void ill_capability_ack_thr(void *);
static void ill_capability_lso_enable(ill_t *);
static void ill_capability_send(ill_t *, mblk_t *);
-static void illgrp_cache_delete(ire_t *, char *);
-static void illgrp_delete(ill_t *ill);
-static void illgrp_reset_schednext(ill_t *ill);
-
static ill_t *ill_prev_usesrc(ill_t *);
static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
static void ill_disband_usesrc_group(ill_t *);
static void conn_cleanup_stale_ire(conn_t *, caddr_t);
#ifdef DEBUG
-static void ill_trace_cleanup(const ill_t *);
-static void ipif_trace_cleanup(const ipif_t *);
+static void ill_trace_cleanup(const ill_t *);
+static void ipif_trace_cleanup(const ipif_t *);
#endif
/*
@@ -491,6 +469,7 @@ static nv_t ipif_nv_tbl[] = {
{ PHYI_STANDBY, "STANDBY" },
{ PHYI_INACTIVE, "INACTIVE" },
{ PHYI_OFFLINE, "OFFLINE" },
+ { PHYI_IPMP, "IPMP" }
};
static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
@@ -508,7 +487,8 @@ static ip_m_t ip_m_tbl[] = {
ip_ether_v6intfid },
{ DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo,
ip_ib_v6intfid },
- { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL},
+ { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL },
+ { SUNW_DL_IPMP, IFT_OTHER, NULL, NULL, ip_ipmp_v6intfid },
{ DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_nodef_v6intfid }
};
@@ -529,14 +509,6 @@ static ipif_t ipif_zero;
*/
uint_t ill_no_arena = 12; /* Setable in /etc/system */
-static uint_t
-ipif_rand(ip_stack_t *ipst)
-{
- ipst->ips_ipif_src_random = ipst->ips_ipif_src_random * 1103515245 +
- 12345;
- return ((ipst->ips_ipif_src_random >> 16) & 0x7fff);
-}
-
/*
* Allocate per-interface mibs.
* Returns true if ok. False otherwise.
@@ -623,7 +595,7 @@ ill_allocate_mibs(ill_t *ill)
* (Always called as writer.)
*/
mblk_t *
-ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
+ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr)
{
arc_t *arc = (arc_t *)template;
char *cp;
@@ -669,17 +641,69 @@ ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
}
mblk_t *
-ipif_area_alloc(ipif_t *ipif)
+ipif_area_alloc(ipif_t *ipif, uint_t optflags)
{
- return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template,
- (char *)&ipif->ipif_lcl_addr));
+ caddr_t addr;
+ mblk_t *mp;
+ area_t *area;
+ uchar_t *areap;
+ ill_t *ill = ipif->ipif_ill;
+
+ if (ill->ill_isv6) {
+ ASSERT(ill->ill_flags & ILLF_XRESOLV);
+ addr = (caddr_t)&ipif->ipif_v6lcl_addr;
+ areap = (uchar_t *)&ip6_area_template;
+ } else {
+ addr = (caddr_t)&ipif->ipif_lcl_addr;
+ areap = (uchar_t *)&ip_area_template;
+ }
+
+ if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL)
+ return (NULL);
+
+ /*
+ * IPMP requires that the hardware address be included in all
+ * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on.
+ * If there are no active underlying ills in the group (and thus no
+ * hardware address, DAD will be deferred until an underlying ill
+ * becomes active.
+ */
+ if (IS_IPMP(ill)) {
+ if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
+ freemsg(mp);
+ return (NULL);
+ }
+ } else {
+ ill_refhold(ill);
+ }
+
+ area = (area_t *)mp->b_rptr;
+ area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR;
+ area->area_flags |= optflags;
+ area->area_hw_addr_length = ill->ill_phys_addr_length;
+ bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset,
+ area->area_hw_addr_length);
+
+ ill_refrele(ill);
+ return (mp);
}
mblk_t *
ipif_ared_alloc(ipif_t *ipif)
{
- return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template,
- (char *)&ipif->ipif_lcl_addr));
+ caddr_t addr;
+ uchar_t *aredp;
+
+ if (ipif->ipif_ill->ill_isv6) {
+ ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV);
+ addr = (caddr_t)&ipif->ipif_v6lcl_addr;
+ aredp = (uchar_t *)&ip6_ared_template;
+ } else {
+ addr = (caddr_t)&ipif->ipif_lcl_addr;
+ aredp = (uchar_t *)&ip_ared_template;
+ }
+
+ return (ill_arp_alloc(ipif->ipif_ill, aredp, addr));
}
mblk_t *
@@ -689,6 +713,19 @@ ill_ared_alloc(ill_t *ill, ipaddr_t addr)
(char *)&addr));
}
+mblk_t *
+ill_arie_alloc(ill_t *ill, const char *grifname, const void *template)
+{
+ mblk_t *mp = ill_arp_alloc(ill, template, 0);
+ arie_t *arie;
+
+ if (mp != NULL) {
+ arie = (arie_t *)mp->b_rptr;
+ (void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ);
+ }
+ return (mp);
+}
+
/*
* Completely vaporize a lower level tap and all associated interfaces.
* ill_delete is called only out of ip_close when the device control
@@ -751,6 +788,12 @@ ill_delete(ill_t *ill)
ip_purge_allmulti(ill);
/*
+ * If the ill being deleted is under IPMP, boot it out of the illgrp.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_ill_leave_illgrp(ill);
+
+ /*
* ill_down will arrange to blow off any IRE's dependent on this
* ILL, and shut down fragmentation reassembly.
*/
@@ -890,8 +933,19 @@ ill_delete_tail(ill_t *ill)
* ill references.
*/
ASSERT(ilm_walk_ill(ill) == 0);
+
/*
- * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free
+ * If this ill is an IPMP meta-interface, blow away the illgrp. This
+ * is safe to do because the illgrp has already been unlinked from the
+ * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
+ */
+ if (IS_IPMP(ill)) {
+ ipmp_illgrp_destroy(ill->ill_grp);
+ ill->ill_grp = NULL;
+ }
+
+ /*
+ * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
* could free the phyint. No more reference to the phyint after this
* point.
*/
@@ -1139,7 +1193,7 @@ ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id)
* Add the pending mp to the list. There can be only 1 pending mp
* in the list. Any exclusive ioctl that needs to wait for a response
* from another module or driver needs to use this function to set
- * the ipsq_pending_mp to the ioctl mblk and wait for the response from
+ * the ipx_pending_mp to the ioctl mblk and wait for the response from
* the other module/driver. This is also used while waiting for the
* ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
*/
@@ -1147,19 +1201,19 @@ boolean_t
ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
int waitfor)
{
- ipsq_t *ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
+ ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
ASSERT(IAM_WRITER_IPIF(ipif));
ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
- ASSERT(ipsq->ipsq_pending_mp == NULL);
+ ASSERT(ipx->ipx_pending_mp == NULL);
/*
* The caller may be using a different ipif than the one passed into
* ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
* ill needs to wait for the V6 ill to quiesce). So we can't ASSERT
- * that `ipsq_current_ipif == ipif'.
+ * that `ipx_current_ipif == ipif'.
*/
- ASSERT(ipsq->ipsq_current_ipif != NULL);
+ ASSERT(ipx->ipx_current_ipif != NULL);
/*
* M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls,
@@ -1180,8 +1234,8 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
if (connp->conn_state_flags & CONN_CLOSING)
return (B_FALSE);
}
- mutex_enter(&ipsq->ipsq_lock);
- ipsq->ipsq_pending_ipif = ipif;
+ mutex_enter(&ipx->ipx_lock);
+ ipx->ipx_pending_ipif = ipif;
/*
* Note down the queue in b_queue. This will be returned by
* ipsq_pending_mp_get. Caller will then use these values to restart
@@ -1189,38 +1243,40 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
*/
add_mp->b_next = NULL;
add_mp->b_queue = q;
- ipsq->ipsq_pending_mp = add_mp;
- ipsq->ipsq_waitfor = waitfor;
+ ipx->ipx_pending_mp = add_mp;
+ ipx->ipx_waitfor = waitfor;
+ mutex_exit(&ipx->ipx_lock);
if (connp != NULL)
connp->conn_oper_pending_ill = ipif->ipif_ill;
- mutex_exit(&ipsq->ipsq_lock);
+
return (B_TRUE);
}
/*
- * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp
+ * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
* queued in the list.
*/
mblk_t *
ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
{
mblk_t *curr = NULL;
+ ipxop_t *ipx = ipsq->ipsq_xop;
- mutex_enter(&ipsq->ipsq_lock);
*connpp = NULL;
- if (ipsq->ipsq_pending_mp == NULL) {
- mutex_exit(&ipsq->ipsq_lock);
+ mutex_enter(&ipx->ipx_lock);
+ if (ipx->ipx_pending_mp == NULL) {
+ mutex_exit(&ipx->ipx_lock);
return (NULL);
}
/* There can be only 1 such excl message */
- curr = ipsq->ipsq_pending_mp;
- ASSERT(curr != NULL && curr->b_next == NULL);
- ipsq->ipsq_pending_ipif = NULL;
- ipsq->ipsq_pending_mp = NULL;
- ipsq->ipsq_waitfor = 0;
- mutex_exit(&ipsq->ipsq_lock);
+ curr = ipx->ipx_pending_mp;
+ ASSERT(curr->b_next == NULL);
+ ipx->ipx_pending_ipif = NULL;
+ ipx->ipx_pending_mp = NULL;
+ ipx->ipx_waitfor = 0;
+ mutex_exit(&ipx->ipx_lock);
if (CONN_Q(curr->b_queue)) {
/*
@@ -1237,7 +1293,7 @@ ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
}
/*
- * Cleanup the ioctl mp queued in ipsq_pending_mp
+ * Cleanup the ioctl mp queued in ipx_pending_mp
* - Called in the ill_delete path
* - Called in the M_ERROR or M_HANGUP path on the ill.
* - Called in the conn close path.
@@ -1246,48 +1302,41 @@ boolean_t
ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
{
mblk_t *mp;
- ipsq_t *ipsq;
+ ipxop_t *ipx;
queue_t *q;
ipif_t *ipif;
ASSERT(IAM_WRITER_ILL(ill));
- ipsq = ill->ill_phyint->phyint_ipsq;
- mutex_enter(&ipsq->ipsq_lock);
+ ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
+
/*
- * If connp is null, unconditionally clean up the ipsq_pending_mp.
+ * If connp is null, unconditionally clean up the ipx_pending_mp.
* This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl
* even if it is meant for another ill, since we have to enqueue
- * a new mp now in ipsq_pending_mp to complete the ipif_down.
+ * a new mp now in ipx_pending_mp to complete the ipif_down.
* If connp is non-null we are called from the conn close path.
*/
- mp = ipsq->ipsq_pending_mp;
+ mutex_enter(&ipx->ipx_lock);
+ mp = ipx->ipx_pending_mp;
if (mp == NULL || (connp != NULL &&
mp->b_queue != CONNP_TO_WQ(connp))) {
- mutex_exit(&ipsq->ipsq_lock);
+ mutex_exit(&ipx->ipx_lock);
return (B_FALSE);
}
- /* Now remove from the ipsq_pending_mp */
- ipsq->ipsq_pending_mp = NULL;
+ /* Now remove from the ipx_pending_mp */
+ ipx->ipx_pending_mp = NULL;
q = mp->b_queue;
mp->b_next = NULL;
mp->b_prev = NULL;
mp->b_queue = NULL;
- /* If MOVE was in progress, clear the move_in_progress fields also. */
- ill = ipsq->ipsq_pending_ipif->ipif_ill;
- if (ill->ill_move_in_progress) {
- ILL_CLEAR_MOVE(ill);
- } else if (ill->ill_up_ipifs) {
- ill_group_cleanup(ill);
- }
-
- ipif = ipsq->ipsq_pending_ipif;
- ipsq->ipsq_pending_ipif = NULL;
- ipsq->ipsq_waitfor = 0;
- ipsq->ipsq_current_ipif = NULL;
- ipsq->ipsq_current_ioctl = 0;
- ipsq->ipsq_current_done = B_TRUE;
- mutex_exit(&ipsq->ipsq_lock);
+ ipif = ipx->ipx_pending_ipif;
+ ipx->ipx_pending_ipif = NULL;
+ ipx->ipx_waitfor = 0;
+ ipx->ipx_current_ipif = NULL;
+ ipx->ipx_current_ioctl = 0;
+ ipx->ipx_current_done = B_TRUE;
+ mutex_exit(&ipx->ipx_lock);
if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
if (connp == NULL) {
@@ -1437,7 +1486,7 @@ conn_ioctl_cleanup(conn_t *connp)
* Is any exclusive ioctl pending ? If so clean it up. If the
* ioctl has not yet started, the mp is pending in the list headed by
* ipsq_xopq_head. If the ioctl has started the mp could be present in
- * ipsq_pending_mp. If the ioctl timed out in the streamhead but
+ * ipx_pending_mp. If the ioctl timed out in the streamhead but
* is currently executing now the mp is not queued anywhere but
* conn_oper_pending_ill is null. The conn close will wait
* till the conn_ref drops to zero.
@@ -1468,9 +1517,9 @@ conn_ioctl_cleanup(conn_t *connp)
ill_waiter_dcr(ill);
/*
* Check whether this ioctl has started and is
- * pending now in ipsq_pending_mp. If it is not
- * found there then check whether this ioctl has
- * not even started and is in the ipsq_xopq list.
+ * pending. If it is not found there then check
+ * whether this ioctl has not even started and is in
+ * the ipsq_xopq list.
*/
if (!ipsq_pending_mp_cleanup(ill, connp))
ipsq_xopq_mp_cleanup(ill, connp);
@@ -1506,16 +1555,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg)
if (connp->conn_multicast_ill == ill) {
/* Revert to late binding */
connp->conn_multicast_ill = NULL;
- connp->conn_orig_multicast_ifindex = 0;
}
if (connp->conn_incoming_ill == ill)
connp->conn_incoming_ill = NULL;
if (connp->conn_outgoing_ill == ill)
connp->conn_outgoing_ill = NULL;
- if (connp->conn_outgoing_pill == ill)
- connp->conn_outgoing_pill = NULL;
- if (connp->conn_nofailover_ill == ill)
- connp->conn_nofailover_ill = NULL;
if (connp->conn_dhcpinit_ill == ill) {
connp->conn_dhcpinit_ill = NULL;
ASSERT(ill->ill_dhcpinit != 0);
@@ -1524,11 +1568,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg)
if (connp->conn_ire_cache != NULL) {
ire = connp->conn_ire_cache;
/*
- * ip_newroute creates IRE_CACHE with ire_stq coming from
- * interface X and ipif coming from interface Y, if interface
- * X and Y are part of the same IPMPgroup. Thus whenever
- * interface X goes down, remove all references to it by
- * checking both on ire_ipif and ire_stq.
+ * Source address selection makes it possible for IRE_CACHE
+ * entries to be created with ire_stq coming from interface X
+ * and ipif coming from interface Y. Thus whenever interface
+ * X goes down, remove all references to it by checking both
+ * on ire_ipif and ire_stq.
*/
if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
(ire->ire_type == IRE_CACHE &&
@@ -1601,14 +1645,10 @@ ill_down(ill_t *ill)
ip_stack_t *ipst = ill->ill_ipst;
/* Blow off any IREs dependent on this ILL. */
- ire_walk(ill_downi, (char *)ill, ipst);
+ ire_walk(ill_downi, ill, ipst);
/* Remove any conn_*_ill depending on this ill */
ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
-
- if (ill->ill_group != NULL) {
- illgrp_delete(ill);
- }
}
/*
@@ -1621,9 +1661,9 @@ ill_downi(ire_t *ire, char *ill_arg)
ill_t *ill = (ill_t *)ill_arg;
/*
- * ip_newroute creates IRE_CACHE with ire_stq coming from
- * interface X and ipif coming from interface Y, if interface
- * X and Y are part of the same IPMP group. Thus whenever interface
+ * Source address selection makes it possible for IRE_CACHE
+ * entries to be created with ire_stq coming from interface X
+ * and ipif coming from interface Y. Thus whenever interface
* X goes down, remove all references to it by checking both
* on ire_ipif and ire_stq.
*/
@@ -3696,16 +3736,39 @@ nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp,
}
/*
- * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an
- * IPMP group, make sure all ill's in the group adopt the new policy. Send
- * up RTS_IFINFO routing socket messages for each interface whose flags we
- * change.
+ * Helper function for ill_forward_set().
+ */
+static void
+ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
+{
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
+
+ ip1dbg(("ill_forward_set: %s %s forwarding on %s",
+ (enable ? "Enabling" : "Disabling"),
+ (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
+ mutex_enter(&ill->ill_lock);
+ if (enable)
+ ill->ill_flags |= ILLF_ROUTER;
+ else
+ ill->ill_flags &= ~ILLF_ROUTER;
+ mutex_exit(&ill->ill_lock);
+ if (ill->ill_isv6)
+ ill_set_nce_router_flags(ill, enable);
+ /* Notify routing socket listeners of this change. */
+ ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
+}
+
+/*
+ * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing
+ * socket messages for each interface whose flags we change.
*/
int
ill_forward_set(ill_t *ill, boolean_t enable)
{
- ill_group_t *illgrp;
- ip_stack_t *ipst = ill->ill_ipst;
+ ipmp_illgrp_t *illg;
+ ip_stack_t *ipst = ill->ill_ipst;
ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
@@ -3716,47 +3779,23 @@ ill_forward_set(ill_t *ill, boolean_t enable)
if (IS_LOOPBACK(ill))
return (EINVAL);
- /*
- * If the ill is in an IPMP group, set the forwarding policy on all
- * members of the group to the same value.
- */
- illgrp = ill->ill_group;
- if (illgrp != NULL) {
- ill_t *tmp_ill;
+ if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
+ /*
+ * Update all of the interfaces in the group.
+ */
+ illg = ill->ill_grp;
+ ill = list_head(&illg->ig_if);
+ for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
+ ill_forward_set_on_ill(ill, enable);
- for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL;
- tmp_ill = tmp_ill->ill_group_next) {
- ip1dbg(("ill_forward_set: %s %s forwarding on %s",
- (enable ? "Enabling" : "Disabling"),
- (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"),
- tmp_ill->ill_name));
- mutex_enter(&tmp_ill->ill_lock);
- if (enable)
- tmp_ill->ill_flags |= ILLF_ROUTER;
- else
- tmp_ill->ill_flags &= ~ILLF_ROUTER;
- mutex_exit(&tmp_ill->ill_lock);
- if (tmp_ill->ill_isv6)
- ill_set_nce_router_flags(tmp_ill, enable);
- /* Notify routing socket listeners of this change. */
- ip_rts_ifmsg(tmp_ill->ill_ipif);
- }
- } else {
- ip1dbg(("ill_forward_set: %s %s forwarding on %s",
- (enable ? "Enabling" : "Disabling"),
- (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
- mutex_enter(&ill->ill_lock);
- if (enable)
- ill->ill_flags |= ILLF_ROUTER;
- else
- ill->ill_flags &= ~ILLF_ROUTER;
- mutex_exit(&ill->ill_lock);
- if (ill->ill_isv6)
- ill_set_nce_router_flags(ill, enable);
- /* Notify routing socket listeners of this change. */
- ip_rts_ifmsg(ill->ill_ipif);
+ /*
+ * Update the IPMP meta-interface.
+ */
+ ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
+ return (0);
}
+ ill_forward_set_on_ill(ill, enable);
return (0);
}
@@ -3772,7 +3811,12 @@ ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
nce_t *nce;
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE);
+ /*
+ * NOTE: we're called separately for each ill in an illgrp,
+ * so don't match across the illgrp.
+ */
+ nce = ndp_lookup_v6(ill, B_FALSE, &ipif->ipif_v6lcl_addr,
+ B_FALSE);
if (nce != NULL) {
mutex_enter(&nce->nce_lock);
if (enable)
@@ -3928,36 +3972,45 @@ ill_next(ill_walk_context_t *ctx, ill_t *lastill)
}
/*
- * Check interface name for correct format which is name+ppa.
- * name can contain characters and digits, the right most digits
- * make up the ppa number. use of octal is not allowed, name must contain
- * a ppa, return pointer to the start of ppa.
- * In case of error return NULL.
+ * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
+ * The final number (PPA) must not have any leading zeros. Upon success, a
+ * pointer to the start of the PPA is returned; otherwise NULL is returned.
*/
static char *
ill_get_ppa_ptr(char *name)
{
- int namelen = mi_strlen(name);
+ int namelen = strlen(name);
+ int end_ndx = namelen - 1;
+ int ppa_ndx, i;
- int len = namelen;
+ /*
+ * Check that the first character is [a-zA-Z], and that the last
+ * character is [0-9].
+ */
+ if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
+ return (NULL);
- name += len;
- while (len > 0) {
- name--;
- if (*name < '0' || *name > '9')
+ /*
+ * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
+ */
+ for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
+ if (!isdigit(name[ppa_ndx - 1]))
break;
- len--;
- }
- /* empty string, all digits, or no trailing digits */
- if (len == 0 || len == (int)namelen)
+ if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
return (NULL);
- name++;
- /* check for attempted use of octal */
- if (*name == '0' && len != (int)namelen - 1)
- return (NULL);
- return (name);
+ /*
+ * Check that the intermediate characters are [a-z0-9.]
+ */
+ for (i = 1; i < ppa_ndx; i++) {
+ if (!isalpha(name[i]) && !isdigit(name[i]) &&
+ name[i] != '.' && name[i] != '_') {
+ return (NULL);
+ }
+ }
+
+ return (name + ppa_ndx);
}
/*
@@ -4037,8 +4090,10 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
} else if (ILL_CAN_WAIT(ill, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (error != NULL)
@@ -4102,6 +4157,7 @@ static void
ill_glist_delete(ill_t *ill)
{
ip_stack_t *ipst;
+ phyint_t *phyi;
if (ill == NULL)
return;
@@ -4139,8 +4195,41 @@ ill_glist_delete(ill_t *ill)
ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
ill->ill_name_length);
- ill_phyint_free(ill);
+ ASSERT(ill->ill_phyint != NULL);
+ phyi = ill->ill_phyint;
+ ill->ill_phyint = NULL;
+
+ /*
+ * ill_init allocates a phyint always to store the copy
+ * of flags relevant to phyint. At that point in time, we could
+ * not assign the name and hence phyint_illv4/v6 could not be
+ * initialized. Later in ipif_set_values, we assign the name to
+ * the ill, at which point in time we assign phyint_illv4/v6.
+ * Thus we don't rely on phyint_illv6 to be initialized always.
+ */
+ if (ill->ill_flags & ILLF_IPV6)
+ phyi->phyint_illv6 = NULL;
+ else
+ phyi->phyint_illv4 = NULL;
+
+ if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
+ return;
+ }
+
+ /*
+ * There are no ills left on this phyint; pull it out of the phyint
+ * avl trees, and free it.
+ */
+ if (phyi->phyint_ifindex > 0) {
+ avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+ phyi);
+ avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
+ phyi);
+ }
rw_exit(&ipst->ips_ill_g_lock);
+
+ phyint_free(phyi);
}
/*
@@ -4367,30 +4456,32 @@ ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
return (0);
}
-/* Initialize the per phyint (per IPMP group) ipsq used for serialization */
+/* Initialize the per phyint ipsq used for serialization */
static boolean_t
-ipsq_init(ill_t *ill)
+ipsq_init(ill_t *ill, boolean_t enter)
{
ipsq_t *ipsq;
+ ipxop_t *ipx;
- /* Init the ipsq and impicitly enter as writer */
- ill->ill_phyint->phyint_ipsq =
- kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
- if (ill->ill_phyint->phyint_ipsq == NULL)
+ if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
return (B_FALSE);
- ipsq = ill->ill_phyint->phyint_ipsq;
- ipsq->ipsq_phyint_list = ill->ill_phyint;
- ill->ill_phyint->phyint_ipsq_next = NULL;
+
+ ill->ill_phyint->phyint_ipsq = ipsq;
+ ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
+ ipx->ipx_ipsq = ipsq;
+ ipsq->ipsq_next = ipsq;
+ ipsq->ipsq_phyint = ill->ill_phyint;
mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
- ipsq->ipsq_refs = 1;
- ipsq->ipsq_writer = curthread;
- ipsq->ipsq_reentry_cnt = 1;
+ mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */
+ if (enter) {
+ ipx->ipx_writer = curthread;
+ ipx->ipx_forced = B_FALSE;
+ ipx->ipx_reentry_cnt = 1;
#ifdef DEBUG
- ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack,
- IPSQ_STACK_DEPTH);
+ ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
#endif
- (void) strcpy(ipsq->ipsq_name, ill->ill_name);
+ }
return (B_TRUE);
}
@@ -4468,7 +4559,7 @@ ill_init(queue_t *q, ill_t *ill)
ill->ill_ppa = UINT_MAX;
ill->ill_fastpath_list = &ill->ill_fastpath_list;
- if (!ipsq_init(ill)) {
+ if (!ipsq_init(ill, B_TRUE)) {
freemsg(info_mp);
mi_free(frag_ptr);
mi_free(ill->ill_phyint);
@@ -4589,29 +4680,16 @@ loopback_kstat_update(kstat_t *ksp, int rw)
}
/*
- * Has ifindex been plumbed already.
- * Compares both phyint_ifindex and phyint_group_ifindex.
+ * Has ifindex been plumbed already?
*/
static boolean_t
phyint_exists(uint_t index, ip_stack_t *ipst)
{
- phyint_t *phyi;
-
ASSERT(index != 0);
ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
- /*
- * Indexes are stored in the phyint - a common structure
- * to both IPv4 and IPv6.
- */
- phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
- for (; phyi != NULL;
- phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
- phyi, AVL_AFTER)) {
- if (phyi->phyint_ifindex == index ||
- phyi->phyint_group_ifindex == index)
- return (B_TRUE);
- }
- return (B_FALSE);
+
+ return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+ &index, NULL) != NULL);
}
/* Pick a unique ifindex */
@@ -4675,9 +4753,9 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
{
ill_t *ill;
ipif_t *ipif;
+ ipsq_t *ipsq;
kstat_named_t *kn;
boolean_t isloopback;
- ipsq_t *old_ipsq;
in6_addr_t ov6addr;
isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
@@ -4761,16 +4839,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
ill->ill_net_type = IRE_LOOPBACK;
/* Initialize the ipsq */
- if (!ipsq_init(ill))
+ if (!ipsq_init(ill, B_FALSE))
goto done;
- ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL;
- ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--;
- ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
- ill->ill_phyint->phyint_ipsq->ipsq_depth = 0;
-#endif
- ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE);
+ ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE);
if (ipif == NULL)
goto done;
@@ -4807,7 +4879,7 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
ill->ill_frag_free_num_pkts = 0;
ill->ill_last_frag_clean_time = 0;
- old_ipsq = ill->ill_phyint->phyint_ipsq;
+ ipsq = ill->ill_phyint->phyint_ipsq;
if (ill_glist_insert(ill, "lo", isv6) != 0)
cmn_err(CE_PANIC, "cannot insert loopback interface");
@@ -4824,13 +4896,11 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
sctp_update_ipif_addr(ipif, ov6addr);
/*
- * If the ipsq was changed in ill_phyint_reinit free the old ipsq.
+ * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
+ * If so, free our original one.
*/
- if (old_ipsq != ill->ill_phyint->phyint_ipsq) {
- /* Loopback ills aren't in any IPMP group */
- ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP));
- ipsq_delete(old_ipsq);
- }
+ if (ipsq != ill->ill_phyint->phyint_ipsq)
+ ipsq_delete(ipsq);
/*
* Delay this till the ipif is allocated as ipif_allocate
@@ -4871,12 +4941,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
done:
if (ill != NULL) {
if (ill->ill_phyint != NULL) {
- ipsq_t *ipsq;
-
ipsq = ill->ill_phyint->phyint_ipsq;
if (ipsq != NULL) {
- ipsq->ipsq_ipst = NULL;
- kmem_free(ipsq, sizeof (ipsq_t));
+ ipsq->ipsq_phyint = NULL;
+ ipsq_delete(ipsq);
}
mi_free(ill->ill_phyint);
}
@@ -4954,9 +5022,11 @@ ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp,
} else if (ILL_CAN_WAIT(ill, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
rw_exit(&ipst->ips_ill_g_lock);
mutex_exit(&ill->ill_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (err != NULL)
@@ -5294,6 +5364,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
dl_info_ack_t *dlia;
ip_m_t *ipm;
dl_qos_cl_sel1_t *sel1;
+ int min_mtu;
ASSERT(IAM_WRITER_ILL(ill));
@@ -5336,7 +5407,14 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
ill->ill_bcast_addr_length = brdcst_addr_length;
ill->ill_phys_addr_length = phys_addr_length;
ill->ill_sap_length = sap_length;
- ill->ill_max_frag = dlia->dl_max_sdu;
+
+ /*
+ * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
+ * but we must ensure a minimum IP MTU is used since other bits of
+ * IP will fly apart otherwise.
+ */
+ min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
+ ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
ill->ill_max_mtu = ill->ill_max_frag;
ill->ill_type = ipm->ip_m_type;
@@ -5358,7 +5436,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
* the wakeup.
*/
(void) ipif_allocate(ill, 0, IRE_LOCAL,
- dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE);
+ dlia->dl_provider_style != DL_STYLE2, B_TRUE);
mutex_enter(&ill->ill_lock);
ASSERT(ill->ill_dlpi_style_set == 0);
ill->ill_dlpi_style_set = 1;
@@ -5397,8 +5475,13 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
/*
* Free ill_resolver_mp and ill_bcast_mp as things could have
* changed now.
+ *
+ * NOTE: The IPMP meta-interface is special-cased because it starts
+ * with no underlying interfaces (and thus an unknown broadcast
+ * address length), but we enforce that an interface is broadcast-
+ * capable as part of allowing it to join a group.
*/
- if (ill->ill_bcast_addr_length == 0) {
+ if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
if (ill->ill_resolver_mp != NULL)
freemsg(ill->ill_resolver_mp);
if (ill->ill_bcast_mp != NULL)
@@ -5451,6 +5534,11 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
if (!ill->ill_isv6)
ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
}
+
+ /* For IPMP, PHYI_IPMP should already be set by ipif_allocate() */
+ if (ill->ill_mactype == SUNW_DL_IPMP)
+ ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
+
/* By default an interface does not support any CoS marking */
ill->ill_flags &= ~ILLF_COS_ENABLED;
@@ -5552,16 +5640,18 @@ ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
}
/*
- * Find any non-virtual, not condemned, and up multicast capable interface
- * given an IP instance and zoneid. Order of preference is:
+ * Find a mulitcast-capable ipif given an IP instance and zoneid.
+ * The ipif must be up, and its ill must multicast-capable, not
+ * condemned, not an underlying interface in an IPMP group, and
+ * not a VNI interface. Order of preference:
*
- * 1. normal
- * 1.1 normal, but deprecated
- * 2. point to point
- * 2.1 point to point, but deprecated
- * 3. link local
- * 3.1 link local, but deprecated
- * 4. loopback.
+ * 1a. normal
+ * 1b. normal, but deprecated
+ * 2a. point to point
+ * 2b. point to point, but deprecated
+ * 3a. link local
+ * 3b. link local, but deprecated
+ * 4. loopback.
*/
ipif_t *
ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
@@ -5580,7 +5670,7 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
mutex_enter(&ill->ill_lock);
- if (IS_VNI(ill) || !ILL_CAN_LOOKUP(ill) ||
+ if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) ||
!(ill->ill_flags & ILLF_MULTICAST)) {
mutex_exit(&ill->ill_lock);
continue;
@@ -5736,10 +5826,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
} else if (IPIF_CAN_WAIT(ipif, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
rw_exit(&ipst->ips_ill_g_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP,
ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (error != NULL)
@@ -5761,15 +5853,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
}
/*
- * Look for an ipif with the specified address. For point-point links
- * we look for matches on either the destination address and the local
- * address, but we ignore the check on the local address if IPIF_UNNUMBERED
- * is set.
- * Matches on a specific ill if match_ill is set.
+ * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
*/
-ipif_t *
-ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
- mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+static ipif_t *
+ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp,
+ zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
+ ip_stack_t *ipst)
{
ipif_t *ipif;
ill_t *ill;
@@ -5788,7 +5877,8 @@ ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
repeat:
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (match_ill != NULL && ill != match_ill) {
+ if (match_ill != NULL && ill != match_ill &&
+ (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
continue;
}
GRAB_CONN_LOCK(q);
@@ -5817,10 +5907,12 @@ repeat:
} else if (IPIF_CAN_WAIT(ipif, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
rw_exit(&ipst->ips_ill_g_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP,
ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (error != NULL)
@@ -5894,11 +5986,40 @@ ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
}
/*
+ * Lookup an ipif with the specified address. For point-to-point links we
+ * look for matches on either the destination address or the local address,
+ * but we skip the local address check if IPIF_UNNUMBERED is set. If the
+ * `match_ill' argument is non-NULL, the lookup is restricted to that ill
+ * (or illgrp if `match_ill' is in an IPMP group).
+ */
+ipif_t *
+ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
+ mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+{
+ return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp,
+ func, error, ipst));
+}
+
+/*
+ * Special abbreviated version of ipif_lookup_addr() that doesn't match
+ * `match_ill' across the IPMP group. This function is only needed in some
+ * corner-cases; almost everything should use ipif_lookup_addr().
+ */
+static ipif_t *
+ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
+{
+ ASSERT(match_ill != NULL);
+ return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES,
+ NULL, NULL, NULL, NULL, ipst));
+}
+
+/*
* Look for an ipif with the specified address. For point-point links
* we look for matches on either the destination address and the local
* address, but we ignore the check on the local address if IPIF_UNNUMBERED
* is set.
- * Matches on a specific ill if match_ill is set.
+ * If the `match_ill' argument is non-NULL, the lookup is restricted to that
+ * ill (or illgrp if `match_ill' is in an IPMP group).
* Return the zoneid for the ipif which matches. ALL_ZONES if no match.
*/
zoneid_t
@@ -5918,7 +6039,8 @@ ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
repeat:
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (match_ill != NULL && ill != match_ill) {
+ if (match_ill != NULL && ill != match_ill &&
+ !IS_IN_SAME_ILLGRP(ill, match_ill)) {
continue;
}
mutex_enter(&ill->ill_lock);
@@ -6008,7 +6130,7 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
/*
* The callers of this function wants to know the
* interface on which they have to send the replies
- * back. For IRE_CACHES that have ire_stq and ire_ipif
+ * back. For IREs that have ire_stq and ire_ipif
* derived from different ills, we really don't care
* what we return here.
*/
@@ -6109,30 +6231,6 @@ ipif_is_freeable(ipif_t *ipif)
}
/*
- * This func does not prevent refcnt from increasing. But if
- * the caller has taken steps to that effect, then this func
- * can be used to determine whether the ipifs marked with IPIF_MOVING
- * have become quiescent and can be moved in a failover/failback.
- */
-static ipif_t *
-ill_quiescent_to_move(ill_t *ill)
-{
- ipif_t *ipif;
-
- ASSERT(MUTEX_HELD(&ill->ill_lock));
-
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (ipif->ipif_state_flags & IPIF_MOVING) {
- if (ipif->ipif_refcnt != 0 ||
- !IPIF_DOWN_OK(ipif)) {
- return (ipif);
- }
- }
- }
- return (NULL);
-}
-
-/*
* The ipif/ill/ire has been refreled. Do the tail processing.
* Determine if the ipif or ill in question has become quiescent and if so
* wakeup close and/or restart any queued pending ioctl that is waiting
@@ -6144,87 +6242,61 @@ ipif_ill_refrele_tail(ill_t *ill)
mblk_t *mp;
conn_t *connp;
ipsq_t *ipsq;
+ ipxop_t *ipx;
ipif_t *ipif;
dl_notify_ind_t *dlindp;
ASSERT(MUTEX_HELD(&ill->ill_lock));
- if ((ill->ill_state_flags & ILL_CONDEMNED) &&
- ill_is_freeable(ill)) {
- /* ill_close may be waiting */
+ if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
+ /* ip_modclose() may be waiting */
cv_broadcast(&ill->ill_cv);
}
- /* ipsq can't change because ill_lock is held */
ipsq = ill->ill_phyint->phyint_ipsq;
- if (ipsq->ipsq_waitfor == 0) {
- /* Not waiting for anything, just return. */
- mutex_exit(&ill->ill_lock);
- return;
- }
- ASSERT(ipsq->ipsq_pending_mp != NULL &&
- ipsq->ipsq_pending_ipif != NULL);
- /*
- * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF.
- * Last ipif going down needs to down the ill, so ill_ire_cnt must
- * be zero for restarting an ioctl that ends up downing the ill.
- */
- ipif = ipsq->ipsq_pending_ipif;
- if (ipif->ipif_ill != ill) {
- /* The ioctl is pending on some other ill. */
- mutex_exit(&ill->ill_lock);
- return;
- }
+ mutex_enter(&ipsq->ipsq_lock);
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
+ if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */
+ goto unlock;
+
+ ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
+
+ ipif = ipx->ipx_pending_ipif;
+ if (ipif->ipif_ill != ill) /* wait is for another ill; bail */
+ goto unlock;
- switch (ipsq->ipsq_waitfor) {
+ switch (ipx->ipx_waitfor) {
case IPIF_DOWN:
- if (!ipif_is_quiescent(ipif)) {
- mutex_exit(&ill->ill_lock);
- return;
- }
+ if (!ipif_is_quiescent(ipif))
+ goto unlock;
break;
case IPIF_FREE:
- if (!ipif_is_freeable(ipif)) {
- mutex_exit(&ill->ill_lock);
- return;
- }
+ if (!ipif_is_freeable(ipif))
+ goto unlock;
break;
-
case ILL_DOWN:
- if (!ill_is_quiescent(ill)) {
- mutex_exit(&ill->ill_lock);
- return;
- }
+ if (!ill_is_quiescent(ill))
+ goto unlock;
break;
case ILL_FREE:
/*
- * case ILL_FREE arises only for loopback. otherwise ill_delete
- * waits synchronously in ip_close, and no message is queued in
- * ipsq_pending_mp at all in this case
+ * ILL_FREE is only for loopback; normal ill teardown waits
+ * synchronously in ip_modclose() without using ipx_waitfor,
+ * handled by the cv_broadcast() at the top of this function.
*/
- if (!ill_is_freeable(ill)) {
- mutex_exit(&ill->ill_lock);
- return;
- }
- break;
-
- case ILL_MOVE_OK:
- if (ill_quiescent_to_move(ill) != NULL) {
- mutex_exit(&ill->ill_lock);
- return;
- }
+ if (!ill_is_freeable(ill))
+ goto unlock;
break;
default:
- cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n",
- (void *)ipsq, ipsq->ipsq_waitfor);
+ cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
+ (void *)ipsq, ipx->ipx_waitfor);
}
- /*
- * Incr refcnt for the qwriter_ip call below which
- * does a refrele
- */
- ill_refhold_locked(ill);
+ ill_refhold_locked(ill); /* for qwriter_ip() call below */
+ mutex_exit(&ipx->ipx_lock);
mp = ipsq_pending_mp_get(ipsq, &connp);
+ mutex_exit(&ipsq->ipsq_lock);
mutex_exit(&ill->ill_lock);
ASSERT(mp != NULL);
@@ -6249,6 +6321,7 @@ ipif_ill_refrele_tail(ill_t *ill)
return;
default:
ASSERT(0);
+ ill_refrele(ill);
}
break;
@@ -6268,6 +6341,11 @@ ipif_ill_refrele_tail(ill_t *ill)
cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
"db_type %d\n", (void *)mp, mp->b_datap->db_type);
}
+ return;
+unlock:
+ mutex_exit(&ipsq->ipsq_lock);
+ mutex_exit(&ipx->ipx_lock);
+ mutex_exit(&ill->ill_lock);
}
#ifdef DEBUG
@@ -6902,10 +6980,23 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
ipif = ipif_arg;
if (ipif_arg != NULL)
match_flags |= MATCH_IRE_ILL;
+again:
gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL,
ALL_ZONES, 0, NULL, match_flags, ipst);
- if (gw_ire == NULL)
+ if (gw_ire == NULL) {
+ /*
+ * With IPMP, we allow host routes to influence in.mpathd's
+ * target selection. However, if the test addresses are on
+ * their own network, the above lookup will fail since the
+ * underlying IRE_INTERFACEs are marked hidden. So allow
+ * hidden test IREs to be found and try again.
+ */
+ if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) {
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+ goto again;
+ }
return (ENETUNREACH);
+ }
/*
* We create one of three types of IREs as a result of this request
@@ -7355,9 +7446,11 @@ void
ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
ill_t *pending_ill)
{
- conn_t *connp = NULL;
+ conn_t *connp;
+ ipxop_t *ipx = ipsq->ipsq_xop;
ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
+ ASSERT(MUTEX_HELD(&ipx->ipx_lock));
ASSERT(func != NULL);
mp->b_queue = q;
@@ -7366,14 +7459,14 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
switch (type) {
case CUR_OP:
- if (ipsq->ipsq_mptail != NULL) {
- ASSERT(ipsq->ipsq_mphead != NULL);
- ipsq->ipsq_mptail->b_next = mp;
+ if (ipx->ipx_mptail != NULL) {
+ ASSERT(ipx->ipx_mphead != NULL);
+ ipx->ipx_mptail->b_next = mp;
} else {
- ASSERT(ipsq->ipsq_mphead == NULL);
- ipsq->ipsq_mphead = mp;
+ ASSERT(ipx->ipx_mphead == NULL);
+ ipx->ipx_mphead = mp;
}
- ipsq->ipsq_mptail = mp;
+ ipx->ipx_mptail = mp;
break;
case NEW_OP:
@@ -7385,6 +7478,15 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
ipsq->ipsq_xopq_mphead = mp;
}
ipsq->ipsq_xopq_mptail = mp;
+ ipx->ipx_ipsq_queued = B_TRUE;
+ break;
+
+ case SWITCH_OP:
+ ASSERT(ipsq->ipsq_swxop != NULL);
+ /* only one switch operation is currently allowed */
+ ASSERT(ipsq->ipsq_switch_mp == NULL);
+ ipsq->ipsq_switch_mp = mp;
+ ipx->ipx_ipsq_queued = B_TRUE;
break;
default:
cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
@@ -7392,55 +7494,273 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
if (CONN_Q(q) && pending_ill != NULL) {
connp = Q_TO_CONN(q);
-
ASSERT(MUTEX_HELD(&connp->conn_lock));
connp->conn_oper_pending_ill = pending_ill;
}
}
/*
- * Return the mp at the head of the ipsq. After emptying the ipsq
- * look at the next ioctl, if this ioctl is complete. Otherwise
- * return, we will resume when we complete the current ioctl.
- * The current ioctl will wait till it gets a response from the
- * driver below.
+ * Dequeue the next message that requested exclusive access to this IPSQ's
+ * xop. Specifically:
+ *
+ * 1. If we're still processing the current operation on `ipsq', then
+ * dequeue the next message for the operation (from ipx_mphead), or
+ * return NULL if there are no queued messages for the operation.
+ * These messages are queued via CUR_OP to qwriter_ip() and friends.
+ *
+ * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is
+ * not set) see if the ipsq has requested an xop switch. If so, switch
+ * `ipsq' to a different xop. Xop switches only happen when joining or
+ * leaving IPMP groups and require a careful dance -- see the comments
+ * in-line below for details. If we're leaving a group xop or if we're
+ * joining a group xop and become writer on it, then we proceed to (3).
+ * Otherwise, we return NULL and exit the xop.
+ *
+ * 3. For each IPSQ in the xop, return any switch operation stored on
+ * ipsq_switch_mp (set via SWITCH_OP); these must be processed before
+ * any other messages queued on the IPSQ. Otherwise, dequeue the next
+ * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
+ * Note that if the phyint tied to `ipsq' is not using IPMP there will
+ * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for
+ * each phyint in the group, including the IPMP meta-interface phyint.
*/
static mblk_t *
ipsq_dq(ipsq_t *ipsq)
{
+ ill_t *illv4, *illv6;
mblk_t *mp;
+ ipsq_t *xopipsq;
+ ipsq_t *leftipsq = NULL;
+ ipxop_t *ipx;
+ phyint_t *phyi = ipsq->ipsq_phyint;
+ ip_stack_t *ipst = ipsq->ipsq_ipst;
+ boolean_t emptied = B_FALSE;
- ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
+ /*
+ * Grab all the locks we need in the defined order (ill_g_lock ->
+ * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
+ */
+ rw_enter(&ipst->ips_ill_g_lock,
+ ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
+ mutex_enter(&ipsq->ipsq_lock);
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
- mp = ipsq->ipsq_mphead;
- if (mp != NULL) {
- ipsq->ipsq_mphead = mp->b_next;
- if (ipsq->ipsq_mphead == NULL)
- ipsq->ipsq_mptail = NULL;
- mp->b_next = NULL;
- return (mp);
+ /*
+ * Dequeue the next message associated with the current exclusive
+ * operation, if any.
+ */
+ if ((mp = ipx->ipx_mphead) != NULL) {
+ ipx->ipx_mphead = mp->b_next;
+ if (ipx->ipx_mphead == NULL)
+ ipx->ipx_mptail = NULL;
+ mp->b_next = (void *)ipsq;
+ goto out;
}
- if (ipsq->ipsq_current_ipif != NULL)
- return (NULL);
- mp = ipsq->ipsq_xopq_mphead;
- if (mp != NULL) {
- ipsq->ipsq_xopq_mphead = mp->b_next;
- if (ipsq->ipsq_xopq_mphead == NULL)
- ipsq->ipsq_xopq_mptail = NULL;
- mp->b_next = NULL;
- return (mp);
+
+ if (ipx->ipx_current_ipif != NULL)
+ goto empty;
+
+ if (ipsq->ipsq_swxop != NULL) {
+ /*
+ * The exclusive operation that is now being completed has
+ * requested a switch to a different xop. This happens
+ * when an interface joins or leaves an IPMP group. Joins
+ * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
+ * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
+ * (phyint_free()), or interface plumb for an ill type
+ * not in the IPMP group (ip_rput_dlpi_writer()).
+ *
+ * Xop switches are not allowed on the IPMP meta-interface.
+ */
+ ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
+ DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
+
+ if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
+ /*
+ * We're switching back to our own xop, so we have two
+ * xop's to drain/exit: our own, and the group xop
+ * that we are leaving.
+ *
+ * First, pull ourselves out of the group ipsq list.
+ * This is safe since we're writer on ill_g_lock.
+ */
+ ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
+
+ xopipsq = ipx->ipx_ipsq;
+ while (xopipsq->ipsq_next != ipsq)
+ xopipsq = xopipsq->ipsq_next;
+
+ xopipsq->ipsq_next = ipsq->ipsq_next;
+ ipsq->ipsq_next = ipsq;
+ ipsq->ipsq_xop = ipsq->ipsq_swxop;
+ ipsq->ipsq_swxop = NULL;
+
+ /*
+ * Second, prepare to exit the group xop. The actual
+ * ipsq_exit() is done at the end of this function
+ * since we cannot hold any locks across ipsq_exit().
+ * Note that although we drop the group's ipx_lock, no
+ * threads can proceed since we're still ipx_writer.
+ */
+ leftipsq = xopipsq;
+ mutex_exit(&ipx->ipx_lock);
+
+ /*
+ * Third, set ipx to point to our own xop (which was
+ * inactive and therefore can be entered).
+ */
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
+ ASSERT(ipx->ipx_writer == NULL);
+ ASSERT(ipx->ipx_current_ipif == NULL);
+ } else {
+ /*
+ * We're switching from our own xop to a group xop.
+ * The requestor of the switch must ensure that the
+ * group xop cannot go away (e.g. by ensuring the
+ * phyint associated with the xop cannot go away).
+ *
+ * If we can become writer on our new xop, then we'll
+ * do the drain. Otherwise, the current writer of our
+ * new xop will do the drain when it exits.
+ *
+ * First, splice ourselves into the group IPSQ list.
+ * This is safe since we're writer on ill_g_lock.
+ */
+ ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
+
+ xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
+ while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
+ xopipsq = xopipsq->ipsq_next;
+
+ xopipsq->ipsq_next = ipsq;
+ ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
+ ipsq->ipsq_xop = ipsq->ipsq_swxop;
+ ipsq->ipsq_swxop = NULL;
+
+ /*
+ * Second, exit our own xop, since it's now unused.
+ * This is safe since we've got the only reference.
+ */
+ ASSERT(ipx->ipx_writer == curthread);
+ ipx->ipx_writer = NULL;
+ VERIFY(--ipx->ipx_reentry_cnt == 0);
+ ipx->ipx_ipsq_queued = B_FALSE;
+ mutex_exit(&ipx->ipx_lock);
+
+ /*
+ * Third, set ipx to point to our new xop, and check
+ * if we can become writer on it. If we cannot, then
+ * the current writer will drain the IPSQ group when
+ * it exits. Our ipsq_xop is guaranteed to be stable
+ * because we're still holding ipsq_lock.
+ */
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
+ if (ipx->ipx_writer != NULL ||
+ ipx->ipx_current_ipif != NULL) {
+ goto out;
+ }
+ }
+
+ /*
+ * Fourth, become writer on our new ipx before we continue
+ * with the drain. Note that we never dropped ipsq_lock
+ * above, so no other thread could've raced with us to
+ * become writer first. Also, we're holding ipx_lock, so
+ * no other thread can examine the ipx right now.
+ */
+ ASSERT(ipx->ipx_current_ipif == NULL);
+ ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
+ VERIFY(ipx->ipx_reentry_cnt++ == 0);
+ ipx->ipx_writer = curthread;
+ ipx->ipx_forced = B_FALSE;
+#ifdef DEBUG
+ ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
+#endif
}
- return (NULL);
+
+ xopipsq = ipsq;
+ do {
+ /*
+ * So that other operations operate on a consistent and
+ * complete phyint, a switch message on an IPSQ must be
+ * handled prior to any other operations on that IPSQ.
+ */
+ if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
+ xopipsq->ipsq_switch_mp = NULL;
+ ASSERT(mp->b_next == NULL);
+ mp->b_next = (void *)xopipsq;
+ goto out;
+ }
+
+ if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
+ xopipsq->ipsq_xopq_mphead = mp->b_next;
+ if (xopipsq->ipsq_xopq_mphead == NULL)
+ xopipsq->ipsq_xopq_mptail = NULL;
+ mp->b_next = (void *)xopipsq;
+ goto out;
+ }
+ } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
+empty:
+ /*
+ * There are no messages. Further, we are holding ipx_lock, hence no
+ * new messages can end up on any IPSQ in the xop.
+ */
+ ipx->ipx_writer = NULL;
+ ipx->ipx_forced = B_FALSE;
+ VERIFY(--ipx->ipx_reentry_cnt == 0);
+ ipx->ipx_ipsq_queued = B_FALSE;
+ emptied = B_TRUE;
+#ifdef DEBUG
+ ipx->ipx_depth = 0;
+#endif
+out:
+ mutex_exit(&ipx->ipx_lock);
+ mutex_exit(&ipsq->ipsq_lock);
+
+ /*
+ * If we completely emptied the xop, then wake up any threads waiting
+ * to enter any of the IPSQ's associated with it.
+ */
+ if (emptied) {
+ xopipsq = ipsq;
+ do {
+ if ((phyi = xopipsq->ipsq_phyint) == NULL)
+ continue;
+
+ illv4 = phyi->phyint_illv4;
+ illv6 = phyi->phyint_illv6;
+
+ GRAB_ILL_LOCKS(illv4, illv6);
+ if (illv4 != NULL)
+ cv_broadcast(&illv4->ill_cv);
+ if (illv6 != NULL)
+ cv_broadcast(&illv6->ill_cv);
+ RELEASE_ILL_LOCKS(illv4, illv6);
+ } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ /*
+ * Now that all locks are dropped, exit the IPSQ we left.
+ */
+ if (leftipsq != NULL)
+ ipsq_exit(leftipsq);
+
+ return (mp);
}
/*
* Enter the ipsq corresponding to ill, by waiting synchronously till
* we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
* will have to drain completely before ipsq_enter returns success.
- * ipsq_current_ipif will be set if some exclusive ioctl is in progress,
- * and the ipsq_exit logic will start the next enqueued ioctl after
- * completion of the current ioctl. If 'force' is used, we don't wait
- * for the enqueued ioctls. This is needed when a conn_close wants to
+ * ipx_current_ipif will be set if some exclusive op is in progress,
+ * and the ipsq_exit logic will start the next enqueued op after
+ * completion of the current op. If 'force' is used, we don't wait
+ * for the enqueued ops. This is needed when a conn_close wants to
* enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
* of an ill can also use this option. But we dont' use it currently.
*/
@@ -7449,13 +7769,16 @@ boolean_t
ipsq_enter(ill_t *ill, boolean_t force, int type)
{
ipsq_t *ipsq;
+ ipxop_t *ipx;
boolean_t waited_enough = B_FALSE;
/*
- * Holding the ill_lock prevents <ill-ipsq> assocs from changing.
- * Since the <ill-ipsq> assocs could change while we wait for the
- * writer, it is easier to wait on a fixed global rather than try to
- * cv_wait on a changing ipsq.
+ * Note that the relationship between ill and ipsq is fixed as long as
+ * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the
+ * relationship between the IPSQ and xop cannot change. However,
+ * since we cannot hold ipsq_lock across the cv_wait(), it may change
+ * while we're waiting. We wait on ill_cv and rely on ipsq_exit()
+ * waking up all ills in the xop when it becomes available.
*/
mutex_enter(&ill->ill_lock);
for (;;) {
@@ -7466,34 +7789,35 @@ ipsq_enter(ill_t *ill, boolean_t force, int type)
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
- if (ipsq->ipsq_writer == NULL &&
- (type == CUR_OP || ipsq->ipsq_current_ipif == NULL ||
- waited_enough)) {
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
+
+ if (ipx->ipx_writer == NULL && (type == CUR_OP ||
+ ipx->ipx_current_ipif == NULL || waited_enough))
break;
- } else if (ipsq->ipsq_writer != NULL) {
+
+ if (!force || ipx->ipx_writer != NULL) {
+ mutex_exit(&ipx->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
cv_wait(&ill->ill_cv, &ill->ill_lock);
} else {
+ mutex_exit(&ipx->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
- if (force) {
- (void) cv_timedwait(&ill->ill_cv,
- &ill->ill_lock,
- lbolt + ENTER_SQ_WAIT_TICKS);
- waited_enough = B_TRUE;
- continue;
- } else {
- cv_wait(&ill->ill_cv, &ill->ill_lock);
- }
+ (void) cv_timedwait(&ill->ill_cv,
+ &ill->ill_lock, lbolt + ENTER_SQ_WAIT_TICKS);
+ waited_enough = B_TRUE;
}
}
- ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL);
- ASSERT(ipsq->ipsq_reentry_cnt == 0);
- ipsq->ipsq_writer = curthread;
- ipsq->ipsq_reentry_cnt++;
+ ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
+ ASSERT(ipx->ipx_reentry_cnt == 0);
+ ipx->ipx_writer = curthread;
+ ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
+ ipx->ipx_reentry_cnt++;
#ifdef DEBUG
- ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IPSQ_STACK_DEPTH);
+ ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
#endif
+ mutex_exit(&ipx->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
mutex_exit(&ill->ill_lock);
return (B_TRUE);
@@ -7513,14 +7837,13 @@ ill_perim_exit(ill_t *ill)
/*
* The ipsq_t (ipsq) is the synchronization data structure used to serialize
- * certain critical operations like plumbing (i.e. most set ioctls),
- * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP
- * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per
- * IPMP group. The ipsq serializes exclusive ioctls issued by applications
- * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple
- * threads executing in the ipsq. Responses from the driver pertain to the
- * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated
- * as part of bringing up the interface) and are enqueued in ipsq_mphead.
+ * certain critical operations like plumbing (i.e. most set ioctls), multicast
+ * joins, igmp/mld timers, etc. There is one ipsq per phyint. The ipsq
+ * serializes exclusive ioctls issued by applications on a per ipsq basis in
+ * ipsq_xopq_mphead. It also protects against multiple threads executing in
+ * the ipsq. Responses from the driver pertain to the current ioctl (say a
+ * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
+ * up the interface) and are enqueued in ipx_mphead.
*
* If a thread does not want to reenter the ipsq when it is already writer,
* it must make sure that the specified reentry point to be called later
@@ -7528,29 +7851,33 @@ ill_perim_exit(ill_t *ill)
* point must never ever try to enter the ipsq again. Otherwise it can lead
* to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
* When the thread that is currently exclusive finishes, it (ipsq_exit)
- * dequeues the requests waiting to become exclusive in ipsq_mphead and calls
- * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit
+ * dequeues the requests waiting to become exclusive in ipx_mphead and calls
+ * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
* proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
* ioctl if the current ioctl has completed. If the current ioctl is still
* in progress it simply returns. The current ioctl could be waiting for
- * a response from another module (arp_ or the driver or could be waiting for
- * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp
- * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the
+ * a response from another module (arp or the driver or could be waiting for
+ * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
+ * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
* execution of the ioctl and ipsq_exit does not start the next ioctl unless
- * ipsq_current_ipif is clear which happens only on ioctl completion.
+ * ipx_current_ipif is NULL which happens only once the ioctl is complete and
+ * all associated DLPI operations have completed.
*/
/*
- * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of
- * ipif or ill can be specified). The caller ensures ipif or ill is valid by
- * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued
- * completion.
+ * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
+ * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ
+ * on success, or NULL on failure. The caller ensures ipif/ill is valid by
+ * refholding it as necessary. If the IPSQ cannot be entered and `func' is
+ * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
+ * can be entered. If `func' is NULL, then `q' and `mp' are ignored.
*/
ipsq_t *
ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
ipsq_func_t func, int type, boolean_t reentry_ok)
{
ipsq_t *ipsq;
+ ipxop_t *ipx;
/* Only 1 of ipif or ill can be specified */
ASSERT((ipif != NULL) ^ (ill != NULL));
@@ -7558,13 +7885,15 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
ill = ipif->ipif_ill;
/*
- * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock
- * ipsq of an ill can't change when ill_lock is held.
+ * lock ordering: conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
+ * ipx of an ipsq can't change when ipsq_lock is held.
*/
GRAB_CONN_LOCK(q);
mutex_enter(&ill->ill_lock);
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
/*
* 1. Enter the ipsq if we are already writer and reentry is ok.
@@ -7572,30 +7901,32 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
* 'func' nor any of its callees must ever attempt to enter the ipsq
* again. Otherwise it can lead to an infinite loop
* 2. Enter the ipsq if there is no current writer and this attempted
- * entry is part of the current ioctl or operation
+ * entry is part of the current operation
* 3. Enter the ipsq if there is no current writer and this is a new
- * ioctl (or operation) and the ioctl (or operation) queue is
- * empty and there is no ioctl (or operation) currently in progress
+ * operation and the operation queue is empty and there is no
+ * operation currently in progress
*/
- if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) ||
- (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL &&
- ipsq->ipsq_current_ipif == NULL))) ||
- (ipsq->ipsq_writer == curthread && reentry_ok)) {
+ if ((ipx->ipx_writer == curthread && reentry_ok) ||
+ (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
+ !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL)))) {
/* Success. */
- ipsq->ipsq_reentry_cnt++;
- ipsq->ipsq_writer = curthread;
+ ipx->ipx_reentry_cnt++;
+ ipx->ipx_writer = curthread;
+ ipx->ipx_forced = B_FALSE;
+ mutex_exit(&ipx->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
mutex_exit(&ill->ill_lock);
RELEASE_CONN_LOCK(q);
#ifdef DEBUG
- ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack,
- IPSQ_STACK_DEPTH);
+ ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
#endif
return (ipsq);
}
- ipsq_enq(ipsq, q, mp, func, type, ill);
+ if (func != NULL)
+ ipsq_enq(ipsq, q, mp, func, type, ill);
+ mutex_exit(&ipx->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
mutex_exit(&ill->ill_lock);
RELEASE_CONN_LOCK(q);
@@ -7630,188 +7961,58 @@ qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
}
/*
- * If there are more than ILL_GRP_CNT ills in a group,
- * we use kmem alloc'd buffers, else use the stack
- */
-#define ILL_GRP_CNT 14
-/*
- * Drain the ipsq, if there are messages on it, and then leave the ipsq.
- * Called by a thread that is currently exclusive on this ipsq.
+ * Exit the specified IPSQ. If this is the final exit on it then drain it
+ * prior to exiting. Caller must be writer on the specified IPSQ.
*/
void
ipsq_exit(ipsq_t *ipsq)
{
+ mblk_t *mp;
+ ipsq_t *mp_ipsq;
queue_t *q;
- mblk_t *mp;
- ipsq_func_t func;
- int next;
- ill_t **ill_list = NULL;
- size_t ill_list_size = 0;
- int cnt = 0;
- boolean_t need_ipsq_free = B_FALSE;
- ip_stack_t *ipst = ipsq->ipsq_ipst;
+ phyint_t *phyi;
+ ipsq_func_t func;
ASSERT(IAM_WRITER_IPSQ(ipsq));
- mutex_enter(&ipsq->ipsq_lock);
- ASSERT(ipsq->ipsq_reentry_cnt >= 1);
- if (ipsq->ipsq_reentry_cnt != 1) {
- ipsq->ipsq_reentry_cnt--;
- mutex_exit(&ipsq->ipsq_lock);
+
+ ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
+ if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
+ ipsq->ipsq_xop->ipx_reentry_cnt--;
return;
}
- mp = ipsq_dq(ipsq);
- while (mp != NULL) {
-again:
- mutex_exit(&ipsq->ipsq_lock);
- func = (ipsq_func_t)mp->b_prev;
- q = (queue_t *)mp->b_queue;
- mp->b_prev = NULL;
- mp->b_queue = NULL;
-
- /*
- * If 'q' is an conn queue, it is valid, since we did a
- * a refhold on the connp, at the start of the ioctl.
- * If 'q' is an ill queue, it is valid, since close of an
- * ill will clean up the 'ipsq'.
- */
- (*func)(ipsq, q, mp, NULL);
-
- mutex_enter(&ipsq->ipsq_lock);
+ for (;;) {
+ phyi = ipsq->ipsq_phyint;
mp = ipsq_dq(ipsq);
- }
-
- mutex_exit(&ipsq->ipsq_lock);
-
- /*
- * Need to grab the locks in the right order. Need to
- * atomically check (under ipsq_lock) that there are no
- * messages before relinquishing the ipsq. Also need to
- * atomically wakeup waiters on ill_cv while holding ill_lock.
- * Holding ill_g_lock ensures that ipsq list of ills is stable.
- * If we need to call ill_split_ipsq and change <ill-ipsq> we need
- * to grab ill_g_lock as writer.
- */
- rw_enter(&ipst->ips_ill_g_lock,
- ipsq->ipsq_split ? RW_WRITER : RW_READER);
+ mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
- /* ipsq_refs can't change while ill_g_lock is held as reader */
- if (ipsq->ipsq_refs != 0) {
- /* At most 2 ills v4/v6 per phyint */
- cnt = ipsq->ipsq_refs << 1;
- ill_list_size = cnt * sizeof (ill_t *);
/*
- * If memory allocation fails, we will do the split
- * the next time ipsq_exit is called for whatever reason.
- * As long as the ipsq_split flag is set the need to
- * split is remembered.
+ * If we've changed to a new IPSQ, and the phyint associated
+ * with the old one has gone away, free the old IPSQ. Note
+ * that this cannot happen while the IPSQ is in a group.
*/
- ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
- if (ill_list != NULL)
- cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt);
- }
- mutex_enter(&ipsq->ipsq_lock);
- mp = ipsq_dq(ipsq);
- if (mp != NULL) {
- /* oops, some message has landed up, we can't get out */
- if (ill_list != NULL)
- ill_unlock_ills(ill_list, cnt);
- rw_exit(&ipst->ips_ill_g_lock);
- if (ill_list != NULL)
- kmem_free(ill_list, ill_list_size);
- ill_list = NULL;
- ill_list_size = 0;
- cnt = 0;
- goto again;
- }
+ if (mp_ipsq != ipsq && phyi == NULL) {
+ ASSERT(ipsq->ipsq_next == ipsq);
+ ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
+ ipsq_delete(ipsq);
+ }
- /*
- * Split only if no ioctl is pending and if memory alloc succeeded
- * above.
- */
- if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL &&
- ill_list != NULL) {
- /*
- * No new ill can join this ipsq since we are holding the
- * ill_g_lock. Hence ill_split_ipsq can safely traverse the
- * ipsq. ill_split_ipsq may fail due to memory shortage.
- * If so we will retry on the next ipsq_exit.
- */
- ipsq->ipsq_split = ill_split_ipsq(ipsq);
- }
+ if (mp == NULL)
+ break;
- /*
- * We are holding the ipsq lock, hence no new messages can
- * land up on the ipsq, and there are no messages currently.
- * Now safe to get out. Wake up waiters and relinquish ipsq
- * atomically while holding ill locks.
- */
- ipsq->ipsq_writer = NULL;
- ipsq->ipsq_reentry_cnt--;
- ASSERT(ipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
- ipsq->ipsq_depth = 0;
-#endif
- mutex_exit(&ipsq->ipsq_lock);
- /*
- * For IPMP this should wake up all ills in this ipsq.
- * We need to hold the ill_lock while waking up waiters to
- * avoid missed wakeups. But there is no need to acquire all
- * the ill locks and then wakeup. If we have not acquired all
- * the locks (due to memory failure above) ill_signal_ipsq_ills
- * wakes up ills one at a time after getting the right ill_lock
- */
- ill_signal_ipsq_ills(ipsq, ill_list != NULL);
- if (ill_list != NULL)
- ill_unlock_ills(ill_list, cnt);
- if (ipsq->ipsq_refs == 0)
- need_ipsq_free = B_TRUE;
- rw_exit(&ipst->ips_ill_g_lock);
- if (ill_list != 0)
- kmem_free(ill_list, ill_list_size);
+ q = mp->b_queue;
+ func = (ipsq_func_t)mp->b_prev;
+ ipsq = mp_ipsq;
+ mp->b_next = mp->b_prev = NULL;
+ mp->b_queue = NULL;
- if (need_ipsq_free) {
/*
- * Free the ipsq. ipsq_refs can't increase because ipsq can't be
- * looked up. ipsq can be looked up only thru ill or phyint
- * and there are no ills/phyint on this ipsq.
+ * If 'q' is an conn queue, it is valid, since we did a
+ * a refhold on the conn at the start of the ioctl.
+ * If 'q' is an ill queue, it is valid, since close of an
+ * ill will clean up its IPSQ.
*/
- ipsq_delete(ipsq);
- }
-
- /*
- * Now that we're outside the IPSQ, start any IGMP/MLD timers. We
- * can't start these inside the IPSQ since e.g. igmp_start_timers() ->
- * untimeout() (inside the IPSQ, waiting for an executing timeout to
- * finish) could deadlock with igmp_timeout_handler() -> ipsq_enter()
- * (executing the timeout, waiting to get inside the IPSQ).
- *
- * However, there is one exception to the above: if this thread *is*
- * the IGMP/MLD timeout handler thread, then we must not start its
- * timer until the current handler is done.
- */
- mutex_enter(&ipst->ips_igmp_timer_lock);
- if (curthread != ipst->ips_igmp_timer_thread) {
- next = ipst->ips_igmp_deferred_next;
- ipst->ips_igmp_deferred_next = INFINITY;
- mutex_exit(&ipst->ips_igmp_timer_lock);
-
- if (next != INFINITY)
- igmp_start_timers(next, ipst);
- } else {
- mutex_exit(&ipst->ips_igmp_timer_lock);
- }
-
- mutex_enter(&ipst->ips_mld_timer_lock);
- if (curthread != ipst->ips_mld_timer_thread) {
- next = ipst->ips_mld_deferred_next;
- ipst->ips_mld_deferred_next = INFINITY;
- mutex_exit(&ipst->ips_mld_timer_lock);
-
- if (next != INFINITY)
- mld_start_timers(next, ipst);
- } else {
- mutex_exit(&ipst->ips_mld_timer_lock);
+ (*func)(ipsq, q, mp, NULL);
}
}
@@ -7822,15 +8023,17 @@ again:
void
ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
{
+ ipxop_t *ipx = ipsq->ipsq_xop;
+
ASSERT(IAM_WRITER_IPSQ(ipsq));
+ ASSERT(ipx->ipx_current_ipif == NULL);
+ ASSERT(ipx->ipx_current_ioctl == 0);
- mutex_enter(&ipsq->ipsq_lock);
- ASSERT(ipsq->ipsq_current_ipif == NULL);
- ASSERT(ipsq->ipsq_current_ioctl == 0);
- ipsq->ipsq_current_done = B_FALSE;
- ipsq->ipsq_current_ipif = ipif;
- ipsq->ipsq_current_ioctl = ioccmd;
- mutex_exit(&ipsq->ipsq_lock);
+ ipx->ipx_current_done = B_FALSE;
+ ipx->ipx_current_ioctl = ioccmd;
+ mutex_enter(&ipx->ipx_lock);
+ ipx->ipx_current_ipif = ipif;
+ mutex_exit(&ipx->ipx_lock);
}
/*
@@ -7844,17 +8047,18 @@ ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
void
ipsq_current_finish(ipsq_t *ipsq)
{
- ipif_t *ipif = ipsq->ipsq_current_ipif;
+ ipxop_t *ipx = ipsq->ipsq_xop;
t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
+ ipif_t *ipif = ipx->ipx_current_ipif;
ASSERT(IAM_WRITER_IPSQ(ipsq));
/*
- * For SIOCSLIFREMOVEIF, the ipif has been already been blown away
+ * For SIOCLIFREMOVEIF, the ipif has been already been blown away
* (but in that case, IPIF_CHANGING will already be clear and no
* pending DLPI messages can remain).
*/
- if (ipsq->ipsq_current_ioctl != SIOCLIFREMOVEIF) {
+ if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
ill_t *ill = ipif->ipif_ill;
mutex_enter(&ill->ill_lock);
@@ -7863,12 +8067,14 @@ ipsq_current_finish(ipsq_t *ipsq)
mutex_exit(&ill->ill_lock);
}
- mutex_enter(&ipsq->ipsq_lock);
- ipsq->ipsq_current_ioctl = 0;
- ipsq->ipsq_current_done = B_TRUE;
- if (dlpi_pending == DL_PRIM_INVAL)
- ipsq->ipsq_current_ipif = NULL;
- mutex_exit(&ipsq->ipsq_lock);
+ ASSERT(!ipx->ipx_current_done);
+ ipx->ipx_current_done = B_TRUE;
+ ipx->ipx_current_ioctl = 0;
+ if (dlpi_pending == DL_PRIM_INVAL) {
+ mutex_enter(&ipx->ipx_lock);
+ ipx->ipx_current_ipif = NULL;
+ mutex_exit(&ipx->ipx_lock);
+ }
}
/*
@@ -7884,123 +8090,38 @@ ipsq_flush(ill_t *ill)
mblk_t *prev;
mblk_t *mp;
mblk_t *mp_next;
- ipsq_t *ipsq;
+ ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
ASSERT(IAM_WRITER_ILL(ill));
- ipsq = ill->ill_phyint->phyint_ipsq;
+
/*
* Flush any messages sent up by the driver.
*/
- mutex_enter(&ipsq->ipsq_lock);
- for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) {
+ mutex_enter(&ipx->ipx_lock);
+ for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
mp_next = mp->b_next;
q = mp->b_queue;
if (q == ill->ill_rq || q == ill->ill_wq) {
- /* Remove the mp from the ipsq */
+ /* dequeue mp */
if (prev == NULL)
- ipsq->ipsq_mphead = mp->b_next;
+ ipx->ipx_mphead = mp->b_next;
else
prev->b_next = mp->b_next;
- if (ipsq->ipsq_mptail == mp) {
+ if (ipx->ipx_mptail == mp) {
ASSERT(mp_next == NULL);
- ipsq->ipsq_mptail = prev;
+ ipx->ipx_mptail = prev;
}
inet_freemsg(mp);
} else {
prev = mp;
}
}
- mutex_exit(&ipsq->ipsq_lock);
+ mutex_exit(&ipx->ipx_lock);
(void) ipsq_pending_mp_cleanup(ill, NULL);
ipsq_xopq_mp_cleanup(ill, NULL);
ill_pending_mp_cleanup(ill);
}
-/* ARGSUSED */
-int
-ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
- ip_ioctl_cmd_t *ipip, void *ifreq)
-{
- ill_t *ill;
- struct lifreq *lifr = (struct lifreq *)ifreq;
- boolean_t isv6;
- conn_t *connp;
- ip_stack_t *ipst;
-
- connp = Q_TO_CONN(q);
- ipst = connp->conn_netstack->netstack_ip;
- isv6 = connp->conn_af_isv6;
- /*
- * Set original index.
- * Failover and failback move logical interfaces
- * from one physical interface to another. The
- * original index indicates the parent of a logical
- * interface, in other words, the physical interface
- * the logical interface will be moved back to on
- * failback.
- */
-
- /*
- * Don't allow the original index to be changed
- * for non-failover addresses, autoconfigured
- * addresses, or IPv6 link local addresses.
- */
- if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) ||
- (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) {
- return (EINVAL);
- }
- /*
- * The new original index must be in use by some
- * physical interface.
- */
- ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL,
- NULL, NULL, ipst);
- if (ill == NULL)
- return (ENXIO);
- ill_refrele(ill);
-
- ipif->ipif_orig_ifindex = lifr->lifr_index;
- /*
- * When this ipif gets failed back, don't
- * preserve the original id, as it is no
- * longer applicable.
- */
- ipif->ipif_orig_ipifid = 0;
- /*
- * For IPv4, change the original index of any
- * multicast addresses associated with the
- * ipif to the new value.
- */
- if (!isv6) {
- ilm_t *ilm;
-
- mutex_enter(&ipif->ipif_ill->ill_lock);
- for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if (ilm->ilm_ipif == ipif) {
- ilm->ilm_orig_ifindex = lifr->lifr_index;
- }
- }
- mutex_exit(&ipif->ipif_ill->ill_lock);
- }
- return (0);
-}
-
-/* ARGSUSED */
-int
-ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
- ip_ioctl_cmd_t *ipip, void *ifreq)
-{
- struct lifreq *lifr = (struct lifreq *)ifreq;
-
- /*
- * Get the original interface index i.e the one
- * before FAILOVER if it ever happened.
- */
- lifr->lifr_index = ipif->ipif_orig_ifindex;
- return (0);
-}
-
/*
* Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls,
* refhold and return the associated ipif
@@ -8087,8 +8208,6 @@ int
ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
cmd_info_t *ci, ipsq_func_t func)
{
- sin_t *sin;
- sin6_t *sin6;
char *name;
struct ifreq *ifr;
struct lifreq *lifr;
@@ -8132,9 +8251,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
* be trusted.
*/
ifr->ifr_name[IFNAMSIZ - 1] = '\0';
- sin = (sin_t *)&ifr->ifr_addr;
name = ifr->ifr_name;
- ci->ci_sin = sin;
+ ci->ci_sin = (sin_t *)&ifr->ifr_addr;
ci->ci_sin6 = NULL;
ci->ci_lifr = (struct lifreq *)ifr;
} else {
@@ -8148,14 +8266,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
*/
lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
name = lifr->lifr_name;
- sin = (sin_t *)&lifr->lifr_addr;
- sin6 = (sin6_t *)&lifr->lifr_addr;
- if (ipip->ipi_cmd == SIOCSLIFGROUPNAME) {
- (void) strncpy(ci->ci_groupname, lifr->lifr_groupname,
- LIFNAMSIZ);
- }
- ci->ci_sin = sin;
- ci->ci_sin6 = sin6;
+ ci->ci_sin = (sin_t *)&lifr->lifr_addr;
+ ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
ci->ci_lifr = lifr;
}
@@ -8181,21 +8293,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
if (ipif == NULL) {
if (err == EINPROGRESS)
return (err);
- if (ipip->ipi_cmd == SIOCLIFFAILOVER ||
- ipip->ipi_cmd == SIOCLIFFAILBACK) {
- /*
- * Need to try both v4 and v6 since this
- * ioctl can come down either v4 or v6
- * socket. The lifreq.lifr_family passed
- * down by this ioctl is AF_UNSPEC.
- */
- ipif = ipif_lookup_on_name(name,
- mi_strlen(name), B_FALSE, &exists, !isv6,
- zoneid, (connp == NULL) ? q :
- CONNP_TO_WQ(connp), mp, func, &err, ipst);
- if (err == EINPROGRESS)
- return (err);
- }
err = 0; /* Ensure we don't use it below */
}
}
@@ -8221,15 +8318,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
if (ipif == NULL)
return (ENXIO);
- /*
- * Allow only GET operations if this ipif has been created
- * temporarily due to a MOVE operation.
- */
- if (ipif->ipif_replace_zero && !(ipip->ipi_flags & IPI_REPL)) {
- ipif_refrele(ipif);
- return (EINVAL);
- }
-
ci->ci_ipif = ipif;
return (0);
}
@@ -8247,15 +8335,15 @@ ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
-
- while (ill != NULL) {
+ for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ if (IS_UNDER_IPMP(ill))
+ continue;
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if (ipif->ipif_zoneid == zoneid ||
ipif->ipif_zoneid == ALL_ZONES)
numifs++;
}
- ill = ill_next(&ctx, ill);
}
rw_exit(&ipst->ips_ill_g_lock);
return (numifs);
@@ -8283,6 +8371,9 @@ ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
ill = ILL_START_WALK_ALL(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
+ continue;
+
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if ((ipif->ipif_flags & IPIF_NOXMIT) &&
@@ -8491,6 +8582,8 @@ ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ if (IS_UNDER_IPMP(ill))
+ continue;
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if (zoneid != ipif->ipif_zoneid &&
@@ -8760,6 +8853,9 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ill_first(list, list, &ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
+ continue;
+
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if ((ipif->ipif_flags & IPIF_NOXMIT) &&
@@ -8795,6 +8891,7 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
ipif_get_name(ipif, lifr->lifr_name,
sizeof (lifr->lifr_name));
+ lifr->lifr_type = ill->ill_type;
if (ipif->ipif_isv6) {
sin6 = (sin6_t *)&lifr->lifr_addr;
*sin6 = sin6_null;
@@ -8828,23 +8925,6 @@ lif_copydone:
return (0);
}
-/* ARGSUSED */
-int
-ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin,
- queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
-{
- ip_stack_t *ipst;
-
- if (q->q_next == NULL)
- ipst = CONNQ_TO_IPST(q);
- else
- ipst = ILLQ_TO_IPST(q);
-
- /* Existence of b_cont->b_cont checked in ip_wput_nondata */
- ipst->ips_ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr;
- return (0);
-}
-
static void
ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
{
@@ -9038,8 +9118,7 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid);
} else {
src_ipif = ipif_select_source_v6(dst_ill,
- daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT,
- zoneid);
+ daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid);
}
if (src_ipif == NULL)
goto next_dst;
@@ -9325,10 +9404,14 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
struct arpreq *ar;
struct xarpreq *xar;
int flags, alength;
- char *lladdr;
- ip_stack_t *ipst;
+ uchar_t *lladdr;
+ ire_t *ire;
+ ip_stack_t *ipst;
ill_t *ill = ipif->ipif_ill;
+ ill_t *proxy_ill = NULL;
+ ipmp_arpent_t *entp = NULL;
boolean_t if_arp_ioctl = B_FALSE;
+ boolean_t proxyarp = B_FALSE;
ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
connp = Q_TO_CONN(q);
@@ -9340,7 +9423,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ar = NULL;
flags = xar->xarp_flags;
- lladdr = LLADDR(&xar->xarp_ha);
+ lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
/*
* Validate against user's link layer address length
@@ -9359,7 +9442,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
xar = NULL;
flags = ar->arp_flags;
- lladdr = ar->arp_ha.sa_data;
+ lladdr = (uchar_t *)ar->arp_ha.sa_data;
/*
* Theoretically, the sa_family could tell us what link
* layer type this operation is trying to deal with. By
@@ -9379,6 +9462,51 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
}
+ ipaddr = sin->sin_addr.s_addr;
+
+ /*
+ * IPMP ARP special handling:
+ *
+ * 1. Since ARP mappings must appear consistent across the group,
+ * prohibit changing ARP mappings on the underlying interfaces.
+ *
+ * 2. Since ARP mappings for IPMP data addresses are maintained by
+ * IP itself, prohibit changing them.
+ *
+ * 3. For proxy ARP, use a functioning hardware address in the group,
+ * provided one exists. If one doesn't, just add the entry as-is;
+ * ipmp_illgrp_refresh_arpent() will refresh it if things change.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP)
+ return (EPERM);
+ }
+ if (IS_IPMP(ill)) {
+ ipmp_illgrp_t *illg = ill->ill_grp;
+
+ switch (ipip->ipi_cmd) {
+ case SIOCSARP:
+ case SIOCSXARP:
+ proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength);
+ if (proxy_ill != NULL) {
+ proxyarp = B_TRUE;
+ if (!ipmp_ill_is_active(proxy_ill))
+ proxy_ill = ipmp_illgrp_next_ill(illg);
+ if (proxy_ill != NULL)
+ lladdr = proxy_ill->ill_phys_addr;
+ }
+ /* FALLTHRU */
+ case SIOCDARP:
+ case SIOCDXARP:
+ ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL,
+ ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
+ if (ire != NULL) {
+ ire_refrele(ire);
+ return (EPERM);
+ }
+ }
+ }
+
/*
* We are going to pass up to ARP a packet chain that looks
* like:
@@ -9400,8 +9528,6 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
return (ENOMEM);
}
- ipaddr = sin->sin_addr.s_addr;
-
mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
(caddr_t)&ipaddr);
if (mp2 == NULL) {
@@ -9481,6 +9607,30 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
area->area_flags |= ACE_F_AUTHORITY;
/*
+ * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it
+ * so that IP can update ARP as the active ills in the group change.
+ */
+ if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD &&
+ (area->area_flags & ACE_F_PERMANENT)) {
+ entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp);
+
+ /*
+ * The second part of the conditional below handles a corner
+ * case: if this is proxy ARP and the IPMP group has no active
+ * interfaces, we can't send the request to ARP now since it
+ * won't be able to build an ACE. So we return success and
+ * notify ARP about the proxy ARP entry once an interface
+ * becomes active.
+ */
+ if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
+ mp2->b_cont = NULL;
+ inet_freemsg(mp1);
+ inet_freemsg(pending_mp);
+ return (entp == NULL ? ENOMEM : 0);
+ }
+ }
+
+ /*
* Before sending 'mp' to ARP, we have to clear the b_next
* and b_prev. Otherwise if STREAMS encounters such a message
* in freemsg(), (because ARP can close any time) it can cause
@@ -9497,7 +9647,12 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
mutex_enter(&connp->conn_lock);
mutex_enter(&ill->ill_lock);
/* conn has not yet started closing, hence this can't fail */
- VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0);
+ if (ipip->ipi_flags & IPI_WR) {
+ VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp),
+ pending_mp, 0) != 0);
+ } else {
+ VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0);
+ }
mutex_exit(&ill->ill_lock);
mutex_exit(&connp->conn_lock);
@@ -9506,6 +9661,13 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* M_IOCACK, and will be handed to ip_sioctl_iocack() for completion.
*/
putnext(ill->ill_rq, mp1);
+
+ /*
+ * If we created an IPMP ARP entry, mark that we've notified ARP.
+ */
+ if (entp != NULL)
+ ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
+
return (EINPROGRESS);
}
@@ -9564,55 +9726,114 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
mp, func, &err, ipst);
if (ipif == NULL)
return (err);
- if (ipif->ipif_id != 0 ||
- ipif->ipif_net_type != IRE_IF_RESOLVER) {
+ if (ipif->ipif_id != 0) {
ipif_refrele(ipif);
return (ENXIO);
}
} else {
/*
- * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with sdl_nlen ==
- * 0: use the IP address to figure out the ill. In the IPMP
- * case, a simple forwarding table lookup will return the
- * IRE_IF_RESOLVER for the first interface in the group, which
- * might not be the interface on which the requested IP
- * address was resolved due to the ill selection algorithm
- * (see ip_newroute_get_dst_ill()). So we do a cache table
- * lookup first: if the IRE cache entry for the IP address is
- * still there, it will contain the ill pointer for the right
- * interface, so we use that. If the cache entry has been
- * flushed, we fall back to the forwarding table lookup. This
- * should be rare enough since IRE cache entries have a longer
- * life expectancy than ARP cache entries.
+ * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen
+ * of 0: use the IP address to find the ipif. If the IP
+ * address is an IPMP test address, ire_ftable_lookup() will
+ * find the wrong ill, so we first do an ipif_lookup_addr().
*/
- ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL,
- ipst);
- if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) ||
- ((ill = ire_to_ill(ire)) == NULL) ||
- (ill->ill_net_type != IRE_IF_RESOLVER)) {
- if (ire != NULL)
- ire_refrele(ire);
- ire = ire_ftable_lookup(sin->sin_addr.s_addr,
- 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0,
- NULL, MATCH_IRE_TYPE, ipst);
+ ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
+ CONNP_TO_WQ(connp), mp, func, &err, ipst);
+ if (ipif == NULL) {
+ ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0,
+ IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL,
+ MATCH_IRE_TYPE, ipst);
if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) {
-
if (ire != NULL)
ire_refrele(ire);
return (ENXIO);
}
+ ipif = ill->ill_ipif;
+ ipif_refhold(ipif);
+ ire_refrele(ire);
}
- ASSERT(ire != NULL && ill != NULL);
- ipif = ill->ill_ipif;
- ipif_refhold(ipif);
- ire_refrele(ire);
}
+
+ if (ipif->ipif_net_type != IRE_IF_RESOLVER) {
+ ipif_refrele(ipif);
+ return (ENXIO);
+ }
+
ci->ci_sin = sin;
ci->ci_ipif = ipif;
return (0);
}
/*
+ * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the
+ * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is
+ * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it
+ * up and thus an ill can join that illgrp.
+ *
+ * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than
+ * open()/close() primarily because close() is not allowed to fail or block
+ * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason
+ * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure
+ * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the
+ * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts
+ * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent
+ * state if I_UNLINK didn't occur.
+ *
+ * Note that for each plumb/unplumb operation, we may end up here more than
+ * once because of the way ifconfig works. However, it's OK to link the same
+ * illgrp more than once, or unlink an illgrp that's already unlinked.
+ */
+static int
+ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
+{
+ int err;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ASSERT(IS_IPMP(ill));
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ switch (ioccmd) {
+ case I_LINK:
+ return (ENOTSUP);
+
+ case I_PLINK:
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp);
+ rw_exit(&ipst->ips_ipmp_lock);
+ break;
+
+ case I_PUNLINK:
+ /*
+ * Require all UP ipifs be brought down prior to unlinking the
+ * illgrp so any associated IREs (and other state) is torched.
+ */
+ if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
+ return (EBUSY);
+
+ /*
+ * NOTE: We hold ipmp_lock across the unlink to prevent a race
+ * with an SIOCSLIFGROUPNAME request from an ill trying to
+ * join this group. Specifically: ills trying to join grab
+ * ipmp_lock and bump a "pending join" counter checked by
+ * ipmp_illgrp_unlink_grp(). During the unlink no new pending
+ * joins can occur (since we have ipmp_lock). Once we drop
+ * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not
+ * find the illgrp (since we unlinked it) and will return
+ * EAFNOSUPPORT. This will then take them back through the
+ * IPMP meta-interface plumbing logic in ifconfig, and thus
+ * back through I_PLINK above.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ err = ipmp_illgrp_unlink_grp(ill->ill_grp);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (err);
+ default:
+ break;
+ }
+ return (0);
+}
+
+/*
* Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
* atomically set/clear the muxids. Also complete the ioctl by acking or
* naking it. Note that the code is structured such that the link type,
@@ -9697,7 +9918,7 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
if (ipsq == NULL) {
ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
- NEW_OP, B_TRUE);
+ NEW_OP, B_FALSE);
if (ipsq == NULL) {
ill_refrele(ill);
return;
@@ -9728,6 +9949,11 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
err = EINVAL;
goto done;
}
+
+ if (IS_IPMP(ill) &&
+ (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
+ goto done;
+
ill->ill_arp_muxid = islink ? li->l_index : 0;
} else {
/*
@@ -9763,6 +9989,7 @@ static int
ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
struct linkblk *li, boolean_t doconsist)
{
+ int err = 0;
ill_t *ill;
queue_t *ipwq, *dwq;
const char *name;
@@ -9796,7 +10023,7 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
if (ipsq == NULL) {
ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
- NEW_OP, B_TRUE);
+ NEW_OP, B_FALSE);
if (ipsq == NULL)
return (EINPROGRESS);
entered_ipsq = B_TRUE;
@@ -9811,12 +10038,14 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
*/
if ((islink && ill->ill_ip_muxid != 0) ||
(!islink && ill->ill_arp_muxid != 0)) {
- if (entered_ipsq)
- ipsq_exit(ipsq);
- return (EINVAL);
+ err = EINVAL;
+ goto done;
}
}
+ if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
+ goto done;
+
/*
* As part of I_{P}LINKing, stash the number of downstream modules and
* the read queue of the module immediately below IP in the ill.
@@ -9853,11 +10082,11 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
ill_capability_reset(ill, B_FALSE);
}
ipsq_current_finish(ipsq);
-
+done:
if (entered_ipsq)
ipsq_exit(ipsq);
- return (0);
+ return (err);
}
/*
@@ -10124,8 +10353,9 @@ nak:
}
/* ip_wput hands off ARP IOCTL responses to us */
+/* ARGSUSED3 */
void
-ip_sioctl_iocack(queue_t *q, mblk_t *mp)
+ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
{
struct arpreq *ar;
struct xarpreq *xar;
@@ -10136,7 +10366,6 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
struct iocblk *orig_iocp;
ill_t *ill;
conn_t *connp = NULL;
- uint_t ioc_id;
mblk_t *pending_mp;
int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE;
int *flagsp;
@@ -10146,6 +10375,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
int err;
ip_stack_t *ipst;
+ ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
ill = q->q_ptr;
ASSERT(ill != NULL);
ipst = ill->ill_ipst;
@@ -10185,10 +10415,14 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
iocp = (struct iocblk *)mp->b_rptr;
/*
- * Pick out the originating queue based on the ioc_id.
+ * Find the pending message; if we're exclusive, it'll be on our IPSQ.
+ * Otherwise, we can find it from our ioc_id.
*/
- ioc_id = iocp->ioc_id;
- pending_mp = ill_pending_mp_get(ill, &connp, ioc_id);
+ if (ipsq != NULL)
+ pending_mp = ipsq_pending_mp_get(ipsq, &connp);
+ else
+ pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id);
+
if (pending_mp == NULL) {
ASSERT(connp == NULL);
inet_freemsg(mp);
@@ -10271,7 +10505,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
ire_refrele(ire);
freemsg(mp);
ip_ioctl_finish(q, orig_ioc_mp,
- EINVAL, NO_COPYOUT, NULL);
+ EINVAL, NO_COPYOUT, ipsq);
return;
}
*flagsp |= ATF_COM;
@@ -10297,12 +10531,27 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
/* Ditch the internal IOCTL. */
freemsg(mp);
ire_refrele(ire);
- ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL);
+ ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
return;
}
}
/*
+ * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE
+ * on the IPMP meta-interface, ensure any ARP entries added in
+ * ip_sioctl_arp() are deleted.
+ */
+ if (IS_IPMP(ill) &&
+ ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) ||
+ ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) {
+ ipmp_illgrp_t *illg = ill->ill_grp;
+ ipmp_arpent_t *entp;
+
+ if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL)
+ ipmp_illgrp_destroy_arpent(illg, entp);
+ }
+
+ /*
* Delete the coresponding IRE_CACHE if any.
* Reset the error if there was one (in case there was no entry
* in arp.)
@@ -10341,7 +10590,7 @@ errack:
if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) {
err = iocp->ioc_error;
freemsg(mp);
- ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL);
+ ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq);
return;
}
@@ -10355,7 +10604,7 @@ errack:
sizeof (xar->xarp_ha.sdl_data)) {
freemsg(mp);
ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT,
- NULL);
+ ipsq);
return;
}
}
@@ -10382,7 +10631,7 @@ errack:
/* Ditch the internal IOCTL. */
freemsg(mp);
/* Complete the original. */
- ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL);
+ ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
}
/*
@@ -10397,7 +10646,7 @@ errack:
* If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
* is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
*
- * Executed as a writer on the ill or ill group.
+ * Executed as a writer on the ill.
* So no lock is needed to traverse the ipif chain, or examine the
* phyint flags.
*/
@@ -10423,7 +10672,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
boolean_t found_sep = B_FALSE;
conn_t *connp;
zoneid_t zoneid;
- int orig_ifindex = 0;
ip_stack_t *ipst = CONNQ_TO_IPST(q);
ASSERT(q->q_next == NULL);
@@ -10513,61 +10761,10 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
if (ipsq == NULL)
return (EINPROGRESS);
- /*
- * If the interface is failed, inactive or offlined, look for a working
- * interface in the ill group and create the ipif there. If we can't
- * find a good interface, create the ipif anyway so that in.mpathd can
- * move it to the first repaired interface.
- */
- if ((ill->ill_phyint->phyint_flags &
- (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
- ill->ill_phyint->phyint_groupname_len != 0) {
- phyint_t *phyi;
- char *groupname = ill->ill_phyint->phyint_groupname;
-
- /*
- * We're looking for a working interface, but it doesn't matter
- * if it's up or down; so instead of following the group lists,
- * we look at each physical interface and compare the groupname.
- * We're only interested in interfaces with IPv4 (resp. IPv6)
- * plumbed when we're adding an IPv4 (resp. IPv6) ipif.
- * Otherwise we create the ipif on the failed interface.
- */
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- phyi = avl_first(&ipst->ips_phyint_g_list->
- phyint_list_avl_by_index);
- for (; phyi != NULL;
- phyi = avl_walk(&ipst->ips_phyint_g_list->
- phyint_list_avl_by_index,
- phyi, AVL_AFTER)) {
- if (phyi->phyint_groupname_len == 0)
- continue;
- ASSERT(phyi->phyint_groupname != NULL);
- if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 &&
- !(phyi->phyint_flags &
- (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
- (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) :
- (phyi->phyint_illv4 != NULL))) {
- break;
- }
- }
- rw_exit(&ipst->ips_ill_g_lock);
-
- if (phyi != NULL) {
- orig_ifindex = ill->ill_phyint->phyint_ifindex;
- ill = (ill->ill_isv6 ? phyi->phyint_illv6 :
- phyi->phyint_illv4);
- }
- }
-
- /*
- * We are now exclusive on the ipsq, so an ill move will be serialized
- * before or after us.
- */
+ /* We are now exclusive on the IPSQ */
ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(ill->ill_move_in_progress == B_FALSE);
- if (found_sep && orig_ifindex == 0) {
+ if (found_sep) {
/* Now see if there is an IPIF with this unit number. */
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
@@ -10580,14 +10777,11 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
/*
* We use IRE_LOCAL for lo0:1 etc. for "receive only" use
- * of lo0. We never come here when we plumb lo0:0. It
- * happens in ipif_lookup_on_name.
- * The specified unit number is ignored when we create the ipif on a
- * different interface. However, we save it in ipif_orig_ipifid below so
- * that the ipif fails back to the right position.
- */
- if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ?
- id : -1, IRE_LOCAL, B_TRUE)) == NULL) {
+ * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name()
+ * instead.
+ */
+ if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL,
+ B_TRUE, B_TRUE)) == NULL) {
err = ENOBUFS;
goto done;
}
@@ -10604,14 +10798,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
&ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
}
- /* Set ifindex and unit number for failback */
- if (err == 0 && orig_ifindex != 0) {
- ipif->ipif_orig_ifindex = orig_ifindex;
- if (found_sep) {
- ipif->ipif_orig_ipifid = id;
- }
- }
-
done:
ipsq_exit(ipsq);
return (err);
@@ -10672,7 +10858,6 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ill_delete(ill);
mutex_enter(&connp->conn_lock);
mutex_enter(&ill->ill_lock);
- ASSERT(ill->ill_group == NULL);
/* Are any references to this ill active */
if (ill_is_freeable(ill)) {
@@ -10693,14 +10878,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
}
- /*
- * We are exclusive on the ipsq, so an ill move will be serialized
- * before or after us.
- */
- ASSERT(ill->ill_move_in_progress == B_FALSE);
-
if (ipif->ipif_id == 0) {
-
ipsq_t *ipsq;
/* Find based on address */
@@ -10712,35 +10890,15 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
sin6 = (sin6_t *)sin;
/* We are a writer, so we should be able to lookup */
- ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
- ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
- if (ipif == NULL) {
- /*
- * Maybe the address in on another interface in
- * the same IPMP group? We check this below.
- */
- ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
- NULL, ALL_ZONES, NULL, NULL, NULL, NULL,
- ipst);
- }
+ ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill,
+ ipst);
} else {
- ipaddr_t addr;
-
if (sin->sin_family != AF_INET)
return (EAFNOSUPPORT);
- addr = sin->sin_addr.s_addr;
/* We are a writer, so we should be able to lookup */
- ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL,
- NULL, NULL, NULL, ipst);
- if (ipif == NULL) {
- /*
- * Maybe the address in on another interface in
- * the same IPMP group? We check this below.
- */
- ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES,
- NULL, NULL, NULL, NULL, ipst);
- }
+ ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill,
+ ipst);
}
if (ipif == NULL) {
return (EADDRNOTAVAIL);
@@ -10750,32 +10908,11 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* It is possible for a user to send an SIOCLIFREMOVEIF with
* lifr_name of the physical interface but with an ip address
* lifr_addr of a logical interface plumbed over it.
- * So update ipsq_current_ipif once ipif points to the
- * correct interface after doing ipif_lookup_addr().
+ * So update ipx_current_ipif now that ipif points to the
+ * correct one.
*/
ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
- ASSERT(ipsq != NULL);
-
- mutex_enter(&ipsq->ipsq_lock);
- ipsq->ipsq_current_ipif = ipif;
- mutex_exit(&ipsq->ipsq_lock);
-
- /*
- * When the address to be removed is hosted on a different
- * interface, we check if the interface is in the same IPMP
- * group as the specified one; if so we proceed with the
- * removal.
- * ill->ill_group is NULL when the ill is down, so we have to
- * compare the group names instead.
- */
- if (ipif->ipif_ill != ill &&
- (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 ||
- ill->ill_phyint->phyint_groupname_len == 0 ||
- mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname,
- ill->ill_phyint->phyint_groupname) != 0)) {
- ipif_refrele(ipif);
- return (EADDRNOTAVAIL);
- }
+ ipsq->ipsq_xop->ipx_current_ipif = ipif;
/* This is a writer */
ipif_refrele(ipif);
@@ -11072,7 +11209,7 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
if (need_dl_down)
ill_dl_down(ill);
if (need_arp_down)
- ipif_arp_down(ipif);
+ ipif_resolver_down(ipif);
return (err);
}
@@ -11272,9 +11409,9 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
if (need_dl_down)
ill_dl_down(ill);
-
if (need_arp_down)
- ipif_arp_down(ipif);
+ ipif_resolver_down(ipif);
+
return (err);
}
@@ -11323,144 +11460,8 @@ ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
/*
- * part of ipmp, make this func return the active/inactive state and
- * caller can set once atomically instead of multiple mutex_enter/mutex_exit
- */
-/*
- * This function either sets or clears the IFF_INACTIVE flag.
- *
- * As long as there are some addresses or multicast memberships on the
- * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we
- * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface
- * will be used for outbound packets.
- *
- * Caller needs to verify the validity of setting IFF_INACTIVE.
- */
-static void
-phyint_inactive(phyint_t *phyi)
-{
- ill_t *ill_v4;
- ill_t *ill_v6;
- ipif_t *ipif;
- ilm_t *ilm;
-
- ill_v4 = phyi->phyint_illv4;
- ill_v6 = phyi->phyint_illv6;
-
- /*
- * No need for a lock while traversing the list since iam
- * a writer
- */
- if (ill_v4 != NULL) {
- ASSERT(IAM_WRITER_ILL(ill_v4));
- for (ipif = ill_v4->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
- mutex_enter(&phyi->phyint_lock);
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- mutex_exit(&phyi->phyint_lock);
- return;
- }
- }
- for (ilm = ill_v4->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
- mutex_enter(&phyi->phyint_lock);
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- mutex_exit(&phyi->phyint_lock);
- return;
- }
- }
- }
- if (ill_v6 != NULL) {
- ill_v6 = phyi->phyint_illv6;
- for (ipif = ill_v6->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
- mutex_enter(&phyi->phyint_lock);
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- mutex_exit(&phyi->phyint_lock);
- return;
- }
- }
- for (ilm = ill_v6->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
- mutex_enter(&phyi->phyint_lock);
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- mutex_exit(&phyi->phyint_lock);
- return;
- }
- }
- }
- mutex_enter(&phyi->phyint_lock);
- phyi->phyint_flags |= PHYI_INACTIVE;
- mutex_exit(&phyi->phyint_lock);
-}
-
-/*
- * This function is called only when the phyint flags change. Currently
- * called from ip_sioctl_flags. We re-do the broadcast nomination so
- * that we can select a good ill.
- */
-static void
-ip_redo_nomination(phyint_t *phyi)
-{
- ill_t *ill_v4;
-
- ill_v4 = phyi->phyint_illv4;
-
- if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
- ASSERT(IAM_WRITER_ILL(ill_v4));
- if (ill_v4->ill_group->illgrp_ill_count > 1)
- ill_nominate_bcast_rcv(ill_v4->ill_group);
- }
-}
-
-/*
- * Heuristic to check if ill is INACTIVE.
- * Checks if ill has an ipif with an usable ip address.
- *
- * Return values:
- * B_TRUE - ill is INACTIVE; has no usable ipif
- * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif
- */
-static boolean_t
-ill_is_inactive(ill_t *ill)
-{
- ipif_t *ipif;
-
- /* Check whether it is in an IPMP group */
- if (ill->ill_phyint->phyint_groupname == NULL)
- return (B_FALSE);
-
- if (ill->ill_ipif_up_count == 0)
- return (B_TRUE);
-
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- uint64_t flags = ipif->ipif_flags;
-
- /*
- * This ipif is usable if it is IPIF_UP and not a
- * dedicated test address. A dedicated test address
- * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED
- * (note in particular that V6 test addresses are
- * link-local data addresses and thus are marked
- * IPIF_NOFAILOVER but not IPIF_DEPRECATED).
- */
- if ((flags & IPIF_UP) &&
- ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) !=
- (IPIF_DEPRECATED|IPIF_NOFAILOVER)))
- return (B_FALSE);
- }
- return (B_TRUE);
-}
-
-/*
- * Set interface flags.
- * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT,
- * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST,
- * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE.
+ * Set interface flags. Many flags require special handling (e.g.,
+ * bringing the interface down); see below for details.
*
* NOTE : We really don't enforce that ipif_id zero should be used
* for setting any flags other than IFF_LOGINT_FLAGS. This
@@ -11478,17 +11479,16 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
{
uint64_t turn_on;
uint64_t turn_off;
- int err;
+ int err = 0;
phyint_t *phyi;
ill_t *ill;
- uint64_t intf_flags;
+ uint64_t intf_flags, cantchange_flags;
boolean_t phyint_flags_modified = B_FALSE;
uint64_t flags;
struct ifreq *ifr;
struct lifreq *lifr;
boolean_t set_linklocal = B_FALSE;
boolean_t zero_source = B_FALSE;
- ip_stack_t *ipst;
ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -11497,11 +11497,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ill = ipif->ipif_ill;
phyi = ill->ill_phyint;
- ipst = ill->ill_ipst;
if (ipip->ipi_cmd_type == IF_CMD) {
ifr = (struct ifreq *)if_req;
- flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
+ flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
} else {
lifr = (struct lifreq *)if_req;
flags = lifr->lifr_flags;
@@ -11524,25 +11523,60 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
flags |= intf_flags & ~0xFFFF;
/*
- * First check which bits will change and then which will
- * go on and off
+ * Explicitly fail attempts to change flags that are always invalid on
+ * an IPMP meta-interface.
*/
- turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE;
- if (!turn_on)
+ if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID))
+ return (EINVAL);
+
+ /*
+ * Check which flags will change; silently ignore flags which userland
+ * is not allowed to control. (Because these flags may change between
+ * SIOCGLIFFLAGS and SIOCSLIFFLAGS, and that's outside of userland's
+ * control, we need to silently ignore them rather than fail.)
+ */
+ cantchange_flags = IFF_CANTCHANGE;
+ if (IS_IPMP(ill))
+ cantchange_flags |= IFF_IPMP_CANTCHANGE;
+
+ turn_on = (flags ^ intf_flags) & ~cantchange_flags;
+ if (turn_on == 0)
return (0); /* No change */
turn_off = intf_flags & turn_on;
turn_on ^= turn_off;
- err = 0;
/*
- * Don't allow any bits belonging to the logical interface
- * to be set or cleared on the replacement ipif that was
- * created temporarily during a MOVE.
+ * All test addresses must be IFF_DEPRECATED (to ensure source address
+ * selection avoids them) -- so force IFF_DEPRECATED on, and do not
+ * allow it to be turned off.
*/
- if (ipif->ipif_replace_zero &&
- ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) {
+ if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED &&
+ (turn_on|intf_flags) & IFF_NOFAILOVER)
return (EINVAL);
+
+ if (turn_on & IFF_NOFAILOVER) {
+ turn_on |= IFF_DEPRECATED;
+ flags |= IFF_DEPRECATED;
+ }
+
+ /*
+ * On underlying interfaces, only allow applications to manage test
+ * addresses -- otherwise, they may get confused when the address
+ * moves as part of being brought up. Likewise, prevent an
+ * application-managed test address from being converted to a data
+ * address. To prevent migration of administratively up addresses in
+ * the kernel, we don't allow them to be converted either.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF;
+
+ if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER))
+ return (EINVAL);
+
+ if ((turn_off & IFF_NOFAILOVER) &&
+ (flags & (appflags | IFF_UP | IFF_DUPLICATE)))
+ return (EINVAL);
}
/*
@@ -11583,16 +11617,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
/*
- * ILL cannot be part of a usesrc group and and IPMP group at the
- * same time. No need to grab ill_g_usesrc_lock here, see
- * synchronization notes in ip.c
- */
- if (turn_on & PHYI_STANDBY &&
- ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
- return (EINVAL);
- }
-
- /*
* If we modify physical interface flags, we'll potentially need to
* send up two routing socket messages for the changes (one for the
* IPv4 ill, and another for the IPv6 ill). Note that here.
@@ -11601,98 +11625,44 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
phyint_flags_modified = B_TRUE;
/*
- * If we are setting or clearing FAILED or STANDBY or OFFLINE,
- * we need to flush the IRE_CACHES belonging to this ill.
- * We handle this case here without doing the DOWN/UP dance
- * like it is done for other flags. If some other flags are
- * being turned on/off with FAILED/STANDBY/OFFLINE, the code
- * below will handle it by bringing it down and then
- * bringing it UP.
+ * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE
+ * (otherwise, we'd immediately use them, defeating standby). Also,
+ * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not
+ * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already
+ * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We
+ * also don't allow PHYI_STANDBY if VNI is enabled since its semantics
+ * will not be honored.
*/
- if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) {
- ill_t *ill_v4, *ill_v6;
-
- ill_v4 = phyi->phyint_illv4;
- ill_v6 = phyi->phyint_illv6;
-
+ if (turn_on & PHYI_STANDBY) {
/*
- * First set the INACTIVE flag if needed. Then delete the ires.
- * ire_add will atomically prevent creating new IRE_CACHEs
- * unless hidden flag is set.
- * PHYI_FAILED and PHYI_INACTIVE are exclusive
+ * No need to grab ill_g_usesrc_lock here; see the
+ * synchronization notes in ip.c.
*/
- if ((turn_on & PHYI_FAILED) &&
- ((intf_flags & PHYI_STANDBY) ||
- !ipst->ips_ipmp_enable_failback)) {
- /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- }
- if ((turn_off & PHYI_FAILED) &&
- ((intf_flags & PHYI_STANDBY) ||
- (!ipst->ips_ipmp_enable_failback &&
- ill_is_inactive(ill)))) {
- phyint_inactive(phyi);
- }
-
- if (turn_on & PHYI_STANDBY) {
- /*
- * We implicitly set INACTIVE only when STANDBY is set.
- * INACTIVE is also set on non-STANDBY phyint when user
- * disables FAILBACK using configuration file.
- * Do not allow STANDBY to be set on such INACTIVE
- * phyint
- */
- if (phyi->phyint_flags & PHYI_INACTIVE)
- return (EINVAL);
- if (!(phyi->phyint_flags & PHYI_FAILED))
- phyint_inactive(phyi);
- }
- if (turn_off & PHYI_STANDBY) {
- if (ipst->ips_ipmp_enable_failback) {
- /*
- * Reset PHYI_INACTIVE.
- */
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- } else if (ill_is_inactive(ill) &&
- !(phyi->phyint_flags & PHYI_FAILED)) {
- /*
- * Need to set INACTIVE, when user sets
- * STANDBY on a non-STANDBY phyint and
- * later resets STANDBY
- */
- phyint_inactive(phyi);
- }
+ if (ill->ill_usesrc_grp_next != NULL ||
+ intf_flags & PHYI_INACTIVE)
+ return (EINVAL);
+ if (!(flags & PHYI_FAILED)) {
+ flags |= PHYI_INACTIVE;
+ turn_on |= PHYI_INACTIVE;
}
- /*
- * We should always send up a message so that the
- * daemons come to know of it. Note that the zeroth
- * interface can be down and the check below for IPIF_UP
- * will not make sense as we are actually setting
- * a phyint flag here. We assume that the ipif used
- * is always the zeroth ipif. (ip_rts_ifmsg does not
- * send up any message for non-zero ipifs).
- */
- phyint_flags_modified = B_TRUE;
+ }
- if (ill_v4 != NULL) {
- ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, ill_stq_cache_delete,
- (char *)ill_v4, ill_v4);
- illgrp_reset_schednext(ill_v4);
- }
- if (ill_v6 != NULL) {
- ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, ill_stq_cache_delete,
- (char *)ill_v6, ill_v6);
- illgrp_reset_schednext(ill_v6);
- }
+ if (turn_off & PHYI_STANDBY) {
+ flags &= ~PHYI_INACTIVE;
+ turn_off |= PHYI_INACTIVE;
}
/*
+ * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both
+ * would end up on.
+ */
+ if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
+ (PHYI_FAILED | PHYI_INACTIVE))
+ return (EINVAL);
+
+ /*
* If ILLF_ROUTER changes, we need to change the ip forwarding
- * status of the interface and, if the interface is part of an IPMP
- * group, all other interfaces that are part of the same IPMP
- * group.
+ * status of the interface.
*/
if ((turn_on | turn_off) & ILLF_ROUTER)
(void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
@@ -11718,33 +11688,31 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
mutex_exit(&ill->ill_phyint->phyint_lock);
/*
- * We do the broadcast and nomination here rather
- * than waiting for a FAILOVER/FAILBACK to happen. In
- * the case of FAILBACK from INACTIVE standby to the
- * interface that has been repaired, PHYI_FAILED has not
- * been cleared yet. If there are only two interfaces in
- * that group, all we have is a FAILED and INACTIVE
- * interface. If we do the nomination soon after a failback,
- * the broadcast nomination code would select the
- * INACTIVE interface for receiving broadcasts as FAILED is
- * not yet cleared. As we don't want STANDBY/INACTIVE to
- * receive broadcast packets, we need to redo nomination
- * when the FAILED is cleared here. Thus, in general we
- * always do the nomination here for FAILED, STANDBY
- * and OFFLINE.
+ * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the
+ * same to the kernel: if any of them has been set by
+ * userland, the interface cannot be used for data traffic.
*/
- if (((turn_on | turn_off) &
- (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) {
- ip_redo_nomination(phyi);
+ if ((turn_on|turn_off) &
+ (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
+ ASSERT(!IS_IPMP(ill));
+ /*
+ * It's possible the ill is part of an "anonymous"
+ * IPMP group rather than a real group. In that case,
+ * there are no other interfaces in the group and thus
+ * no need to call ipmp_phyint_refresh_active().
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_phyint_refresh_active(phyi);
}
+
if (phyint_flags_modified) {
if (phyi->phyint_illv4 != NULL) {
ip_rts_ifmsg(phyi->phyint_illv4->
- ill_ipif);
+ ill_ipif, RTSQ_DEFAULT);
}
if (phyi->phyint_illv6 != NULL) {
ip_rts_ifmsg(phyi->phyint_illv6->
- ill_ipif);
+ ill_ipif, RTSQ_DEFAULT);
}
}
return (0);
@@ -11785,15 +11753,17 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
/*
- * The only flag changes that we currently take specific action on
- * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL,
- * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and
- * IPIF_PREFERRED. This is done by bring the ipif down, changing
- * the flags and bringing it back up again.
+ * The only flag changes that we currently take specific action on are
+ * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP,
+ * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and
+ * IPIF_NOFAILOVER. This is done by bring the ipif down, changing the
+ * flags and bringing it back up again. For IPIF_NOFAILOVER, the act
+ * of bringing it back up will trigger the address to be moved.
*/
if ((turn_on|turn_off) &
(IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
- ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) {
+ ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
+ IPIF_NOFAILOVER)) {
/*
* Taking this ipif down, make sure we have
* valid net and subnet bcast ire's for other
@@ -11822,9 +11792,8 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
{
ill_t *ill;
phyint_t *phyi;
- uint64_t turn_on;
- uint64_t turn_off;
- uint64_t intf_flags;
+ uint64_t turn_on, turn_off;
+ uint64_t intf_flags, cantchange_flags;
boolean_t phyint_flags_modified = B_FALSE;
int err = 0;
boolean_t set_linklocal = B_FALSE;
@@ -11839,12 +11808,15 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
phyi = ill->ill_phyint;
intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
- turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP);
+ cantchange_flags = IFF_CANTCHANGE | IFF_UP;
+ if (IS_IPMP(ill))
+ cantchange_flags |= IFF_IPMP_CANTCHANGE;
+ turn_on = (flags ^ intf_flags) & ~cantchange_flags;
turn_off = intf_flags & turn_on;
turn_on ^= turn_off;
- if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))
+ if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
phyint_flags_modified = B_TRUE;
/*
@@ -11870,9 +11842,6 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
mutex_exit(&ill->ill_lock);
mutex_exit(&phyi->phyint_lock);
- if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)))
- ip_redo_nomination(phyi);
-
if (set_linklocal)
(void) ipif_setlinklocal(ipif);
@@ -11881,12 +11850,29 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
else
ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
+ /*
+ * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
+ * the kernel: if any of them has been set by userland, the interface
+ * cannot be used for data traffic.
+ */
+ if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
+ ASSERT(!IS_IPMP(ill));
+ /*
+ * It's possible the ill is part of an "anonymous" IPMP group
+ * rather than a real group. In that case, there are no other
+ * interfaces in the group and thus no need for us to call
+ * ipmp_phyint_refresh_active().
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_phyint_refresh_active(phyi);
+ }
+
if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) {
/*
* XXX ipif_up really does not know whether a phyint flags
* was modified or not. So, it sends up information on
* only one routing sockets message. As we don't bring up
- * the interface and also set STANDBY/FAILED simultaneously
+ * the interface and also set PHYI_ flags simultaneously
* it should be okay.
*/
err = ipif_up(ipif, q, mp);
@@ -11898,14 +11884,14 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
if (phyint_flags_modified) {
if (phyi->phyint_illv4 != NULL) {
ip_rts_ifmsg(phyi->phyint_illv4->
- ill_ipif);
+ ill_ipif, RTSQ_DEFAULT);
}
if (phyi->phyint_illv6 != NULL) {
ip_rts_ifmsg(phyi->phyint_illv6->
- ill_ipif);
+ ill_ipif, RTSQ_DEFAULT);
}
} else {
- ip_rts_ifmsg(ipif);
+ ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
}
/*
* Update the flags in SCTP's IPIF list, ipif_up() will do
@@ -12101,10 +12087,7 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* broadcast address makes sense. If it does,
* there should be an IRE for it already.
* Don't match on ipif, only on the ill
- * since we are sharing these now. Don't use
- * MATCH_IRE_ILL_GROUP as we are looking for
- * the broadcast ire on this ill and each ill
- * in the group has its own broadcast ire.
+ * since we are sharing these now.
*/
ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST,
ipif, ALL_ZONES, NULL,
@@ -12302,9 +12285,16 @@ int
ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *if_req)
{
-
ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+
+ /*
+ * Since no applications should ever be setting metrics on underlying
+ * interfaces, we explicitly fail to smoke 'em out.
+ */
+ if (IS_UNDER_IPMP(ipif->ipif_ill))
+ return (EINVAL);
+
/*
* Set interface metric. We don't use this for
* anything but we keep track of it in case it is
@@ -12332,6 +12322,7 @@ ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
/* Get interface metric. */
ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+
if (ipip->ipi_cmd_type == IF_CMD) {
struct ifreq *ifr;
@@ -12766,13 +12757,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
nipif->ipif_state_flags |= IPIF_CHANGING;
}
- mutex_exit(&ill->ill_lock);
-
if (lir->lir_maxmtu != 0) {
ill->ill_max_mtu = lir->lir_maxmtu;
- ill->ill_mtu_userspecified = 1;
+ ill->ill_user_mtu = lir->lir_maxmtu;
mtu_walk = B_TRUE;
}
+ mutex_exit(&ill->ill_lock);
if (lir->lir_reachtime != 0)
ill->ill_reachable_time = lir->lir_reachtime;
@@ -12821,6 +12811,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ILL_UNMARK_CHANGING(ill);
mutex_exit(&ill->ill_lock);
+ /*
+ * Refresh IPMP meta-interface MTU if necessary.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_illgrp_refresh_mtu(ill->ill_grp);
+
return (0);
}
@@ -13032,13 +13028,117 @@ ipif_assign_seqid(ipif_t *ipif)
}
/*
+ * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are
+ * administratively down (i.e., no DAD), of the same type, and locked. Note
+ * that the clone is complete -- including the seqid -- and the expectation is
+ * that the caller will either free or overwrite `sipif' before it's unlocked.
+ */
+static void
+ipif_clone(const ipif_t *sipif, ipif_t *dipif)
+{
+ ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock));
+ ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock));
+ ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
+ ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
+ ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
+ ASSERT(sipif->ipif_arp_del_mp == NULL);
+ ASSERT(dipif->ipif_arp_del_mp == NULL);
+ ASSERT(sipif->ipif_igmp_rpt == NULL);
+ ASSERT(dipif->ipif_igmp_rpt == NULL);
+ ASSERT(sipif->ipif_multicast_up == 0);
+ ASSERT(dipif->ipif_multicast_up == 0);
+ ASSERT(sipif->ipif_joined_allhosts == 0);
+ ASSERT(dipif->ipif_joined_allhosts == 0);
+
+ dipif->ipif_mtu = sipif->ipif_mtu;
+ dipif->ipif_flags = sipif->ipif_flags;
+ dipif->ipif_metric = sipif->ipif_metric;
+ dipif->ipif_zoneid = sipif->ipif_zoneid;
+ dipif->ipif_v6subnet = sipif->ipif_v6subnet;
+ dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
+ dipif->ipif_v6src_addr = sipif->ipif_v6src_addr;
+ dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
+ dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
+ dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
+
+ /*
+ * While dipif is down right now, it might've been up before. Since
+ * it's changing identity, its packet counters need to be reset.
+ */
+ dipif->ipif_ib_pkt_count = 0;
+ dipif->ipif_ob_pkt_count = 0;
+ dipif->ipif_fo_pkt_count = 0;
+
+ /*
+ * As per the comment atop the function, we assume that these sipif
+ * fields will be changed before sipif is unlocked.
+ */
+ dipif->ipif_seqid = sipif->ipif_seqid;
+ dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp;
+ dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt;
+ dipif->ipif_state_flags = sipif->ipif_state_flags;
+}
+
+/*
+ * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif'
+ * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin
+ * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then
+ * transfer the xop to `dipif'. Requires that all ipifs are administratively
+ * down (i.e., no DAD), of the same type, and unlocked.
+ */
+static void
+ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
+{
+ ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq;
+ int ipx_current_ioctl;
+
+ ASSERT(sipif != dipif);
+ ASSERT(sipif != virgipif);
+
+ /*
+ * Grab all of the locks that protect the ipif in a defined order.
+ */
+ GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
+ if (sipif > dipif) {
+ mutex_enter(&sipif->ipif_saved_ire_lock);
+ mutex_enter(&dipif->ipif_saved_ire_lock);
+ } else {
+ mutex_enter(&dipif->ipif_saved_ire_lock);
+ mutex_enter(&sipif->ipif_saved_ire_lock);
+ }
+
+ ipif_clone(sipif, dipif);
+ if (virgipif != NULL) {
+ ipif_clone(virgipif, sipif);
+ mi_free(virgipif);
+ }
+
+ mutex_exit(&sipif->ipif_saved_ire_lock);
+ mutex_exit(&dipif->ipif_saved_ire_lock);
+ RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
+
+ /*
+ * Transfer ownership of the current xop, if necessary.
+ */
+ if (ipsq->ipsq_xop->ipx_current_ipif == sipif) {
+ ASSERT(ipsq->ipsq_xop->ipx_pending_ipif == NULL);
+ ipx_current_ioctl = ipsq->ipsq_xop->ipx_current_ioctl;
+ ipsq_current_finish(ipsq);
+ ipsq_current_start(ipsq, dipif, ipx_current_ioctl);
+ }
+
+ if (virgipif == NULL)
+ mi_free(sipif);
+}
+
+/*
* Insert the ipif, so that the list of ipifs on the ill will be sorted
* with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
* be inserted into the first space available in the list. The value of
* ipif_id will then be set to the appropriate value for its position.
*/
static int
-ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
+ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock)
{
ill_t *ill;
ipif_t *tipif;
@@ -13056,12 +13156,11 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
/*
* In the case of lo0:0 we already hold the ill_g_lock.
* ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
- * ipif_insert. Another such caller is ipif_move.
+ * ipif_insert.
*/
if (acquire_g_lock)
rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- if (acquire_ill_lock)
- mutex_enter(&ill->ill_lock);
+ mutex_enter(&ill->ill_lock);
id = ipif->ipif_id;
tipifp = &(ill->ill_ipif);
if (id == -1) { /* need to find a real id */
@@ -13075,8 +13174,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
}
/* limit number of logical interfaces */
if (id >= ipst->ips_ip_addrs_per_if) {
- if (acquire_ill_lock)
- mutex_exit(&ill->ill_lock);
+ mutex_exit(&ill->ill_lock);
if (acquire_g_lock)
rw_exit(&ipst->ips_ill_g_lock);
return (-1);
@@ -13091,8 +13189,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
tipifp = &(tipif->ipif_next);
}
} else {
- if (acquire_ill_lock)
- mutex_exit(&ill->ill_lock);
+ mutex_exit(&ill->ill_lock);
if (acquire_g_lock)
rw_exit(&ipst->ips_ill_g_lock);
return (-1);
@@ -13102,25 +13199,22 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
ipif->ipif_next = tipif;
*tipifp = ipif;
- if (acquire_ill_lock)
- mutex_exit(&ill->ill_lock);
+ mutex_exit(&ill->ill_lock);
if (acquire_g_lock)
rw_exit(&ipst->ips_ill_g_lock);
+
return (0);
}
static void
-ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
+ipif_remove(ipif_t *ipif)
{
ipif_t **ipifp;
ill_t *ill = ipif->ipif_ill;
ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
- if (acquire_ill_lock)
- mutex_enter(&ill->ill_lock);
- else
- ASSERT(MUTEX_HELD(&ill->ill_lock));
+ mutex_enter(&ill->ill_lock);
ipifp = &ill->ill_ipif;
for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
if (*ipifp == ipif) {
@@ -13128,9 +13222,7 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
break;
}
}
-
- if (acquire_ill_lock)
- mutex_exit(&ill->ill_lock);
+ mutex_exit(&ill->ill_lock);
}
/*
@@ -13149,10 +13241,12 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
* second DL_INFO_ACK comes in from the driver.
*/
static ipif_t *
-ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
+ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
+ boolean_t insert)
{
ipif_t *ipif;
- phyint_t *phyi;
+ phyint_t *phyi = ill->ill_phyint;
+ ip_stack_t *ipst = ill->ill_ipst;
ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
ill->ill_name, id, (void *)ill));
@@ -13175,23 +13269,61 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
ipif->ipif_refcnt = 0;
ipif->ipif_saved_ire_cnt = 0;
- if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) {
- mi_free(ipif);
- return (NULL);
+ if (insert) {
+ if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) {
+ mi_free(ipif);
+ return (NULL);
+ }
+ /* -1 id should have been replaced by real id */
+ id = ipif->ipif_id;
+ ASSERT(id >= 0);
}
- /* -1 id should have been replaced by real id */
- id = ipif->ipif_id;
- ASSERT(id >= 0);
if (ill->ill_name[0] != '\0')
ipif_assign_seqid(ipif);
/*
- * Keep a copy of original id in ipif_orig_ipifid. Failback
- * will attempt to restore the original id. The SIOCSLIFOINDEX
- * ioctl sets ipif_orig_ipifid to zero.
+ * If this is ipif zero, configure ill/phyint-wide information.
+ * Defer most configuration until we're guaranteed we're attached.
*/
- ipif->ipif_orig_ipifid = id;
+ if (id == 0) {
+ if (ill->ill_mactype == SUNW_DL_IPMP) {
+ /*
+ * Set PHYI_IPMP and also set PHYI_FAILED since there
+ * are no active interfaces. Similarly, PHYI_RUNNING
+ * isn't set until the group has an active interface.
+ */
+ mutex_enter(&phyi->phyint_lock);
+ phyi->phyint_flags |= (PHYI_IPMP | PHYI_FAILED);
+ mutex_exit(&phyi->phyint_lock);
+
+ /*
+ * Create the illgrp (which must not exist yet because
+ * the zeroth ipif is created once per ill). However,
+ * do not not link it to the ipmp_grp_t until I_PLINK
+ * is called; see ip_sioctl_plink_ipmp() for details.
+ */
+ if (ipmp_illgrp_create(ill) == NULL) {
+ if (insert) {
+ rw_enter(&ipst->ips_ill_g_lock,
+ RW_WRITER);
+ ipif_remove(ipif);
+ rw_exit(&ipst->ips_ill_g_lock);
+ }
+ mi_free(ipif);
+ return (NULL);
+ }
+ } else {
+ /*
+ * By default, PHYI_RUNNING is set when the zeroth
+ * ipif is created. For other ipifs, we don't touch
+ * it since DLPI notifications may have changed it.
+ */
+ mutex_enter(&phyi->phyint_lock);
+ phyi->phyint_flags |= PHYI_RUNNING;
+ mutex_exit(&phyi->phyint_lock);
+ }
+ }
/*
* We grab the ill_lock and phyint_lock to protect the flag changes.
@@ -13199,18 +13331,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
* ioctl completes and the IPIF_CHANGING flag is cleared.
*/
mutex_enter(&ill->ill_lock);
- mutex_enter(&ill->ill_phyint->phyint_lock);
- /*
- * Set the running flag when logical interface zero is created.
- * For subsequent logical interfaces, a DLPI link down
- * notification message may have cleared the running flag to
- * indicate the link is down, so we shouldn't just blindly set it.
- */
- if (id == 0)
- ill->ill_phyint->phyint_flags |= PHYI_RUNNING;
+ mutex_enter(&phyi->phyint_lock);
+
ipif->ipif_ire_type = ire_type;
- phyi = ill->ill_phyint;
- ipif->ipif_orig_ifindex = phyi->phyint_ifindex;
if (ipif->ipif_isv6) {
ill->ill_flags |= ILLF_IPV6;
@@ -13238,14 +13361,18 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
* Don't set the interface flags etc. now, will do it in
* ip_ll_subnet_defaults.
*/
- if (!initialize) {
- mutex_exit(&ill->ill_lock);
- mutex_exit(&ill->ill_phyint->phyint_lock);
- return (ipif);
- }
+ if (!initialize)
+ goto out;
+
ipif->ipif_mtu = ill->ill_max_mtu;
- if (ill->ill_bcast_addr_length != 0) {
+ /*
+ * NOTE: The IPMP meta-interface is special-cased because it starts
+ * with no underlying interfaces (and thus an unknown broadcast
+ * address length), but all interfaces that can be placed into an IPMP
+ * group are required to be broadcast-capable.
+ */
+ if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) {
/*
* Later detect lack of DLPI driver multicast
* capability by catching DL_ENABMULTI errors in
@@ -13269,8 +13396,7 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
ill->ill_flags |= ILLF_NOARP;
}
if (ill->ill_phys_addr_length == 0) {
- if (ill->ill_media &&
- ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) {
+ if (ill->ill_mactype == SUNW_DL_VNI) {
ipif->ipif_flags |= IPIF_NOXMIT;
phyi->phyint_flags |= PHYI_VIRTUAL;
} else {
@@ -13285,8 +13411,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
}
}
}
+out:
+ mutex_exit(&phyi->phyint_lock);
mutex_exit(&ill->ill_lock);
- mutex_exit(&ill->ill_phyint->phyint_lock);
return (ipif);
}
@@ -13300,34 +13427,49 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
* for details.
*/
void
-ipif_arp_down(ipif_t *ipif)
+ipif_resolver_down(ipif_t *ipif)
{
mblk_t *mp;
ill_t *ill = ipif->ipif_ill;
- ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
+ ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
ASSERT(IAM_WRITER_IPIF(ipif));
+ if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
+ return;
+
/* Delete the mapping for the local address */
mp = ipif->ipif_arp_del_mp;
if (mp != NULL) {
- ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+ ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
*(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id));
putnext(ill->ill_rq, mp);
ipif->ipif_arp_del_mp = NULL;
}
/*
+ * Make IPMP aware of the deleted data address.
+ */
+ if (IS_IPMP(ill))
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+
+ /*
* If this is the last ipif that is going down and there are no
* duplicate addresses we may yet attempt to re-probe, then we need to
* clean up ARP completely.
*/
if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) {
+ /*
+ * If this was the last ipif on an IPMP interface, purge any
+ * IPMP ARP entries associated with it.
+ */
+ if (IS_IPMP(ill))
+ ipmp_illgrp_refresh_arpent(ill->ill_grp);
/* Send up AR_INTERFACE_DOWN message */
mp = ill->ill_arp_down_mp;
if (mp != NULL) {
- ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+ ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
*(unsigned *)mp->b_rptr, ill->ill_name,
ipif->ipif_id));
putnext(ill->ill_rq, mp);
@@ -13337,7 +13479,7 @@ ipif_arp_down(ipif_t *ipif)
/* Tell ARP to delete the multicast mappings */
mp = ill->ill_arp_del_mapping_mp;
if (mp != NULL) {
- ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+ ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
*(unsigned *)mp->b_rptr, ill->ill_name,
ipif->ipif_id));
putnext(ill->ill_rq, mp);
@@ -13377,6 +13519,13 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
return (0);
/*
+ * IPMP meta-interfaces don't have any inherent multicast mappings,
+ * and instead use the ones on the underlying interfaces.
+ */
+ if (IS_IPMP(ill))
+ return (0);
+
+ /*
* Delete the existing mapping from ARP. Normally ipif_down
* -> ipif_arp_down should send this up to ARP. The only
* reason we would find this when we are switching from
@@ -13473,26 +13622,23 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
}
/*
- * Get the resolver set up for a new interface address.
- * (Always called as writer.)
- * Called both for IPv4 and IPv6 interfaces,
- * though it only sets up the resolver for v6
- * if it's an xresolv interface (one using an external resolver).
- * Honors ILLF_NOARP.
- * The enumerated value res_act is used to tune the behavior.
- * If set to Res_act_initial, then we set up all the resolver
- * structures for a new interface. If set to Res_act_move, then
- * we just send an AR_ENTRY_ADD message up to ARP for IPv4
- * interfaces; this is called by ip_rput_dlpi_writer() to handle
- * asynchronous hardware address change notification. If set to
- * Res_act_defend, then we tell ARP that it needs to send a single
- * gratuitous message in defense of the address.
+ * Get the resolver set up for a new IP address. (Always called as writer.)
+ * Called both for IPv4 and IPv6 interfaces, though it only sets up the
+ * resolver for v6 if it's an ILLF_XRESOLV interface. Honors ILLF_NOARP.
+ *
+ * The enumerated value res_act tunes the behavior:
+ * * Res_act_initial: set up all the resolver structures for a new
+ * IP address.
+ * * Res_act_defend: tell ARP that it needs to send a single gratuitous
+ * ARP message in defense of the address.
+ * * Res_act_rebind: tell ARP to change the hardware address for an IP
+ * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif().
+ *
* Returns error on failure.
*/
int
ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
{
- caddr_t addr;
mblk_t *arp_up_mp = NULL;
mblk_t *arp_down_mp = NULL;
mblk_t *arp_add_mp = NULL;
@@ -13500,9 +13646,9 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
mblk_t *arp_add_mapping_mp = NULL;
mblk_t *arp_del_mapping_mp = NULL;
ill_t *ill = ipif->ipif_ill;
- uchar_t *area_p = NULL;
- uchar_t *ared_p = NULL;
int err = ENOMEM;
+ boolean_t added_ipif = B_FALSE;
+ boolean_t publish;
boolean_t was_dup;
ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
@@ -13540,11 +13686,7 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
* External resolver for IPv6
*/
ASSERT(res_act == Res_act_initial);
- if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
- addr = (caddr_t)&ipif->ipif_v6lcl_addr;
- area_p = (uchar_t *)&ip6_area_template;
- ared_p = (uchar_t *)&ip6_ared_template;
- }
+ publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr);
} else {
/*
* IPv4 arp case. If the ARP stream has already started
@@ -13562,41 +13704,39 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
ill->ill_arp_bringup_pending = 1;
mutex_exit(&ill->ill_lock);
}
- if (ipif->ipif_lcl_addr != INADDR_ANY) {
- addr = (caddr_t)&ipif->ipif_lcl_addr;
- area_p = (uchar_t *)&ip_area_template;
- ared_p = (uchar_t *)&ip_ared_template;
+ publish = (ipif->ipif_lcl_addr != INADDR_ANY);
+ }
+
+ if (IS_IPMP(ill) && publish) {
+ /*
+ * If we're here via ipif_up(), then the ipif won't be bound
+ * yet -- add it to the group, which will bind it if possible.
+ * (We would add it in ipif_up(), but deleting on failure
+ * there is gruesome.) If we're here via ipmp_ill_bind_ipif(),
+ * then the ipif has already been added to the group and we
+ * just need to use the binding.
+ */
+ if (ipmp_ipif_bound_ill(ipif) == NULL) {
+ if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) {
+ /*
+ * We couldn't bind the ipif to an ill yet,
+ * so we have nothing to publish.
+ */
+ publish = B_FALSE;
+ }
+ added_ipif = B_TRUE;
}
}
/*
* Add an entry for the local address in ARP only if it
- * is not UNNUMBERED and the address is not INADDR_ANY.
+ * is not UNNUMBERED and it is suitable for publishing.
*/
- if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) {
- area_t *area;
-
- /* Now ask ARP to publish our address. */
- arp_add_mp = ill_arp_alloc(ill, area_p, addr);
- if (arp_add_mp == NULL)
- goto failed;
- area = (area_t *)arp_add_mp->b_rptr;
- if (res_act != Res_act_initial) {
- /*
- * Copy the new hardware address and length into
- * arp_add_mp to be sent to ARP.
- */
- area->area_hw_addr_length = ill->ill_phys_addr_length;
- bcopy(ill->ill_phys_addr,
- ((char *)area + area->area_hw_addr_offset),
- area->area_hw_addr_length);
- }
-
- area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH |
- ACE_F_MYADDR;
-
+ if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) {
if (res_act == Res_act_defend) {
- area->area_flags |= ACE_F_DEFEND;
+ arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND);
+ if (arp_add_mp == NULL)
+ goto failed;
/*
* If we're just defending our address now, then
* there's no need to set up ARP multicast mappings.
@@ -13605,17 +13745,18 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
goto done;
}
- if (res_act != Res_act_initial)
- goto arp_setup_multicast;
-
/*
- * Allocate an ARP deletion message so we know we can tell ARP
- * when the interface goes down.
+ * Allocate an ARP add message and an ARP delete message (the
+ * latter is saved for use when the address goes down).
*/
- arp_del_mp = ill_arp_alloc(ill, ared_p, addr);
- if (arp_del_mp == NULL)
+ if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL)
+ goto failed;
+
+ if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL)
goto failed;
+ if (res_act != Res_act_initial)
+ goto arp_setup_multicast;
} else {
if (res_act != Res_act_initial)
goto done;
@@ -13624,14 +13765,11 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
* Need to bring up ARP or setup multicast mapping only
* when the first interface is coming UP.
*/
- if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
- was_dup) {
+ if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup)
goto done;
- }
/*
- * Allocate an ARP down message (to be saved) and an ARP up
- * message.
+ * Allocate an ARP down message (to be saved) and an ARP up message.
*/
arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0);
if (arp_down_mp == NULL)
@@ -13648,33 +13786,21 @@ arp_setup_multicast:
/*
* Setup the multicast mappings. This function initializes
* ill_arp_del_mapping_mp also. This does not need to be done for
- * IPv6.
+ * IPv6, or for the IPMP interface (since it has no link-layer).
*/
- if (!ill->ill_isv6) {
+ if (!ill->ill_isv6 && !IS_IPMP(ill)) {
err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp);
if (err != 0)
goto failed;
ASSERT(ill->ill_arp_del_mapping_mp != NULL);
ASSERT(arp_add_mapping_mp != NULL);
}
-
done:
- if (arp_del_mp != NULL) {
- ASSERT(ipif->ipif_arp_del_mp == NULL);
- ipif->ipif_arp_del_mp = arp_del_mp;
- }
- if (arp_down_mp != NULL) {
- ASSERT(ill->ill_arp_down_mp == NULL);
- ill->ill_arp_down_mp = arp_down_mp;
- }
- if (arp_del_mapping_mp != NULL) {
- ASSERT(ill->ill_arp_del_mapping_mp == NULL);
- ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
- }
if (arp_up_mp != NULL) {
ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n",
ill->ill_name, ipif->ipif_id));
putnext(ill->ill_rq, arp_up_mp);
+ arp_up_mp = NULL;
}
if (arp_add_mp != NULL) {
ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n",
@@ -13686,6 +13812,7 @@ done:
if (!ill->ill_arp_extend)
ipif->ipif_addr_ready = 1;
putnext(ill->ill_rq, arp_add_mp);
+ arp_add_mp = NULL;
} else {
ipif->ipif_addr_ready = 1;
}
@@ -13693,29 +13820,40 @@ done:
ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n",
ill->ill_name, ipif->ipif_id));
putnext(ill->ill_rq, arp_add_mapping_mp);
+ arp_add_mapping_mp = NULL;
}
- if (res_act != Res_act_initial)
- return (0);
- if (ill->ill_flags & ILLF_NOARP)
- err = ill_arp_off(ill);
- else
- err = ill_arp_on(ill);
- if (err != 0) {
- ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err));
- freemsg(ipif->ipif_arp_del_mp);
- freemsg(ill->ill_arp_down_mp);
- freemsg(ill->ill_arp_del_mapping_mp);
- ipif->ipif_arp_del_mp = NULL;
- ill->ill_arp_down_mp = NULL;
- ill->ill_arp_del_mapping_mp = NULL;
- return (err);
+ if (res_act == Res_act_initial) {
+ if (ill->ill_flags & ILLF_NOARP)
+ err = ill_arp_off(ill);
+ else
+ err = ill_arp_on(ill);
+ if (err != 0) {
+ ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n",
+ err));
+ goto failed;
+ }
}
+
+ if (arp_del_mp != NULL) {
+ ASSERT(ipif->ipif_arp_del_mp == NULL);
+ ipif->ipif_arp_del_mp = arp_del_mp;
+ }
+ if (arp_down_mp != NULL) {
+ ASSERT(ill->ill_arp_down_mp == NULL);
+ ill->ill_arp_down_mp = arp_down_mp;
+ }
+ if (arp_del_mapping_mp != NULL) {
+ ASSERT(ill->ill_arp_del_mapping_mp == NULL);
+ ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
+ }
+
return ((ill->ill_ipif_up_count != 0 || was_dup ||
ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS);
-
failed:
ip1dbg(("ipif_resolver_up: FAILED\n"));
+ if (added_ipif)
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
freemsg(arp_add_mp);
freemsg(arp_del_mp);
freemsg(arp_add_mapping_mp);
@@ -13734,13 +13872,12 @@ ipif_arp_start_dad(ipif_t *ipif)
{
ill_t *ill = ipif->ipif_ill;
mblk_t *arp_add_mp;
- area_t *area;
+ /* ACE_F_UNVERIFIED restarts DAD */
if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing ||
(ipif->ipif_flags & IPIF_UNNUMBERED) ||
ipif->ipif_lcl_addr == INADDR_ANY ||
- (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
- (char *)&ipif->ipif_lcl_addr)) == NULL) {
+ (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) {
/*
* If we can't contact ARP for some reason, that's not really a
* problem. Just send out the routing socket notification that
@@ -13752,10 +13889,6 @@ ipif_arp_start_dad(ipif_t *ipif)
return;
}
- /* Setting the 'unverified' flag restarts DAD */
- area = (area_t *)arp_add_mp->b_rptr;
- area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR |
- ACE_F_UNVERIFIED;
putnext(ill->ill_rq, arp_add_mp);
}
@@ -13764,7 +13897,8 @@ ipif_ndp_start_dad(ipif_t *ipif)
{
nce_t *nce;
- nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE);
+ nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr,
+ B_FALSE);
if (nce == NULL)
return;
@@ -13805,7 +13939,7 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
*/
if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) ||
(!ill->ill_isv6 && !ill->ill_arp_extend)) {
- ip_rts_ifmsg(ill->ill_ipif);
+ ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
return;
}
@@ -13838,8 +13972,10 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
* we'll handle eventual routing socket
* notification via DAD completion.)
*/
- if (ipif == ill->ill_ipif)
- ip_rts_ifmsg(ill->ill_ipif);
+ if (ipif == ill->ill_ipif) {
+ ip_rts_ifmsg(ill->ill_ipif,
+ RTSQ_DEFAULT);
+ }
}
} else {
/*
@@ -13855,285 +13991,30 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
* If we've torn down links, then notify the user right away.
*/
if (!went_up)
- ip_rts_ifmsg(ill->ill_ipif);
-}
-
-/*
- * Wakeup all threads waiting to enter the ipsq, and sleeping
- * on any of the ills in this ipsq. The ill_lock of the ill
- * must be held so that waiters don't miss wakeups
- */
-static void
-ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock)
-{
- phyint_t *phyint;
-
- phyint = ipsq->ipsq_phyint_list;
- while (phyint != NULL) {
- if (phyint->phyint_illv4) {
- if (!caller_holds_lock)
- mutex_enter(&phyint->phyint_illv4->ill_lock);
- ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
- cv_broadcast(&phyint->phyint_illv4->ill_cv);
- if (!caller_holds_lock)
- mutex_exit(&phyint->phyint_illv4->ill_lock);
- }
- if (phyint->phyint_illv6) {
- if (!caller_holds_lock)
- mutex_enter(&phyint->phyint_illv6->ill_lock);
- ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
- cv_broadcast(&phyint->phyint_illv6->ill_cv);
- if (!caller_holds_lock)
- mutex_exit(&phyint->phyint_illv6->ill_lock);
- }
- phyint = phyint->phyint_ipsq_next;
- }
-}
-
-static ipsq_t *
-ipsq_create(char *groupname, ip_stack_t *ipst)
-{
- ipsq_t *ipsq;
-
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
- ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
- if (ipsq == NULL) {
- return (NULL);
- }
-
- if (groupname != NULL)
- (void) strcpy(ipsq->ipsq_name, groupname);
- else
- ipsq->ipsq_name[0] = '\0';
-
- mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL);
- ipsq->ipsq_flags |= IPSQ_GROUP;
- ipsq->ipsq_next = ipst->ips_ipsq_g_head;
- ipst->ips_ipsq_g_head = ipsq;
- ipsq->ipsq_ipst = ipst; /* No netstack_hold */
- return (ipsq);
-}
-
-/*
- * Return an ipsq correspoding to the groupname. If 'create' is true
- * allocate a new ipsq if one does not exist. Usually an ipsq is associated
- * uniquely with an IPMP group. However during IPMP groupname operations,
- * multiple IPMP groups may be associated with a single ipsq. But no
- * IPMP group can be associated with more than 1 ipsq at any time.
- * For example
- * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs
- * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2
- * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2
- *
- * Now the command ifconfig hme3 group mpk17-84 results in the temporary
- * status shown below during the execution of the above command.
- * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4
- *
- * After the completion of the above groupname command we return to the stable
- * state shown below.
- * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3
- * hme4 mpk17-85 ipsq2 mpk17-85 1
- *
- * Because of the above, we don't search based on the ipsq_name since that
- * would miss the correct ipsq during certain windows as shown above.
- * The ipsq_name is only used during split of an ipsq to return the ipsq to its
- * natural state.
- */
-static ipsq_t *
-ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq,
- ip_stack_t *ipst)
-{
- ipsq_t *ipsq;
- int group_len;
- phyint_t *phyint;
-
- ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
- group_len = strlen(groupname);
- ASSERT(group_len != 0);
- group_len++;
-
- for (ipsq = ipst->ips_ipsq_g_head;
- ipsq != NULL;
- ipsq = ipsq->ipsq_next) {
- /*
- * When an ipsq is being split, and ill_split_ipsq
- * calls this function, we exclude it from being considered.
- */
- if (ipsq == exclude_ipsq)
- continue;
-
- /*
- * Compare against the ipsq_name. The groupname change happens
- * in 2 phases. The 1st phase merges the from group into
- * the to group's ipsq, by calling ill_merge_groups and restarts
- * the ioctl. The 2nd phase then locates the ipsq again thru
- * ipsq_name. At this point the phyint_groupname has not been
- * updated.
- */
- if ((group_len == strlen(ipsq->ipsq_name) + 1) &&
- (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) {
- /*
- * Verify that an ipmp groupname is exactly
- * part of 1 ipsq and is not found in any other
- * ipsq.
- */
- ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, ipst) ==
- NULL);
- return (ipsq);
- }
-
- /*
- * Comparison against ipsq_name alone is not sufficient.
- * In the case when groups are currently being
- * merged, the ipsq could hold other IPMP groups temporarily.
- * so we walk the phyint list and compare against the
- * phyint_groupname as well.
- */
- phyint = ipsq->ipsq_phyint_list;
- while (phyint != NULL) {
- if ((group_len == phyint->phyint_groupname_len) &&
- (bcmp(phyint->phyint_groupname, groupname,
- group_len) == 0)) {
- /*
- * Verify that an ipmp groupname is exactly
- * part of 1 ipsq and is not found in any other
- * ipsq.
- */
- ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq,
- ipst) == NULL);
- return (ipsq);
- }
- phyint = phyint->phyint_ipsq_next;
- }
- }
- if (create)
- ipsq = ipsq_create(groupname, ipst);
- return (ipsq);
+ ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
}
static void
ipsq_delete(ipsq_t *ipsq)
{
- ipsq_t *nipsq;
- ipsq_t *pipsq = NULL;
- ip_stack_t *ipst = ipsq->ipsq_ipst;
-
- /*
- * We don't hold the ipsq lock, but we are sure no new
- * messages can land up, since the ipsq_refs is zero.
- * i.e. this ipsq is unnamed and no phyint or phyint group
- * is associated with this ipsq. (Lookups are based on ill_name
- * or phyint_groupname)
- */
- ASSERT(ipsq->ipsq_refs == 0);
- ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL);
- ASSERT(ipsq->ipsq_pending_mp == NULL);
- if (!(ipsq->ipsq_flags & IPSQ_GROUP)) {
- /*
- * This is not the ipsq of an IPMP group.
- */
- ipsq->ipsq_ipst = NULL;
- kmem_free(ipsq, sizeof (ipsq_t));
- return;
- }
-
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
- /*
- * Locate the ipsq before we can remove it from
- * the singly linked list of ipsq's.
- */
- for (nipsq = ipst->ips_ipsq_g_head; nipsq != NULL;
- nipsq = nipsq->ipsq_next) {
- if (nipsq == ipsq) {
- break;
- }
- pipsq = nipsq;
- }
-
- ASSERT(nipsq == ipsq);
+ ipxop_t *ipx = ipsq->ipsq_xop;
- /* unlink ipsq from the list */
- if (pipsq != NULL)
- pipsq->ipsq_next = ipsq->ipsq_next;
- else
- ipst->ips_ipsq_g_head = ipsq->ipsq_next;
ipsq->ipsq_ipst = NULL;
+ ASSERT(ipsq->ipsq_phyint == NULL);
+ ASSERT(ipsq->ipsq_xop != NULL);
+ ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL);
+ ASSERT(ipx->ipx_pending_mp == NULL);
kmem_free(ipsq, sizeof (ipsq_t));
- rw_exit(&ipst->ips_ill_g_lock);
-}
-
-static void
-ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp,
- queue_t *q)
-{
- ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock));
- ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL);
- ASSERT(old_ipsq->ipsq_pending_ipif == NULL);
- ASSERT(old_ipsq->ipsq_pending_mp == NULL);
- ASSERT(current_mp != NULL);
-
- ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl,
- NEW_OP, NULL);
-
- ASSERT(new_ipsq->ipsq_xopq_mptail != NULL &&
- new_ipsq->ipsq_xopq_mphead != NULL);
-
- /*
- * move from old ipsq to the new ipsq.
- */
- new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead;
- if (old_ipsq->ipsq_xopq_mphead != NULL)
- new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail;
-
- old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL;
}
-void
-ill_group_cleanup(ill_t *ill)
-{
- ill_t *ill_v4;
- ill_t *ill_v6;
- ipif_t *ipif;
-
- ill_v4 = ill->ill_phyint->phyint_illv4;
- ill_v6 = ill->ill_phyint->phyint_illv6;
-
- if (ill_v4 != NULL) {
- mutex_enter(&ill_v4->ill_lock);
- for (ipif = ill_v4->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- IPIF_UNMARK_MOVING(ipif);
- }
- ill_v4->ill_up_ipifs = B_FALSE;
- mutex_exit(&ill_v4->ill_lock);
- }
-
- if (ill_v6 != NULL) {
- mutex_enter(&ill_v6->ill_lock);
- for (ipif = ill_v6->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- IPIF_UNMARK_MOVING(ipif);
- }
- ill_v6->ill_up_ipifs = B_FALSE;
- mutex_exit(&ill_v6->ill_lock);
- }
-}
-/*
- * This function is called when an ill has had a change in its group status
- * to bring up all the ipifs that were up before the change.
- */
-int
-ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
+static int
+ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
{
+ int err;
ipif_t *ipif;
- ill_t *ill_v4;
- ill_t *ill_v6;
- ill_t *from_ill;
- int err = 0;
- ASSERT(IAM_WRITER_ILL(ill));
+ if (ill == NULL)
+ return (0);
/*
* Except for ipif_state_flags and ill_state_flags the other
@@ -14142,389 +14023,86 @@ ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
* even an ipif that was already down, in ill_down_ipifs. So we
* just blindly clear the IPIF_CHANGING flag here on all ipifs.
*/
- ill_v4 = ill->ill_phyint->phyint_illv4;
- ill_v6 = ill->ill_phyint->phyint_illv6;
- if (ill_v4 != NULL) {
- ill_v4->ill_up_ipifs = B_TRUE;
- for (ipif = ill_v4->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- mutex_enter(&ill_v4->ill_lock);
- ipif->ipif_state_flags &= ~IPIF_CHANGING;
- IPIF_UNMARK_MOVING(ipif);
- mutex_exit(&ill_v4->ill_lock);
- if (ipif->ipif_was_up) {
- if (!(ipif->ipif_flags & IPIF_UP))
- err = ipif_up(ipif, q, mp);
- ipif->ipif_was_up = B_FALSE;
- if (err != 0) {
- /*
- * Can there be any other error ?
- */
- ASSERT(err == EINPROGRESS);
- return (err);
- }
- }
- }
- mutex_enter(&ill_v4->ill_lock);
- ill_v4->ill_state_flags &= ~ILL_CHANGING;
- mutex_exit(&ill_v4->ill_lock);
- ill_v4->ill_up_ipifs = B_FALSE;
- if (ill_v4->ill_move_in_progress) {
- ASSERT(ill_v4->ill_move_peer != NULL);
- ill_v4->ill_move_in_progress = B_FALSE;
- from_ill = ill_v4->ill_move_peer;
- from_ill->ill_move_in_progress = B_FALSE;
- from_ill->ill_move_peer = NULL;
- mutex_enter(&from_ill->ill_lock);
- from_ill->ill_state_flags &= ~ILL_CHANGING;
- mutex_exit(&from_ill->ill_lock);
- if (ill_v6 == NULL) {
- if (from_ill->ill_phyint->phyint_flags &
- PHYI_STANDBY) {
- phyint_inactive(from_ill->ill_phyint);
- }
- if (ill_v4->ill_phyint->phyint_flags &
- PHYI_STANDBY) {
- phyint_inactive(ill_v4->ill_phyint);
- }
- }
- ill_v4->ill_move_peer = NULL;
- }
- }
+ ASSERT(IAM_WRITER_ILL(ill));
- if (ill_v6 != NULL) {
- ill_v6->ill_up_ipifs = B_TRUE;
- for (ipif = ill_v6->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- mutex_enter(&ill_v6->ill_lock);
- ipif->ipif_state_flags &= ~IPIF_CHANGING;
- IPIF_UNMARK_MOVING(ipif);
- mutex_exit(&ill_v6->ill_lock);
- if (ipif->ipif_was_up) {
- if (!(ipif->ipif_flags & IPIF_UP))
- err = ipif_up(ipif, q, mp);
- ipif->ipif_was_up = B_FALSE;
- if (err != 0) {
- /*
- * Can there be any other error ?
- */
- ASSERT(err == EINPROGRESS);
- return (err);
- }
- }
- }
- mutex_enter(&ill_v6->ill_lock);
- ill_v6->ill_state_flags &= ~ILL_CHANGING;
- mutex_exit(&ill_v6->ill_lock);
- ill_v6->ill_up_ipifs = B_FALSE;
- if (ill_v6->ill_move_in_progress) {
- ASSERT(ill_v6->ill_move_peer != NULL);
- ill_v6->ill_move_in_progress = B_FALSE;
- from_ill = ill_v6->ill_move_peer;
- from_ill->ill_move_in_progress = B_FALSE;
- from_ill->ill_move_peer = NULL;
- mutex_enter(&from_ill->ill_lock);
- from_ill->ill_state_flags &= ~ILL_CHANGING;
- mutex_exit(&from_ill->ill_lock);
- if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
- phyint_inactive(from_ill->ill_phyint);
- }
- if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) {
- phyint_inactive(ill_v6->ill_phyint);
+ ill->ill_up_ipifs = B_TRUE;
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ mutex_enter(&ill->ill_lock);
+ ipif->ipif_state_flags &= ~IPIF_CHANGING;
+ mutex_exit(&ill->ill_lock);
+ if (ipif->ipif_was_up) {
+ if (!(ipif->ipif_flags & IPIF_UP))
+ err = ipif_up(ipif, q, mp);
+ ipif->ipif_was_up = B_FALSE;
+ if (err != 0) {
+ ASSERT(err == EINPROGRESS);
+ return (err);
}
- ill_v6->ill_move_peer = NULL;
}
}
+ mutex_enter(&ill->ill_lock);
+ ill->ill_state_flags &= ~ILL_CHANGING;
+ mutex_exit(&ill->ill_lock);
+ ill->ill_up_ipifs = B_FALSE;
return (0);
}
/*
- * bring down all the approriate ipifs.
+ * This function is called to bring up all the ipifs that were up before
+ * bringing the ill down via ill_down_ipifs().
*/
-/* ARGSUSED */
-static void
-ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover)
+int
+ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
{
- ipif_t *ipif;
+ int err;
ASSERT(IAM_WRITER_ILL(ill));
- /*
- * Except for ipif_state_flags the other fields of the ipif/ill that
- * are modified below are protected implicitly since we are a writer
- */
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER))
- continue;
- /*
- * Don't bring down the LINK LOCAL addresses as they are tied
- * to physical interface and they don't move. Treat them as
- * IPIF_NOFAILOVER.
- */
- if (chk_nofailover && ill->ill_isv6 &&
- IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))
- continue;
- if (index == 0 || index == ipif->ipif_orig_ifindex) {
- /*
- * We go through the ipif_down logic even if the ipif
- * is already down, since routes can be added based
- * on down ipifs. Going through ipif_down once again
- * will delete any IREs created based on these routes.
- */
- if (ipif->ipif_flags & IPIF_UP)
- ipif->ipif_was_up = B_TRUE;
- /*
- * If called with chk_nofailover true ipif is moving.
- */
- mutex_enter(&ill->ill_lock);
- if (chk_nofailover) {
- ipif->ipif_state_flags |=
- IPIF_MOVING | IPIF_CHANGING;
- } else {
- ipif->ipif_state_flags |= IPIF_CHANGING;
- }
- mutex_exit(&ill->ill_lock);
- /*
- * Need to re-create net/subnet bcast ires if
- * they are dependent on ipif.
- */
- if (!ipif->ipif_isv6)
- ipif_check_bcast_ires(ipif);
- (void) ipif_logical_down(ipif, NULL, NULL);
- ipif_non_duplicate(ipif);
- ipif_down_tail(ipif);
- }
- }
-}
-
-#define IPSQ_INC_REF(ipsq, ipst) { \
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \
- (ipsq)->ipsq_refs++; \
-}
+ err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
+ if (err != 0)
+ return (err);
-#define IPSQ_DEC_REF(ipsq, ipst) { \
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \
- (ipsq)->ipsq_refs--; \
- if ((ipsq)->ipsq_refs == 0) \
- (ipsq)->ipsq_name[0] = '\0'; \
+ return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp));
}
/*
- * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
- * new_ipsq.
+ * Bring down any IPIF_UP ipifs on ill.
*/
static void
-ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq, ip_stack_t *ipst)
+ill_down_ipifs(ill_t *ill)
{
- phyint_t *phyint;
- phyint_t *next_phyint;
-
- /*
- * To change the ipsq of an ill, we need to hold the ill_g_lock as
- * writer and the ill_lock of the ill in question. Also the dest
- * ipsq can't vanish while we hold the ill_g_lock as writer.
- */
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
- phyint = cur_ipsq->ipsq_phyint_list;
- cur_ipsq->ipsq_phyint_list = NULL;
- while (phyint != NULL) {
- next_phyint = phyint->phyint_ipsq_next;
- IPSQ_DEC_REF(cur_ipsq, ipst);
- phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list;
- new_ipsq->ipsq_phyint_list = phyint;
- IPSQ_INC_REF(new_ipsq, ipst);
- phyint->phyint_ipsq = new_ipsq;
- phyint = next_phyint;
- }
-}
-
-#define SPLIT_SUCCESS 0
-#define SPLIT_NOT_NEEDED 1
-#define SPLIT_FAILED 2
-
-int
-ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry,
- ip_stack_t *ipst)
-{
- ipsq_t *newipsq = NULL;
-
- /*
- * Assertions denote pre-requisites for changing the ipsq of
- * a phyint
- */
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
- /*
- * <ill-phyint> assocs can't change while ill_g_lock
- * is held as writer. See ill_phyint_reinit()
- */
- ASSERT(phyint->phyint_illv4 == NULL ||
- MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
- ASSERT(phyint->phyint_illv6 == NULL ||
- MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
-
- if ((phyint->phyint_groupname_len !=
- (strlen(cur_ipsq->ipsq_name) + 1) ||
- bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name,
- phyint->phyint_groupname_len) != 0)) {
- /*
- * Once we fail in creating a new ipsq due to memory shortage,
- * don't attempt to create new ipsq again, based on another
- * phyint, since we want all phyints belonging to an IPMP group
- * to be in the same ipsq even in the event of mem alloc fails.
- */
- newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry,
- cur_ipsq, ipst);
- if (newipsq == NULL) {
- /* Memory allocation failure */
- return (SPLIT_FAILED);
- } else {
- /* ipsq_refs protected by ill_g_lock (writer) */
- IPSQ_DEC_REF(cur_ipsq, ipst);
- phyint->phyint_ipsq = newipsq;
- phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list;
- newipsq->ipsq_phyint_list = phyint;
- IPSQ_INC_REF(newipsq, ipst);
- return (SPLIT_SUCCESS);
- }
- }
- return (SPLIT_NOT_NEEDED);
-}
+ ipif_t *ipif;
-/*
- * The ill locks of the phyint and the ill_g_lock (writer) must be held
- * to do this split
- */
-static int
-ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, ip_stack_t *ipst)
-{
- ipsq_t *newipsq;
+ ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
/*
- * <ill-phyint> assocs can't change while ill_g_lock
- * is held as writer. See ill_phyint_reinit()
+ * Except for ipif_state_flags the other fields of the ipif/ill that
+ * are modified below are protected implicitly since we are a writer
*/
-
- ASSERT(phyint->phyint_illv4 == NULL ||
- MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
- ASSERT(phyint->phyint_illv6 == NULL ||
- MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
-
- if (!ipsq_init((phyint->phyint_illv4 != NULL) ?
- phyint->phyint_illv4: phyint->phyint_illv6)) {
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
/*
- * ipsq_init failed due to no memory
- * caller will use the same ipsq
+ * We go through the ipif_down logic even if the ipif
+ * is already down, since routes can be added based
+ * on down ipifs. Going through ipif_down once again
+ * will delete any IREs created based on these routes.
*/
- return (SPLIT_FAILED);
- }
-
- /* ipsq_ref is protected by ill_g_lock (writer) */
- IPSQ_DEC_REF(cur_ipsq, ipst);
-
- /*
- * This is a new ipsq that is unknown to the world.
- * So we don't need to hold ipsq_lock,
- */
- newipsq = phyint->phyint_ipsq;
- newipsq->ipsq_writer = NULL;
- newipsq->ipsq_reentry_cnt--;
- ASSERT(newipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
- newipsq->ipsq_depth = 0;
-#endif
-
- return (SPLIT_SUCCESS);
-}
+ if (ipif->ipif_flags & IPIF_UP)
+ ipif->ipif_was_up = B_TRUE;
-/*
- * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
- * ipsq's representing their individual groups or themselves. Return
- * whether split needs to be retried again later.
- */
-static boolean_t
-ill_split_ipsq(ipsq_t *cur_ipsq)
-{
- phyint_t *phyint;
- phyint_t *next_phyint;
- int error;
- boolean_t need_retry = B_FALSE;
- ip_stack_t *ipst = cur_ipsq->ipsq_ipst;
+ mutex_enter(&ill->ill_lock);
+ ipif->ipif_state_flags |= IPIF_CHANGING;
+ mutex_exit(&ill->ill_lock);
- phyint = cur_ipsq->ipsq_phyint_list;
- cur_ipsq->ipsq_phyint_list = NULL;
- while (phyint != NULL) {
- next_phyint = phyint->phyint_ipsq_next;
/*
- * 'created' will tell us whether the callee actually
- * created an ipsq. Lack of memory may force the callee
- * to return without creating an ipsq.
+ * Need to re-create net/subnet bcast ires if
+ * they are dependent on ipif.
*/
- if (phyint->phyint_groupname == NULL) {
- error = ill_split_to_own_ipsq(phyint, cur_ipsq, ipst);
- } else {
- error = ill_split_to_grp_ipsq(phyint, cur_ipsq,
- need_retry, ipst);
- }
-
- switch (error) {
- case SPLIT_FAILED:
- need_retry = B_TRUE;
- /* FALLTHRU */
- case SPLIT_NOT_NEEDED:
- /*
- * Keep it on the list.
- */
- phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list;
- cur_ipsq->ipsq_phyint_list = phyint;
- break;
- case SPLIT_SUCCESS:
- break;
- default:
- ASSERT(0);
- }
-
- phyint = next_phyint;
- }
- return (need_retry);
-}
-
-/*
- * given an ipsq 'ipsq' lock all ills associated with this ipsq.
- * and return the ills in the list. This list will be
- * needed to unlock all the ills later on by the caller.
- * The <ill-ipsq> associations could change between the
- * lock and unlock. Hence the unlock can't traverse the
- * ipsq to get the list of ills.
- */
-static int
-ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max)
-{
- int cnt = 0;
- phyint_t *phyint;
- ip_stack_t *ipst = ipsq->ipsq_ipst;
-
- /*
- * The caller holds ill_g_lock to ensure that the ill memberships
- * of the ipsq don't change
- */
- ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
- phyint = ipsq->ipsq_phyint_list;
- while (phyint != NULL) {
- if (phyint->phyint_illv4 != NULL) {
- ASSERT(cnt < list_max);
- list[cnt++] = phyint->phyint_illv4;
- }
- if (phyint->phyint_illv6 != NULL) {
- ASSERT(cnt < list_max);
- list[cnt++] = phyint->phyint_illv6;
- }
- phyint = phyint->phyint_ipsq_next;
+ if (!ipif->ipif_isv6)
+ ipif_check_bcast_ires(ipif);
+ (void) ipif_logical_down(ipif, NULL, NULL);
+ ipif_non_duplicate(ipif);
+ ipif_down_tail(ipif);
}
- ill_lock_ills(list, cnt);
- return (cnt);
}
void
@@ -14577,3504 +14155,251 @@ ill_unlock_ills(ill_t **list, int cnt)
}
/*
- * Merge all the ills from 1 ipsq group into another ipsq group.
- * The source ipsq group is specified by the ipsq associated with
- * 'from_ill'. The destination ipsq group is specified by the ipsq
- * associated with 'to_ill' or 'groupname' respectively.
- * Note that ipsq itself does not have a reference count mechanism
- * and functions don't look up an ipsq and pass it around. Instead
- * functions pass around an ill or groupname, and the ipsq is looked
- * up from the ill or groupname and the required operation performed
- * atomically with the lookup on the ipsq.
+ * Redo source address selection. This is called when a
+ * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up.
*/
-static int
-ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp,
- queue_t *q)
-{
- ipsq_t *old_ipsq;
- ipsq_t *new_ipsq;
- ill_t **ill_list;
- int cnt;
- size_t ill_list_size;
- boolean_t became_writer_on_new_sq = B_FALSE;
- ip_stack_t *ipst = from_ill->ill_ipst;
-
- ASSERT(to_ill == NULL || ipst == to_ill->ill_ipst);
- /* Exactly 1 of 'to_ill' and groupname can be specified. */
- ASSERT((to_ill != NULL) ^ (groupname != NULL));
-
- /*
- * Need to hold ill_g_lock as writer and also the ill_lock to
- * change the <ill-ipsq> assoc of an ill. Need to hold the
- * ipsq_lock to prevent new messages from landing on an ipsq.
- */
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
- old_ipsq = from_ill->ill_phyint->phyint_ipsq;
- if (groupname != NULL)
- new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL, ipst);
- else {
- new_ipsq = to_ill->ill_phyint->phyint_ipsq;
- }
-
- ASSERT(old_ipsq != NULL && new_ipsq != NULL);
-
- /*
- * both groups are on the same ipsq.
- */
- if (old_ipsq == new_ipsq) {
- rw_exit(&ipst->ips_ill_g_lock);
- return (0);
- }
-
- cnt = old_ipsq->ipsq_refs << 1;
- ill_list_size = cnt * sizeof (ill_t *);
- ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
- if (ill_list == NULL) {
- rw_exit(&ipst->ips_ill_g_lock);
- return (ENOMEM);
- }
- cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt);
-
- /* Need ipsq lock to enque messages on new ipsq or to become writer */
- mutex_enter(&new_ipsq->ipsq_lock);
- if ((new_ipsq->ipsq_writer == NULL &&
- new_ipsq->ipsq_current_ipif == NULL) ||
- (new_ipsq->ipsq_writer == curthread)) {
- new_ipsq->ipsq_writer = curthread;
- new_ipsq->ipsq_reentry_cnt++;
- became_writer_on_new_sq = B_TRUE;
- }
-
- /*
- * We are holding ill_g_lock as writer and all the ill locks of
- * the old ipsq. So the old_ipsq can't be looked up, and hence no new
- * message can land up on the old ipsq even though we don't hold the
- * ipsq_lock of the old_ipsq. Now move all messages to the newipsq.
- */
- ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q);
-
- /*
- * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'.
- * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq>
- * assocs. till we release the ill_g_lock, and hence it can't vanish.
- */
- ill_merge_ipsq(old_ipsq, new_ipsq, ipst);
-
- /*
- * Mark the new ipsq as needing a split since it is currently
- * being shared by more than 1 IPMP group. The split will
- * occur at the end of ipsq_exit
- */
- new_ipsq->ipsq_split = B_TRUE;
-
- /* Now release all the locks */
- mutex_exit(&new_ipsq->ipsq_lock);
- ill_unlock_ills(ill_list, cnt);
- rw_exit(&ipst->ips_ill_g_lock);
-
- kmem_free(ill_list, ill_list_size);
-
- /*
- * If we succeeded in becoming writer on the new ipsq, then
- * drain the new ipsq and start processing all enqueued messages
- * including the current ioctl we are processing which is either
- * a set groupname or failover/failback.
- */
- if (became_writer_on_new_sq)
- ipsq_exit(new_ipsq);
-
- /*
- * syncq has been changed and all the messages have been moved.
- */
- mutex_enter(&old_ipsq->ipsq_lock);
- old_ipsq->ipsq_current_ipif = NULL;
- old_ipsq->ipsq_current_ioctl = 0;
- old_ipsq->ipsq_current_done = B_TRUE;
- mutex_exit(&old_ipsq->ipsq_lock);
- return (EINPROGRESS);
-}
-
-/*
- * Delete and add the loopback copy and non-loopback copy of
- * the BROADCAST ire corresponding to ill and addr. Used to
- * group broadcast ires together when ill becomes part of
- * a group.
- *
- * This function is also called when ill is leaving the group
- * so that the ires belonging to the group gets re-grouped.
- */
-static void
-ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr)
-{
- ire_t *ire, *nire, *nire_next, *ire_head = NULL;
- ire_t **ire_ptpn = &ire_head;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * The loopback and non-loopback IREs are inserted in the order in which
- * they're found, on the basis that they are correctly ordered (loopback
- * first).
- */
- for (;;) {
- ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
- ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
- if (ire == NULL)
- break;
-
- /*
- * we are passing in KM_SLEEP because it is not easy to
- * go back to a sane state in case of memory failure.
- */
- nire = kmem_cache_alloc(ire_cache, KM_SLEEP);
- ASSERT(nire != NULL);
- bzero(nire, sizeof (ire_t));
- /*
- * Don't use ire_max_frag directly since we don't
- * hold on to 'ire' until we add the new ire 'nire' and
- * we don't want the new ire to have a dangling reference
- * to 'ire'. The ire_max_frag of a broadcast ire must
- * be in sync with the ipif_mtu of the associate ipif.
- * For eg. this happens as a result of SIOCSLIFNAME,
- * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by
- * the driver. A change in ire_max_frag triggered as
- * as a result of path mtu discovery, or due to an
- * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a
- * route change -mtu command does not apply to broadcast ires.
- *
- * XXX We need a recovery strategy here if ire_init fails
- */
- if (ire_init(nire,
- (uchar_t *)&ire->ire_addr,
- (uchar_t *)&ire->ire_mask,
- (uchar_t *)&ire->ire_src_addr,
- (uchar_t *)&ire->ire_gateway_addr,
- ire->ire_stq == NULL ? &ip_loopback_mtu :
- &ire->ire_ipif->ipif_mtu,
- ire->ire_nce,
- ire->ire_rfq,
- ire->ire_stq,
- ire->ire_type,
- ire->ire_ipif,
- ire->ire_cmask,
- ire->ire_phandle,
- ire->ire_ihandle,
- ire->ire_flags,
- &ire->ire_uinfo,
- NULL,
- NULL,
- ipst) == NULL) {
- cmn_err(CE_PANIC, "ire_init() failed");
- }
- ire_delete(ire);
- ire_refrele(ire);
-
- /*
- * The newly created IREs are inserted at the tail of the list
- * starting with ire_head. As we've just allocated them no one
- * knows about them so it's safe.
- */
- *ire_ptpn = nire;
- ire_ptpn = &nire->ire_next;
- }
-
- for (nire = ire_head; nire != NULL; nire = nire_next) {
- int error;
- ire_t *oire;
- /* unlink the IRE from our list before calling ire_add() */
- nire_next = nire->ire_next;
- nire->ire_next = NULL;
-
- /* ire_add adds the ire at the right place in the list */
- oire = nire;
- error = ire_add(&nire, NULL, NULL, NULL, B_FALSE);
- ASSERT(error == 0);
- ASSERT(oire == nire);
- ire_refrele(nire); /* Held in ire_add */
- }
-}
-
-/*
- * This function is usually called when an ill is inserted in
- * a group and all the ipifs are already UP. As all the ipifs
- * are already UP, the broadcast ires have already been created
- * and been inserted. But, ire_add_v4 would not have grouped properly.
- * We need to re-group for the benefit of ip_wput_ire which
- * expects BROADCAST ires to be grouped properly to avoid sending
- * more than one copy of the broadcast packet per group.
- *
- * NOTE : We don't check for ill_ipif_up_count to be non-zero here
- * because when ipif_up_done ends up calling this, ires have
- * already been added before illgrp_insert i.e before ill_group
- * has been initialized.
- */
-static void
-ill_group_bcast_for_xmit(ill_t *ill)
+void
+ill_update_source_selection(ill_t *ill)
{
- ill_group_t *illgrp;
ipif_t *ipif;
- ipaddr_t addr;
- ipaddr_t net_mask;
- ipaddr_t subnet_netmask;
- illgrp = ill->ill_group;
+ ASSERT(IAM_WRITER_ILL(ill));
/*
- * This function is called even when an ill is deleted from
- * the group. Hence, illgrp could be null.
+ * Underlying interfaces are only used for test traffic and thus
+ * should always send with their (deprecated) source addresses.
*/
- if (illgrp != NULL && illgrp->illgrp_ill_count == 1)
+ if (IS_UNDER_IPMP(ill))
return;
- /*
- * Delete all the BROADCAST ires matching this ill and add
- * them back. This time, ire_add_v4 should take care of
- * grouping them with others because ill is part of the
- * group.
- */
- ill_bcast_delete_and_add(ill, 0);
- ill_bcast_delete_and_add(ill, INADDR_BROADCAST);
-
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-
- if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
- !(ipif->ipif_flags & IPIF_NOLOCAL)) {
- net_mask = ip_net_mask(ipif->ipif_lcl_addr);
- } else {
- net_mask = htonl(IN_CLASSA_NET);
- }
- addr = net_mask & ipif->ipif_subnet;
- ill_bcast_delete_and_add(ill, addr);
- ill_bcast_delete_and_add(ill, ~net_mask | addr);
-
- subnet_netmask = ipif->ipif_net_mask;
- addr = ipif->ipif_subnet;
- ill_bcast_delete_and_add(ill, addr);
- ill_bcast_delete_and_add(ill, ~subnet_netmask | addr);
- }
-}
-
-/*
- * This function is called from illgrp_delete when ill is being deleted
- * from the group.
- *
- * As ill is not there in the group anymore, any address belonging
- * to this ill should be cleared of IRE_MARK_NORECV.
- */
-static void
-ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr)
-{
- ire_t *ire;
- irb_t *irb;
- ip_stack_t *ipst = ill->ill_ipst;
-
- ASSERT(ill->ill_group == NULL);
-
- ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
- ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
-
- if (ire != NULL) {
- /*
- * IPMP and plumbing operations are serialized on the ipsq, so
- * no one will insert or delete a broadcast ire under our feet.
- */
- irb = ire->ire_bucket;
- rw_enter(&irb->irb_lock, RW_READER);
- ire_refrele(ire);
-
- for (; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_addr != addr)
- break;
- if (ire_to_ill(ire) != ill)
- continue;
-
- ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED));
- ire->ire_marks &= ~IRE_MARK_NORECV;
- }
- rw_exit(&irb->irb_lock);
- }
-}
-
-ire_t *
-irep_insert(ill_group_t *illgrp, ipaddr_t addr, ire_t *ire, ire_t ***pirep)
-{
- boolean_t first = B_TRUE;
- ire_t *clear_ire = NULL;
- ire_t *start_ire = NULL;
- uint64_t match_flags;
- uint64_t phyi_flags;
- boolean_t fallback = B_FALSE;
-
- /*
- * irb_lock must be held by the caller.
- * Get to the first ire matching the address and the
- * group. If the address does not match we are done
- * as we could not find the IRE. If the address matches
- * we should get to the first one matching the group.
- */
- while (ire != NULL) {
- if (ire->ire_addr != addr ||
- ire->ire_ipif->ipif_ill->ill_group == illgrp) {
- break;
- }
- ire = ire->ire_next;
- }
- match_flags = PHYI_FAILED | PHYI_INACTIVE;
- start_ire = ire;
-redo:
- while (ire != NULL && ire->ire_addr == addr &&
- ire->ire_ipif->ipif_ill->ill_group == illgrp) {
- /*
- * The first ire for any address within a group
- * should always be the one with IRE_MARK_NORECV cleared
- * so that ip_wput_ire can avoid searching for one.
- * Note down the insertion point which will be used
- * later.
- */
- if (first && (*pirep == NULL))
- *pirep = ire->ire_ptpn;
- /*
- * PHYI_FAILED is set when the interface fails.
- * This interface might have become good, but the
- * daemon has not yet detected. We should still
- * not receive on this. PHYI_OFFLINE should never
- * be picked as this has been offlined and soon
- * be removed.
- */
- phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags;
- if (phyi_flags & PHYI_OFFLINE) {
- ire->ire_marks |= IRE_MARK_NORECV;
- ire = ire->ire_next;
- continue;
- }
- if (phyi_flags & match_flags) {
- ire->ire_marks |= IRE_MARK_NORECV;
- ire = ire->ire_next;
- if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
- PHYI_INACTIVE) {
- fallback = B_TRUE;
- }
- continue;
- }
- if (first) {
- /*
- * We will move this to the front of the list later
- * on.
- */
- clear_ire = ire;
- ire->ire_marks &= ~IRE_MARK_NORECV;
- } else {
- ire->ire_marks |= IRE_MARK_NORECV;
- }
- first = B_FALSE;
- ire = ire->ire_next;
- }
- /*
- * If we never nominated anybody, try nominating at least
- * an INACTIVE, if we found one. Do it only once though.
- */
- if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) &&
- fallback) {
- match_flags = PHYI_FAILED;
- ire = start_ire;
- *pirep = NULL;
- goto redo;
- }
- return (clear_ire);
-}
-
-/*
- * This function must be called only after the broadcast ires
- * have been grouped together. For a given address addr, nominate
- * only one of the ires whose interface is not FAILED or OFFLINE.
- *
- * This is also called when an ipif goes down, so that we can nominate
- * a different ire with the same address for receiving.
- */
-static void
-ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr, ip_stack_t *ipst)
-{
- irb_t *irb;
- ire_t *ire;
- ire_t *ire1;
- ire_t *save_ire;
- ire_t **irep = NULL;
- ire_t *clear_ire = NULL;
- ire_t *new_lb_ire;
- ire_t *new_nlb_ire;
- boolean_t new_lb_ire_used = B_FALSE;
- boolean_t new_nlb_ire_used = B_FALSE;
- boolean_t refrele_lb_ire = B_FALSE;
- boolean_t refrele_nlb_ire = B_FALSE;
- uint_t max_frag;
-
- ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES,
- NULL, MATCH_IRE_TYPE, ipst);
- /*
- * We may not be able to find some ires if a previous
- * ire_create failed. This happens when an ipif goes
- * down and we are unable to create BROADCAST ires due
- * to memory failure. Thus, we have to check for NULL
- * below. This should handle the case for LOOPBACK,
- * POINTOPOINT and interfaces with some POINTOPOINT
- * logicals for which there are no BROADCAST ires.
- */
- if (ire == NULL)
- return;
- /*
- * Currently IRE_BROADCASTS are deleted when an ipif
- * goes down which runs exclusively. Thus, setting
- * IRE_MARK_RCVD should not race with ire_delete marking
- * IRE_MARK_CONDEMNED. We grab the lock below just to
- * be consistent with other parts of the code that walks
- * a given bucket.
- */
- save_ire = ire;
- irb = ire->ire_bucket;
- new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
- if (new_lb_ire == NULL) {
- ire_refrele(ire);
- return;
- }
- new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
- if (new_nlb_ire == NULL) {
- ire_refrele(ire);
- kmem_cache_free(ire_cache, new_lb_ire);
- return;
- }
- IRB_REFHOLD(irb);
- rw_enter(&irb->irb_lock, RW_WRITER);
- clear_ire = irep_insert(illgrp, addr, ire, &irep);
-
- /*
- * irep non-NULL indicates that we entered the while loop
- * above. If clear_ire is at the insertion point, we don't
- * have to do anything. clear_ire will be NULL if all the
- * interfaces are failed.
- *
- * We cannot unlink and reinsert the ire at the right place
- * in the list since there can be other walkers of this bucket.
- * Instead we delete and recreate the ire
- */
- if (clear_ire != NULL && irep != NULL && *irep != clear_ire) {
- ire_t *clear_ire_stq = NULL;
- ire_t *clr_ire = NULL;
- ire_t *ire_next = NULL;
-
- if (clear_ire->ire_stq == NULL)
- ire_next = clear_ire->ire_next;
-
- rw_exit(&irb->irb_lock);
-
- bzero(new_lb_ire, sizeof (ire_t));
- /* XXX We need a recovery strategy here. */
- if (ire_init(new_lb_ire,
- (uchar_t *)&clear_ire->ire_addr,
- (uchar_t *)&clear_ire->ire_mask,
- (uchar_t *)&clear_ire->ire_src_addr,
- (uchar_t *)&clear_ire->ire_gateway_addr,
- &clear_ire->ire_max_frag,
- NULL, /* let ire_nce_init derive the resolver info */
- clear_ire->ire_rfq,
- clear_ire->ire_stq,
- clear_ire->ire_type,
- clear_ire->ire_ipif,
- clear_ire->ire_cmask,
- clear_ire->ire_phandle,
- clear_ire->ire_ihandle,
- clear_ire->ire_flags,
- &clear_ire->ire_uinfo,
- NULL,
- NULL,
- ipst) == NULL)
- cmn_err(CE_PANIC, "ire_init() failed");
-
- refrele_lb_ire = B_TRUE;
-
- if (ire_next != NULL &&
- ire_next->ire_stq != NULL &&
- ire_next->ire_addr == clear_ire->ire_addr &&
- ire_next->ire_ipif->ipif_ill ==
- clear_ire->ire_ipif->ipif_ill) {
- clear_ire_stq = ire_next;
-
- bzero(new_nlb_ire, sizeof (ire_t));
- /* XXX We need a recovery strategy here. */
- if (ire_init(new_nlb_ire,
- (uchar_t *)&clear_ire_stq->ire_addr,
- (uchar_t *)&clear_ire_stq->ire_mask,
- (uchar_t *)&clear_ire_stq->ire_src_addr,
- (uchar_t *)&clear_ire_stq->ire_gateway_addr,
- &clear_ire_stq->ire_max_frag,
- NULL,
- clear_ire_stq->ire_rfq,
- clear_ire_stq->ire_stq,
- clear_ire_stq->ire_type,
- clear_ire_stq->ire_ipif,
- clear_ire_stq->ire_cmask,
- clear_ire_stq->ire_phandle,
- clear_ire_stq->ire_ihandle,
- clear_ire_stq->ire_flags,
- &clear_ire_stq->ire_uinfo,
- NULL,
- NULL,
- ipst) == NULL)
- cmn_err(CE_PANIC, "ire_init() failed");
-
- refrele_nlb_ire = B_TRUE;
- }
-
- rw_enter(&irb->irb_lock, RW_WRITER);
- /*
- * irb_lock was dropped across call to ire_init() due to
- * lock ordering issue with ipst->ips_ndp{4,6}->ndp_g_lock
- * mutex lock. Therefore irep could have changed. call
- * irep_insert() to get the new insertion point (irep) and
- * recheck all known conditions.
- */
- irep = NULL;
- clr_ire = irep_insert(illgrp, addr, save_ire, &irep);
- if ((irep != NULL) && (*irep != clear_ire) &&
- (clr_ire == clear_ire)) {
- if ((clear_ire_stq != NULL) &&
- (clr_ire->ire_next != clear_ire_stq))
- clear_ire_stq = NULL;
- /*
- * Delete the ire. We can't call ire_delete() since
- * we are holding the bucket lock. We can't release the
- * bucket lock since we can't allow irep to change.
- * So just mark it CONDEMNED.
- * The IRB_REFRELE will delete the ire from the list
- * and do the refrele.
- */
- clear_ire->ire_marks |= IRE_MARK_CONDEMNED;
- irb->irb_marks |= IRB_MARK_CONDEMNED;
-
- if (clear_ire_stq != NULL &&
- clear_ire_stq->ire_nce != NULL) {
- nce_fastpath_list_delete(
- clear_ire_stq->ire_nce);
- clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED;
- }
-
- /*
- * Also take care of otherfields like ib/ob pkt count
- * etc. Need to dup them.
- * ditto in ill_bcast_delete_and_add
- */
-
- /* Set the max_frag before adding the ire */
- max_frag = *new_lb_ire->ire_max_fragp;
- new_lb_ire->ire_max_fragp = NULL;
- new_lb_ire->ire_max_frag = max_frag;
-
- /* Add the new ire's. Insert at *irep */
- new_lb_ire->ire_bucket = clear_ire->ire_bucket;
- ire1 = *irep;
- if (ire1 != NULL)
- ire1->ire_ptpn = &new_lb_ire->ire_next;
- new_lb_ire->ire_next = ire1;
- /* Link the new one in. */
- new_lb_ire->ire_ptpn = irep;
- membar_producer();
- *irep = new_lb_ire;
- new_lb_ire_used = B_TRUE;
- BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
- ire_stats_inserted);
- new_lb_ire->ire_bucket->irb_ire_cnt++;
- DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *),
- new_lb_ire->ire_ipif,
- (char *), "ire", (void *), new_lb_ire);
- new_lb_ire->ire_ipif->ipif_ire_cnt++;
-
- if (clear_ire_stq != NULL) {
- ill_t *ire_ill;
- /* Set the max_frag before adding the ire */
- max_frag = *new_nlb_ire->ire_max_fragp;
- new_nlb_ire->ire_max_fragp = NULL;
- new_nlb_ire->ire_max_frag = max_frag;
-
- new_nlb_ire->ire_bucket = clear_ire->ire_bucket;
- irep = &new_lb_ire->ire_next;
- /* Add the new ire. Insert at *irep */
- ire1 = *irep;
- if (ire1 != NULL)
- ire1->ire_ptpn = &new_nlb_ire->ire_next;
- new_nlb_ire->ire_next = ire1;
- /* Link the new one in. */
- new_nlb_ire->ire_ptpn = irep;
- membar_producer();
- *irep = new_nlb_ire;
- new_nlb_ire_used = B_TRUE;
- BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
- ire_stats_inserted);
- new_nlb_ire->ire_bucket->irb_ire_cnt++;
- DTRACE_PROBE3(ipif__incr__cnt,
- (ipif_t *), new_nlb_ire->ire_ipif,
- (char *), "ire", (void *), new_nlb_ire);
- new_nlb_ire->ire_ipif->ipif_ire_cnt++;
- DTRACE_PROBE3(ill__incr__cnt,
- (ill_t *), new_nlb_ire->ire_stq->q_ptr,
- (char *), "ire", (void *), new_nlb_ire);
- ire_ill = (ill_t *)new_nlb_ire->ire_stq->q_ptr;
- ire_ill->ill_ire_cnt++;
- }
- }
- }
- ire_refrele(save_ire);
- rw_exit(&irb->irb_lock);
- /*
- * Since we dropped the irb_lock across call to ire_init()
- * and rechecking known conditions, it is possible that
- * the checks might fail, therefore undo the work done by
- * ire_init() by calling ire_refrele() on the newly created ire.
- */
- if (!new_lb_ire_used) {
- if (refrele_lb_ire) {
- ire_refrele(new_lb_ire);
- } else {
- kmem_cache_free(ire_cache, new_lb_ire);
- }
- }
- if (!new_nlb_ire_used) {
- if (refrele_nlb_ire) {
- ire_refrele(new_nlb_ire);
- } else {
- kmem_cache_free(ire_cache, new_nlb_ire);
- }
- }
- IRB_REFRELE(irb);
-}
-
-/*
- * Whenever an ipif goes down we have to renominate a different
- * broadcast ire to receive. Whenever an ipif comes up, we need
- * to make sure that we have only one nominated to receive.
- */
-static void
-ipif_renominate_bcast(ipif_t *ipif)
-{
- ill_t *ill = ipif->ipif_ill;
- ipaddr_t subnet_addr;
- ipaddr_t net_addr;
- ipaddr_t net_mask = 0;
- ipaddr_t subnet_netmask;
- ipaddr_t addr;
- ill_group_t *illgrp;
- ip_stack_t *ipst = ill->ill_ipst;
-
- illgrp = ill->ill_group;
- /*
- * If this is the last ipif going down, it might take
- * the ill out of the group. In that case ipif_down ->
- * illgrp_delete takes care of doing the nomination.
- * ipif_down does not call for this case.
- */
- ASSERT(illgrp != NULL);
-
- /* There could not have been any ires associated with this */
- if (ipif->ipif_subnet == 0)
- return;
-
- ill_mark_bcast(illgrp, 0, ipst);
- ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst);
-
- if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
- !(ipif->ipif_flags & IPIF_NOLOCAL)) {
- net_mask = ip_net_mask(ipif->ipif_lcl_addr);
- } else {
- net_mask = htonl(IN_CLASSA_NET);
- }
- addr = net_mask & ipif->ipif_subnet;
- ill_mark_bcast(illgrp, addr, ipst);
-
- net_addr = ~net_mask | addr;
- ill_mark_bcast(illgrp, net_addr, ipst);
-
- subnet_netmask = ipif->ipif_net_mask;
- addr = ipif->ipif_subnet;
- ill_mark_bcast(illgrp, addr, ipst);
-
- subnet_addr = ~subnet_netmask | addr;
- ill_mark_bcast(illgrp, subnet_addr, ipst);
-}
-
-/*
- * Whenever we form or delete ill groups, we need to nominate one set of
- * BROADCAST ires for receiving in the group.
- *
- * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires
- * have been added, but ill_ipif_up_count is 0. Thus, we don't assert
- * for ill_ipif_up_count to be non-zero. This is the only case where
- * ill_ipif_up_count is zero and we would still find the ires.
- *
- * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one
- * ipif is UP and we just have to do the nomination.
- *
- * 3) When ill_handoff_responsibility calls us, some ill has been removed
- * from the group. So, we have to do the nomination.
- *
- * Because of (3), there could be just one ill in the group. But we have
- * to nominate still as IRE_MARK_NORCV may have been marked on this.
- * Thus, this function does not optimize when there is only one ill as
- * it is not correct for (3).
- */
-static void
-ill_nominate_bcast_rcv(ill_group_t *illgrp)
-{
- ill_t *ill;
- ipif_t *ipif;
- ipaddr_t subnet_addr;
- ipaddr_t prev_subnet_addr = 0;
- ipaddr_t net_addr;
- ipaddr_t prev_net_addr = 0;
- ipaddr_t net_mask = 0;
- ipaddr_t subnet_netmask;
- ipaddr_t addr;
- ip_stack_t *ipst;
-
- /*
- * When the last memeber is leaving, there is nothing to
- * nominate.
- */
- if (illgrp->illgrp_ill_count == 0) {
- ASSERT(illgrp->illgrp_ill == NULL);
- return;
- }
-
- ill = illgrp->illgrp_ill;
- ASSERT(!ill->ill_isv6);
- ipst = ill->ill_ipst;
- /*
- * We assume that ires with same address and belonging to the
- * same group, has been grouped together. Nominating a *single*
- * ill in the group for sending and receiving broadcast is done
- * by making sure that the first BROADCAST ire (which will be
- * the one returned by ire_ctable_lookup for ip_rput and the
- * one that will be used in ip_wput_ire) will be the one that
- * will not have IRE_MARK_NORECV set.
- *
- * 1) ip_rput checks and discards packets received on ires marked
- * with IRE_MARK_NORECV. Thus, we don't send up duplicate
- * broadcast packets. We need to clear IRE_MARK_NORECV on the
- * first ire in the group for every broadcast address in the group.
- * ip_rput will accept packets only on the first ire i.e only
- * one copy of the ill.
- *
- * 2) ip_wput_ire needs to send out just one copy of the broadcast
- * packet for the whole group. It needs to send out on the ill
- * whose ire has not been marked with IRE_MARK_NORECV. If it sends
- * on the one marked with IRE_MARK_NORECV, ip_rput will accept
- * the copy echoed back on other port where the ire is not marked
- * with IRE_MARK_NORECV.
- *
- * Note that we just need to have the first IRE either loopback or
- * non-loopback (either of them may not exist if ire_create failed
- * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will
- * always hit the first one and hence will always accept one copy.
- *
- * We have a broadcast ire per ill for all the unique prefixes
- * hosted on that ill. As we don't have a way of knowing the
- * unique prefixes on a given ill and hence in the whole group,
- * we just call ill_mark_bcast on all the prefixes that exist
- * in the group. For the common case of one prefix, the code
- * below optimizes by remebering the last address used for
- * markng. In the case of multiple prefixes, this will still
- * optimize depending the order of prefixes.
- *
- * The only unique address across the whole group is 0.0.0.0 and
- * 255.255.255.255 and thus we call only once. ill_mark_bcast enables
- * the first ire in the bucket for receiving and disables the
- * others.
- */
- ill_mark_bcast(illgrp, 0, ipst);
- ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst);
- for (; ill != NULL; ill = ill->ill_group_next) {
-
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
-
- if (!(ipif->ipif_flags & IPIF_UP) ||
- ipif->ipif_subnet == 0) {
- continue;
- }
- if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
- !(ipif->ipif_flags & IPIF_NOLOCAL)) {
- net_mask = ip_net_mask(ipif->ipif_lcl_addr);
- } else {
- net_mask = htonl(IN_CLASSA_NET);
- }
- addr = net_mask & ipif->ipif_subnet;
- if (prev_net_addr == 0 || prev_net_addr != addr) {
- ill_mark_bcast(illgrp, addr, ipst);
- net_addr = ~net_mask | addr;
- ill_mark_bcast(illgrp, net_addr, ipst);
- }
- prev_net_addr = addr;
-
- subnet_netmask = ipif->ipif_net_mask;
- addr = ipif->ipif_subnet;
- if (prev_subnet_addr == 0 ||
- prev_subnet_addr != addr) {
- ill_mark_bcast(illgrp, addr, ipst);
- subnet_addr = ~subnet_netmask | addr;
- ill_mark_bcast(illgrp, subnet_addr, ipst);
- }
- prev_subnet_addr = addr;
- }
- }
-}
-
-/*
- * This function is called while forming ill groups.
- *
- * Currently, we handle only allmulti groups. We want to join
- * allmulti on only one of the ills in the groups. In future,
- * when we have link aggregation, we may have to join normal
- * multicast groups on multiple ills as switch does inbound load
- * balancing. Following are the functions that calls this
- * function :
- *
- * 1) ill_recover_multicast : Interface is coming back UP.
- * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6
- * will call ill_recover_multicast to recover all the multicast
- * groups. We need to make sure that only one member is joined
- * in the ill group.
- *
- * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed.
- * Somebody is joining allmulti. We need to make sure that only one
- * member is joined in the group.
- *
- * 3) illgrp_insert : If allmulti has already joined, we need to make
- * sure that only one member is joined in the group.
- *
- * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving
- * allmulti who we have nominated. We need to pick someother ill.
- *
- * 5) illgrp_delete : The ill we nominated is leaving the group,
- * we need to pick a new ill to join the group.
- *
- * For (1), (2), (5) - we just have to check whether there is
- * a good ill joined in the group. If we could not find any ills
- * joined the group, we should join.
- *
- * For (4), the one that was nominated to receive, left the group.
- * There could be nobody joined in the group when this function is
- * called.
- *
- * For (3) - we need to explicitly check whether there are multiple
- * ills joined in the group.
- *
- * For simplicity, we don't differentiate any of the above cases. We
- * just leave the group if it is joined on any of them and join on
- * the first good ill.
- */
-int
-ill_nominate_mcast_rcv(ill_group_t *illgrp)
-{
- ilm_t *ilm;
- ill_t *ill;
- ill_t *fallback_inactive_ill = NULL;
- ill_t *fallback_failed_ill = NULL;
- int ret = 0;
-
- /*
- * Leave the allmulti on all the ills and start fresh.
- */
- for (ill = illgrp->illgrp_ill; ill != NULL;
- ill = ill->ill_group_next) {
- if (ill->ill_join_allmulti)
- ill_leave_allmulti(ill);
- }
-
- /*
- * Choose a good ill. Fallback to inactive or failed if
- * none available. We need to fallback to FAILED in the
- * case where we have 2 interfaces in a group - where
- * one of them is failed and another is a good one and
- * the good one (not marked inactive) is leaving the group.
- */
- for (ill = illgrp->illgrp_ill; ill != NULL; ill = ill->ill_group_next) {
- if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE)
- continue;
- if (ill->ill_phyint->phyint_flags & PHYI_FAILED) {
- fallback_failed_ill = ill;
- continue;
- }
- if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) {
- fallback_inactive_ill = ill;
- continue;
- }
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
- ret = ill_join_allmulti(ill);
- /*
- * ill_join_allmulti() can fail because of
- * memory failures so make sure we join at
- * least on one ill.
- */
- if (ill->ill_join_allmulti)
- return (0);
- }
- }
- }
- if (ret != 0) {
- /*
- * If we tried nominating above and failed to do so,
- * return error. We might have tried multiple times.
- * But, return the latest error.
- */
- return (ret);
- }
- if ((ill = fallback_inactive_ill) != NULL) {
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr))
- return (ill_join_allmulti(ill));
- }
- } else if ((ill = fallback_failed_ill) != NULL) {
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr))
- return (ill_join_allmulti(ill));
- }
- }
- return (0);
-}
-
-/*
- * This function is called from illgrp_delete after it is
- * deleted from the group to reschedule responsibilities
- * to a different ill.
- */
-static void
-ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp)
-{
- ilm_t *ilm;
- ipif_t *ipif;
- ipaddr_t subnet_addr;
- ipaddr_t net_addr;
- ipaddr_t net_mask = 0;
- ipaddr_t subnet_netmask;
- ipaddr_t addr;
- ip_stack_t *ipst = ill->ill_ipst;
-
- ASSERT(ill->ill_group == NULL);
- /*
- * Broadcast Responsibility:
- *
- * 1. If this ill has been nominated for receiving broadcast
- * packets, we need to find a new one. Before we find a new
- * one, we need to re-group the ires that are part of this new
- * group (assumed by ill_nominate_bcast_rcv). We do this by
- * calling ill_group_bcast_for_xmit(ill) which will do the right
- * thing for us.
- *
- * 2. If this ill was not nominated for receiving broadcast
- * packets, we need to clear the IRE_MARK_NORECV flag
- * so that we continue to send up broadcast packets.
- */
- if (!ill->ill_isv6) {
- /*
- * Case 1 above : No optimization here. Just redo the
- * nomination.
- */
- ill_group_bcast_for_xmit(ill);
- ill_nominate_bcast_rcv(illgrp);
-
- /*
- * Case 2 above : Lookup and clear IRE_MARK_NORECV.
- */
- ill_clear_bcast_mark(ill, 0);
- ill_clear_bcast_mark(ill, INADDR_BROADCAST);
-
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
-
- if (!(ipif->ipif_flags & IPIF_UP) ||
- ipif->ipif_subnet == 0) {
- continue;
- }
- if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
- !(ipif->ipif_flags & IPIF_NOLOCAL)) {
- net_mask = ip_net_mask(ipif->ipif_lcl_addr);
- } else {
- net_mask = htonl(IN_CLASSA_NET);
- }
- addr = net_mask & ipif->ipif_subnet;
- ill_clear_bcast_mark(ill, addr);
-
- net_addr = ~net_mask | addr;
- ill_clear_bcast_mark(ill, net_addr);
-
- subnet_netmask = ipif->ipif_net_mask;
- addr = ipif->ipif_subnet;
- ill_clear_bcast_mark(ill, addr);
-
- subnet_addr = ~subnet_netmask | addr;
- ill_clear_bcast_mark(ill, subnet_addr);
- }
- }
-
- /*
- * Multicast Responsibility.
- *
- * If we have joined allmulti on this one, find a new member
- * in the group to join allmulti. As this ill is already part
- * of allmulti, we don't have to join on this one.
- *
- * If we have not joined allmulti on this one, there is no
- * responsibility to handoff. But we need to take new
- * responsibility i.e, join allmulti on this one if we need
- * to.
- */
- if (ill->ill_join_allmulti) {
- (void) ill_nominate_mcast_rcv(illgrp);
- } else {
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
- (void) ill_join_allmulti(ill);
- break;
- }
- }
- }
-
- /*
- * We intentionally do the flushing of IRE_CACHES only matching
- * on the ill and not on groups. Note that we are already deleted
- * from the group.
- *
- * This will make sure that all IRE_CACHES whose stq is pointing
- * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get
- * deleted and IRE_CACHES that are not pointing at this ill will
- * be left alone.
- */
- ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
- illgrp_cache_delete, ill, ill);
-
- /*
- * Some conn may have cached one of the IREs deleted above. By removing
- * the ire reference, we clean up the extra reference to the ill held in
- * ire->ire_stq.
- */
- ipcl_walk(conn_cleanup_stale_ire, NULL, ipst);
-
- /*
- * Re-do source address selection for all the members in the
- * group, if they borrowed source address from one of the ipifs
- * in this ill.
- */
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (ill->ill_isv6) {
- ipif_update_other_ipifs_v6(ipif, illgrp);
- } else {
- ipif_update_other_ipifs(ipif, illgrp);
- }
+ if (ill->ill_isv6)
+ ipif_recreate_interface_routes_v6(NULL, ipif);
+ else
+ ipif_recreate_interface_routes(NULL, ipif);
}
}
/*
- * Delete the ill from the group. The caller makes sure that it is
- * in a group and it okay to delete from the group. So, we always
- * delete here.
+ * Finish the group join started in ip_sioctl_groupname().
*/
+/* ARGSUSED */
static void
-illgrp_delete(ill_t *ill)
-{
- ill_group_t *illgrp;
- ill_group_t *tmpg;
- ill_t *tmp_ill;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * Reset illgrp_ill_schednext if it was pointing at us.
- * We need to do this before we set ill_group to NULL.
- */
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- mutex_enter(&ill->ill_lock);
-
- illgrp_reset_schednext(ill);
-
- illgrp = ill->ill_group;
-
- /* Delete the ill from illgrp. */
- if (illgrp->illgrp_ill == ill) {
- illgrp->illgrp_ill = ill->ill_group_next;
- } else {
- tmp_ill = illgrp->illgrp_ill;
- while (tmp_ill->ill_group_next != ill) {
- tmp_ill = tmp_ill->ill_group_next;
- ASSERT(tmp_ill != NULL);
- }
- tmp_ill->ill_group_next = ill->ill_group_next;
- }
- ill->ill_group = NULL;
- ill->ill_group_next = NULL;
-
- illgrp->illgrp_ill_count--;
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
-
- /*
- * As this ill is leaving the group, we need to hand off
- * the responsibilities to the other ills in the group, if
- * this ill had some responsibilities.
- */
-
- ill_handoff_responsibility(ill, illgrp);
-
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
- if (illgrp->illgrp_ill_count == 0) {
-
- ASSERT(illgrp->illgrp_ill == NULL);
- if (ill->ill_isv6) {
- if (illgrp == ipst->ips_illgrp_head_v6) {
- ipst->ips_illgrp_head_v6 = illgrp->illgrp_next;
- } else {
- tmpg = ipst->ips_illgrp_head_v6;
- while (tmpg->illgrp_next != illgrp) {
- tmpg = tmpg->illgrp_next;
- ASSERT(tmpg != NULL);
- }
- tmpg->illgrp_next = illgrp->illgrp_next;
- }
- } else {
- if (illgrp == ipst->ips_illgrp_head_v4) {
- ipst->ips_illgrp_head_v4 = illgrp->illgrp_next;
- } else {
- tmpg = ipst->ips_illgrp_head_v4;
- while (tmpg->illgrp_next != illgrp) {
- tmpg = tmpg->illgrp_next;
- ASSERT(tmpg != NULL);
- }
- tmpg->illgrp_next = illgrp->illgrp_next;
- }
- }
- mutex_destroy(&illgrp->illgrp_lock);
- mi_free(illgrp);
- }
- rw_exit(&ipst->ips_ill_g_lock);
-
- /*
- * Even though the ill is out of the group its not necessary
- * to set ipsq_split as TRUE as the ipifs could be down temporarily
- * We will split the ipsq when phyint_groupname is set to NULL.
- */
-
- /*
- * Send a routing sockets message if we are deleting from
- * groups with names.
- */
- if (ill->ill_phyint->phyint_groupname_len != 0)
- ip_rts_ifmsg(ill->ill_ipif);
-}
-
-/*
- * Re-do source address selection. This is normally called when
- * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST
- * ipif comes up.
- */
-void
-ill_update_source_selection(ill_t *ill)
+ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
{
- ipif_t *ipif;
-
- ASSERT(IAM_WRITER_ILL(ill));
-
- if (ill->ill_group != NULL)
- ill = ill->ill_group->illgrp_ill;
-
- for (; ill != NULL; ill = ill->ill_group_next) {
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (ill->ill_isv6)
- ipif_recreate_interface_routes_v6(NULL, ipif);
- else
- ipif_recreate_interface_routes(NULL, ipif);
- }
- }
-}
-
-/*
- * Insert ill in a group headed by illgrp_head. The caller can either
- * pass a groupname in which case we search for a group with the
- * same name to insert in or pass a group to insert in. This function
- * would only search groups with names.
- *
- * NOTE : The caller should make sure that there is at least one ipif
- * UP on this ill so that illgrp_scheduler can pick this ill
- * for outbound packets. If ill_ipif_up_count is zero, we have
- * already sent a DL_UNBIND to the driver and we don't want to
- * send anymore packets. We don't assert for ipif_up_count
- * to be greater than zero, because ipif_up_done wants to call
- * this function before bumping up the ipif_up_count. See
- * ipif_up_done() for details.
- */
-int
-illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname,
- ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up)
-{
- ill_group_t *illgrp;
- ill_t *prev_ill;
- phyint_t *phyi;
+ ill_t *ill = q->q_ptr;
+ phyint_t *phyi = ill->ill_phyint;
+ ipmp_grp_t *grp = phyi->phyint_grp;
ip_stack_t *ipst = ill->ill_ipst;
- ASSERT(ill->ill_group == NULL);
-
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- mutex_enter(&ill->ill_lock);
-
- if (groupname != NULL) {
- /*
- * Look for a group with a matching groupname to insert.
- */
- for (illgrp = *illgrp_head; illgrp != NULL;
- illgrp = illgrp->illgrp_next) {
-
- ill_t *tmp_ill;
-
- /*
- * If we have an ill_group_t in the list which has
- * no ill_t assigned then we must be in the process of
- * removing this group. We skip this as illgrp_delete()
- * will remove it from the list.
- */
- if ((tmp_ill = illgrp->illgrp_ill) == NULL) {
- ASSERT(illgrp->illgrp_ill_count == 0);
- continue;
- }
-
- ASSERT(tmp_ill->ill_phyint != NULL);
- phyi = tmp_ill->ill_phyint;
- /*
- * Look at groups which has names only.
- */
- if (phyi->phyint_groupname_len == 0)
- continue;
- /*
- * Names are stored in the phyint common to both
- * IPv4 and IPv6.
- */
- if (mi_strcmp(phyi->phyint_groupname,
- groupname) == 0) {
- break;
- }
- }
- } else {
- /*
- * If the caller passes in a NULL "grp_to_insert", we
- * allocate one below and insert this singleton.
- */
- illgrp = grp_to_insert;
- }
-
- ill->ill_group_next = NULL;
-
- if (illgrp == NULL) {
- illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t));
- if (illgrp == NULL) {
- return (ENOMEM);
- }
- illgrp->illgrp_next = *illgrp_head;
- *illgrp_head = illgrp;
- illgrp->illgrp_ill = ill;
- illgrp->illgrp_ill_count = 1;
- ill->ill_group = illgrp;
- /*
- * Used in illgrp_scheduler to protect multiple threads
- * from traversing the list.
- */
- mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0);
- } else {
- ASSERT(ill->ill_net_type ==
- illgrp->illgrp_ill->ill_net_type);
- ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type);
-
- /* Insert ill at tail of this group */
- prev_ill = illgrp->illgrp_ill;
- while (prev_ill->ill_group_next != NULL)
- prev_ill = prev_ill->ill_group_next;
- prev_ill->ill_group_next = ill;
- ill->ill_group = illgrp;
- illgrp->illgrp_ill_count++;
- /*
- * Inherit group properties. Currently only forwarding
- * is the property we try to keep the same with all the
- * ills. When there are more, we will abstract this into
- * a function.
- */
- ill->ill_flags &= ~ILLF_ROUTER;
- ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER);
- }
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
-
- /*
- * 1) When ipif_up_done() calls this function, ipif_up_count
- * may be zero as it has not yet been bumped. But the ires
- * have already been added. So, we do the nomination here
- * itself. But, when ip_sioctl_groupname calls this, it checks
- * for ill_ipif_up_count != 0. Thus we don't check for
- * ill_ipif_up_count here while nominating broadcast ires for
- * receive.
- *
- * 2) Similarly, we need to call ill_group_bcast_for_xmit here
- * to group them properly as ire_add() has already happened
- * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert
- * case, we need to do it here anyway.
- */
- if (!ill->ill_isv6) {
- ill_group_bcast_for_xmit(ill);
- ill_nominate_bcast_rcv(illgrp);
- }
-
- if (!ipif_is_coming_up) {
- /*
- * When ipif_up_done() calls this function, the multicast
- * groups have not been joined yet. So, there is no point in
- * nomination. ill_join_allmulti() will handle groups when
- * ill_recover_multicast() is called from ipif_up_done() later.
- */
- (void) ill_nominate_mcast_rcv(illgrp);
- /*
- * ipif_up_done calls ill_update_source_selection
- * anyway. Moreover, we don't want to re-create
- * interface routes while ipif_up_done() still has reference
- * to them. Refer to ipif_up_done() for more details.
- */
- ill_update_source_selection(ill);
- }
-
- /*
- * Send a routing sockets message if we are inserting into
- * groups with names.
- */
- if (groupname != NULL)
- ip_rts_ifmsg(ill->ill_ipif);
- return (0);
-}
-
-/*
- * Return the first phyint matching the groupname. There could
- * be more than one when there are ill groups.
- *
- * If 'usable' is set, then we exclude ones that are marked with any of
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE).
- * Needs work: called only from ip_sioctl_groupname and from the ipmp/netinfo
- * emulation of ipmp.
- */
-phyint_t *
-phyint_lookup_group(char *groupname, boolean_t usable, ip_stack_t *ipst)
-{
- phyint_t *phyi;
-
- ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
- /*
- * Group names are stored in the phyint - a common structure
- * to both IPv4 and IPv6.
- */
- phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
- for (; phyi != NULL;
- phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
- phyi, AVL_AFTER)) {
- if (phyi->phyint_groupname_len == 0)
- continue;
- /*
- * Skip the ones that should not be used since the callers
- * sometime use this for sending packets.
- */
- if (usable && (phyi->phyint_flags &
- (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)))
- continue;
+ /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */
+ ASSERT(!IS_IPMP(ill) && grp != NULL);
+ ASSERT(IAM_WRITER_IPSQ(ipsq));
- ASSERT(phyi->phyint_groupname != NULL);
- if (mi_strcmp(groupname, phyi->phyint_groupname) == 0)
- return (phyi);
+ if (phyi->phyint_illv4 != NULL) {
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ VERIFY(grp->gr_pendv4-- > 0);
+ rw_exit(&ipst->ips_ipmp_lock);
+ ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4);
}
- return (NULL);
-}
-
-
-/*
- * Return the first usable phyint matching the group index. By 'usable'
- * we exclude ones that are marked ununsable with any of
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE).
- *
- * Used only for the ipmp/netinfo emulation of ipmp.
- */
-phyint_t *
-phyint_lookup_group_ifindex(uint_t group_ifindex, ip_stack_t *ipst)
-{
- phyint_t *phyi;
-
- ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
- if (!ipst->ips_ipmp_hook_emulation)
- return (NULL);
-
- /*
- * Group indicies are stored in the phyint - a common structure
- * to both IPv4 and IPv6.
- */
- phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
- for (; phyi != NULL;
- phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
- phyi, AVL_AFTER)) {
- /* Ignore the ones that do not have a group */
- if (phyi->phyint_groupname_len == 0)
- continue;
-
- ASSERT(phyi->phyint_group_ifindex != 0);
- /*
- * Skip the ones that should not be used since the callers
- * sometime use this for sending packets.
- */
- if (phyi->phyint_flags &
- (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE))
- continue;
- if (phyi->phyint_group_ifindex == group_ifindex)
- return (phyi);
+ if (phyi->phyint_illv6 != NULL) {
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ VERIFY(grp->gr_pendv6-- > 0);
+ rw_exit(&ipst->ips_ipmp_lock);
+ ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6);
}
- return (NULL);
+ freemsg(mp);
}
/*
- * MT notes on creation and deletion of IPMP groups
- *
- * Creation and deletion of IPMP groups introduce the need to merge or
- * split the associated serialization objects i.e the ipsq's. Normally all
- * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled
- * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during
- * the execution of the SIOCSLIFGROUPNAME command the picture changes. There
- * is a need to change the <ill-ipsq> association and we have to operate on both
- * the source and destination IPMP groups. For eg. attempting to set the
- * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to
- * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the
- * source or destination IPMP group are mapped to a single ipsq for executing
- * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's.
- * The <ill-ipsq> mapping is restored back to normal at a later point. This is
- * termed as a split of the ipsq. The converse of the merge i.e. a split of the
- * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname
- * occurred on the ipsq, then the ipsq_split flag is set. This indicates the
- * ipsq has to be examined for redoing the <ill-ipsq> associations.
- *
- * In the above example the ioctl handling code locates the current ipsq of hme0
- * which is ipsq(mpk17-84). It then enters the above ipsq immediately or
- * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates
- * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into
- * the destination ipsq. If the destination ipsq is not busy, it also enters
- * the destination ipsq exclusively. Now the actual groupname setting operation
- * can proceed. If the destination ipsq is busy, the operation is enqueued
- * on the destination (merged) ipsq and will be handled in the unwind from
- * ipsq_exit.
- *
- * To prevent other threads accessing the ill while the group name change is
- * in progres, we bring down the ipifs which also removes the ill from the
- * group. The group is changed in phyint and when the first ipif on the ill
- * is brought up, the ill is inserted into the right IPMP group by
- * illgrp_insert.
+ * Process an SIOCSLIFGROUPNAME request.
*/
/* ARGSUSED */
int
ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *ifreq)
{
- int i;
- char *tmp;
- int namelen;
- ill_t *ill = ipif->ipif_ill;
- ill_t *ill_v4, *ill_v6;
- int err = 0;
- phyint_t *phyi;
- phyint_t *phyi_tmp;
- struct lifreq *lifr;
- mblk_t *mp1;
- char *groupname;
- ipsq_t *ipsq;
+ struct lifreq *lifr = ifreq;
+ ill_t *ill = ipif->ipif_ill;
ip_stack_t *ipst = ill->ill_ipst;
-
- ASSERT(IAM_WRITER_IPIF(ipif));
-
- /* Existance verified in ip_wput_nondata */
- mp1 = mp->b_cont->b_cont;
- lifr = (struct lifreq *)mp1->b_rptr;
- groupname = lifr->lifr_groupname;
-
- if (ipif->ipif_id != 0)
- return (EINVAL);
-
- phyi = ill->ill_phyint;
- ASSERT(phyi != NULL);
-
- if (phyi->phyint_flags & PHYI_VIRTUAL)
- return (EINVAL);
-
- tmp = groupname;
- for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++)
- ;
-
- if (i == LIFNAMSIZ) {
- /* no null termination */
- return (EINVAL);
- }
+ phyint_t *phyi = ill->ill_phyint;
+ ipmp_grp_t *grp = phyi->phyint_grp;
+ mblk_t *ipsq_mp;
+ int err = 0;
/*
- * Calculate the namelen exclusive of the null
- * termination character.
+ * Note that phyint_grp can only change here, where we're exclusive.
*/
- namelen = tmp - groupname;
-
- ill_v4 = phyi->phyint_illv4;
- ill_v6 = phyi->phyint_illv6;
+ ASSERT(IAM_WRITER_ILL(ill));
- /*
- * ILL cannot be part of a usesrc group and and IPMP group at the
- * same time. No need to grab the ill_g_usesrc_lock here, see
- * synchronization notes in ip.c
- */
- if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
+ if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL ||
+ (phyi->phyint_flags & PHYI_VIRTUAL))
return (EINVAL);
- }
-
- /*
- * mark the ill as changing.
- * this should queue all new requests on the syncq.
- */
- GRAB_ILL_LOCKS(ill_v4, ill_v6);
-
- if (ill_v4 != NULL)
- ill_v4->ill_state_flags |= ILL_CHANGING;
- if (ill_v6 != NULL)
- ill_v6->ill_state_flags |= ILL_CHANGING;
- RELEASE_ILL_LOCKS(ill_v4, ill_v6);
-
- if (namelen == 0) {
- /*
- * Null string means remove this interface from the
- * existing group.
- */
- if (phyi->phyint_groupname_len == 0) {
- /*
- * Never was in a group.
- */
- err = 0;
- goto done;
- }
-
- /*
- * IPv4 or IPv6 may be temporarily out of the group when all
- * the ipifs are down. Thus, we need to check for ill_group to
- * be non-NULL.
- */
- if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
- ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
- mutex_enter(&ill_v4->ill_lock);
- if (!ill_is_quiescent(ill_v4)) {
- /*
- * ipsq_pending_mp_add will not fail since
- * connp is NULL
- */
- (void) ipsq_pending_mp_add(NULL,
- ill_v4->ill_ipif, q, mp, ILL_DOWN);
- mutex_exit(&ill_v4->ill_lock);
- err = EINPROGRESS;
- goto done;
- }
- mutex_exit(&ill_v4->ill_lock);
- }
-
- if (ill_v6 != NULL && ill_v6->ill_group != NULL) {
- ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
- mutex_enter(&ill_v6->ill_lock);
- if (!ill_is_quiescent(ill_v6)) {
- (void) ipsq_pending_mp_add(NULL,
- ill_v6->ill_ipif, q, mp, ILL_DOWN);
- mutex_exit(&ill_v6->ill_lock);
- err = EINPROGRESS;
- goto done;
- }
- mutex_exit(&ill_v6->ill_lock);
- }
-
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- GRAB_ILL_LOCKS(ill_v4, ill_v6);
- mutex_enter(&phyi->phyint_lock);
- ASSERT(phyi->phyint_groupname != NULL);
- mi_free(phyi->phyint_groupname);
- phyi->phyint_groupname = NULL;
- phyi->phyint_groupname_len = 0;
-
- /* Restore the ifindex used to be the per interface one */
- phyi->phyint_group_ifindex = 0;
- phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
- mutex_exit(&phyi->phyint_lock);
- RELEASE_ILL_LOCKS(ill_v4, ill_v6);
- rw_exit(&ipst->ips_ill_g_lock);
- err = ill_up_ipifs(ill, q, mp);
- /*
- * set the split flag so that the ipsq can be split
- */
- mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
- phyi->phyint_ipsq->ipsq_split = B_TRUE;
- mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
+ lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0';
- } else {
- if (phyi->phyint_groupname_len != 0) {
- ASSERT(phyi->phyint_groupname != NULL);
- /* Are we inserting in the same group ? */
- if (mi_strcmp(groupname,
- phyi->phyint_groupname) == 0) {
- err = 0;
- goto done;
- }
- }
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- /*
- * Merge ipsq for the group's.
- * This check is here as multiple groups/ills might be
- * sharing the same ipsq.
- * If we have to merege than the operation is restarted
- * on the new ipsq.
- */
- ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL, ipst);
- if (phyi->phyint_ipsq != ipsq) {
- rw_exit(&ipst->ips_ill_g_lock);
- err = ill_merge_groups(ill, NULL, groupname, mp, q);
- goto done;
- }
- /*
- * Running exclusive on new ipsq.
- */
-
- ASSERT(ipsq != NULL);
- ASSERT(ipsq->ipsq_writer == curthread);
-
- /*
- * Check whether the ill_type and ill_net_type matches before
- * we allocate any memory so that the cleanup is easier.
- *
- * We can't group dissimilar ones as we can't load spread
- * packets across the group because of potential link-level
- * header differences.
- */
- phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst);
- if (phyi_tmp != NULL) {
- if ((ill_v4 != NULL &&
- phyi_tmp->phyint_illv4 != NULL) &&
- ((ill_v4->ill_net_type !=
- phyi_tmp->phyint_illv4->ill_net_type) ||
- (ill_v4->ill_type !=
- phyi_tmp->phyint_illv4->ill_type))) {
- mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
- phyi->phyint_ipsq->ipsq_split = B_TRUE;
- mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (EINVAL);
- }
- if ((ill_v6 != NULL &&
- phyi_tmp->phyint_illv6 != NULL) &&
- ((ill_v6->ill_net_type !=
- phyi_tmp->phyint_illv6->ill_net_type) ||
- (ill_v6->ill_type !=
- phyi_tmp->phyint_illv6->ill_type))) {
- mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
- phyi->phyint_ipsq->ipsq_split = B_TRUE;
- mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (EINVAL);
- }
- }
-
- rw_exit(&ipst->ips_ill_g_lock);
-
- /*
- * bring down all v4 ipifs.
- */
- if (ill_v4 != NULL) {
- ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
- }
-
- /*
- * bring down all v6 ipifs.
- */
- if (ill_v6 != NULL) {
- ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
- }
-
- /*
- * make sure all ipifs are down and there are no active
- * references. Call to ipsq_pending_mp_add will not fail
- * since connp is NULL.
- */
- if (ill_v4 != NULL) {
- mutex_enter(&ill_v4->ill_lock);
- if (!ill_is_quiescent(ill_v4)) {
- (void) ipsq_pending_mp_add(NULL,
- ill_v4->ill_ipif, q, mp, ILL_DOWN);
- mutex_exit(&ill_v4->ill_lock);
- err = EINPROGRESS;
- goto done;
- }
- mutex_exit(&ill_v4->ill_lock);
- }
-
- if (ill_v6 != NULL) {
- mutex_enter(&ill_v6->ill_lock);
- if (!ill_is_quiescent(ill_v6)) {
- (void) ipsq_pending_mp_add(NULL,
- ill_v6->ill_ipif, q, mp, ILL_DOWN);
- mutex_exit(&ill_v6->ill_lock);
- err = EINPROGRESS;
- goto done;
- }
- mutex_exit(&ill_v6->ill_lock);
- }
-
- /*
- * allocate including space for null terminator
- * before we insert.
- */
- tmp = (char *)mi_alloc(namelen + 1, BPRI_MED);
- if (tmp == NULL)
- return (ENOMEM);
-
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- GRAB_ILL_LOCKS(ill_v4, ill_v6);
- mutex_enter(&phyi->phyint_lock);
- if (phyi->phyint_groupname_len != 0) {
- ASSERT(phyi->phyint_groupname != NULL);
- mi_free(phyi->phyint_groupname);
- }
-
- /*
- * setup the new group name.
- */
- phyi->phyint_groupname = tmp;
- bcopy(groupname, phyi->phyint_groupname, namelen + 1);
- phyi->phyint_groupname_len = namelen + 1;
-
- if (ipst->ips_ipmp_hook_emulation) {
- /*
- * If the group already exists we use the existing
- * group_ifindex, otherwise we pick a new index here.
- */
- if (phyi_tmp != NULL) {
- phyi->phyint_group_ifindex =
- phyi_tmp->phyint_group_ifindex;
- } else {
- /* XXX We need a recovery strategy here. */
- if (!ip_assign_ifindex(
- &phyi->phyint_group_ifindex, ipst))
- cmn_err(CE_PANIC,
- "ip_assign_ifindex() failed");
- }
- }
- /*
- * Select whether the netinfo and hook use the per-interface
- * or per-group ifindex.
- */
- if (ipst->ips_ipmp_hook_emulation)
- phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex;
- else
- phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-
- if (ipst->ips_ipmp_hook_emulation &&
- phyi_tmp != NULL) {
- /* First phyint in group - group PLUMB event */
- ill_nic_event_plumb(ill, B_TRUE);
- }
- mutex_exit(&phyi->phyint_lock);
- RELEASE_ILL_LOCKS(ill_v4, ill_v6);
- rw_exit(&ipst->ips_ill_g_lock);
-
- err = ill_up_ipifs(ill, q, mp);
- }
-
-done:
/*
- * normally ILL_CHANGING is cleared in ill_up_ipifs.
+ * If the name hasn't changed, there's nothing to do.
*/
- if (err != EINPROGRESS) {
- GRAB_ILL_LOCKS(ill_v4, ill_v6);
- if (ill_v4 != NULL)
- ill_v4->ill_state_flags &= ~ILL_CHANGING;
- if (ill_v6 != NULL)
- ill_v6->ill_state_flags &= ~ILL_CHANGING;
- RELEASE_ILL_LOCKS(ill_v4, ill_v6);
- }
- return (err);
-}
+ if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0)
+ goto unlock;
-/* ARGSUSED */
-int
-ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
- mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
-{
- ill_t *ill;
- phyint_t *phyi;
- struct lifreq *lifr;
- mblk_t *mp1;
-
- /* Existence verified in ip_wput_nondata */
- mp1 = mp->b_cont->b_cont;
- lifr = (struct lifreq *)mp1->b_rptr;
- ill = ipif->ipif_ill;
- phyi = ill->ill_phyint;
-
- lifr->lifr_groupname[0] = '\0';
/*
- * ill_group may be null if all the interfaces
- * are down. But still, the phyint should always
- * hold the name.
- */
- if (phyi->phyint_groupname_len != 0) {
- bcopy(phyi->phyint_groupname, lifr->lifr_groupname,
- phyi->phyint_groupname_len);
- }
-
- return (0);
-}
-
-
-typedef struct conn_move_s {
- ill_t *cm_from_ill;
- ill_t *cm_to_ill;
- int cm_ifindex;
-} conn_move_t;
-
-/*
- * ipcl_walk function for moving conn_multicast_ill for a given ill.
- */
-static void
-conn_move(conn_t *connp, caddr_t arg)
-{
- conn_move_t *connm;
- int ifindex;
- int i;
- ill_t *from_ill;
- ill_t *to_ill;
- ilg_t *ilg;
- ilm_t *ret_ilm;
-
- connm = (conn_move_t *)arg;
- ifindex = connm->cm_ifindex;
- from_ill = connm->cm_from_ill;
- to_ill = connm->cm_to_ill;
-
- /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */
-
- /* All multicast fields protected by conn_lock */
- mutex_enter(&connp->conn_lock);
- ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
- if ((connp->conn_outgoing_ill == from_ill) &&
- (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) {
- connp->conn_outgoing_ill = to_ill;
- connp->conn_incoming_ill = to_ill;
- }
-
- /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */
-
- if ((connp->conn_multicast_ill == from_ill) &&
- (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) {
- connp->conn_multicast_ill = connm->cm_to_ill;
- }
-
- /*
- * Change the ilg_ill to point to the new one. This assumes
- * ilm_move_v6 has moved the ilms to new_ill and the driver
- * has been told to receive packets on this interface.
- * ilm_move_v6 FAILBACKS all the ilms successfully always.
- * But when doing a FAILOVER, it might fail with ENOMEM and so
- * some ilms may not have moved. We check to see whether
- * the ilms have moved to to_ill. We can't check on from_ill
- * as in the process of moving, we could have split an ilm
- * in to two - which has the same orig_ifindex and v6group.
+ * Handle requests to rename an IPMP meta-interface.
*
- * For IPv4, ilg_ipif moves implicitly. The code below really
- * does not do anything for IPv4 as ilg_ill is NULL for IPv4.
- */
- for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
- ilg = &connp->conn_ilg[i];
- if ((ilg->ilg_ill == from_ill) &&
- (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) {
- /* ifindex != 0 indicates failback */
- if (ifindex != 0) {
- connp->conn_ilg[i].ilg_ill = to_ill;
- continue;
- }
-
- mutex_enter(&to_ill->ill_lock);
- ret_ilm = ilm_lookup_ill_index_v6(to_ill,
- &ilg->ilg_v6group, ilg->ilg_orig_ifindex,
- connp->conn_zoneid);
- mutex_exit(&to_ill->ill_lock);
-
- if (ret_ilm != NULL)
- connp->conn_ilg[i].ilg_ill = to_ill;
- }
+ * Note that creation of the IPMP meta-interface is handled in
+ * userland through the standard plumbing sequence. As part of the
+ * plumbing the IPMP meta-interface, its initial groupname is set to
+ * the name of the interface (see ipif_set_values_tail()).
+ */
+ if (IS_IPMP(ill)) {
+ err = ipmp_grp_rename(grp, lifr->lifr_groupname);
+ goto unlock;
}
- mutex_exit(&connp->conn_lock);
-}
-
-static void
-conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex)
-{
- conn_move_t connm;
- ip_stack_t *ipst = from_ill->ill_ipst;
-
- connm.cm_from_ill = from_ill;
- connm.cm_to_ill = to_ill;
- connm.cm_ifindex = ifindex;
-
- ipcl_walk(conn_move, (caddr_t)&connm, ipst);
-}
-
-/*
- * ilm has been moved from from_ill to to_ill.
- * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill.
- * appropriately.
- *
- * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because
- * the code there de-references ipif_ill to get the ill to
- * send multicast requests. It does not work as ipif is on its
- * move and already moved when this function is called.
- * Thus, we need to use from_ill and to_ill send down multicast
- * requests.
- */
-static void
-ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill)
-{
- ipif_t *ipif;
- ilm_t *ilm;
/*
- * See whether we need to send down DL_ENABMULTI_REQ on
- * to_ill as ilm has just been added.
+ * Handle requests to add or remove an IP interface from a group.
*/
- ASSERT(IAM_WRITER_ILL(to_ill));
- ASSERT(IAM_WRITER_ILL(from_ill));
-
- ILM_WALKER_HOLD(to_ill);
- for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-
- if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED))
- continue;
- /*
- * no locks held, ill/ipif cannot dissappear as long
- * as we are writer.
- */
- ipif = to_ill->ill_ipif;
+ if (lifr->lifr_groupname[0] != '\0') { /* add */
/*
- * No need to hold any lock as we are the writer and this
- * can only be changed by a writer.
+ * Moves are handled by first removing the interface from
+ * its existing group, and then adding it to another group.
+ * So, fail if it's already in a group.
*/
- ilm->ilm_is_new = B_FALSE;
-
- if (to_ill->ill_net_type != IRE_IF_RESOLVER ||
- ipif->ipif_flags & IPIF_POINTOPOINT) {
- ip1dbg(("ilm_send_multicast_reqs: to_ill not "
- "resolver\n"));
- continue; /* Must be IRE_IF_NORESOLVER */
- }
-
- if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
- ip1dbg(("ilm_send_multicast_reqs: "
- "to_ill MULTI_BCAST\n"));
- goto from;
+ if (IS_UNDER_IPMP(ill)) {
+ err = EALREADY;
+ goto unlock;
}
- if (to_ill->ill_isv6)
- mld_joingroup(ilm);
- else
- igmp_joingroup(ilm);
-
- if (to_ill->ill_ipif_up_count == 0) {
- /*
- * Nobody there. All multicast addresses will be
- * re-joined when we get the DL_BIND_ACK bringing the
- * interface up.
- */
- ilm->ilm_notify_driver = B_FALSE;
- ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n"));
- goto from;
+ grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst);
+ if (grp == NULL) {
+ err = ENOENT;
+ goto unlock;
}
/*
- * For allmulti address, we want to join on only one interface.
- * Checking for ilm_numentries_v6 is not correct as you may
- * find an ilm with zero address on to_ill, but we may not
- * have nominated to_ill for receiving. Thus, if we have
- * nominated from_ill (ill_join_allmulti is set), nominate
- * only if to_ill is not already nominated (to_ill normally
- * should not have been nominated if "from_ill" has already
- * been nominated. As we don't prevent failovers from happening
- * across groups, we don't assert).
+ * Check if the phyint and its ills are suitable for
+ * inclusion into the group.
*/
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
- /*
- * There is no need to hold ill locks as we are
- * writer on both ills and when ill_join_allmulti()
- * is called the thread is always a writer.
- */
- if (from_ill->ill_join_allmulti &&
- !to_ill->ill_join_allmulti) {
- (void) ill_join_allmulti(to_ill);
- }
- } else if (ilm->ilm_notify_driver) {
-
- /*
- * This is a newly moved ilm so we need to tell the
- * driver about the new group. There can be more than
- * one ilm's for the same group in the list each with a
- * different orig_ifindex. We have to inform the driver
- * once. In ilm_move_v[4,6] we only set the flag
- * ilm_notify_driver for the first ilm.
- */
-
- (void) ip_ll_send_enabmulti_req(to_ill,
- &ilm->ilm_v6addr);
- }
-
- ilm->ilm_notify_driver = B_FALSE;
+ if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0)
+ goto unlock;
/*
- * See whether we need to send down DL_DISABMULTI_REQ on
- * from_ill as ilm has just been removed.
+ * Checks pass; join the group, and enqueue the remaining
+ * illgrp joins for when we've become part of the group xop
+ * and are exclusive across its IPSQs. Since qwriter_ip()
+ * requires an mblk_t to scribble on, and since `mp' will be
+ * freed as part of completing the ioctl, allocate another.
*/
-from:
- ipif = from_ill->ill_ipif;
- if (from_ill->ill_net_type != IRE_IF_RESOLVER ||
- ipif->ipif_flags & IPIF_POINTOPOINT) {
- ip1dbg(("ilm_send_multicast_reqs: "
- "from_ill not resolver\n"));
- continue; /* Must be IRE_IF_NORESOLVER */
- }
-
- if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
- ip1dbg(("ilm_send_multicast_reqs: "
- "from_ill MULTI_BCAST\n"));
- continue;
- }
-
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
- if (from_ill->ill_join_allmulti)
- ill_leave_allmulti(from_ill);
- } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) {
- (void) ip_ll_send_disabmulti_req(from_ill,
- &ilm->ilm_v6addr);
+ if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) {
+ err = ENOMEM;
+ goto unlock;
}
- }
- ILM_WALKER_RELE(to_ill);
-}
-
-/*
- * This function is called when all multicast memberships needs
- * to be moved from "from_ill" to "to_ill" for IPv6. This function is
- * called only once unlike the IPv4 counterpart where it is called after
- * every logical interface is moved. The reason is due to multicast
- * memberships are joined using an interface address in IPv4 while in
- * IPv6, interface index is used.
- */
-static void
-ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex)
-{
- ilm_t *ilm;
- ilm_t *ilm_next;
- ilm_t *new_ilm;
- ilm_t **ilmp;
- int count;
- char buf[INET6_ADDRSTRLEN];
- in6_addr_t ipv6_snm = ipv6_solicited_node_mcast;
- ip_stack_t *ipst = from_ill->ill_ipst;
-
- ASSERT(MUTEX_HELD(&to_ill->ill_lock));
- ASSERT(MUTEX_HELD(&from_ill->ill_lock));
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
- if (ifindex == 0) {
/*
- * Form the solicited node mcast address which is used later.
+ * Before we drop ipmp_lock, bump gr_pend* to ensure that the
+ * IPMP meta-interface ills needed by `phyi' cannot go away
+ * before ip_join_illgrps() is called back. See the comments
+ * in ip_sioctl_plink_ipmp() for more.
*/
- ipif_t *ipif;
-
- ipif = from_ill->ill_ipif;
- ASSERT(ipif->ipif_id == 0);
-
- ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
- }
-
- ilmp = &from_ill->ill_ilm;
- for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
- ilm_next = ilm->ilm_next;
-
- if (ilm->ilm_flags & ILM_DELETED) {
- ilmp = &ilm->ilm_next;
- continue;
- }
+ if (phyi->phyint_illv4 != NULL)
+ grp->gr_pendv4++;
+ if (phyi->phyint_illv6 != NULL)
+ grp->gr_pendv6++;
- new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr,
- ilm->ilm_orig_ifindex, ilm->ilm_zoneid);
- ASSERT(ilm->ilm_orig_ifindex != 0);
- if (ilm->ilm_orig_ifindex == ifindex) {
- /*
- * We are failing back multicast memberships.
- * If the same ilm exists in to_ill, it means somebody
- * has joined the same group there e.g. ff02::1
- * is joined within the kernel when the interfaces
- * came UP.
- */
- ASSERT(ilm->ilm_ipif == NULL);
- if (new_ilm != NULL) {
- new_ilm->ilm_refcnt += ilm->ilm_refcnt;
- if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
- !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
- new_ilm->ilm_is_new = B_TRUE;
- }
- } else {
- /*
- * check if we can just move the ilm
- */
- if (from_ill->ill_ilm_walker_cnt != 0) {
- /*
- * We have walkers we cannot move
- * the ilm, so allocate a new ilm,
- * this (old) ilm will be marked
- * ILM_DELETED at the end of the loop
- * and will be freed when the
- * last walker exits.
- */
- new_ilm = (ilm_t *)mi_zalloc
- (sizeof (ilm_t));
- if (new_ilm == NULL) {
- ip0dbg(("ilm_move_v6: "
- "FAILBACK of IPv6"
- " multicast address %s : "
- "from %s to"
- " %s failed : ENOMEM \n",
- inet_ntop(AF_INET6,
- &ilm->ilm_v6addr, buf,
- sizeof (buf)),
- from_ill->ill_name,
- to_ill->ill_name));
-
- ilmp = &ilm->ilm_next;
- continue;
- }
- *new_ilm = *ilm;
- /*
- * we don't want new_ilm linked to
- * ilm's filter list.
- */
- new_ilm->ilm_filter = NULL;
- } else {
- /*
- * No walkers we can move the ilm.
- * lets take it out of the list.
- */
- *ilmp = ilm->ilm_next;
- ilm->ilm_next = NULL;
- DTRACE_PROBE3(ill__decr__cnt,
- (ill_t *), from_ill,
- (char *), "ilm", (void *), ilm);
- ASSERT(from_ill->ill_ilm_cnt > 0);
- from_ill->ill_ilm_cnt--;
-
- new_ilm = ilm;
- }
+ rw_exit(&ipst->ips_ipmp_lock);
- /*
- * if this is the first ilm for the group
- * set ilm_notify_driver so that we notify the
- * driver in ilm_send_multicast_reqs.
- */
- if (ilm_lookup_ill_v6(to_ill,
- &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
- new_ilm->ilm_notify_driver = B_TRUE;
-
- DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
- (char *), "ilm", (void *), new_ilm);
- new_ilm->ilm_ill = to_ill;
- to_ill->ill_ilm_cnt++;
-
- /* Add to the to_ill's list */
- new_ilm->ilm_next = to_ill->ill_ilm;
- to_ill->ill_ilm = new_ilm;
- /*
- * set the flag so that mld_joingroup is
- * called in ilm_send_multicast_reqs().
- */
- new_ilm->ilm_is_new = B_TRUE;
- }
- goto bottom;
- } else if (ifindex != 0) {
- /*
- * If this is FAILBACK (ifindex != 0) and the ifindex
- * has not matched above, look at the next ilm.
- */
- ilmp = &ilm->ilm_next;
- continue;
- }
- /*
- * If we are here, it means ifindex is 0. Failover
- * everything.
- *
- * We need to handle solicited node mcast address
- * and all_nodes mcast address differently as they
- * are joined witin the kenrel (ipif_multicast_up)
- * and potentially from the userland. We are called
- * after the ipifs of from_ill has been moved.
- * If we still find ilms on ill with solicited node
- * mcast address or all_nodes mcast address, it must
- * belong to the UP interface that has not moved e.g.
- * ipif_id 0 with the link local prefix does not move.
- * We join this on the new ill accounting for all the
- * userland memberships so that applications don't
- * see any failure.
- *
- * We need to make sure that we account only for the
- * solicited node and all node multicast addresses
- * that was brought UP on these. In the case of
- * a failover from A to B, we might have ilms belonging
- * to A (ilm_orig_ifindex pointing at A) on B accounting
- * for the membership from the userland. If we are failing
- * over from B to C now, we will find the ones belonging
- * to A on B. These don't account for the ill_ipif_up_count.
- * They just move from B to C. The check below on
- * ilm_orig_ifindex ensures that.
- */
- if ((ilm->ilm_orig_ifindex ==
- from_ill->ill_phyint->phyint_ifindex) &&
- (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) ||
- IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast,
- &ilm->ilm_v6addr))) {
- ASSERT(ilm->ilm_refcnt > 0);
- count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count;
- /*
- * For indentation reasons, we are not using a
- * "else" here.
- */
- if (count == 0) {
- ilmp = &ilm->ilm_next;
- continue;
- }
- ilm->ilm_refcnt -= count;
- if (new_ilm != NULL) {
- /*
- * Can find one with the same
- * ilm_orig_ifindex, if we are failing
- * over to a STANDBY. This happens
- * when somebody wants to join a group
- * on a STANDBY interface and we
- * internally join on a different one.
- * If we had joined on from_ill then, a
- * failover now will find a new ilm
- * with this index.
- */
- ip1dbg(("ilm_move_v6: FAILOVER, found"
- " new ilm on %s, group address %s\n",
- to_ill->ill_name,
- inet_ntop(AF_INET6,
- &ilm->ilm_v6addr, buf,
- sizeof (buf))));
- new_ilm->ilm_refcnt += count;
- if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
- !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
- new_ilm->ilm_is_new = B_TRUE;
- }
- } else {
- new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
- if (new_ilm == NULL) {
- ip0dbg(("ilm_move_v6: FAILOVER of IPv6"
- " multicast address %s : from %s to"
- " %s failed : ENOMEM \n",
- inet_ntop(AF_INET6,
- &ilm->ilm_v6addr, buf,
- sizeof (buf)), from_ill->ill_name,
- to_ill->ill_name));
- ilmp = &ilm->ilm_next;
- continue;
- }
- *new_ilm = *ilm;
- new_ilm->ilm_filter = NULL;
- new_ilm->ilm_refcnt = count;
- new_ilm->ilm_timer = INFINITY;
- new_ilm->ilm_rtx.rtx_timer = INFINITY;
- new_ilm->ilm_is_new = B_TRUE;
- /*
- * If the to_ill has not joined this
- * group we need to tell the driver in
- * ill_send_multicast_reqs.
- */
- if (ilm_lookup_ill_v6(to_ill,
- &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
- new_ilm->ilm_notify_driver = B_TRUE;
-
- new_ilm->ilm_ill = to_ill;
- DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
- (char *), "ilm", (void *), new_ilm);
- to_ill->ill_ilm_cnt++;
-
- /* Add to the to_ill's list */
- new_ilm->ilm_next = to_ill->ill_ilm;
- to_ill->ill_ilm = new_ilm;
- ASSERT(new_ilm->ilm_ipif == NULL);
- }
- if (ilm->ilm_refcnt == 0) {
- goto bottom;
- } else {
- new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
- CLEAR_SLIST(new_ilm->ilm_filter);
- ilmp = &ilm->ilm_next;
- }
- continue;
- } else {
- /*
- * ifindex = 0 means, move everything pointing at
- * from_ill. We are doing this becuase ill has
- * either FAILED or became INACTIVE.
- *
- * As we would like to move things later back to
- * from_ill, we want to retain the identity of this
- * ilm. Thus, we don't blindly increment the reference
- * count on the ilms matching the address alone. We
- * need to match on the ilm_orig_index also. new_ilm
- * was obtained by matching ilm_orig_index also.
- */
- if (new_ilm != NULL) {
- /*
- * This is possible only if a previous restore
- * was incomplete i.e restore to
- * ilm_orig_ifindex left some ilms because
- * of some failures. Thus when we are failing
- * again, we might find our old friends there.
- */
- ip1dbg(("ilm_move_v6: FAILOVER, found new ilm"
- " on %s, group address %s\n",
- to_ill->ill_name,
- inet_ntop(AF_INET6,
- &ilm->ilm_v6addr, buf,
- sizeof (buf))));
- new_ilm->ilm_refcnt += ilm->ilm_refcnt;
- if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
- !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
- new_ilm->ilm_is_new = B_TRUE;
- }
- } else {
- if (from_ill->ill_ilm_walker_cnt != 0) {
- new_ilm = (ilm_t *)
- mi_zalloc(sizeof (ilm_t));
- if (new_ilm == NULL) {
- ip0dbg(("ilm_move_v6: "
- "FAILOVER of IPv6"
- " multicast address %s : "
- "from %s to"
- " %s failed : ENOMEM \n",
- inet_ntop(AF_INET6,
- &ilm->ilm_v6addr, buf,
- sizeof (buf)),
- from_ill->ill_name,
- to_ill->ill_name));
-
- ilmp = &ilm->ilm_next;
- continue;
- }
- *new_ilm = *ilm;
- new_ilm->ilm_filter = NULL;
- } else {
- *ilmp = ilm->ilm_next;
- DTRACE_PROBE3(ill__decr__cnt,
- (ill_t *), from_ill,
- (char *), "ilm", (void *), ilm);
- ASSERT(from_ill->ill_ilm_cnt > 0);
- from_ill->ill_ilm_cnt--;
-
- new_ilm = ilm;
- }
- /*
- * If the to_ill has not joined this
- * group we need to tell the driver in
- * ill_send_multicast_reqs.
- */
- if (ilm_lookup_ill_v6(to_ill,
- &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
- new_ilm->ilm_notify_driver = B_TRUE;
-
- /* Add to the to_ill's list */
- new_ilm->ilm_next = to_ill->ill_ilm;
- to_ill->ill_ilm = new_ilm;
- ASSERT(ilm->ilm_ipif == NULL);
- new_ilm->ilm_ill = to_ill;
- DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
- (char *), "ilm", (void *), new_ilm);
- to_ill->ill_ilm_cnt++;
- new_ilm->ilm_is_new = B_TRUE;
- }
-
- }
-
-bottom:
- /*
- * Revert multicast filter state to (EXCLUDE, NULL).
- * new_ilm->ilm_is_new should already be set if needed.
- */
- new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
- CLEAR_SLIST(new_ilm->ilm_filter);
+ ipmp_phyint_join_grp(phyi, grp);
+ ill_refhold(ill);
+ qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps,
+ SWITCH_OP, B_FALSE);
+ return (0);
+ } else {
/*
- * We allocated/got a new ilm, free the old one.
+ * Request to remove the interface from a group. If the
+ * interface is not in a group, this trivially succeeds.
*/
- if (new_ilm != ilm) {
- if (from_ill->ill_ilm_walker_cnt == 0) {
- *ilmp = ilm->ilm_next;
-
- ASSERT(ilm->ilm_ipif == NULL); /* ipv6 */
- DTRACE_PROBE3(ill__decr__cnt, (ill_t *),
- from_ill, (char *), "ilm", (void *), ilm);
- ASSERT(from_ill->ill_ilm_cnt > 0);
- from_ill->ill_ilm_cnt--;
-
- ilm_inactive(ilm); /* frees this ilm */
-
- } else {
- ilm->ilm_flags |= ILM_DELETED;
- from_ill->ill_ilm_cleanup_reqd = 1;
- ilmp = &ilm->ilm_next;
- }
- }
+ rw_exit(&ipst->ips_ipmp_lock);
+ if (IS_UNDER_IPMP(ill))
+ ipmp_phyint_leave_grp(phyi);
+ return (0);
}
+unlock:
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (err);
}
/*
- * Move all the multicast memberships to to_ill. Called when
- * an ipif moves from "from_ill" to "to_ill". This function is slightly
- * different from IPv6 counterpart as multicast memberships are associated
- * with ills in IPv6. This function is called after every ipif is moved
- * unlike IPv6, where it is moved only once.
+ * Process an SIOCGLIFBINDING request.
*/
-static void
-ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif)
-{
- ilm_t *ilm;
- ilm_t *ilm_next;
- ilm_t *new_ilm;
- ilm_t **ilmp;
- ip_stack_t *ipst = from_ill->ill_ipst;
-
- ASSERT(MUTEX_HELD(&to_ill->ill_lock));
- ASSERT(MUTEX_HELD(&from_ill->ill_lock));
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
- ilmp = &from_ill->ill_ilm;
- for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
- ilm_next = ilm->ilm_next;
-
- if (ilm->ilm_flags & ILM_DELETED) {
- ilmp = &ilm->ilm_next;
- continue;
- }
-
- ASSERT(ilm->ilm_ipif != NULL);
-
- if (ilm->ilm_ipif != ipif) {
- ilmp = &ilm->ilm_next;
- continue;
- }
-
- if (V4_PART_OF_V6(ilm->ilm_v6addr) ==
- htonl(INADDR_ALLHOSTS_GROUP)) {
- new_ilm = ilm_lookup_ipif(ipif,
- V4_PART_OF_V6(ilm->ilm_v6addr));
- if (new_ilm != NULL) {
- new_ilm->ilm_refcnt += ilm->ilm_refcnt;
- /*
- * We still need to deal with the from_ill.
- */
- new_ilm->ilm_is_new = B_TRUE;
- new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
- CLEAR_SLIST(new_ilm->ilm_filter);
- ASSERT(ilm->ilm_ipif == ipif);
- ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
- if (from_ill->ill_ilm_walker_cnt == 0) {
- DTRACE_PROBE3(ill__decr__cnt,
- (ill_t *), from_ill,
- (char *), "ilm", (void *), ilm);
- ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
- }
- goto delete_ilm;
- }
- /*
- * If we could not find one e.g. ipif is
- * still down on to_ill, we add this ilm
- * on ill_new to preserve the reference
- * count.
- */
- }
- /*
- * When ipifs move, ilms always move with it
- * to the NEW ill. Thus we should never be
- * able to find ilm till we really move it here.
- */
- ASSERT(ilm_lookup_ipif(ipif,
- V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL);
-
- if (from_ill->ill_ilm_walker_cnt != 0) {
- new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
- if (new_ilm == NULL) {
- char buf[INET6_ADDRSTRLEN];
- ip0dbg(("ilm_move_v4: FAILBACK of IPv4"
- " multicast address %s : "
- "from %s to"
- " %s failed : ENOMEM \n",
- inet_ntop(AF_INET,
- &ilm->ilm_v6addr, buf,
- sizeof (buf)),
- from_ill->ill_name,
- to_ill->ill_name));
-
- ilmp = &ilm->ilm_next;
- continue;
- }
- *new_ilm = *ilm;
- DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ipif,
- (char *), "ilm", (void *), ilm);
- new_ilm->ilm_ipif->ipif_ilm_cnt++;
- /* We don't want new_ilm linked to ilm's filter list */
- new_ilm->ilm_filter = NULL;
- } else {
- /* Remove from the list */
- *ilmp = ilm->ilm_next;
- new_ilm = ilm;
- }
-
- /*
- * If we have never joined this group on the to_ill
- * make sure we tell the driver.
- */
- if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr,
- ALL_ZONES) == NULL)
- new_ilm->ilm_notify_driver = B_TRUE;
-
- /* Add to the to_ill's list */
- new_ilm->ilm_next = to_ill->ill_ilm;
- to_ill->ill_ilm = new_ilm;
- new_ilm->ilm_is_new = B_TRUE;
-
- /*
- * Revert multicast filter state to (EXCLUDE, NULL)
- */
- new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
- CLEAR_SLIST(new_ilm->ilm_filter);
-
- /*
- * Delete only if we have allocated a new ilm.
- */
- if (new_ilm != ilm) {
-delete_ilm:
- if (from_ill->ill_ilm_walker_cnt == 0) {
- /* Remove from the list */
- *ilmp = ilm->ilm_next;
- ilm->ilm_next = NULL;
- DTRACE_PROBE3(ipif__decr__cnt,
- (ipif_t *), ilm->ilm_ipif,
- (char *), "ilm", (void *), ilm);
- ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
- ilm->ilm_ipif->ipif_ilm_cnt--;
- ilm_inactive(ilm);
- } else {
- ilm->ilm_flags |= ILM_DELETED;
- from_ill->ill_ilm_cleanup_reqd = 1;
- ilmp = &ilm->ilm_next;
- }
- }
- }
-}
-
-static uint_t
-ipif_get_id(ill_t *ill, uint_t id)
-{
- uint_t unit;
- ipif_t *tipif;
- boolean_t found = B_FALSE;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * During failback, we want to go back to the same id
- * instead of the smallest id so that the original
- * configuration is maintained. id is non-zero in that
- * case.
- */
- if (id != 0) {
- /*
- * While failing back, if we still have an ipif with
- * MAX_ADDRS_PER_IF, it means this will be replaced
- * as soon as we return from this function. It was
- * to set to MAX_ADDRS_PER_IF by the caller so that
- * we can choose the smallest id. Thus we return zero
- * in that case ignoring the hint.
- */
- if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF)
- return (0);
- for (tipif = ill->ill_ipif; tipif != NULL;
- tipif = tipif->ipif_next) {
- if (tipif->ipif_id == id) {
- found = B_TRUE;
- break;
- }
- }
- /*
- * If somebody already plumbed another logical
- * with the same id, we won't be able to find it.
- */
- if (!found)
- return (id);
- }
- for (unit = 0; unit <= ipst->ips_ip_addrs_per_if; unit++) {
- found = B_FALSE;
- for (tipif = ill->ill_ipif; tipif != NULL;
- tipif = tipif->ipif_next) {
- if (tipif->ipif_id == unit) {
- found = B_TRUE;
- break;
- }
- }
- if (!found)
- break;
- }
- return (unit);
-}
-
/* ARGSUSED */
-static int
-ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp,
- ipif_t **rep_ipif_ptr)
+int
+ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+ ip_ioctl_cmd_t *ipip, void *ifreq)
{
- ill_t *from_ill;
- ipif_t *rep_ipif;
- uint_t unit;
- int err = 0;
- ipif_t *to_ipif;
- struct iocblk *iocp;
- boolean_t failback_cmd;
- boolean_t remove_ipif;
- int rc;
- ip_stack_t *ipst;
-
- ASSERT(IAM_WRITER_ILL(to_ill));
- ASSERT(IAM_WRITER_IPIF(ipif));
-
- iocp = (struct iocblk *)mp->b_rptr;
- failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK);
- remove_ipif = B_FALSE;
-
- from_ill = ipif->ipif_ill;
- ipst = from_ill->ill_ipst;
-
- ASSERT(MUTEX_HELD(&to_ill->ill_lock));
- ASSERT(MUTEX_HELD(&from_ill->ill_lock));
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
- /*
- * Don't move LINK LOCAL addresses as they are tied to
- * physical interface.
- */
- if (from_ill->ill_isv6 &&
- IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) {
- ipif->ipif_was_up = B_FALSE;
- IPIF_UNMARK_MOVING(ipif);
- return (0);
- }
-
- /*
- * We set the ipif_id to maximum so that the search for
- * ipif_id will pick the lowest number i.e 0 in the
- * following 2 cases :
- *
- * 1) We have a replacement ipif at the head of to_ill.
- * We can't remove it yet as we can exceed ip_addrs_per_if
- * on to_ill and hence the MOVE might fail. We want to
- * remove it only if we could move the ipif. Thus, by
- * setting it to the MAX value, we make the search in
- * ipif_get_id return the zeroth id.
- *
- * 2) When DR pulls out the NIC and re-plumbs the interface,
- * we might just have a zero address plumbed on the ipif
- * with zero id in the case of IPv4. We remove that while
- * doing the failback. We want to remove it only if we
- * could move the ipif. Thus, by setting it to the MAX
- * value, we make the search in ipif_get_id return the
- * zeroth id.
- *
- * Both (1) and (2) are done only when when we are moving
- * an ipif (either due to failover/failback) which originally
- * belonged to this interface i.e the ipif_orig_ifindex is
- * the same as to_ill's ifindex. This is needed so that
- * FAILOVER from A -> B ( A failed) followed by FAILOVER
- * from B -> A (B is being removed from the group) and
- * FAILBACK from A -> B restores the original configuration.
- * Without the check for orig_ifindex, the second FAILOVER
- * could make the ipif belonging to B replace the A's zeroth
- * ipif and the subsequent failback re-creating the replacement
- * ipif again.
- *
- * NOTE : We created the replacement ipif when we did a
- * FAILOVER (See below). We could check for FAILBACK and
- * then look for replacement ipif to be removed. But we don't
- * want to do that because we wan't to allow the possibility
- * of a FAILOVER from A -> B (which creates the replacement ipif),
- * followed by a *FAILOVER* from B -> A instead of a FAILBACK
- * from B -> A.
- */
- to_ipif = to_ill->ill_ipif;
- if ((to_ill->ill_phyint->phyint_ifindex ==
- ipif->ipif_orig_ifindex) &&
- to_ipif->ipif_replace_zero) {
- ASSERT(to_ipif->ipif_id == 0);
- remove_ipif = B_TRUE;
- to_ipif->ipif_id = MAX_ADDRS_PER_IF;
- }
- /*
- * Find the lowest logical unit number on the to_ill.
- * If we are failing back, try to get the original id
- * rather than the lowest one so that the original
- * configuration is maintained.
- *
- * XXX need a better scheme for this.
- */
- if (failback_cmd) {
- unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid);
- } else {
- unit = ipif_get_id(to_ill, 0);
- }
-
- /* Reset back to zero in case we fail below */
- if (to_ipif->ipif_id == MAX_ADDRS_PER_IF)
- to_ipif->ipif_id = 0;
+ ill_t *bound_ill;
+ struct lifreq *lifr = ifreq;
- if (unit == ipst->ips_ip_addrs_per_if) {
- ipif->ipif_was_up = B_FALSE;
- IPIF_UNMARK_MOVING(ipif);
+ if (!IS_IPMP(ipif->ipif_ill))
return (EINVAL);
- }
-
- /*
- * ipif is ready to move from "from_ill" to "to_ill".
- *
- * 1) If we are moving ipif with id zero, create a
- * replacement ipif for this ipif on from_ill. If this fails
- * fail the MOVE operation.
- *
- * 2) Remove the replacement ipif on to_ill if any.
- * We could remove the replacement ipif when we are moving
- * the ipif with id zero. But what if somebody already
- * unplumbed it ? Thus we always remove it if it is present.
- * We want to do it only if we are sure we are going to
- * move the ipif to to_ill which is why there are no
- * returns due to error till ipif is linked to to_ill.
- * Note that the first ipif that we failback will always
- * be zero if it is present.
- */
- if (ipif->ipif_id == 0) {
- ipaddr_t inaddr_any = INADDR_ANY;
- rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED);
- if (rep_ipif == NULL) {
- ipif->ipif_was_up = B_FALSE;
- IPIF_UNMARK_MOVING(ipif);
- return (ENOMEM);
- }
- *rep_ipif = ipif_zero;
- /*
- * Before we put the ipif on the list, store the addresses
- * as mapped addresses as some of the ioctls e.g SIOCGIFADDR
- * assumes so. This logic is not any different from what
- * ipif_allocate does.
- */
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6lcl_addr);
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6src_addr);
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6subnet);
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6net_mask);
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6brd_addr);
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6pp_dst_addr);
- /*
- * We mark IPIF_NOFAILOVER so that this can never
- * move.
- */
- rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER;
- rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE;
- rep_ipif->ipif_replace_zero = B_TRUE;
- mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL,
- MUTEX_DEFAULT, NULL);
- rep_ipif->ipif_id = 0;
- rep_ipif->ipif_ire_type = ipif->ipif_ire_type;
- rep_ipif->ipif_ill = from_ill;
- rep_ipif->ipif_orig_ifindex =
- from_ill->ill_phyint->phyint_ifindex;
- /* Insert at head */
- rep_ipif->ipif_next = from_ill->ill_ipif;
- from_ill->ill_ipif = rep_ipif;
- /*
- * We don't really care to let apps know about
- * this interface.
- */
- }
-
- if (remove_ipif) {
- /*
- * We set to a max value above for this case to get
- * id zero. ASSERT that we did get one.
- */
- ASSERT((to_ipif->ipif_id == 0) && (unit == 0));
- rep_ipif = to_ipif;
- to_ill->ill_ipif = rep_ipif->ipif_next;
- rep_ipif->ipif_next = NULL;
- /*
- * If some apps scanned and find this interface,
- * it is time to let them know, so that they can
- * delete it.
- */
-
- *rep_ipif_ptr = rep_ipif;
- }
-
- /* Get it out of the ILL interface list. */
- ipif_remove(ipif, B_FALSE);
-
- /* Assign the new ill */
- ipif->ipif_ill = to_ill;
- ipif->ipif_id = unit;
- /* id has already been checked */
- rc = ipif_insert(ipif, B_FALSE, B_FALSE);
- ASSERT(rc == 0);
- /* Let SCTP update its list */
- sctp_move_ipif(ipif, from_ill, to_ill);
- /*
- * Handle the failover and failback of ipif_t between
- * ill_t that have differing maximum mtu values.
- */
- if (ipif->ipif_mtu > to_ill->ill_max_mtu) {
- if (ipif->ipif_saved_mtu == 0) {
- /*
- * As this ipif_t is moving to an ill_t
- * that has a lower ill_max_mtu, its
- * ipif_mtu needs to be saved so it can
- * be restored during failback or during
- * failover to an ill_t which has a
- * higher ill_max_mtu.
- */
- ipif->ipif_saved_mtu = ipif->ipif_mtu;
- ipif->ipif_mtu = to_ill->ill_max_mtu;
- } else {
- /*
- * The ipif_t is, once again, moving to
- * an ill_t that has a lower maximum mtu
- * value.
- */
- ipif->ipif_mtu = to_ill->ill_max_mtu;
- }
- } else if (ipif->ipif_mtu < to_ill->ill_max_mtu &&
- ipif->ipif_saved_mtu != 0) {
- /*
- * The mtu of this ipif_t had to be reduced
- * during an earlier failover; this is an
- * opportunity for it to be increased (either as
- * part of another failover or a failback).
- */
- if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) {
- ipif->ipif_mtu = ipif->ipif_saved_mtu;
- ipif->ipif_saved_mtu = 0;
- } else {
- ipif->ipif_mtu = to_ill->ill_max_mtu;
- }
+ if ((bound_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
+ lifr->lifr_binding[0] = '\0';
+ return (0);
}
- /*
- * We preserve all the other fields of the ipif including
- * ipif_saved_ire_mp. The routes that are saved here will
- * be recreated on the new interface and back on the old
- * interface when we move back.
- */
- ASSERT(ipif->ipif_arp_del_mp == NULL);
-
- return (err);
-}
-
-static int
-ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp,
- int ifindex, ipif_t **rep_ipif_ptr)
-{
- ipif_t *mipif;
- ipif_t *ipif_next;
- int err;
-
- /*
- * We don't really try to MOVE back things if some of the
- * operations fail. The daemon will take care of moving again
- * later on.
- */
- for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) {
- ipif_next = mipif->ipif_next;
- if (!(mipif->ipif_flags & IPIF_NOFAILOVER) &&
- (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) {
-
- err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr);
-
- /*
- * When the MOVE fails, it is the job of the
- * application to take care of this properly
- * i.e try again if it is ENOMEM.
- */
- if (mipif->ipif_ill != from_ill) {
- /*
- * ipif has moved.
- *
- * Move the multicast memberships associated
- * with this ipif to the new ill. For IPv6, we
- * do it once after all the ipifs are moved
- * (in ill_move) as they are not associated
- * with ipifs.
- *
- * We need to move the ilms as the ipif has
- * already been moved to a new ill even
- * in the case of errors. Neither
- * ilm_free(ipif) will find the ilm
- * when somebody unplumbs this ipif nor
- * ilm_delete(ilm) will be able to find the
- * ilm, if we don't move now.
- */
- if (!from_ill->ill_isv6)
- ilm_move_v4(from_ill, to_ill, mipif);
- }
-
- if (err != 0)
- return (err);
- }
- }
+ (void) strlcpy(lifr->lifr_binding, bound_ill->ill_name, LIFNAMSIZ);
+ ill_refrele(bound_ill);
return (0);
}
-static int
-ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp)
-{
- int ifindex;
- int err;
- struct iocblk *iocp;
- ipif_t *ipif;
- ipif_t *rep_ipif_ptr = NULL;
- ipif_t *from_ipif = NULL;
- boolean_t check_rep_if = B_FALSE;
- ip_stack_t *ipst = from_ill->ill_ipst;
-
- iocp = (struct iocblk *)mp->b_rptr;
- if (iocp->ioc_cmd == SIOCLIFFAILOVER) {
- /*
- * Move everything pointing at from_ill to to_ill.
- * We acheive this by passing in 0 as ifindex.
- */
- ifindex = 0;
- } else {
- /*
- * Move everything pointing at from_ill whose original
- * ifindex of connp, ipif, ilm points at to_ill->ill_index.
- * We acheive this by passing in ifindex rather than 0.
- * Multicast vifs, ilgs move implicitly because ipifs move.
- */
- ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK);
- ifindex = to_ill->ill_phyint->phyint_ifindex;
- }
-
- /*
- * Determine if there is at least one ipif that would move from
- * 'from_ill' to 'to_ill'. If so, it is possible that the replacement
- * ipif (if it exists) on the to_ill would be consumed as a result of
- * the move, in which case we need to quiesce the replacement ipif also.
- */
- for (from_ipif = from_ill->ill_ipif; from_ipif != NULL;
- from_ipif = from_ipif->ipif_next) {
- if (((ifindex == 0) ||
- (ifindex == from_ipif->ipif_orig_ifindex)) &&
- !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) {
- check_rep_if = B_TRUE;
- break;
- }
- }
-
- ill_down_ipifs(from_ill, mp, ifindex, B_TRUE);
-
- GRAB_ILL_LOCKS(from_ill, to_ill);
- if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) {
- (void) ipsq_pending_mp_add(NULL, ipif, q,
- mp, ILL_MOVE_OK);
- RELEASE_ILL_LOCKS(from_ill, to_ill);
- return (EINPROGRESS);
- }
-
- /* Check if the replacement ipif is quiescent to delete */
- if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif,
- (iocp->ioc_cmd == SIOCLIFFAILBACK))) {
- to_ill->ill_ipif->ipif_state_flags |=
- IPIF_MOVING | IPIF_CHANGING;
- if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) {
- (void) ipsq_pending_mp_add(NULL, ipif, q,
- mp, ILL_MOVE_OK);
- RELEASE_ILL_LOCKS(from_ill, to_ill);
- return (EINPROGRESS);
- }
- }
- RELEASE_ILL_LOCKS(from_ill, to_ill);
-
- ASSERT(!MUTEX_HELD(&to_ill->ill_lock));
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- GRAB_ILL_LOCKS(from_ill, to_ill);
- err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr);
-
- /* ilm_move is done inside ipif_move for IPv4 */
- if (err == 0 && from_ill->ill_isv6)
- ilm_move_v6(from_ill, to_ill, ifindex);
-
- RELEASE_ILL_LOCKS(from_ill, to_ill);
- rw_exit(&ipst->ips_ill_g_lock);
-
- /*
- * send rts messages and multicast messages.
- */
- if (rep_ipif_ptr != NULL) {
- if (rep_ipif_ptr->ipif_recovery_id != 0) {
- (void) untimeout(rep_ipif_ptr->ipif_recovery_id);
- rep_ipif_ptr->ipif_recovery_id = 0;
- }
- ip_rts_ifmsg(rep_ipif_ptr);
- ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr);
-#ifdef DEBUG
- ipif_trace_cleanup(rep_ipif_ptr);
-#endif
- mi_free(rep_ipif_ptr);
- }
-
- conn_move_ill(from_ill, to_ill, ifindex);
-
- return (err);
-}
-
/*
- * Used to extract arguments for FAILOVER/FAILBACK ioctls.
- * Also checks for the validity of the arguments.
- * Note: We are already exclusive inside the from group.
- * It is upto the caller to release refcnt on the to_ill's.
+ * Process an SIOCGLIFGROUPNAME request.
*/
-static int
-ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4,
- ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6)
+/* ARGSUSED */
+int
+ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+ ip_ioctl_cmd_t *ipip, void *ifreq)
{
- int dst_index;
- ipif_t *ipif_v4, *ipif_v6;
- struct lifreq *lifr;
- mblk_t *mp1;
- boolean_t exists;
- sin_t *sin;
- int err = 0;
- ip_stack_t *ipst;
+ ipmp_grp_t *grp;
+ struct lifreq *lifr = ifreq;
+ ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
- if (CONN_Q(q))
- ipst = CONNQ_TO_IPST(q);
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL)
+ lifr->lifr_groupname[0] = '\0';
else
- ipst = ILLQ_TO_IPST(q);
-
- if ((mp1 = mp->b_cont) == NULL)
- return (EPROTO);
-
- if ((mp1 = mp1->b_cont) == NULL)
- return (EPROTO);
-
- lifr = (struct lifreq *)mp1->b_rptr;
- sin = (sin_t *)&lifr->lifr_addr;
-
- /*
- * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6
- * specific operations.
- */
- if (sin->sin_family != AF_UNSPEC)
- return (EINVAL);
-
- /*
- * Get ipif with id 0. We are writer on the from ill. So we can pass
- * NULLs for the last 4 args and we know the lookup won't fail
- * with EINPROGRESS.
- */
- ipif_v4 = ipif_lookup_on_name(lifr->lifr_name,
- mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE,
- ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
- ipif_v6 = ipif_lookup_on_name(lifr->lifr_name,
- mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE,
- ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
-
- if (ipif_v4 == NULL && ipif_v6 == NULL)
- return (ENXIO);
-
- if (ipif_v4 != NULL) {
- ASSERT(ipif_v4->ipif_refcnt != 0);
- if (ipif_v4->ipif_id != 0) {
- err = EINVAL;
- goto done;
- }
-
- ASSERT(IAM_WRITER_IPIF(ipif_v4));
- *ill_from_v4 = ipif_v4->ipif_ill;
- }
-
- if (ipif_v6 != NULL) {
- ASSERT(ipif_v6->ipif_refcnt != 0);
- if (ipif_v6->ipif_id != 0) {
- err = EINVAL;
- goto done;
- }
-
- ASSERT(IAM_WRITER_IPIF(ipif_v6));
- *ill_from_v6 = ipif_v6->ipif_ill;
- }
-
- err = 0;
- dst_index = lifr->lifr_movetoindex;
- *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE,
- q, mp, ip_process_ioctl, &err, ipst);
- if (err != 0) {
- /*
- * A move may be in progress, EINPROGRESS looking up the "to"
- * ill means changes already done to the "from" ipsq need to
- * be undone to avoid potential deadlocks.
- *
- * ENXIO will usually be because there is only v6 on the ill,
- * that's not treated as an error unless an ENXIO is also
- * seen when looking up the v6 "to" ill.
- *
- * If EINPROGRESS, the mp has been enqueued and can not be
- * used to look up the v6 "to" ill, but a preemptive clean
- * up of changes to the v6 "from" ipsq is done.
- */
- if (err == EINPROGRESS) {
- if (*ill_from_v4 != NULL) {
- ill_t *from_ill;
- ipsq_t *from_ipsq;
-
- from_ill = ipif_v4->ipif_ill;
- from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
- mutex_enter(&from_ipsq->ipsq_lock);
- from_ipsq->ipsq_current_ipif = NULL;
- mutex_exit(&from_ipsq->ipsq_lock);
- }
- if (*ill_from_v6 != NULL) {
- ill_t *from_ill;
- ipsq_t *from_ipsq;
-
- from_ill = ipif_v6->ipif_ill;
- from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
- mutex_enter(&from_ipsq->ipsq_lock);
- from_ipsq->ipsq_current_ipif = NULL;
- mutex_exit(&from_ipsq->ipsq_lock);
- }
- goto done;
- }
- ASSERT(err == ENXIO);
- err = 0;
- }
-
- *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE,
- q, mp, ip_process_ioctl, &err, ipst);
- if (err != 0) {
- /*
- * A move may be in progress, EINPROGRESS looking up the "to"
- * ill means changes already done to the "from" ipsq need to
- * be undone to avoid potential deadlocks.
- */
- if (err == EINPROGRESS) {
- if (*ill_from_v6 != NULL) {
- ill_t *from_ill;
- ipsq_t *from_ipsq;
-
- from_ill = ipif_v6->ipif_ill;
- from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
- mutex_enter(&from_ipsq->ipsq_lock);
- from_ipsq->ipsq_current_ipif = NULL;
- mutex_exit(&from_ipsq->ipsq_lock);
- }
- goto done;
- }
- ASSERT(err == ENXIO);
-
- /* Both v4 and v6 lookup failed */
- if (*ill_to_v4 == NULL) {
- err = ENXIO;
- goto done;
- }
- err = 0;
- }
-
- /*
- * If we have something to MOVE i.e "from" not NULL,
- * "to" should be non-NULL.
- */
- if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) ||
- (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) {
- err = EINVAL;
- }
-
-done:
- if (ipif_v4 != NULL)
- ipif_refrele(ipif_v4);
- if (ipif_v6 != NULL)
- ipif_refrele(ipif_v6);
- return (err);
+ (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (0);
}
/*
- * FAILOVER and FAILBACK are modelled as MOVE operations.
- *
- * We don't check whether the MOVE is within the same group or
- * not, because this ioctl can be used as a generic mechanism
- * to failover from interface A to B, though things will function
- * only if they are really part of the same group. Moreover,
- * all ipifs may be down and hence temporarily out of the group.
- *
- * ipif's that need to be moved are first brought down; V4 ipifs are brought
- * down first and then V6. For each we wait for the ipif's to become quiescent.
- * Bringing down the ipifs ensures that all ires pointing to these ipifs's
- * have been deleted and there are no active references. Once quiescent the
- * ipif's are moved and brought up on the new ill.
- *
- * Normally the source ill and destination ill belong to the same IPMP group
- * and hence the same ipsq_t. In the event they don't belong to the same
- * same group the two ipsq's are first merged into one ipsq - that of the
- * to_ill. The multicast memberships on the source and destination ill cannot
- * change during the move operation since multicast joins/leaves also have to
- * execute on the same ipsq and are hence serialized.
+ * Process an SIOCGLIFGROUPINFO request.
*/
/* ARGSUSED */
int
-ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
- ip_ioctl_cmd_t *ipip, void *ifreq)
+ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+ ip_ioctl_cmd_t *ipip, void *dummy)
{
- ill_t *ill_to_v4 = NULL;
- ill_t *ill_to_v6 = NULL;
- ill_t *ill_from_v4 = NULL;
- ill_t *ill_from_v6 = NULL;
- int err = 0;
-
- /*
- * setup from and to ill's, we can get EINPROGRESS only for
- * to_ill's.
- */
- err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6,
- &ill_to_v4, &ill_to_v6);
-
- if (err != 0) {
- ip0dbg(("ip_sioctl_move: extract args failed\n"));
- goto done;
- }
-
- /*
- * nothing to do.
- */
- if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) {
- goto done;
- }
-
- /*
- * nothing to do.
- */
- if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) {
- goto done;
- }
-
- /*
- * Mark the ill as changing.
- * ILL_CHANGING flag is cleared when the ipif's are brought up
- * in ill_up_ipifs in case of error they are cleared below.
- */
-
- GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
- if (ill_from_v4 != NULL)
- ill_from_v4->ill_state_flags |= ILL_CHANGING;
- if (ill_from_v6 != NULL)
- ill_from_v6->ill_state_flags |= ILL_CHANGING;
- RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
-
- /*
- * Make sure that both src and dst are
- * in the same syncq group. If not make it happen.
- * We are not holding any locks because we are the writer
- * on the from_ipsq and we will hold locks in ill_merge_groups
- * to protect to_ipsq against changing.
- */
- if (ill_from_v4 != NULL) {
- if (ill_from_v4->ill_phyint->phyint_ipsq !=
- ill_to_v4->ill_phyint->phyint_ipsq) {
- err = ill_merge_groups(ill_from_v4, ill_to_v4,
- NULL, mp, q);
- goto err_ret;
-
- }
- ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock));
- } else {
-
- if (ill_from_v6->ill_phyint->phyint_ipsq !=
- ill_to_v6->ill_phyint->phyint_ipsq) {
- err = ill_merge_groups(ill_from_v6, ill_to_v6,
- NULL, mp, q);
- goto err_ret;
-
- }
- ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock));
- }
-
- /*
- * Now that the ipsq's have been merged and we are the writer
- * lets mark to_ill as changing as well.
- */
-
- GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
- if (ill_to_v4 != NULL)
- ill_to_v4->ill_state_flags |= ILL_CHANGING;
- if (ill_to_v6 != NULL)
- ill_to_v6->ill_state_flags |= ILL_CHANGING;
- RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
-
- /*
- * Its ok for us to proceed with the move even if
- * ill_pending_mp is non null on one of the from ill's as the reply
- * should not be looking at the ipif, it should only care about the
- * ill itself.
- */
-
- /*
- * lets move ipv4 first.
- */
- if (ill_from_v4 != NULL) {
- ASSERT(IAM_WRITER_ILL(ill_to_v4));
- ill_from_v4->ill_move_in_progress = B_TRUE;
- ill_to_v4->ill_move_in_progress = B_TRUE;
- ill_to_v4->ill_move_peer = ill_from_v4;
- ill_from_v4->ill_move_peer = ill_to_v4;
- err = ill_move(ill_from_v4, ill_to_v4, q, mp);
- }
-
- /*
- * Now lets move ipv6.
- */
- if (err == 0 && ill_from_v6 != NULL) {
- ASSERT(IAM_WRITER_ILL(ill_to_v6));
- ill_from_v6->ill_move_in_progress = B_TRUE;
- ill_to_v6->ill_move_in_progress = B_TRUE;
- ill_to_v6->ill_move_peer = ill_from_v6;
- ill_from_v6->ill_move_peer = ill_to_v6;
- err = ill_move(ill_from_v6, ill_to_v6, q, mp);
- }
-
-err_ret:
- /*
- * EINPROGRESS means we are waiting for the ipif's that need to be
- * moved to become quiescent.
- */
- if (err == EINPROGRESS) {
- goto done;
- }
-
- /*
- * if err is set ill_up_ipifs will not be called
- * lets clear the flags.
- */
-
- GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
- GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
- /*
- * Some of the clearing may be redundant. But it is simple
- * not making any extra checks.
- */
- if (ill_from_v6 != NULL) {
- ill_from_v6->ill_move_in_progress = B_FALSE;
- ill_from_v6->ill_move_peer = NULL;
- ill_from_v6->ill_state_flags &= ~ILL_CHANGING;
- }
- if (ill_from_v4 != NULL) {
- ill_from_v4->ill_move_in_progress = B_FALSE;
- ill_from_v4->ill_move_peer = NULL;
- ill_from_v4->ill_state_flags &= ~ILL_CHANGING;
- }
- if (ill_to_v6 != NULL) {
- ill_to_v6->ill_move_in_progress = B_FALSE;
- ill_to_v6->ill_move_peer = NULL;
- ill_to_v6->ill_state_flags &= ~ILL_CHANGING;
- }
- if (ill_to_v4 != NULL) {
- ill_to_v4->ill_move_in_progress = B_FALSE;
- ill_to_v4->ill_move_peer = NULL;
- ill_to_v4->ill_state_flags &= ~ILL_CHANGING;
- }
-
- /*
- * Check for setting INACTIVE, if STANDBY is set and FAILED is not set.
- * Do this always to maintain proper state i.e even in case of errors.
- * As phyint_inactive looks at both v4 and v6 interfaces,
- * we need not call on both v4 and v6 interfaces.
- */
- if (ill_from_v4 != NULL) {
- if ((ill_from_v4->ill_phyint->phyint_flags &
- (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
- phyint_inactive(ill_from_v4->ill_phyint);
- }
- } else if (ill_from_v6 != NULL) {
- if ((ill_from_v6->ill_phyint->phyint_flags &
- (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
- phyint_inactive(ill_from_v6->ill_phyint);
- }
- }
-
- if (ill_to_v4 != NULL) {
- if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) {
- ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
- }
- } else if (ill_to_v6 != NULL) {
- if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) {
- ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
- }
- }
-
- RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
- RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
-
-no_err:
- /*
- * lets bring the interfaces up on the to_ill.
- */
- if (err == 0) {
- err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4,
- q, mp);
- }
-
- if (err == 0) {
- if (ill_from_v4 != NULL && ill_to_v4 != NULL)
- ilm_send_multicast_reqs(ill_from_v4, ill_to_v4);
+ lifgroupinfo_t *lifgr;
+ ipmp_grp_t *grp;
+ ip_stack_t *ipst = CONNQ_TO_IPST(q);
- if (ill_from_v6 != NULL && ill_to_v6 != NULL)
- ilm_send_multicast_reqs(ill_from_v6, ill_to_v6);
- }
-done:
+ /* ip_wput_nondata() verified mp->b_cont->b_cont */
+ lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr;
+ lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0';
- if (ill_to_v4 != NULL) {
- ill_refrele(ill_to_v4);
- }
- if (ill_to_v6 != NULL) {
- ill_refrele(ill_to_v6);
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) {
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (ENOENT);
}
-
- return (err);
+ ipmp_grp_info(grp, lifgr);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (0);
}
static void
@@ -18167,10 +14492,9 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
* we only wait for the ACK of the DL_UNBIND_REQ.
*/
mutex_enter(&ill->ill_lock);
- if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
- (prim == DL_UNBIND_REQ)) {
+ if (!(ill->ill_state_flags & ILL_CONDEMNED) || (prim == DL_UNBIND_REQ))
ill->ill_dlpi_pending = prim;
- }
+
mutex_exit(&ill->ill_lock);
putnext(ill->ill_wq, mp);
}
@@ -18324,6 +14648,7 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
{
mblk_t *mp;
ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
+ ipxop_t *ipx = ipsq->ipsq_xop;
ASSERT(IAM_WRITER_IPSQ(ipsq));
mutex_enter(&ill->ill_lock);
@@ -18336,12 +14661,11 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
if ((mp = ill->ill_dlpi_deferred) == NULL) {
ill->ill_dlpi_pending = DL_PRIM_INVAL;
-
- mutex_enter(&ipsq->ipsq_lock);
- if (ipsq->ipsq_current_done)
- ipsq->ipsq_current_ipif = NULL;
- mutex_exit(&ipsq->ipsq_lock);
-
+ if (ipx->ipx_current_done) {
+ mutex_enter(&ipx->ipx_lock);
+ ipx->ipx_current_ipif = NULL;
+ mutex_exit(&ipx->ipx_lock);
+ }
cv_signal(&ill->ill_cv);
mutex_exit(&ill->ill_lock);
return;
@@ -18379,7 +14703,7 @@ conn_delete_ire(conn_t *connp, caddr_t arg)
}
/*
- * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number
+ * Some operations (e.g., ipif_down()) conditionally delete a number
* of IREs. Those IREs may have been previously cached in the conn structure.
* This ipcl_walk() walker function releases all references to such IREs based
* on the condemned flag.
@@ -18403,7 +14727,6 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
/*
* Take down a specific interface, but don't lose any information about it.
- * Also delete interface from its interface group (ifgrp).
* (Always called as writer.)
* This function goes through the down sequence even if the interface is
* already down. There are 2 reasons.
@@ -18501,7 +14824,7 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
* For eg. bind, and route operations (Eg. route add / delete) cannot return
* failure if the ipif is currently undergoing an exclusive operation, and
* hence pass the flag. The mblk is then enqueued in the ipsq and the operation
- * is restarted by ipsq_exit() when the currently exclusive ioctl completes.
+ * is restarted by ipsq_exit() when the current exclusive operation completes.
* The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The
* lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
* change while the ill_lock is held. Before dropping the ill_lock we acquire
@@ -18522,7 +14845,6 @@ int
ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
{
ill_t *ill = ipif->ipif_ill;
- phyint_t *phyi;
conn_t *connp;
boolean_t success;
boolean_t ipif_was_up = B_FALSE;
@@ -18569,20 +14891,7 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
}
/*
- * Before we delete the ill from the group (if any), we need
- * to make sure that we delete all the routes dependent on
- * this and also any ipifs dependent on this ipif for
- * source address. We need to do before we delete from
- * the group because
- *
- * 1) ipif_down_delete_ire de-references ill->ill_group.
- *
- * 2) ipif_update_other_ipifs needs to walk the whole group
- * for re-doing source address selection. Note that
- * ipif_select_source[_v6] called from
- * ipif_update_other_ipifs[_v6] will not pick this ipif
- * because we have already marked down here i.e cleared
- * IPIF_UP.
+ * Delete all IRE's pointing at this ipif or its source address.
*/
if (ipif->ipif_isv6) {
ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES,
@@ -18592,6 +14901,17 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
ipst);
}
+ if (ipif_was_up && ill->ill_ipif_up_count == 0) {
+ /*
+ * Since the interface is now down, it may have just become
+ * inactive. Note that this needs to be done even for a
+ * lll_logical_down(), or ARP entries will not get correctly
+ * restored when the interface comes back up.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_ill_refresh_active(ill);
+ }
+
/*
* Cleaning up the conn_ire_cache or conns must be done only after the
* ires have been deleted above. Otherwise a thread could end up
@@ -18609,53 +14929,9 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
* entries for such ipifs.
*/
if (ipif->ipif_isv6)
- ipif_update_other_ipifs_v6(ipif, ill->ill_group);
+ ipif_update_other_ipifs_v6(ipif);
else
- ipif_update_other_ipifs(ipif, ill->ill_group);
-
- if (ipif_was_up) {
- /*
- * Check whether it is last ipif to leave this group.
- * If this is the last ipif to leave, we should remove
- * this ill from the group as ipif_select_source will not
- * be able to find any useful ipifs if this ill is selected
- * for load balancing.
- *
- * For nameless groups, we should call ifgrp_delete if this
- * belongs to some group. As this ipif is going down, we may
- * need to reconstruct groups.
- */
- phyi = ill->ill_phyint;
- /*
- * If the phyint_groupname_len is 0, it may or may not
- * be in the nameless group. If the phyint_groupname_len is
- * not 0, then this ill should be part of some group.
- * As we always insert this ill in the group if
- * phyint_groupname_len is not zero when the first ipif
- * comes up (in ipif_up_done), it should be in a group
- * when the namelen is not 0.
- *
- * NOTE : When we delete the ill from the group,it will
- * blow away all the IRE_CACHES pointing either at this ipif or
- * ill_wq (illgrp_cache_delete does this). Thus, no IRES
- * should be pointing at this ill.
- */
- ASSERT(phyi->phyint_groupname_len == 0 ||
- (phyi->phyint_groupname != NULL && ill->ill_group != NULL));
-
- if (phyi->phyint_groupname_len != 0) {
- if (ill->ill_ipif_up_count == 0)
- illgrp_delete(ill);
- }
-
- /*
- * If we have deleted some of the broadcast ires associated
- * with this ipif, we need to re-nominate somebody else if
- * the ires that we deleted were the nominated ones.
- */
- if (ill->ill_group != NULL && !ill->ill_isv6)
- ipif_renominate_bcast(ipif);
- }
+ ipif_update_other_ipifs(ipif);
/*
* neighbor-discovery or arp entries for this interface.
@@ -18734,17 +15010,12 @@ ipif_down_tail(ipif_t *ipif)
ill->ill_logical_down = 0;
/*
- * Have to be after removing the routes in ipif_down_delete_ire.
+ * Has to be after removing the routes in ipif_down_delete_ire.
*/
- if (ipif->ipif_isv6) {
- if (ill->ill_flags & ILLF_XRESOLV)
- ipif_arp_down(ipif);
- } else {
- ipif_arp_down(ipif);
- }
+ ipif_resolver_down(ipif);
- ip_rts_ifmsg(ipif);
- ip_rts_newaddrmsg(RTM_DELETE, 0, ipif);
+ ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+ ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
}
/*
@@ -18804,39 +15075,11 @@ static void
ipif_down_delete_ire(ire_t *ire, char *ipif_arg)
{
ipif_t *ipif = (ipif_t *)ipif_arg;
- ill_t *ire_ill;
- ill_t *ipif_ill;
ASSERT(IAM_WRITER_IPIF(ipif));
if (ire->ire_ipif == NULL)
return;
- /*
- * For IPv4, we derive source addresses for an IRE from ipif's
- * belonging to the same IPMP group as the IRE's outgoing
- * interface. If an IRE's outgoing interface isn't in the
- * same IPMP group as a particular ipif, then that ipif
- * couldn't have been used as a source address for this IRE.
- *
- * For IPv6, source addresses are only restricted to the IPMP group
- * if the IRE is for a link-local address or a multicast address.
- * Otherwise, source addresses for an IRE can be chosen from
- * interfaces other than the the outgoing interface for that IRE.
- *
- * For source address selection details, see ipif_select_source()
- * and ipif_select_source_v6().
- */
- if (ire->ire_ipversion == IPV4_VERSION ||
- IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) ||
- IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
- ire_ill = ire->ire_ipif->ipif_ill;
- ipif_ill = ipif->ipif_ill;
-
- if (ire_ill->ill_group != ipif_ill->ill_group) {
- return;
- }
- }
-
if (ire->ire_ipif != ipif) {
/*
* Look for a matching source address.
@@ -18875,83 +15118,53 @@ void
ill_ipif_cache_delete(ire_t *ire, char *ill_arg)
{
ill_t *ill = (ill_t *)ill_arg;
- ill_t *ipif_ill;
ASSERT(IAM_WRITER_ILL(ill));
- /*
- * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
- * Hence this should be IRE_CACHE.
- */
ASSERT(ire->ire_type == IRE_CACHE);
/*
- * We are called for IRE_CACHES whose ire_ipif matches ill.
- * We are only interested in IRE_CACHES that has borrowed
- * the source address from ill_arg e.g. ipif_up_done[_v6]
- * for which we need to look at ire_ipif->ipif_ill match
- * with ill.
+ * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
+ * ill, but we only want to delete the IRE if ire_ipif matches.
*/
ASSERT(ire->ire_ipif != NULL);
- ipif_ill = ire->ire_ipif->ipif_ill;
- if (ipif_ill == ill || (ill->ill_group != NULL &&
- ipif_ill->ill_group == ill->ill_group)) {
+ if (ill == ire->ire_ipif->ipif_ill)
ire_delete(ire);
- }
}
/*
- * Delete all the ire whose stq references ill_arg.
+ * Delete all the IREs whose ire_stq's reference `ill_arg'. IPMP uses this
+ * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references
+ * the IPMP ill.
*/
-static void
+void
ill_stq_cache_delete(ire_t *ire, char *ill_arg)
{
ill_t *ill = (ill_t *)ill_arg;
- ill_t *ire_ill;
ASSERT(IAM_WRITER_ILL(ill));
- /*
- * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
- * Hence this should be IRE_CACHE.
- */
ASSERT(ire->ire_type == IRE_CACHE);
/*
- * We are called for IRE_CACHES whose ire_stq and ire_ipif
- * matches ill. We are only interested in IRE_CACHES that
- * has ire_stq->q_ptr pointing at ill_arg. Thus we do the
- * filtering here.
+ * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
+ * ill, but we only want to delete the IRE if ire_stq matches.
*/
- ire_ill = (ill_t *)ire->ire_stq->q_ptr;
-
- if (ire_ill == ill)
+ if (ire->ire_stq->q_ptr == ill_arg)
ire_delete(ire);
}
/*
- * This is called when an ill leaves the group. We want to delete
- * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is
- * pointing at ill.
+ * Delete all broadcast IREs with a source address on `ill_arg'.
*/
static void
-illgrp_cache_delete(ire_t *ire, char *ill_arg)
+ill_broadcast_delete(ire_t *ire, char *ill_arg)
{
- ill_t *ill = (ill_t *)ill_arg;
+ ill_t *ill = (ill_t *)ill_arg;
ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(ill->ill_group == NULL);
- /*
- * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
- * Hence this should be IRE_CACHE.
- */
- ASSERT(ire->ire_type == IRE_CACHE);
- /*
- * We are called for IRE_CACHES whose ire_stq and ire_ipif
- * matches ill. We are interested in both.
- */
- ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) ||
- (ire->ire_ipif->ipif_ill == ill));
+ ASSERT(ire->ire_type == IRE_BROADCAST);
- ire_delete(ire);
+ if (ire->ire_ipif->ipif_ill == ill)
+ ire_delete(ire);
}
/*
@@ -18997,13 +15210,12 @@ ipif_free(ipif_t *ipif)
rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
/* Remove pointers to this ill in the multicast routing tables */
reset_mrt_vif_ipif(ipif);
+ /* If necessary, clear the cached source ipif rotor. */
+ if (ipif->ipif_ill->ill_src_ipif == ipif)
+ ipif->ipif_ill->ill_src_ipif = NULL;
rw_exit(&ipst->ips_ill_g_lock);
}
-/*
- * Warning: this is not the only function that calls mi_free on an ipif_t. See
- * also ill_move().
- */
static void
ipif_free_tail(ipif_t *ipif)
{
@@ -19036,7 +15248,7 @@ ipif_free_tail(ipif_t *ipif)
sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
/* Get it out of the ILL interface list. */
- ipif_remove(ipif, B_TRUE);
+ ipif_remove(ipif);
rw_exit(&ipst->ips_ill_g_lock);
mutex_destroy(&ipif->ipif_saved_ire_lock);
@@ -19208,8 +15420,10 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
} else if (IPIF_CAN_WAIT(ipif, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
ill_refrele(ill);
@@ -19244,7 +15458,7 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
ire_type = IRE_LOOPBACK;
else
ire_type = IRE_LOCAL;
- ipif = ipif_allocate(ill, id, ire_type, B_TRUE);
+ ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE);
if (ipif != NULL)
ipif_refhold_locked(ipif);
else if (error != NULL)
@@ -19342,65 +15556,62 @@ ill_mtu_change(ire_t *ire, char *ill_arg)
void
ipif_multicast_up(ipif_t *ipif)
{
- int err, index;
+ int err;
ill_t *ill;
ASSERT(IAM_WRITER_IPIF(ipif));
ill = ipif->ipif_ill;
- index = ill->ill_phyint->phyint_ifindex;
ip1dbg(("ipif_multicast_up\n"));
if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up)
return;
if (ipif->ipif_isv6) {
+ in6_addr_t v6allmc = ipv6_all_hosts_mcast;
+ in6_addr_t v6solmc = ipv6_solicited_node_mcast;
+
+ v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
+
if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
return;
- /* Join the all hosts multicast address */
ip1dbg(("ipif_multicast_up - addmulti\n"));
+
/*
- * Passing B_TRUE means we have to join the multicast
- * membership on this interface even though this is
- * FAILED. If we join on a different one in the group,
- * we will not be able to delete the membership later
- * as we currently don't track where we join when we
- * join within the kernel unlike applications where
- * we have ilg/ilg_orig_index. See ip_addmulti_v6
- * for more on this.
+ * Join the all hosts multicast address. We skip this for
+ * underlying IPMP interfaces since they should be invisible.
*/
- err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index,
- ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
- if (err != 0) {
- ip0dbg(("ipif_multicast_up: "
- "all_hosts_mcast failed %d\n",
- err));
- return;
+ if (!IS_UNDER_IPMP(ill)) {
+ err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid,
+ ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
+ if (err != 0) {
+ ip0dbg(("ipif_multicast_up: "
+ "all_hosts_mcast failed %d\n", err));
+ return;
+ }
+ ipif->ipif_joined_allhosts = 1;
}
+
/*
* Enable multicast for the solicited node multicast address
*/
if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
- in6_addr_t ipv6_multi = ipv6_solicited_node_mcast;
-
- ipv6_multi.s6_addr32[3] |=
- ipif->ipif_v6lcl_addr.s6_addr32[3];
-
- err = ip_addmulti_v6(&ipv6_multi, ill, index,
- ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE,
- NULL);
+ err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid,
+ ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
if (err != 0) {
ip0dbg(("ipif_multicast_up: solicited MC"
" failed %d\n", err));
- (void) ip_delmulti_v6(&ipv6_all_hosts_mcast,
- ill, ill->ill_phyint->phyint_ifindex,
- ipif->ipif_zoneid, B_TRUE, B_TRUE);
+ if (ipif->ipif_joined_allhosts) {
+ (void) ip_delmulti_v6(&v6allmc, ill,
+ ipif->ipif_zoneid, B_TRUE, B_TRUE);
+ ipif->ipif_joined_allhosts = 0;
+ }
return;
}
}
} else {
- if (ipif->ipif_lcl_addr == INADDR_ANY)
+ if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
return;
/* Join the all hosts multicast address */
@@ -19420,7 +15631,7 @@ ipif_multicast_up(ipif_t *ipif)
* (Explicit memberships are blown away in ill_leave_multicast() when the
* ill is brought down.)
*/
-static void
+void
ipif_multicast_down(ipif_t *ipif)
{
int err;
@@ -19444,19 +15655,18 @@ ipif_multicast_down(ipif_t *ipif)
}
/*
- * Leave the all hosts multicast address. Similar to ip_addmulti_v6,
- * we should look for ilms on this ill rather than the ones that have
- * been failed over here. They are here temporarily. As
- * ipif_multicast_up has joined on this ill, we should delete only
- * from this ill.
+ * Leave the all-hosts multicast address.
*/
- err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
- ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid,
- B_TRUE, B_TRUE);
- if (err != 0) {
- ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n",
- err));
+ if (ipif->ipif_joined_allhosts) {
+ err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
+ ipif->ipif_zoneid, B_TRUE, B_TRUE);
+ if (err != 0) {
+ ip0dbg(("ipif_multicast_down: all_hosts_mcast "
+ "failed %d\n", err));
+ }
+ ipif->ipif_joined_allhosts = 0;
}
+
/*
* Disable multicast for the solicited node multicast address
*/
@@ -19467,9 +15677,7 @@ ipif_multicast_down(ipif_t *ipif)
ipif->ipif_v6lcl_addr.s6_addr32[3];
err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill,
- ipif->ipif_ill->ill_phyint->phyint_ifindex,
ipif->ipif_zoneid, B_TRUE, B_TRUE);
-
if (err != 0) {
ip0dbg(("ipif_multicast_down: sol MC failed %d\n",
err));
@@ -19683,9 +15891,8 @@ ipif_set_default(ipif_t *ipif)
* Return 0 if this address can be used as local address without causing
* duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
* is already up on a different ill, and EADDRINUSE if it's up on the same ill.
- * Special checks are needed to allow the same IPv6 link-local address
- * on different ills.
- * TODO: allowing the same site-local address on different ill's.
+ * Note that the same IPv6 link-local address is allowed as long as the ills
+ * are not on the same link.
*/
int
ip_addr_availability_check(ipif_t *new_ipif)
@@ -19717,30 +15924,26 @@ ip_addr_availability_check(ipif_t *new_ipif)
ipif = ipif->ipif_next) {
if ((ipif == new_ipif) ||
!(ipif->ipif_flags & IPIF_UP) ||
- (ipif->ipif_flags & IPIF_UNNUMBERED))
+ (ipif->ipif_flags & IPIF_UNNUMBERED) ||
+ !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
+ &our_v6addr))
continue;
- if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
- &our_v6addr)) {
- if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
- new_ipif->ipif_flags |= IPIF_UNNUMBERED;
- else if (ipif->ipif_flags & IPIF_POINTOPOINT)
- ipif->ipif_flags |= IPIF_UNNUMBERED;
- else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) &&
- new_ipif->ipif_ill != ill)
- continue;
- else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) &&
- new_ipif->ipif_ill != ill)
- continue;
- else if (new_ipif->ipif_zoneid !=
- ipif->ipif_zoneid &&
- ipif->ipif_zoneid != ALL_ZONES &&
- IS_LOOPBACK(ill))
- continue;
- else if (new_ipif->ipif_ill == ill)
- return (EADDRINUSE);
- else
- return (EADDRNOTAVAIL);
- }
+
+ if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
+ new_ipif->ipif_flags |= IPIF_UNNUMBERED;
+ else if (ipif->ipif_flags & IPIF_POINTOPOINT)
+ ipif->ipif_flags |= IPIF_UNNUMBERED;
+ else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) ||
+ IN6_IS_ADDR_SITELOCAL(&our_v6addr)) &&
+ !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill))
+ continue;
+ else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid &&
+ ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill))
+ continue;
+ else if (new_ipif->ipif_ill == ill)
+ return (EADDRINUSE);
+ else
+ return (EADDRNOTAVAIL);
}
}
@@ -19753,13 +15956,15 @@ ip_addr_availability_check(ipif_t *new_ipif)
* When the routine returns EINPROGRESS then mp has been consumed and
* the ioctl will be acked from ip_rput_dlpi.
*/
-static int
+int
ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
{
- ill_t *ill = ipif->ipif_ill;
- boolean_t isv6 = ipif->ipif_isv6;
- int err = 0;
- boolean_t success;
+ ill_t *ill = ipif->ipif_ill;
+ boolean_t isv6 = ipif->ipif_isv6;
+ int err = 0;
+ boolean_t success;
+ uint_t ipif_orig_id;
+ ip_stack_t *ipst = ill->ill_ipst;
ASSERT(IAM_WRITER_IPIF(ipif));
@@ -19769,6 +15974,123 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
if (ipif->ipif_flags & IPIF_UP)
return (EALREADY);
+ /*
+ * If this is a request to bring up a data address on an interface
+ * under IPMP, then move the address to its IPMP meta-interface and
+ * try to bring it up. One complication is that the zeroth ipif for
+ * an ill is special, in that every ill always has one, and that code
+ * throughout IP deferences ill->ill_ipif without holding any locks.
+ */
+ if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) &&
+ (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) {
+ ipif_t *stubipif = NULL, *moveipif = NULL;
+ ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
+
+ /*
+ * The ipif being brought up should be quiesced. If it's not,
+ * something has gone amiss and we need to bail out. (If it's
+ * quiesced, we know it will remain so via IPIF_CHANGING.)
+ */
+ mutex_enter(&ill->ill_lock);
+ if (!ipif_is_quiescent(ipif)) {
+ mutex_exit(&ill->ill_lock);
+ return (EINVAL);
+ }
+ mutex_exit(&ill->ill_lock);
+
+ /*
+ * If we're going to need to allocate ipifs, do it prior
+ * to starting the move (and grabbing locks).
+ */
+ if (ipif->ipif_id == 0) {
+ moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
+ B_FALSE);
+ stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
+ B_FALSE);
+ if (moveipif == NULL || stubipif == NULL) {
+ mi_free(moveipif);
+ mi_free(stubipif);
+ return (ENOMEM);
+ }
+ }
+
+ /*
+ * Grab or transfer the ipif to move. During the move, keep
+ * ill_g_lock held to prevent any ill walker threads from
+ * seeing things in an inconsistent state.
+ */
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ if (ipif->ipif_id != 0) {
+ ipif_remove(ipif);
+ } else {
+ ipif_transfer(ipif, moveipif, stubipif);
+ ipif = moveipif;
+ }
+
+ /*
+ * Place the ipif on the IPMP ill. If the zeroth ipif on
+ * the IPMP ill is a stub (0.0.0.0 down address) then we
+ * replace that one. Otherwise, pick the next available slot.
+ */
+ ipif->ipif_ill = ipmp_ill;
+ ipif_orig_id = ipif->ipif_id;
+
+ if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) {
+ ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL);
+ ipif = ipmp_ill->ill_ipif;
+ } else {
+ ipif->ipif_id = -1;
+ if (ipif_insert(ipif, B_FALSE) != 0) {
+ /*
+ * No more available ipif_id's -- put it back
+ * on the original ill and fail the operation.
+ * Since we're writer on the ill, we can be
+ * sure our old slot is still available.
+ */
+ ipif->ipif_id = ipif_orig_id;
+ ipif->ipif_ill = ill;
+ if (ipif_orig_id == 0) {
+ ipif_transfer(ipif, ill->ill_ipif,
+ NULL);
+ } else {
+ VERIFY(ipif_insert(ipif, B_FALSE) == 0);
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (ENOMEM);
+ }
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ /*
+ * Tell SCTP that the ipif has moved. Note that even if we
+ * had to allocate a new ipif, the original sequence id was
+ * preserved and therefore SCTP won't know.
+ */
+ sctp_move_ipif(ipif, ill, ipmp_ill);
+
+ /*
+ * If the ipif being brought up was on slot zero, then we
+ * first need to bring up the placeholder we stuck there. In
+ * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call
+ * to ipif_up() itself, if we successfully bring up the
+ * placeholder, we'll check ill_move_ipif and bring it up too.
+ */
+ if (ipif_orig_id == 0) {
+ ASSERT(ill->ill_move_ipif == NULL);
+ ill->ill_move_ipif = ipif;
+ if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0)
+ ASSERT(ill->ill_move_ipif == NULL);
+ if (err != EINPROGRESS)
+ ill->ill_move_ipif = NULL;
+ return (err);
+ }
+
+ /*
+ * Bring it up on the IPMP ill.
+ */
+ return (ipif_up(ipif, q, mp));
+ }
+
/* Skip arp/ndp for any loopback interface. */
if (ill->ill_wq != NULL) {
conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
@@ -19798,7 +16120,6 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
*/
ASSERT(connp != NULL || !CONN_Q(q));
- ASSERT(ipsq->ipsq_pending_mp == NULL);
if (connp != NULL)
mutex_enter(&connp->conn_lock);
mutex_enter(&ill->ill_lock);
@@ -19810,27 +16131,25 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
return (EINTR);
/*
- * Crank up IPv6 neighbor discovery
- * Unlike ARP, this should complete when
- * ipif_ndp_up returns. However, for
- * ILLF_XRESOLV interfaces we also send a
- * AR_INTERFACE_UP to the external resolver.
- * That ioctl will complete in ip_rput.
+ * Crank up the resolver. For IPv6, this cranks up the
+ * external resolver if one is configured, but even if an
+ * external resolver isn't configured, it must be called to
+ * reset DAD state. For IPv6, if an external resolver is not
+ * being used, ipif_resolver_up() will never return
+ * EINPROGRESS, so we can always call ipif_ndp_up() here.
+ * Note that if an external resolver is being used, there's no
+ * need to call ipif_ndp_up() since it will do nothing.
*/
- if (isv6) {
- err = ipif_ndp_up(ipif);
- if (err != 0) {
- if (err != EINPROGRESS)
- mp = ipsq_pending_mp_get(ipsq, &connp);
- return (err);
- }
- }
- /* Now, ARP */
err = ipif_resolver_up(ipif, Res_act_initial);
if (err == EINPROGRESS) {
- /* We will complete it in ip_arp_done */
+ /* We will complete it in ip_arp_done() */
return (err);
}
+
+ if (isv6 && err == 0)
+ err = ipif_ndp_up(ipif, B_TRUE);
+
+ ASSERT(err != EINPROGRESS);
mp = ipsq_pending_mp_get(ipsq, &connp);
ASSERT(mp != NULL);
if (err != 0)
@@ -19843,7 +16162,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
ipif->ipif_addr_ready = 1;
}
- return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
+
+ err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif);
+ if (err == 0 && ill->ill_move_ipif != NULL) {
+ ipif = ill->ill_move_ipif;
+ ill->ill_move_ipif = NULL;
+ return (ipif_up(ipif, q, mp));
+ }
+ return (err);
}
/*
@@ -19939,13 +16265,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
return (EINPROGRESS);
bad:
ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
- /*
- * We don't have to check for possible removal from illgrp
- * as we have not yet inserted in illgrp. For groups
- * without names, this ipif is still not UP and hence
- * this could not have possibly had any influence in forming
- * groups.
- */
freemsg(bind_mp);
freemsg(unbind_mp);
@@ -19974,12 +16293,10 @@ ipif_up_done(ipif_t *ipif)
ipif_t *tmp_ipif;
boolean_t flush_ire_cache = B_TRUE;
int err = 0;
- phyint_t *phyi;
ire_t **ipif_saved_irep = NULL;
int ipif_saved_ire_cnt;
int cnt;
boolean_t src_ipif_held = B_FALSE;
- boolean_t ire_added = B_FALSE;
boolean_t loopback = B_FALSE;
ip_stack_t *ipst = ill->ill_ipst;
@@ -20010,7 +16327,7 @@ ipif_up_done(ipif_t *ipif)
break;
}
if (flush_ire_cache)
- ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE,
+ ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
/*
@@ -20044,7 +16361,9 @@ ipif_up_done(ipif_t *ipif)
ipif->ipif_ire_type = IRE_LOCAL;
}
- if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) {
+ if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) ||
+ ((ipif->ipif_flags & IPIF_DEPRECATED) &&
+ !(ipif->ipif_flags & IPIF_NOFAILOVER))) {
/*
* Can't use our source address. Select a different
* source address for the IRE_INTERFACE and IRE_LOCAL
@@ -20189,11 +16508,9 @@ ipif_up_done(ipif_t *ipif)
}
/*
- * Need to atomically check for ip_addr_availablity_check
- * under ip_addr_avail_lock, and if it fails got bad, and remove
- * from group also.The ill_g_lock is grabbed as reader
- * just to make sure no new ills or new ipifs are being added
- * to the system while we are checking the uniqueness of addresses.
+ * Need to atomically check for IP address availability under
+ * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new
+ * ills or new ipifs can be added while we are checking availability.
*/
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
mutex_enter(&ipst->ips_ip_addr_avail_lock);
@@ -20227,13 +16544,6 @@ ipif_up_done(ipif_t *ipif)
/*
* Add in all newly created IREs. ire_create_bcast() has
* already checked for duplicates of the IRE_BROADCAST type.
- * We want to add before we call ifgrp_insert which wants
- * to know whether IRE_IF_RESOLVER exists or not.
- *
- * NOTE : We refrele the ire though we may branch to "bad"
- * later on where we do ire_delete. This is okay
- * because nobody can delete it as we are running
- * exclusively.
*/
for (irep1 = irep; irep1 > ire_array; ) {
irep1--;
@@ -20243,44 +16553,6 @@ ipif_up_done(ipif_t *ipif)
*/
(void) ire_add(irep1, NULL, NULL, NULL, B_FALSE);
}
- ire_added = B_TRUE;
- /*
- * Form groups if possible.
- *
- * If we are supposed to be in a ill_group with a name, insert it
- * now as we know that at least one ipif is UP. Otherwise form
- * nameless groups.
- *
- * If ip_enable_group_ifs is set and ipif address is not 0, insert
- * this ipif into the appropriate interface group, or create a
- * new one. If this is already in a nameless group, we try to form
- * a bigger group looking at other ills potentially sharing this
- * ipif's prefix.
- */
- phyi = ill->ill_phyint;
- if (phyi->phyint_groupname_len != 0) {
- ASSERT(phyi->phyint_groupname != NULL);
- if (ill->ill_ipif_up_count == 1) {
- ASSERT(ill->ill_group == NULL);
- err = illgrp_insert(&ipst->ips_illgrp_head_v4, ill,
- phyi->phyint_groupname, NULL, B_TRUE);
- if (err != 0) {
- ip1dbg(("ipif_up_done: illgrp allocation "
- "failed, error %d\n", err));
- goto bad;
- }
- }
- ASSERT(ill->ill_group != NULL);
- }
-
- /*
- * When this is part of group, we need to make sure that
- * any broadcast ires created because of this ipif coming
- * UP gets marked/cleared with IRE_MARK_NORECV appropriately
- * so that we don't receive duplicate broadcast packets.
- */
- if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0)
- ipif_renominate_bcast(ipif);
/* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
@@ -20331,19 +16603,30 @@ ipif_up_done(ipif_t *ipif)
*/
ill_recover_multicast(ill);
}
- /* Join the allhosts multicast address */
- ipif_multicast_up(ipif);
- if (!loopback) {
+ if (ill->ill_ipif_up_count == 1) {
+ /*
+ * Since the interface is now up, it may now be active.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_ill_refresh_active(ill);
+
/*
- * See whether anybody else would benefit from the
- * new ipif that we added. We call this always rather
- * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST
- * ipif is for the benefit of illgrp_insert (done above)
- * which does not do source address selection as it does
- * not want to re-create interface routes that we are
- * having reference to it here.
+ * If this is an IPMP interface, we may now be able to
+ * establish ARP entries.
*/
+ if (IS_IPMP(ill))
+ ipmp_illgrp_refresh_arpent(ill->ill_grp);
+ }
+
+ /* Join the allhosts multicast address */
+ ipif_multicast_up(ipif);
+
+ /*
+ * See if anybody else would benefit from our new ipif.
+ */
+ if (!loopback &&
+ !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
ill_update_source_selection(ill);
}
@@ -20386,27 +16669,11 @@ ipif_up_done(ipif_t *ipif)
bad:
ip1dbg(("ipif_up_done: FAILED \n"));
- /*
- * We don't have to bother removing from ill groups because
- *
- * 1) For groups with names, we insert only when the first ipif
- * comes up. In that case if it fails, it will not be in any
- * group. So, we need not try to remove for that case.
- *
- * 2) For groups without names, either we tried to insert ipif_ill
- * in a group as singleton or found some other group to become
- * a bigger group. For the former, if it fails we don't have
- * anything to do as ipif_ill is not in the group and for the
- * latter, there are no failures in illgrp_insert/illgrp_delete
- * (ENOMEM can't occur for this. Check ifgrp_insert).
- */
+
while (irep > ire_array) {
irep--;
- if (*irep != NULL) {
+ if (*irep != NULL)
ire_delete(*irep);
- if (ire_added)
- ire_refrele(*irep);
- }
}
(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
@@ -20417,7 +16684,7 @@ bad:
if (src_ipif_held)
ipif_refrele(src_ipif);
- ipif_arp_down(ipif);
+ ipif_resolver_down(ipif);
return (err);
}
@@ -20493,119 +16760,6 @@ ill_arp_on(ill_t *ill)
}
/*
- * Called after either deleting ill from the group or when setting
- * FAILED or STANDBY on the interface.
- */
-static void
-illgrp_reset_schednext(ill_t *ill)
-{
- ill_group_t *illgrp;
- ill_t *save_ill;
-
- ASSERT(IAM_WRITER_ILL(ill));
- /*
- * When called from illgrp_delete, ill_group will be non-NULL.
- * But when called from ip_sioctl_flags, it could be NULL if
- * somebody is setting FAILED/INACTIVE on some interface which
- * is not part of a group.
- */
- illgrp = ill->ill_group;
- if (illgrp == NULL)
- return;
- if (illgrp->illgrp_ill_schednext != ill)
- return;
-
- illgrp->illgrp_ill_schednext = NULL;
- save_ill = ill;
- /*
- * Choose a good ill to be the next one for
- * outbound traffic. As the flags FAILED/STANDBY is
- * not yet marked when called from ip_sioctl_flags,
- * we check for ill separately.
- */
- for (ill = illgrp->illgrp_ill; ill != NULL;
- ill = ill->ill_group_next) {
- if ((ill != save_ill) &&
- !(ill->ill_phyint->phyint_flags &
- (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) {
- illgrp->illgrp_ill_schednext = ill;
- return;
- }
- }
-}
-
-/*
- * Given an ill, find the next ill in the group to be scheduled.
- * (This should be called by ip_newroute() before ire_create().)
- * The passed in ill may be pulled out of the group, after we have picked
- * up a different outgoing ill from the same group. However ire add will
- * atomically check this.
- */
-ill_t *
-illgrp_scheduler(ill_t *ill)
-{
- ill_t *retill;
- ill_group_t *illgrp;
- int illcnt;
- int i;
- uint64_t flags;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * We don't use a lock to check for the ill_group. If this ill
- * is currently being inserted we may end up just returning this
- * ill itself. That is ok.
- */
- if (ill->ill_group == NULL) {
- ill_refhold(ill);
- return (ill);
- }
-
- /*
- * Grab the ill_g_lock as reader to make sure we are dealing with
- * a set of stable ills. No ill can be added or deleted or change
- * group while we hold the reader lock.
- */
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- if ((illgrp = ill->ill_group) == NULL) {
- rw_exit(&ipst->ips_ill_g_lock);
- ill_refhold(ill);
- return (ill);
- }
-
- illcnt = illgrp->illgrp_ill_count;
- mutex_enter(&illgrp->illgrp_lock);
- retill = illgrp->illgrp_ill_schednext;
-
- if (retill == NULL)
- retill = illgrp->illgrp_ill;
-
- /*
- * We do a circular search beginning at illgrp_ill_schednext
- * or illgrp_ill. We don't check the flags against the ill lock
- * since it can change anytime. The ire creation will be atomic
- * and will fail if the ill is FAILED or OFFLINE.
- */
- for (i = 0; i < illcnt; i++) {
- flags = retill->ill_phyint->phyint_flags;
-
- if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
- ILL_CAN_LOOKUP(retill)) {
- illgrp->illgrp_ill_schednext = retill->ill_group_next;
- ill_refhold(retill);
- break;
- }
- retill = retill->ill_group_next;
- if (retill == NULL)
- retill = illgrp->illgrp_ill;
- }
- mutex_exit(&illgrp->illgrp_lock);
- rw_exit(&ipst->ips_ill_g_lock);
-
- return (i == illcnt ? NULL : retill);
-}
-
-/*
* Checks for availbility of a usable source address (if there is one) when the
* destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
* this selection is done regardless of the destination.
@@ -20654,11 +16808,26 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
}
/*
- * Determine the best source address given a destination address and an ill.
- * Prefers non-deprecated over deprecated but will return a deprecated
- * address if there is no other choice. If there is a usable source address
- * on the interface pointed to by ill_usesrc_ifindex then that is given
- * first preference.
+ * IP source address type, sorted from worst to best. For a given type,
+ * always prefer IP addresses on the same subnet. All-zones addresses are
+ * suboptimal because they pose problems with unlabeled destinations.
+ */
+typedef enum {
+ IPIF_NONE,
+ IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */
+ IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */
+ IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */
+ IPIF_SAMENET_ALLZONES, /* allzones and same subnet */
+ IPIF_DIFFNET, /* normal and different subnet */
+ IPIF_SAMENET /* normal and same subnet */
+} ipif_type_t;
+
+/*
+ * Pick the optimal ipif on `ill' for sending to destination `dst' from zone
+ * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t
+ * enumeration, and return the highest-rated ipif. If there's a tie, we pick
+ * the first one, unless IPMP is used in which case we round-robin among them;
+ * see below for more.
*
* Returns NULL if there is no suitable source address for the ill.
* This only occurs when there is no valid source address for the ill.
@@ -20666,17 +16835,13 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
ipif_t *
ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
{
- ipif_t *ipif;
- ipif_t *ipif_dep = NULL; /* Fallback to deprecated */
- ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE];
- int index = 0;
- boolean_t wrapped = B_FALSE;
- boolean_t same_subnet_only = B_FALSE;
- boolean_t ipif_same_found, ipif_other_found;
- boolean_t specific_found;
- ill_t *till, *usill = NULL;
+ ill_t *usill = NULL;
+ ill_t *ipmp_ill = NULL;
+ ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif;
+ ipif_type_t type, best_type;
tsol_tpc_t *src_rhtp, *dst_rhtp;
- ip_stack_t *ipst = ill->ill_ipst;
+ ip_stack_t *ipst = ill->ill_ipst;
+ boolean_t samenet;
if (ill->ill_usesrc_ifindex != 0) {
usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
@@ -20688,6 +16853,17 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
}
/*
+ * Test addresses should never be used for source address selection,
+ * so if we were passed one, switch to the IPMP meta-interface.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL)
+ ill = ipmp_ill; /* Select source from IPMP ill */
+ else
+ return (NULL);
+ }
+
+ /*
* If we're dealing with an unlabeled destination on a labeled system,
* make sure that we ignore source addresses that are incompatible with
* the destination's default label. That destination's default label
@@ -20705,7 +16881,7 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
}
/*
- * Holds the ill_g_lock as reader. This makes sure that no ipif/ill
+ * Hold the ill_g_lock as reader. This makes sure that no ipif/ill
* can be deleted. But an ipif/ill can get CONDEMNED any time.
* After selecting the right ipif, under ill_lock make sure ipif is
* not condemned, and increment refcnt. If ipif is CONDEMNED,
@@ -20713,190 +16889,117 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
* but not under a lock.
*/
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-
retry:
- till = ill;
- ipif_arr[0] = NULL;
+ /*
+ * For source address selection, we treat the ipif list as circular
+ * and continue until we get back to where we started. This allows
+ * IPMP to vary source address selection (which improves inbound load
+ * spreading) by caching its last ending point and starting from
+ * there. NOTE: we don't have to worry about ill_src_ipif changing
+ * ills since that can't happen on the IPMP ill.
+ */
+ start_ipif = ill->ill_ipif;
+ if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
+ start_ipif = ill->ill_src_ipif;
- if (till->ill_group != NULL)
- till = till->ill_group->illgrp_ill;
+ ipif = start_ipif;
+ best_ipif = NULL;
+ best_type = IPIF_NONE;
+ do {
+ if ((next_ipif = ipif->ipif_next) == NULL)
+ next_ipif = ill->ill_ipif;
- /*
- * Choose one good source address from each ill across the group.
- * If possible choose a source address in the same subnet as
- * the destination address.
- *
- * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE
- * This is okay because of the following.
- *
- * If PHYI_FAILED is set and we still have non-deprecated
- * addresses, it means the addresses have not yet been
- * failed over to a different interface. We potentially
- * select them to create IRE_CACHES, which will be later
- * flushed when the addresses move over.
- *
- * If PHYI_INACTIVE is set and we still have non-deprecated
- * addresses, it means either the user has configured them
- * or PHYI_INACTIVE has not been cleared after the addresses
- * been moved over. For the former, in.mpathd does a failover
- * when the interface becomes INACTIVE and hence we should
- * not find them. Once INACTIVE is set, we don't allow them
- * to create logical interfaces anymore. For the latter, a
- * flush will happen when INACTIVE is cleared which will
- * flush the IRE_CACHES.
- *
- * If PHYI_OFFLINE is set, all the addresses will be failed
- * over soon. We potentially select them to create IRE_CACHEs,
- * which will be later flushed when the addresses move over.
- *
- * NOTE : As ipif_select_source is called to borrow source address
- * for an ipif that is part of a group, source address selection
- * will be re-done whenever the group changes i.e either an
- * insertion/deletion in the group.
- *
- * Fill ipif_arr[] with source addresses, using these rules:
- *
- * 1. At most one source address from a given ill ends up
- * in ipif_arr[] -- that is, at most one of the ipif's
- * associated with a given ill ends up in ipif_arr[].
- *
- * 2. If there is at least one non-deprecated ipif in the
- * IPMP group with a source address on the same subnet as
- * our destination, then fill ipif_arr[] only with
- * source addresses on the same subnet as our destination.
- * Note that because of (1), only the first
- * non-deprecated ipif found with a source address
- * matching the destination ends up in ipif_arr[].
- *
- * 3. Otherwise, fill ipif_arr[] with non-deprecated source
- * addresses not in the same subnet as our destination.
- * Again, because of (1), only the first off-subnet source
- * address will be chosen.
- *
- * 4. If there are no non-deprecated ipifs, then just use
- * the source address associated with the last deprecated
- * one we find that happens to be on the same subnet,
- * otherwise the first one not in the same subnet.
- */
- specific_found = B_FALSE;
- for (; till != NULL; till = till->ill_group_next) {
- ipif_same_found = B_FALSE;
- ipif_other_found = B_FALSE;
- for (ipif = till->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (!IPIF_CAN_LOOKUP(ipif))
- continue;
- /* Always skip NOLOCAL and ANYCAST interfaces */
- if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
- continue;
- if (!(ipif->ipif_flags & IPIF_UP) ||
- !ipif->ipif_addr_ready)
- continue;
- if (ipif->ipif_zoneid != zoneid &&
- ipif->ipif_zoneid != ALL_ZONES)
- continue;
- /*
- * Interfaces with 0.0.0.0 address are allowed to be UP,
- * but are not valid as source addresses.
- */
- if (ipif->ipif_lcl_addr == INADDR_ANY)
- continue;
+ if (!IPIF_CAN_LOOKUP(ipif))
+ continue;
+ /* Always skip NOLOCAL and ANYCAST interfaces */
+ if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
+ continue;
+ if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready)
+ continue;
+ if (ipif->ipif_zoneid != zoneid &&
+ ipif->ipif_zoneid != ALL_ZONES)
+ continue;
- /*
- * Check compatibility of local address for
- * destination's default label if we're on a labeled
- * system. Incompatible addresses can't be used at
- * all.
- */
- if (dst_rhtp != NULL) {
- boolean_t incompat;
+ /*
+ * Interfaces with 0.0.0.0 address are allowed to be UP, but
+ * are not valid as source addresses.
+ */
+ if (ipif->ipif_lcl_addr == INADDR_ANY)
+ continue;
- src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
- IPV4_VERSION, B_FALSE);
- if (src_rhtp == NULL)
- continue;
- incompat =
- src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
- src_rhtp->tpc_tp.tp_doi !=
- dst_rhtp->tpc_tp.tp_doi ||
- (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
- &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
- !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
- src_rhtp->tpc_tp.tp_sl_set_cipso));
- TPC_RELE(src_rhtp);
- if (incompat)
- continue;
- }
+ /*
+ * Check compatibility of local address for destination's
+ * default label if we're on a labeled system. Incompatible
+ * addresses can't be used at all.
+ */
+ if (dst_rhtp != NULL) {
+ boolean_t incompat;
- /*
- * We prefer not to use all all-zones addresses, if we
- * can avoid it, as they pose problems with unlabeled
- * destinations.
- */
- if (ipif->ipif_zoneid != ALL_ZONES) {
- if (!specific_found &&
- (!same_subnet_only ||
- (ipif->ipif_net_mask & dst) ==
- ipif->ipif_subnet)) {
- index = 0;
- specific_found = B_TRUE;
- ipif_other_found = B_FALSE;
- }
- } else {
- if (specific_found)
- continue;
- }
- if (ipif->ipif_flags & IPIF_DEPRECATED) {
- if (ipif_dep == NULL ||
- (ipif->ipif_net_mask & dst) ==
- ipif->ipif_subnet)
- ipif_dep = ipif;
+ src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
+ IPV4_VERSION, B_FALSE);
+ if (src_rhtp == NULL)
+ continue;
+ incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
+ src_rhtp->tpc_tp.tp_doi !=
+ dst_rhtp->tpc_tp.tp_doi ||
+ (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
+ &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
+ !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
+ src_rhtp->tpc_tp.tp_sl_set_cipso));
+ TPC_RELE(src_rhtp);
+ if (incompat)
continue;
- }
- if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) {
- /* found a source address in the same subnet */
- if (!same_subnet_only) {
- same_subnet_only = B_TRUE;
- index = 0;
- }
- ipif_same_found = B_TRUE;
- } else {
- if (same_subnet_only || ipif_other_found)
- continue;
- ipif_other_found = B_TRUE;
- }
- ipif_arr[index++] = ipif;
- if (index == MAX_IPIF_SELECT_SOURCE) {
- wrapped = B_TRUE;
- index = 0;
- }
- if (ipif_same_found)
- break;
}
- }
- if (ipif_arr[0] == NULL) {
- ipif = ipif_dep;
- } else {
- if (wrapped)
- index = MAX_IPIF_SELECT_SOURCE;
- ipif = ipif_arr[ipif_rand(ipst) % index];
- ASSERT(ipif != NULL);
- }
+ samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
- if (ipif != NULL) {
+ if (ipif->ipif_flags & IPIF_DEPRECATED) {
+ type = samenet ? IPIF_SAMENET_DEPRECATED :
+ IPIF_DIFFNET_DEPRECATED;
+ } else if (ipif->ipif_zoneid == ALL_ZONES) {
+ type = samenet ? IPIF_SAMENET_ALLZONES :
+ IPIF_DIFFNET_ALLZONES;
+ } else {
+ type = samenet ? IPIF_SAMENET : IPIF_DIFFNET;
+ }
+
+ if (type > best_type) {
+ best_type = type;
+ best_ipif = ipif;
+ if (best_type == IPIF_SAMENET)
+ break; /* can't get better */
+ }
+ } while ((ipif = next_ipif) != start_ipif);
+
+ if ((ipif = best_ipif) != NULL) {
mutex_enter(&ipif->ipif_ill->ill_lock);
if (!IPIF_CAN_LOOKUP(ipif)) {
mutex_exit(&ipif->ipif_ill->ill_lock);
goto retry;
}
ipif_refhold_locked(ipif);
+
+ /*
+ * For IPMP, update the source ipif rotor to the next ipif,
+ * provided we can look it up. (We must not use it if it's
+ * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
+ * ipif_free() checked ill_src_ipif.)
+ */
+ if (IS_IPMP(ill) && ipif != NULL) {
+ next_ipif = ipif->ipif_next;
+ if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif))
+ ill->ill_src_ipif = next_ipif;
+ else
+ ill->ill_src_ipif = NULL;
+ }
mutex_exit(&ipif->ipif_ill->ill_lock);
}
rw_exit(&ipst->ips_ill_g_lock);
if (usill != NULL)
ill_refrele(usill);
+ if (ipmp_ill != NULL)
+ ill_refrele(ipmp_ill);
if (dst_rhtp != NULL)
TPC_RELE(dst_rhtp);
@@ -20929,8 +17032,7 @@ retry:
* ipif_update_other_ipifs calls us.
*
* If old_ipif is NULL, just redo the source address selection
- * if needed. This happens when illgrp_insert or ipif_up_done
- * calls us.
+ * if needed. This happens when ipif_up_done calls us.
*/
static void
ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
@@ -21064,49 +17166,31 @@ ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
/*
* This old_ipif is going away.
*
- * Determine if any other ipif's is using our address as
+ * Determine if any other ipif's are using our address as
* ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or
* IPIF_DEPRECATED).
* Find the IRE_INTERFACE for such ipifs and recreate them
* to use an different source address following the rules in
* ipif_up_done.
- *
- * This function takes an illgrp as an argument so that illgrp_delete
- * can call this to update source address even after deleting the
- * old_ipif->ipif_ill from the ill group.
*/
static void
-ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp)
+ipif_update_other_ipifs(ipif_t *old_ipif)
{
- ipif_t *ipif;
- ill_t *ill;
+ ipif_t *ipif;
+ ill_t *ill;
char buf[INET6_ADDRSTRLEN];
ASSERT(IAM_WRITER_IPIF(old_ipif));
- ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif));
ill = old_ipif->ipif_ill;
- ip1dbg(("ipif_update_other_ipifs(%s, %s)\n",
- ill->ill_name,
- inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr,
- buf, sizeof (buf))));
- /*
- * If this part of a group, look at all ills as ipif_select_source
- * borrows source address across all the ills in the group.
- */
- if (illgrp != NULL)
- ill = illgrp->illgrp_ill;
-
- for (; ill != NULL; ill = ill->ill_group_next) {
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
-
- if (ipif == old_ipif)
- continue;
+ ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name,
+ inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf))));
- ipif_recreate_interface_routes(old_ipif, ipif);
- }
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ if (ipif == old_ipif)
+ continue;
+ ipif_recreate_interface_routes(old_ipif, ipif);
}
}
@@ -21117,8 +17201,7 @@ if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
{
/*
* ill_phyint_reinit merged the v4 and v6 into a single
- * ipsq. Could also have become part of a ipmp group in the
- * process, and we might not have been able to complete the
+ * ipsq. We might not have been able to complete the
* operation in ipif_set_values, if we could not become
* exclusive. If so restart it here.
*/
@@ -21171,6 +17254,48 @@ ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
}
/*
+ * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the
+ * minimum (but complete) set exist. This is necessary when adding or
+ * removing an interface to/from an IPMP group, since interfaces in an
+ * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever
+ * its test address subnets overlap with IPMP data addresses). It's also
+ * used to refresh the IRE_BROADCAST entries associated with the IPMP
+ * interface when the nominated broadcast interface changes.
+ */
+void
+ill_refresh_bcast(ill_t *ill)
+{
+ ire_t *ire_array[12]; /* max ipif_create_bcast_ires() can create */
+ ire_t **irep;
+ ipif_t *ipif;
+
+ ASSERT(!ill->ill_isv6);
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ /*
+ * Remove any old broadcast IREs.
+ */
+ ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST,
+ ill_broadcast_delete, ill, ill);
+
+ /*
+ * Create new ones for any ipifs that are up and broadcast-capable.
+ */
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) !=
+ (IPIF_UP|IPIF_BROADCAST))
+ continue;
+
+ irep = ipif_create_bcast_ires(ipif, ire_array);
+ while (irep-- > ire_array) {
+ (void) ire_add(irep, NULL, NULL, NULL, B_FALSE);
+ if (*irep != NULL)
+ ire_refrele(*irep);
+ }
+ }
+}
+
+/*
* Create any IRE_BROADCAST entries for `ipif', and store those entries in
* `irep'. Returns a pointer to the next free `irep' entry (just like
* ire_check_and_create_bcast()).
@@ -21433,10 +17558,33 @@ ipif_check_bcast_ires(ipif_t *test_ipif)
/*
* Walk through all the ipifs that will be affected by the dying IREs,
- * and recreate the IREs as necessary.
+ * and recreate the IREs as necessary. Note that all interfaces in an
+ * IPMP illgrp share the same broadcast IREs, and thus the entire
+ * illgrp must be walked, starting with the IPMP meta-interface (so
+ * that broadcast IREs end up on it whenever possible).
*/
+ if (IS_UNDER_IPMP(ill))
+ ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
+
irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
+ if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
+ ipmp_illgrp_t *illg = ill->ill_grp;
+
+ ill = list_head(&illg->ig_if);
+ for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+ for (i = 0; i < BCAST_COUNT; i++) {
+ if (bireinfo[i].bi_willdie &&
+ !bireinfo[i].bi_haverep)
+ break;
+ }
+ if (i == BCAST_COUNT)
+ break;
+
+ irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
+ }
+ }
+
/*
* Scan through the set of broadcast IREs and see if there are any
* that we need to replace that have not yet been replaced. If so,
@@ -21528,7 +17676,7 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
/*
* If there's another ill already with the requested name, ensure
- * that it's of the same type. Otherwise, ill_phyint_reinit() will
+ * that it's of the same type. Otherwise, ill_phyint_reinit() will
* fuse together two unrelated ills, which will cause chaos.
*/
ipst = ill->ill_ipst;
@@ -21620,8 +17768,7 @@ ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
{
/*
* ill_phyint_reinit merged the v4 and v6 into a single
- * ipsq. Could also have become part of a ipmp group in the
- * process, and we might not have been able to complete the
+ * ipsq. We might not have been able to complete the
* slifname in ipif_set_values, if we could not become
* exclusive. If so restart it here
*/
@@ -21665,85 +17812,6 @@ ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
return (ipif);
}
-typedef struct conn_change_s {
- uint_t cc_old_ifindex;
- uint_t cc_new_ifindex;
-} conn_change_t;
-
-/*
- * ipcl_walk function for changing interface index.
- */
-static void
-conn_change_ifindex(conn_t *connp, caddr_t arg)
-{
- conn_change_t *connc;
- uint_t old_ifindex;
- uint_t new_ifindex;
- int i;
- ilg_t *ilg;
-
- connc = (conn_change_t *)arg;
- old_ifindex = connc->cc_old_ifindex;
- new_ifindex = connc->cc_new_ifindex;
-
- if (connp->conn_orig_bound_ifindex == old_ifindex)
- connp->conn_orig_bound_ifindex = new_ifindex;
-
- if (connp->conn_orig_multicast_ifindex == old_ifindex)
- connp->conn_orig_multicast_ifindex = new_ifindex;
-
- for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
- ilg = &connp->conn_ilg[i];
- if (ilg->ilg_orig_ifindex == old_ifindex)
- ilg->ilg_orig_ifindex = new_ifindex;
- }
-}
-
-/*
- * Walk all the ipifs and ilms on this ill and change the orig_ifindex
- * to new_index if it matches the old_index.
- *
- * Failovers typically happen within a group of ills. But somebody
- * can remove an ill from the group after a failover happened. If
- * we are setting the ifindex after this, we potentially need to
- * look at all the ills rather than just the ones in the group.
- * We cut down the work by looking at matching ill_net_types
- * and ill_types as we could not possibly grouped them together.
- */
-static void
-ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc)
-{
- ill_t *ill;
- ipif_t *ipif;
- uint_t old_ifindex;
- uint_t new_ifindex;
- ilm_t *ilm;
- ill_walk_context_t ctx;
- ip_stack_t *ipst = ill_orig->ill_ipst;
-
- old_ifindex = connc->cc_old_ifindex;
- new_ifindex = connc->cc_new_ifindex;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- ill = ILL_START_WALK_ALL(&ctx, ipst);
- for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if ((ill_orig->ill_net_type != ill->ill_net_type) ||
- (ill_orig->ill_type != ill->ill_type)) {
- continue;
- }
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (ipif->ipif_orig_ifindex == old_ifindex)
- ipif->ipif_orig_ifindex = new_ifindex;
- }
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (ilm->ilm_orig_ifindex == old_ifindex)
- ilm->ilm_orig_ifindex = new_ifindex;
- }
- }
- rw_exit(&ipst->ips_ill_g_lock);
-}
-
/*
* We first need to ensure that the new index is unique, and
* then carry the change across both v4 and v6 ill representation
@@ -21755,13 +17823,10 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *ifreq)
{
ill_t *ill;
- ill_t *ill_other;
phyint_t *phyi;
- int old_index;
- conn_change_t connc;
struct ifreq *ifr = (struct ifreq *)ifreq;
struct lifreq *lifr = (struct lifreq *)ifreq;
- uint_t index;
+ uint_t old_index, index;
ill_t *ill_v4;
ill_t *ill_v6;
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
@@ -21773,31 +17838,15 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
/*
* Only allow on physical interface. Also, index zero is illegal.
- *
- * Need to check for PHYI_FAILED and PHYI_INACTIVE
- *
- * 1) If PHYI_FAILED is set, a failover could have happened which
- * implies a possible failback might have to happen. As failback
- * depends on the old index, we should fail setting the index.
- *
- * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that
- * any addresses or multicast memberships are failed over to
- * a non-STANDBY interface. As failback depends on the old
- * index, we should fail setting the index for this case also.
- *
- * 3) If PHYI_OFFLINE is set, a possible failover has happened.
- * Be consistent with PHYI_FAILED and fail the ioctl.
*/
ill = ipif->ipif_ill;
phyi = ill->ill_phyint;
- if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) ||
- ipif->ipif_id != 0 || index == 0) {
+ if (ipif->ipif_id != 0 || index == 0) {
return (EINVAL);
}
- old_index = phyi->phyint_ifindex;
/* If the index is not changing, no work to do */
- if (old_index == index)
+ if (phyi->phyint_ifindex == index)
return (0);
/*
@@ -21816,31 +17865,17 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
return (EBUSY);
}
- /*
- * The new index is unused. Set it in the phyint.
- * Locate the other ill so that we can send a routing
- * sockets message.
- */
- if (ill->ill_isv6) {
- ill_other = phyi->phyint_illv4;
- } else {
- ill_other = phyi->phyint_illv6;
- }
-
+ /* The new index is unused. Set it in the phyint. */
+ old_index = phyi->phyint_ifindex;
phyi->phyint_ifindex = index;
/* Update SCTP's ILL list */
sctp_ill_reindex(ill, old_index);
- connc.cc_old_ifindex = old_index;
- connc.cc_new_ifindex = index;
- ip_change_ifindex(ill, &connc);
- ipcl_walk(conn_change_ifindex, (caddr_t)&connc, ipst);
-
/* Send the routing sockets message */
- ip_rts_ifmsg(ipif);
- if (ill_other != NULL)
- ip_rts_ifmsg(ill_other->ill_ipif);
+ ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+ if (ILL_OTHER(ill))
+ ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
return (0);
}
@@ -22038,6 +18073,45 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
B_TRUE));
}
+/*
+ * Return the number of addresses on `ill' with one or more of the values
+ * in `set' set and all of the values in `clear' clear.
+ */
+static uint_t
+ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear)
+{
+ ipif_t *ipif;
+ uint_t cnt = 0;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+ if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear))
+ cnt++;
+
+ return (cnt);
+}
+
+/*
+ * Return the number of migratable addresses on `ill' that are under
+ * application control.
+ */
+uint_t
+ill_appaddr_cnt(const ill_t *ill)
+{
+ return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF,
+ IPIF_NOFAILOVER));
+}
+
+/*
+ * Return the number of point-to-point addresses on `ill'.
+ */
+uint_t
+ill_ptpaddr_cnt(const ill_t *ill)
+{
+ return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0));
+}
+
/* ARGSUSED */
int
ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
@@ -22158,7 +18232,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
int err = 0, ret;
uint_t ifindex;
- phyint_t *us_phyint, *us_cli_phyint;
ipsq_t *ipsq = NULL;
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
@@ -22167,19 +18240,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ASSERT(CONN_Q(q));
isv6 = (Q_TO_CONN(q))->conn_af_isv6;
- us_cli_phyint = usesrc_cli_ill->ill_phyint;
-
- ASSERT(us_cli_phyint != NULL);
-
- /*
- * If the client ILL is being used for IPMP, abort.
- * Note, this can be done before ipsq_try_enter since we are already
- * exclusive on this ILL
- */
- if ((us_cli_phyint->phyint_groupname != NULL) ||
- (us_cli_phyint->phyint_flags & PHYI_STANDBY)) {
- return (EINVAL);
- }
ifindex = lifr->lifr_index;
if (ifindex == 0) {
@@ -22198,15 +18258,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
return (err);
}
- /*
- * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP
- * group nor can either of the interfaces be used for standy. So
- * to guarantee mutual exclusion with ip_sioctl_flags (which sets
- * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname)
- * we need to be exclusive on the ipsq belonging to the usesrc_ill.
- * We are already exlusive on this ipsq i.e ipsq corresponding to
- * the usesrc_cli_ill
- */
ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
NEW_OP, B_TRUE);
if (ipsq == NULL) {
@@ -22215,11 +18266,19 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
goto done;
}
- /* Check if the usesrc_ill is used for IPMP */
- us_phyint = usesrc_ill->ill_phyint;
- if ((us_phyint->phyint_groupname != NULL) ||
- (us_phyint->phyint_flags & PHYI_STANDBY)) {
- err = EINVAL;
+ /* USESRC isn't currently supported with IPMP */
+ if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) {
+ err = ENOTSUP;
+ goto done;
+ }
+
+ /*
+ * USESRC isn't compatible with the STANDBY flag. (STANDBY is only
+ * used by IPMP underlying interfaces, but someone might think it's
+ * more general and try to use it independently with VNI.)
+ */
+ if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
+ err = ENOTSUP;
goto done;
}
@@ -22372,79 +18431,45 @@ ill_phyint_compare_name(const void *name_ptr, const void *phyip)
return (-1);
return (0);
}
+
/*
- * This function is called from ill_delete when the ill is being
- * unplumbed. We remove the reference from the phyint and we also
- * free the phyint when there are no more references to it.
+ * This function is called on the unplumb path via ill_glist_delete() when
+ * there are no ills left on the phyint and thus the phyint can be freed.
*/
static void
-ill_phyint_free(ill_t *ill)
+phyint_free(phyint_t *phyi)
{
- phyint_t *phyi;
- phyint_t *next_phyint;
- ipsq_t *cur_ipsq;
- ip_stack_t *ipst = ill->ill_ipst;
+ ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
- ASSERT(ill->ill_phyint != NULL);
+ ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL);
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
- phyi = ill->ill_phyint;
- ill->ill_phyint = NULL;
/*
- * ill_init allocates a phyint always to store the copy
- * of flags relevant to phyint. At that point in time, we could
- * not assign the name and hence phyint_illv4/v6 could not be
- * initialized. Later in ipif_set_values, we assign the name to
- * the ill, at which point in time we assign phyint_illv4/v6.
- * Thus we don't rely on phyint_illv6 to be initialized always.
+ * If this phyint was an IPMP meta-interface, blow away the group.
+ * This is safe to do because all of the illgrps have already been
+ * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us.
+ * If we're cleaning up as a result of failed initialization,
+ * phyint_grp may be NULL.
*/
- if (ill->ill_flags & ILLF_IPV6) {
- phyi->phyint_illv6 = NULL;
- } else {
- phyi->phyint_illv4 = NULL;
+ if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) {
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ ipmp_grp_destroy(phyi->phyint_grp);
+ phyi->phyint_grp = NULL;
+ rw_exit(&ipst->ips_ipmp_lock);
}
- /*
- * ipif_down removes it from the group when the last ipif goes
- * down.
- */
- ASSERT(ill->ill_group == NULL);
-
- if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL)
- return;
/*
- * Make sure this phyint was put in the list.
+ * If this interface was under IPMP, take it out of the group.
*/
- if (phyi->phyint_ifindex > 0) {
- avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
- phyi);
- avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
- phyi);
- }
+ if (phyi->phyint_grp != NULL)
+ ipmp_phyint_leave_grp(phyi);
+
/*
- * remove phyint from the ipsq list.
+ * Delete the phyint and disassociate its ipsq. The ipsq itself
+ * will be freed in ipsq_exit().
*/
- cur_ipsq = phyi->phyint_ipsq;
- if (phyi == cur_ipsq->ipsq_phyint_list) {
- cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next;
- } else {
- next_phyint = cur_ipsq->ipsq_phyint_list;
- while (next_phyint != NULL) {
- if (next_phyint->phyint_ipsq_next == phyi) {
- next_phyint->phyint_ipsq_next =
- phyi->phyint_ipsq_next;
- break;
- }
- next_phyint = next_phyint->phyint_ipsq_next;
- }
- ASSERT(next_phyint != NULL);
- }
- IPSQ_DEC_REF(cur_ipsq, ipst);
+ phyi->phyint_ipsq->ipsq_phyint = NULL;
+ phyi->phyint_name[0] = '\0';
- if (phyi->phyint_groupname_len != 0) {
- ASSERT(phyi->phyint_groupname != NULL);
- mi_free(phyi->phyint_groupname);
- }
mi_free(phyi);
}
@@ -22464,7 +18489,6 @@ ill_phyint_reinit(ill_t *ill)
phyint_t *phyi;
avl_index_t where = 0;
ill_t *ill_other = NULL;
- ipsq_t *ipsq;
ip_stack_t *ipst = ill->ill_ipst;
ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
@@ -22476,6 +18500,11 @@ ill_phyint_reinit(ill_t *ill)
phyi_old->phyint_illv4 == NULL));
ASSERT(phyi_old->phyint_ifindex == 0);
+ /*
+ * Now that our ill has a name, set it in the phyint.
+ */
+ (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ);
+
phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
ill->ill_name, &where);
@@ -22497,8 +18526,7 @@ ill_phyint_reinit(ill_t *ill)
* we are initializing IPv4.
*/
if (phyi != NULL) {
- ill_other = (isv6) ? phyi->phyint_illv4 :
- phyi->phyint_illv6;
+ ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6;
ASSERT(ill_other->ill_phyint != NULL);
ASSERT((isv6 && !ill_other->ill_isv6) ||
(!isv6 && ill_other->ill_isv6));
@@ -22517,26 +18545,15 @@ ill_phyint_reinit(ill_t *ill)
ASSERT(phyi->phyint_illv4 == NULL);
phyi->phyint_illv4 = ill;
}
- /*
- * This is a new ill, currently undergoing SLIFNAME
- * So we could not have joined an IPMP group until now.
- */
- ASSERT(phyi_old->phyint_ipsq_next == NULL &&
- phyi_old->phyint_groupname == NULL);
/*
- * This phyi_old is going away. Decref ipsq_refs and
- * assert it is zero. The ipsq itself will be freed in
- * ipsq_exit
+ * Delete the old phyint and make its ipsq eligible
+ * to be freed in ipsq_exit().
*/
- ipsq = phyi_old->phyint_ipsq;
- IPSQ_DEC_REF(ipsq, ipst);
- ASSERT(ipsq->ipsq_refs == 0);
- /* Get the singleton phyint out of the ipsq list */
- ASSERT(phyi_old->phyint_ipsq_next == NULL);
- ipsq->ipsq_phyint_list = NULL;
phyi_old->phyint_illv4 = NULL;
phyi_old->phyint_illv6 = NULL;
+ phyi_old->phyint_ipsq->ipsq_phyint = NULL;
+ phyi_old->phyint_name[0] = '\0';
mi_free(phyi_old);
} else {
mutex_enter(&ill->ill_lock);
@@ -22551,9 +18568,6 @@ ill_phyint_reinit(ill_t *ill)
if (!phyint_assign_ifindex(phyi, ipst))
cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
- /* No IPMP group yet, thus the hook uses the ifindex */
- phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-
avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
(void *)phyi, where);
@@ -22571,13 +18585,6 @@ ill_phyint_reinit(ill_t *ill)
ill->ill_phyint = phyi;
/*
- * Keep the index on ipif_orig_index to be used by FAILOVER.
- * We do this here as when the first ipif was allocated,
- * ipif_allocate does not know the right interface index.
- */
-
- ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex;
- /*
* Now that the phyint's ifindex has been assigned, complete the
* remaining
*/
@@ -22606,45 +18613,14 @@ ill_phyint_reinit(ill_t *ill)
*/
if (ill->ill_name_length <= 2 ||
ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') {
- /*
- * Generate nic plumb event for ill_name even if
- * ipmp_hook_emulation is set. That avoids generating events
- * for the ill_names should ipmp_hook_emulation be turned on
- * later.
- */
- ill_nic_event_plumb(ill, B_FALSE);
+ ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name,
+ ill->ill_name_length);
}
RELEASE_ILL_LOCKS(ill, ill_other);
mutex_exit(&phyi->phyint_lock);
}
/*
- * Allocate a NE_PLUMB nic info event and store in the ill.
- * If 'group' is set we do it for the group name, otherwise the ill name.
- * It will be sent when we leave the ipsq.
- */
-void
-ill_nic_event_plumb(ill_t *ill, boolean_t group)
-{
- phyint_t *phyi = ill->ill_phyint;
- char *name;
- int namelen;
-
- ASSERT(MUTEX_HELD(&ill->ill_lock));
-
- if (group) {
- ASSERT(phyi->phyint_groupname_len != 0);
- namelen = phyi->phyint_groupname_len;
- name = phyi->phyint_groupname;
- } else {
- namelen = ill->ill_name_length;
- name = ill->ill_name;
- }
-
- ill_nic_event_dispatch(ill, 0, NE_PLUMB, name, namelen);
-}
-
-/*
* Notify any downstream modules of the name of this interface.
* An M_IOCTL is used even though we don't expect a successful reply.
* Any reply message from the driver (presumably an M_IOCNAK) will
@@ -22686,8 +18662,9 @@ ip_ifname_notify(ill_t *ill, queue_t *q)
static int
ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
{
- int err;
+ int err;
ip_stack_t *ipst = ill->ill_ipst;
+ phyint_t *phyi = ill->ill_phyint;
/* Set the obsolete NDD per-interface forwarding name. */
err = ill_set_ndd_name(ill);
@@ -22696,6 +18673,34 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
err);
}
+ /*
+ * Now that ill_name is set, the configuration for the IPMP
+ * meta-interface can be performed.
+ */
+ if (IS_IPMP(ill)) {
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ /*
+ * If phyi->phyint_grp is NULL, then this is the first IPMP
+ * meta-interface and we need to create the IPMP group.
+ */
+ if (phyi->phyint_grp == NULL) {
+ /*
+ * If someone has renamed another IPMP group to have
+ * the same name as our interface, bail.
+ */
+ if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) {
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (EEXIST);
+ }
+ phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi);
+ if (phyi->phyint_grp == NULL) {
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (ENOMEM);
+ }
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+ }
+
/* Tell downstream modules where they are. */
ip_ifname_notify(ill, q);
@@ -22966,10 +18971,10 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
/*
* If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
*/
- if (ipsq->ipsq_current_ipif == NULL)
+ if (ipsq->ipsq_xop->ipx_current_ipif == NULL)
ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
else
- ASSERT(ipsq->ipsq_current_ipif == ipif);
+ ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif);
error = ipif_set_values_tail(ill, ipif, mp, q);
ipsq_exit(ipsq);
@@ -22986,18 +18991,8 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
void
ipif_init(ip_stack_t *ipst)
{
- hrtime_t hrt;
int i;
- /*
- * Can't call drv_getparm here as it is too early in the boot.
- * As we use ipif_src_random just for picking a different
- * source address everytime, this need not be really random.
- */
- hrt = gethrtime();
- ipst->ips_ipif_src_random =
- ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff);
-
for (i = 0; i < MAX_G_HEADS; i++) {
ipst->ips_ill_g_heads[i].ill_g_list_head =
(ill_if_t *)&ipst->ips_ill_g_heads[i];
@@ -23023,7 +19018,11 @@ ipif_init(ip_stack_t *ipst)
* match is found to take care of such rare network configurations like -
* le0: 129.146.1.1/16
* le1: 129.146.2.2/24
- * It is used only by SO_DONTROUTE at the moment.
+ *
+ * This is used by SO_DONTROUTE and IP_NEXTHOP. Since neither of those are
+ * supported on underlying interfaces in an IPMP group, underlying interfaces
+ * are ignored when looking up a match. (If we didn't ignore them, we'd
+ * risk using a test address as a source for outgoing traffic.)
*/
ipif_t *
ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
@@ -23038,6 +19037,8 @@ ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ if (IS_UNDER_IPMP(ill))
+ continue;
mutex_enter(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
@@ -23660,30 +19661,76 @@ ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa,
* Knows about IEEE 802 and IEEE EUI-64 mappings.
*/
static boolean_t
-ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr)
{
char *addr;
- if (phys_length != ETHERADDRL)
+ if (ill->ill_phys_addr_length != ETHERADDRL)
return (B_FALSE);
/* Form EUI-64 like address */
addr = (char *)&v6addr->s6_addr32[2];
- bcopy((char *)phys_addr, addr, 3);
+ bcopy(ill->ill_phys_addr, addr, 3);
addr[0] ^= 0x2; /* Toggle Universal/Local bit */
addr[3] = (char)0xff;
addr[4] = (char)0xfe;
- bcopy((char *)phys_addr + 3, addr + 5, 3);
+ bcopy(ill->ill_phys_addr + 3, addr + 5, 3);
return (B_TRUE);
}
/* ARGSUSED */
static boolean_t
-ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr)
{
return (B_FALSE);
}
+typedef struct ipmp_ifcookie {
+ uint32_t ic_hostid;
+ char ic_ifname[LIFNAMSIZ];
+ char ic_zonename[ZONENAME_MAX];
+} ipmp_ifcookie_t;
+
+/*
+ * Construct a pseudo-random interface ID for the IPMP interface that's both
+ * predictable and (almost) guaranteed to be unique.
+ */
+static boolean_t
+ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
+{
+ zone_t *zp;
+ uint8_t *addr;
+ uchar_t hash[16];
+ ulong_t hostid;
+ MD5_CTX ctx;
+ ipmp_ifcookie_t ic = { 0 };
+
+ ASSERT(IS_IPMP(ill));
+
+ (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
+ ic.ic_hostid = htonl((uint32_t)hostid);
+
+ (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ);
+
+ if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) {
+ (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX);
+ zone_rele(zp);
+ }
+
+ MD5Init(&ctx);
+ MD5Update(&ctx, &ic, sizeof (ic));
+ MD5Final(hash, &ctx);
+
+ /*
+ * Map the hash to an interface ID per the basic approach in RFC3041.
+ */
+ addr = &v6addr->s6_addr8[8];
+ bcopy(hash + 8, addr, sizeof (uint64_t));
+ addr[0] &= ~0x2; /* set local bit */
+
+ return (B_TRUE);
+}
+
/* ARGSUSED */
static boolean_t
ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
@@ -23739,14 +19786,14 @@ ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
* Derive IPoIB interface id from the link layer address.
*/
static boolean_t
-ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
{
char *addr;
- if (phys_length != 20)
+ if (ill->ill_phys_addr_length != 20)
return (B_FALSE);
addr = (char *)&v6addr->s6_addr32[2];
- bcopy(phys_addr + 12, addr, 8);
+ bcopy(ill->ill_phys_addr + 12, addr, 8);
/*
* In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
* in the globally assigned EUI-64 GUID to 1, in violation of IEEE
@@ -23863,6 +19910,7 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
*ipifp = NULL;
return (B_FALSE);
}
+
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
if (!IPIF_CAN_LOOKUP(ipif))
continue;
@@ -23897,71 +19945,9 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
}
/*
- * Same as ipif_lookup_zoneid() but looks at all the ills in the same group.
- */
-boolean_t
-ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
-{
- ill_t *illg;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * We look at the passed-in ill first without grabbing ill_g_lock.
- */
- if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) {
- return (B_TRUE);
- }
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- if (ill->ill_group == NULL) {
- /* ill not in a group */
- rw_exit(&ipst->ips_ill_g_lock);
- return (B_FALSE);
- }
-
- /*
- * There's no ipif in the zone on ill, however ill is part of an IPMP
- * group. We need to look for an ipif in the zone on all the ills in the
- * group.
- */
- illg = ill->ill_group->illgrp_ill;
- do {
- /*
- * We don't call ipif_lookup_zoneid() on ill as we already know
- * that it's not there.
- */
- if (illg != ill &&
- ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) {
- break;
- }
- } while ((illg = illg->ill_group_next) != NULL);
- rw_exit(&ipst->ips_ill_g_lock);
- return (illg != NULL);
-}
-
-/*
- * Check if this ill is only being used to send ICMP probes for IPMP
- */
-boolean_t
-ill_is_probeonly(ill_t *ill)
-{
- /*
- * Check if the interface is FAILED, or INACTIVE
- */
- if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE))
- return (B_TRUE);
-
- return (B_FALSE);
-}
-
-/*
* Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
* If a pointer to an ipif_t is returned then the caller will need to do
* an ill_refrele().
- *
- * If there is no real interface which matches the ifindex, then it looks
- * for a group that has a matching index. In the case of a group match the
- * lifidx must be zero. We don't need emulate the logical interfaces
- * since IP Filter's use of netinfo doesn't use that.
*/
ipif_t *
ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
@@ -23972,18 +19958,8 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
ipst);
-
- if (ill == NULL) {
- /* Fallback to group names only if hook_emulation set */
- if (!ipst->ips_ipmp_hook_emulation)
- return (NULL);
-
- if (lifidx != 0)
- return (NULL);
- ill = ill_group_lookup_on_ifindex(ifindex, isv6, ipst);
- if (ill == NULL)
- return (NULL);
- }
+ if (ill == NULL)
+ return (NULL);
mutex_enter(&ill->ill_lock);
if (ill->ill_state_flags & ILL_CONDEMNED) {
@@ -24059,7 +20035,7 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp)
* If we can quiesce the ill, then set the address. If not, then
* ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
*/
- ill_down_ipifs(ill, NULL, 0, B_FALSE);
+ ill_down_ipifs(ill);
mutex_enter(&ill->ill_lock);
if (!ill_is_quiescent(ill)) {
/* call cannot fail since `conn_t *' argument is NULL */
@@ -24283,10 +20259,7 @@ ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event,
if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL)
goto fail;
- if (event == NE_UNPLUMB)
- info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
- else
- info->hnei_event.hne_nic = ill->ill_phyint->phyint_hook_ifindex;
+ info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
info->hnei_event.hne_lif = lif;
info->hnei_event.hne_event = event;
info->hnei_event.hne_protocol = ill->ill_isv6 ?
@@ -24323,8 +20296,8 @@ fail:
void
ipif_up_notify(ipif_t *ipif)
{
- ip_rts_ifmsg(ipif);
- ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
+ ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+ ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT);
sctp_update_ipif(ipif, SCTP_IPIF_UP);
ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id),
NE_LIF_UP, NULL, 0);
diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c
index 405cb653d5..52a7e74806 100644
--- a/usr/src/uts/common/inet/ip/ip_ire.c
+++ b/usr/src/uts/common/inet/ip/ip_ire.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -31,6 +31,7 @@
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
+#include <sys/strsun.h>
#include <sys/ddi.h>
#include <sys/cmn_err.h>
#include <sys/policy.h>
@@ -61,7 +62,6 @@
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
#include <inet/sadb.h>
-#include <sys/kmem.h>
#include <inet/tcp.h>
#include <inet/ipclassifier.h>
#include <sys/zone.h>
@@ -220,11 +220,6 @@ struct kmem_cache *rt_entry_cache;
* IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is
* to be ignored when walking the ires using ire_next.
*
- * IRE_MARK_HIDDEN signifies that the ire is a special ire typically for the
- * benefit of in.mpathd which needs to probe interfaces for failures. Normal
- * applications should not be seeing this ire and hence this ire is ignored
- * in most cases in the search using ire_next.
- *
* Zones note:
* Walking IREs within a given zone also walks certain ires in other
* zones. This is done intentionally. IRE walks with a specified
@@ -1235,10 +1230,9 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
{
irb_t *irb;
boolean_t drop = B_FALSE;
- /* LINTED : set but not used in function */
boolean_t mctl_present;
mblk_t *first_mp = NULL;
- mblk_t *save_mp = NULL;
+ mblk_t *data_mp = NULL;
ire_t *dst_ire;
ipha_t *ipha;
ip6_t *ip6h;
@@ -1258,27 +1252,16 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
* we resolve an IPv6 address with an IPv4 ire
* or vice versa.
*/
+ EXTRACT_PKT_MP(mp, first_mp, mctl_present);
+ data_mp = mp;
+ mp = first_mp;
if (ire->ire_ipversion == IPV4_VERSION) {
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
- ipha = (ipha_t *)mp->b_rptr;
- save_mp = mp;
- mp = first_mp;
-
+ ipha = (ipha_t *)data_mp->b_rptr;
dst_ire = ire_cache_lookup(ipha->ipha_dst,
ire->ire_zoneid, MBLK_GETLABEL(mp), ipst);
} else {
ASSERT(ire->ire_ipversion == IPV6_VERSION);
- /*
- * Get a pointer to the beginning of the IPv6 header.
- * Ignore leading IPsec control mblks.
- */
- first_mp = mp;
- if (mp->b_datap->db_type == M_CTL) {
- mp = mp->b_cont;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- save_mp = mp;
- mp = first_mp;
+ ip6h = (ip6_t *)data_mp->b_rptr;
dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst,
ire->ire_zoneid, MBLK_GETLABEL(mp), ipst);
}
@@ -1330,10 +1313,8 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
* is over: we just drop the packet.
*/
if (ire->ire_flags & RTF_MULTIRT) {
- if (save_mp) {
- save_mp->b_prev = NULL;
- save_mp->b_next = NULL;
- }
+ data_mp->b_prev = NULL;
+ data_mp->b_next = NULL;
MULTIRT_DEBUG_UNTAG(mp);
freemsg(mp);
} else {
@@ -1355,9 +1336,31 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
(CONN_Q(q) ? Q_TO_CONN(q) : NULL),
ire->ire_zoneid, ipst);
} else {
+ int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN;
+
ASSERT(ire->ire_ipversion == IPV6_VERSION);
- ip_newroute_v6(q, mp, &ip6h->ip6_dst, NULL,
- NULL, ire->ire_zoneid, ipst);
+
+ /*
+ * If necessary, skip over the ip6i_t to find
+ * the header with the actual source address.
+ */
+ if (ip6h->ip6_nxt == IPPROTO_RAW) {
+ if (MBLKL(data_mp) < minlen &&
+ pullupmsg(data_mp, -1) == 0) {
+ ip1dbg(("ire_add_then_send: "
+ "cannot pullupmsg ip6i\n"));
+ if (mctl_present)
+ freeb(first_mp);
+ ire_refrele(ire);
+ return;
+ }
+ ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN);
+ ip6h = (ip6_t *)(data_mp->b_rptr +
+ sizeof (ip6i_t));
+ }
+ ip_newroute_v6(q, mp, &ip6h->ip6_dst,
+ &ip6h->ip6_src, NULL, ire->ire_zoneid,
+ ipst);
}
}
@@ -1680,7 +1683,9 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep,
{
ire_t *ire;
uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ boolean_t prefer;
+ ill_t *ill = ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
/*
* No broadcast IREs for the LOOPBACK interface
@@ -1690,21 +1695,26 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep,
(ipif->ipif_flags & IPIF_NOXMIT))
return (irep);
- /* If this would be a duplicate, don't bother. */
+ /*
+ * If this new IRE would be a duplicate, only prefer it if one of
+ * the following is true:
+ *
+ * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST
+ * set and the new one has all of those clear.
+ *
+ * 2. The existing one corresponds to an underlying ILL in an IPMP
+ * group and the new one corresponds to an IPMP group interface.
+ */
if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif,
ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) {
- /*
- * We look for non-deprecated (and non-anycast, non-nolocal)
- * ipifs as the best choice. ipifs with check_flags matching
- * (deprecated, etc) are used only if non-deprecated ipifs
- * are not available. if the existing ire's ipif is deprecated
- * and the new ipif is non-deprecated, switch to the new ipif
- */
- if ((!(ire->ire_ipif->ipif_flags & check_flags)) ||
- (ipif->ipif_flags & check_flags)) {
+ prefer = ((ire->ire_ipif->ipif_flags & check_flags) &&
+ !(ipif->ipif_flags & check_flags)) ||
+ (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill));
+ if (!prefer) {
ire_refrele(ire);
return (irep);
}
+
/*
* Bcast ires exist in pairs. Both have to be deleted,
* Since we are exclusive we can make the above assertion.
@@ -1716,10 +1726,7 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep,
ire_delete(ire);
ire_refrele(ire);
}
-
- irep = ire_create_bcast(ipif, addr, irep);
-
- return (irep);
+ return (ire_create_bcast(ipif, addr, irep));
}
uint_t ip_loopback_mtu = IP_LOOPBACK_MTU;
@@ -1733,6 +1740,22 @@ ire_t **
ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep)
{
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ ill_t *ill = ipif->ipif_ill;
+
+ ASSERT(IAM_WRITER_IPIF(ipif));
+
+ if (IS_IPMP(ill)) {
+ /*
+ * Broadcast IREs for the IPMP meta-interface use the
+ * nominated broadcast interface to send and receive packets.
+ * If there's no nominated interface, send the packets down to
+ * the IPMP stub driver, which will discard them. If the
+ * nominated broadcast interface changes, ill_refresh_bcast()
+ * will refresh the broadcast IREs.
+ */
+ if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+ ill = ipif->ipif_ill;
+ }
*irep++ = ire_create(
(uchar_t *)&addr, /* dest addr */
@@ -1741,8 +1764,8 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep)
NULL, /* no gateway */
&ipif->ipif_mtu, /* max frag */
NULL, /* no src nce */
- ipif->ipif_rq, /* recv-from queue */
- ipif->ipif_wq, /* send-to queue */
+ ill->ill_rq, /* recv-from queue */
+ ill->ill_wq, /* send-to queue */
IRE_BROADCAST,
ipif,
0,
@@ -1761,7 +1784,7 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep)
NULL, /* no gateway */
&ip_loopback_mtu, /* max frag size */
NULL, /* no src_nce */
- ipif->ipif_rq, /* recv-from queue */
+ ill->ill_rq, /* recv-from queue */
NULL, /* no send-to queue */
IRE_BROADCAST, /* Needed for fanout in wput */
ipif,
@@ -2049,32 +2072,23 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
{
ill_t *ire_stq_ill = NULL;
ill_t *ire_ipif_ill = NULL;
- ill_group_t *ire_ill_group = NULL;
ASSERT(match_flags != 0 || zoneid != ALL_ZONES);
/*
- * MATCH_IRE_ILL/MATCH_IRE_ILL_GROUP : We match both on ill
- * pointed by ire_stq and ire_ipif. Only in the case of
- * IRE_CACHEs can ire_stq and ire_ipif be pointing to
- * different ills. But we want to keep this function generic
- * enough for future use. So, we always try to match on both.
- * The only caller of this function ire_walk_ill_tables, will
- * call "func" after we return from this function. We expect
- * "func" to do the right filtering of ires in this case.
- *
- * NOTE : In the case of MATCH_IRE_ILL_GROUP, groups
- * pointed by ire_stq and ire_ipif should always be the same.
- * So, we just match on only one of them.
+ * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and
+ * ire_ipif. Only in the case of IRE_CACHEs can ire_stq and
+ * ire_ipif be pointing to different ills. But we want to keep
+ * this function generic enough for future use. So, we always
+ * try to match on both. The only caller of this function
+ * ire_walk_ill_tables, will call "func" after we return from
+ * this function. We expect "func" to do the right filtering
+ * of ires in this case.
*/
- if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
+ if (match_flags & MATCH_IRE_ILL) {
if (ire->ire_stq != NULL)
- ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr;
+ ire_stq_ill = ire->ire_stq->q_ptr;
if (ire->ire_ipif != NULL)
ire_ipif_ill = ire->ire_ipif->ipif_ill;
- if (ire_stq_ill != NULL)
- ire_ill_group = ire_stq_ill->ill_group;
- if ((ire_ill_group == NULL) && (ire_ipif_ill != NULL))
- ire_ill_group = ire_ipif_ill->ill_group;
}
if (zoneid != ALL_ZONES) {
@@ -2115,7 +2129,7 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
ipif_t *src_ipif;
src_ipif =
ipif_select_source_v6(ire_stq_ill,
- &ire->ire_addr_v6, RESTRICT_TO_NONE,
+ &ire->ire_addr_v6, B_FALSE,
IPV6_PREFER_SRC_DEFAULT,
zoneid);
if (src_ipif != NULL) {
@@ -2143,9 +2157,9 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
ire_t *rire;
ire_match_flags |= MATCH_IRE_TYPE;
- if (ire->ire_ipif != NULL) {
- ire_match_flags |= MATCH_IRE_ILL_GROUP;
- }
+ if (ire->ire_ipif != NULL)
+ ire_match_flags |= MATCH_IRE_ILL;
+
if (ire->ire_ipversion == IPV4_VERSION) {
rire = ire_route_lookup(ire->ire_gateway_addr,
0, 0, IRE_INTERFACE, ire->ire_ipif, NULL,
@@ -2169,11 +2183,8 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
if (((!(match_flags & MATCH_IRE_TYPE)) ||
(ire->ire_type & ire_type)) &&
((!(match_flags & MATCH_IRE_ILL)) ||
- (ire_stq_ill == ill || ire_ipif_ill == ill)) &&
- ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
- (ire_stq_ill == ill) || (ire_ipif_ill == ill) ||
- (ire_ill_group != NULL &&
- ire_ill_group == ill->ill_group))) {
+ (ire_stq_ill == ill || ire_ipif_ill == ill ||
+ ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) {
return (B_TRUE);
}
return (B_FALSE);
@@ -2221,8 +2232,7 @@ ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
boolean_t ret;
struct rtfuncarg rtfarg;
- ASSERT((!(match_flags & (MATCH_IRE_ILL |
- MATCH_IRE_ILL_GROUP))) || (ill != NULL));
+ ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL));
ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0));
/*
* Optimize by not looking at the forwarding table if there
@@ -2399,32 +2409,26 @@ ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp,
}
/*
- * IPMP flag settings happen without taking the exclusive route
- * in ip_sioctl_flags. So we need to make an atomic check here
- * for FAILED/OFFLINE/INACTIVE flags or if it has hit the
- * FAILBACK=no case.
+ * Don't allow IRE's to be created on changing ill's. Also, since
+ * IPMP flags can be set on an ill without quiescing it, if we're not
+ * a writer on stq_ill, check that the flags still allow IRE creation.
*/
if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) {
if (stq_ill->ill_state_flags & ILL_CHANGING) {
ill = stq_ill;
error = EAGAIN;
- } else if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) ||
- (ill_is_probeonly(stq_ill) &&
- !(ire->ire_marks & IRE_MARK_HIDDEN))) {
- error = EINVAL;
+ } else if (IS_UNDER_IPMP(stq_ill)) {
+ mutex_enter(&stq_ill->ill_phyint->phyint_lock);
+ if (!ipmp_ill_is_active(stq_ill) &&
+ !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) {
+ error = EINVAL;
+ }
+ mutex_exit(&stq_ill->ill_phyint->phyint_lock);
}
- goto done;
+ if (error != 0)
+ goto done;
}
- /*
- * We don't check for OFFLINE/FAILED in this case because
- * the source address selection logic (ipif_select_source)
- * may still select a source address from such an ill. The
- * assumption is that these addresses will be moved by in.mpathd
- * soon. (i.e. this is a race). However link local addresses
- * will not move and hence ipif_select_source_v6 tries to avoid
- * FAILED ills. Please see ipif_select_source_v6 for more info
- */
if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) &&
(ipif_ill->ill_state_flags & ILL_CHANGING)) {
ill = ipif_ill;
@@ -2444,8 +2448,10 @@ done:
if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) {
ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
ire_atomic_end(irb_ptr, ire);
ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
error = EINPROGRESS;
} else if (error != 0) {
@@ -2502,39 +2508,7 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
ire = ire1;
}
if (ire->ire_stq != NULL)
- stq_ill = (ill_t *)ire->ire_stq->q_ptr;
-
- if (ire->ire_type == IRE_CACHE) {
- /*
- * If this interface is FAILED, or INACTIVE or has hit
- * the FAILBACK=no case, we create IRE_CACHES marked
- * HIDDEN for some special cases e.g. bind to
- * IPIF_NOFAILOVER address etc. So, if this interface
- * is FAILED/INACTIVE/hit FAILBACK=no case, and we are
- * not creating hidden ires, we should not allow that.
- * This happens because the state of the interface
- * changed while we were waiting in ARP. If this is the
- * daemon sending probes, the next probe will create
- * HIDDEN ires and we will create an ire then. This
- * cannot happen with NDP currently because IRE is
- * never queued in NDP. But it can happen in the
- * future when we have external resolvers with IPv6.
- * If the interface gets marked with OFFLINE while we
- * are waiting in ARP, don't add the ire.
- */
- if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) ||
- (ill_is_probeonly(stq_ill) &&
- !(ire->ire_marks & IRE_MARK_HIDDEN))) {
- /*
- * We don't know whether it is a valid ipif or not.
- * unless we do the check below. So, set it to NULL.
- */
- ire->ire_ipif = NULL;
- ire_delete(ire);
- *irep = NULL;
- return (EINVAL);
- }
- }
+ stq_ill = ire->ire_stq->q_ptr;
if (stq_ill != NULL && ire->ire_type == IRE_CACHE &&
stq_ill->ill_net_type == IRE_IF_RESOLVER) {
@@ -2573,12 +2547,12 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
rw_exit(&ipst->ips_ill_g_lock);
if (ipif == NULL ||
(ipif->ipif_isv6 &&
+ !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) &&
!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
&ipif->ipif_v6src_addr)) ||
(!ipif->ipif_isv6 &&
ire->ire_src_addr != ipif->ipif_src_addr) ||
ire->ire_zoneid != ipif->ipif_zoneid) {
-
if (ipif != NULL)
ipif_refrele(ipif);
ire->ire_ipif = NULL;
@@ -2587,20 +2561,7 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
return (EINVAL);
}
-
ASSERT(ill != NULL);
- /*
- * If this group was dismantled while this packets was
- * queued in ARP, don't add it here.
- */
- if (ire->ire_ipif->ipif_ill->ill_group != ill->ill_group) {
- /* We don't want ire_inactive bump stats for this */
- ipif_refrele(ipif);
- ire->ire_ipif = NULL;
- ire_delete(ire);
- *irep = NULL;
- return (EINVAL);
- }
/*
* Since we didn't attach label security attributes to the
@@ -2677,6 +2638,16 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
boolean_t need_refrele = B_FALSE;
nce_t *nce;
ip_stack_t *ipst = ire->ire_ipst;
+ uint_t marks = 0;
+
+ /*
+ * IREs with source addresses hosted on interfaces that are under IPMP
+ * should be hidden so that applications don't accidentally end up
+ * sending packets with test addresses as their source addresses, or
+ * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here.
+ */
+ if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill))
+ marks |= IRE_MARK_TESTHIDDEN;
if (ire->ire_ipif != NULL)
ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
@@ -2691,10 +2662,15 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
case IRE_HOST:
ire->ire_mask = IP_HOST_MASK;
ire->ire_masklen = IP_ABITS;
+ ire->ire_marks |= marks;
if ((ire->ire_flags & RTF_SETSRC) == 0)
ire->ire_src_addr = 0;
break;
case IRE_CACHE:
+ ire->ire_mask = IP_HOST_MASK;
+ ire->ire_masklen = IP_ABITS;
+ ire->ire_marks |= marks;
+ break;
case IRE_BROADCAST:
case IRE_LOCAL:
case IRE_LOOPBACK:
@@ -2702,15 +2678,14 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
ire->ire_masklen = IP_ABITS;
break;
case IRE_PREFIX:
- if ((ire->ire_flags & RTF_SETSRC) == 0)
- ire->ire_src_addr = 0;
- break;
case IRE_DEFAULT:
+ ire->ire_marks |= marks;
if ((ire->ire_flags & RTF_SETSRC) == 0)
ire->ire_src_addr = 0;
break;
case IRE_IF_RESOLVER:
case IRE_IF_NORESOLVER:
+ ire->ire_marks |= marks;
break;
default:
ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n",
@@ -2796,19 +2771,13 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
*/
flags |= MATCH_IRE_IPIF;
/*
- * If we are creating hidden ires, make sure we search on
- * this ill (MATCH_IRE_ILL) and a hidden ire,
- * while we are searching for duplicates below. Otherwise we
- * could potentially find an IRE on some other interface
- * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
- * shouldn't do this as this will lead to an infinite loop
- * (if we get to ip_wput again) eventually we need an hidden
- * ire for this packet to go out. MATCH_IRE_ILL is explicitly
- * done below.
+ * If we are creating a hidden IRE, make sure we search for
+ * hidden IREs when searching for duplicates below.
+ * Otherwise, we might find an IRE on some other interface
+ * that's not marked hidden.
*/
- if (ire->ire_type == IRE_CACHE &&
- (ire->ire_marks & IRE_MARK_HIDDEN))
- flags |= (MATCH_IRE_MARK_HIDDEN);
+ if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
+ flags |= MATCH_IRE_MARK_TESTHIDDEN;
}
if ((ire->ire_type & IRE_CACHETABLE) == 0) {
irb_ptr = ire_get_bucket(ire);
@@ -2927,7 +2896,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
* avoid a lookup in the caller again. If the callers
* don't want to use it, they need to do a REFRELE.
*/
- ip1dbg(("found dup ire existing %p new %p",
+ ip1dbg(("found dup ire existing %p new %p\n",
(void *)ire1, (void *)ire));
IRE_REFHOLD(ire1);
ire_atomic_end(irb_ptr, ire);
@@ -2948,6 +2917,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
return (0);
}
}
+
if (ire->ire_type & IRE_CACHE) {
ASSERT(ire->ire_stq != NULL);
nce = ndp_lookup_v4(ire_to_ill(ire),
@@ -2999,17 +2969,9 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
}
/*
* Make it easy for ip_wput_ire() to hit multiple broadcast ires by
- * grouping identical addresses together on the hash chain. We also
- * don't want to send multiple copies out if there are two ills part
- * of the same group. Thus we group the ires with same addr and same
- * ill group together so that ip_wput_ire can easily skip all the
- * ires with same addr and same group after sending the first copy.
- * We do this only for IRE_BROADCASTs as ip_wput_ire is currently
- * interested in such groupings only for broadcasts.
- *
- * NOTE : If the interfaces are brought up first and then grouped,
- * illgrp_insert will handle it. We come here when the interfaces
- * are already in group and we are bringing them UP.
+ * grouping identical addresses together on the hash chain. We do
+ * this only for IRE_BROADCASTs as ip_wput_ire is currently interested
+ * in such groupings only for broadcasts.
*
* Find the first entry that matches ire_addr. *irep will be null
* if no match.
@@ -3023,29 +2985,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
if (ire->ire_type == IRE_BROADCAST && *irep != NULL) {
/*
* We found some ire (i.e *irep) with a matching addr. We
- * want to group ires with same addr and same ill group
- * together.
- *
- * First get to the entry that matches our address and
- * ill group i.e stop as soon as we find the first ire
- * matching the ill group and address. If there is only
- * an address match, we should walk and look for some
- * group match. These are some of the possible scenarios :
- *
- * 1) There are no groups at all i.e all ire's ill_group
- * are NULL. In that case we will essentially group
- * all the ires with the same addr together. Same as
- * the "else" block of this "if".
- *
- * 2) There are some groups and this ire's ill_group is
- * NULL. In this case, we will first find the group
- * that matches the address and a NULL group. Then
- * we will insert the ire at the end of that group.
- *
- * 3) There are some groups and this ires's ill_group is
- * non-NULL. In this case we will first find the group
- * that matches the address and the ill_group. Then
- * we will insert the ire at the end of that group.
+ * want to group ires with same addr.
*/
for (;;) {
ire1 = *irep;
@@ -3053,8 +2993,8 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
(ire1->ire_next->ire_addr != ire->ire_addr) ||
(ire1->ire_type != IRE_BROADCAST) ||
(ire1->ire_flags & RTF_MULTIRT) ||
- (ire1->ire_ipif->ipif_ill->ill_group ==
- ire->ire_ipif->ipif_ill->ill_group))
+ (ire1->ire_ipif->ipif_ill->ill_grp ==
+ ire->ire_ipif->ipif_ill->ill_grp))
break;
irep = &ire1->ire_next;
}
@@ -3071,18 +3011,14 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
/*
* Either we have hit the end of the list or the address
- * did not match or the group *matched*. If we found
- * a match on the group, skip to the end of the group.
+ * did not match.
*/
while (*irep != NULL) {
ire1 = *irep;
if ((ire1->ire_addr != ire->ire_addr) ||
- (ire1->ire_type != IRE_BROADCAST) ||
- (ire1->ire_ipif->ipif_ill->ill_group !=
- ire->ire_ipif->ipif_ill->ill_group))
+ (ire1->ire_type != IRE_BROADCAST))
break;
- if (ire1->ire_ipif->ipif_ill->ill_group == NULL &&
- ire1->ire_ipif == ire->ire_ipif) {
+ if (ire1->ire_ipif == ire->ire_ipif) {
irep = &ire1->ire_next;
break;
}
@@ -3611,15 +3547,14 @@ ire_inactive(ire_t *ire)
* The ipif that is associated with an ire is ire->ire_ipif and
* hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call
* ipif_ill_refrele_tail. Usually stq_ill is null or the same as
- * ire->ire_ipif->ipif_ill. So nothing more needs to be done. Only
- * in the case of IRE_CACHES when IPMP is used, stq_ill can be
- * different. If this is different from ire->ire_ipif->ipif_ill and
- * if the ill_ire_cnt on the stq_ill also has dropped to zero, we call
+ * ire->ire_ipif->ipif_ill. So nothing more needs to be done.
+ * However, for VNI or IPMP IRE entries, stq_ill can be different.
+ * If this is different from ire->ire_ipif->ipif_ill and if the
+ * ill_ire_cnt on the stq_ill also has dropped to zero, we call
* ipif_ill_refrele_tail on the stq_ill.
*/
-
if (ire->ire_stq != NULL)
- stq_ill = (ill_t *)ire->ire_stq->q_ptr;
+ stq_ill = ire->ire_stq->q_ptr;
if (stq_ill == NULL || stq_ill == ill) {
/* Optimize the most common case */
@@ -3881,26 +3816,27 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
{
ill_t *ire_ill = NULL, *dst_ill;
ill_t *ipif_ill = NULL;
- ill_group_t *ire_ill_group = NULL;
- ill_group_t *ipif_ill_group = NULL;
ASSERT(ire->ire_ipversion == IPV4_VERSION);
ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
- ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) ||
+ ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
(ipif != NULL && !ipif->ipif_isv6));
ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL);
/*
- * HIDDEN cache entries have to be looked up specifically with
- * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set
- * when the interface is FAILED or INACTIVE. In that case,
- * any IRE_CACHES that exists should be marked with
- * IRE_MARK_HIDDEN. So, we don't really need to match below
- * for IRE_MARK_HIDDEN. But we do so for consistency.
+ * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
+ * is in fact hidden, to ensure the caller gets the right one. One
+ * exception: if the caller passed MATCH_IRE_IHANDLE, then they
+ * already know the identity of the given IRE_INTERFACE entry and
+ * there's no point trying to hide it from them.
*/
- if (!(match_flags & MATCH_IRE_MARK_HIDDEN) &&
- (ire->ire_marks & IRE_MARK_HIDDEN))
- return (B_FALSE);
+ if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+ if (match_flags & MATCH_IRE_IHANDLE)
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+
+ if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
+ return (B_FALSE);
+ }
/*
* MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option
@@ -3994,19 +3930,18 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
}
/*
- * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that
- * somebody wants to send out on a particular interface which
- * is given by ire_stq and hence use ire_stq to derive the ill
- * value. ire_ipif for IRE_CACHES is just the means of getting
- * a source address i.e ire_src_addr = ire->ire_ipif->ipif_src_addr.
- * ire_to_ill does the right thing for this.
+ * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
+ * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
+ * of getting a source address -- i.e., ire_src_addr ==
+ * ire->ire_ipif->ipif_src_addr). ire_to_ill() handles this.
+ *
+ * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
+ * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
+ * IPMP test traffic), then the ill must match exactly.
*/
- if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
+ if (match_flags & MATCH_IRE_ILL) {
ire_ill = ire_to_ill(ire);
- if (ire_ill != NULL)
- ire_ill_group = ire_ill->ill_group;
ipif_ill = ipif->ipif_ill;
- ipif_ill_group = ipif_ill->ill_group;
}
if ((ire->ire_addr == (addr & mask)) &&
@@ -4018,24 +3953,21 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
(ire->ire_src_addr == ipif->ipif_src_addr)) &&
((!(match_flags & MATCH_IRE_IPIF)) ||
(ire->ire_ipif == ipif)) &&
- ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
- (ire->ire_type != IRE_CACHE ||
- ire->ire_marks & IRE_MARK_HIDDEN)) &&
+ ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
+ (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) ||
(ire->ire_type != IRE_CACHE ||
ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) &&
- ((!(match_flags & MATCH_IRE_ILL)) ||
- (ire_ill == ipif_ill)) &&
((!(match_flags & MATCH_IRE_WQ)) ||
(ire->ire_stq == wq)) &&
+ ((!(match_flags & MATCH_IRE_ILL)) ||
+ (ire_ill == ipif_ill ||
+ (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
+ ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
((!(match_flags & MATCH_IRE_IHANDLE)) ||
(ire->ire_ihandle == ihandle)) &&
((!(match_flags & MATCH_IRE_MASK)) ||
(ire->ire_mask == mask)) &&
- ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
- (ire_ill == ipif_ill) ||
- (ire_ill_group != NULL &&
- ire_ill_group == ipif_ill_group)) &&
((!(match_flags & MATCH_IRE_SECATTR)) ||
(!is_system_labeled()) ||
(tsol_ire_match_gwattr(ire, tsl) == 0))) {
@@ -4060,8 +3992,7 @@ ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
* ire_match_args() will dereference ipif MATCH_IRE_SRC or
* MATCH_IRE_ILL is set.
*/
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
- (ipif == NULL))
+ if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
return (NULL);
/*
@@ -4142,14 +4073,15 @@ ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif,
/*
* Check whether the IRE_LOCAL and the IRE potentially used to transmit
- * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are part of
- * the same ill group.
+ * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical
+ * or part of the same illgrp. (In the IPMP case, usually the two IREs
+ * will both belong to the IPMP ill, but exceptions are possible -- e.g.
+ * if IPMP test addresses are on their own subnet.)
*/
boolean_t
-ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire)
+ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire)
{
- ill_t *recv_ill, *xmit_ill;
- ill_group_t *recv_group, *xmit_group;
+ ill_t *recv_ill, *xmit_ill;
ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK));
ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE));
@@ -4160,20 +4092,11 @@ ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire)
ASSERT(recv_ill != NULL);
ASSERT(xmit_ill != NULL);
- if (recv_ill == xmit_ill)
- return (B_TRUE);
-
- recv_group = recv_ill->ill_group;
- xmit_group = xmit_ill->ill_group;
-
- if (recv_group != NULL && recv_group == xmit_group)
- return (B_TRUE);
-
- return (B_FALSE);
+ return (IS_ON_SAME_LAN(recv_ill, xmit_ill));
}
/*
- * Check if the IRE_LOCAL uses the same ill (group) as another route would use.
+ * Check if the IRE_LOCAL uses the same ill as another route would use.
* If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
* then we don't allow this IRE_LOCAL to be used.
*/
@@ -4183,17 +4106,16 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
{
ire_t *alt_ire;
boolean_t rval;
+ int flags;
+
+ flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE;
if (ire_local->ire_ipversion == IPV4_VERSION) {
alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL,
- NULL, zoneid, 0, tsl,
- MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE, ipst);
+ NULL, zoneid, 0, tsl, flags, ipst);
} else {
- alt_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL,
- 0, NULL, NULL, zoneid, 0, tsl,
- MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE, ipst);
+ alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL,
+ NULL, zoneid, 0, tsl, flags, ipst);
}
if (alt_ire == NULL)
@@ -4203,16 +4125,14 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
ire_refrele(alt_ire);
return (B_FALSE);
}
- rval = ire_local_same_ill_group(ire_local, alt_ire);
+ rval = ire_local_same_lan(ire_local, alt_ire);
ire_refrele(alt_ire);
return (rval);
}
/*
- * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers
- * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get
- * to the hidden ones.
+ * Lookup cache
*
* In general the zoneid has to match (where ALL_ZONES match all of them).
* But for IRE_LOCAL we also need to handle the case where L2 should
@@ -4220,8 +4140,7 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
* Ethernet drivers nor Ethernet hardware loops back packets sent to their
* own MAC address. This loopback is needed when the normal
* routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill (or ill group) as the ill with which this IRE_LOCAL is
- * associated.
+ * the same ill as the ill with which this IRE_LOCAL is associated.
*
* Earlier versions of this code always matched an IRE_LOCAL independently of
* the zoneid. We preserve that earlier behavior when
@@ -4239,7 +4158,7 @@ ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl,
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
if (ire->ire_marks & (IRE_MARK_CONDEMNED |
- IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) {
+ IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) {
continue;
}
if (ire->ire_addr == addr) {
@@ -4284,7 +4203,7 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
ire_t *ire;
/*
- * Lets look for an ire in the cachetable whose
+ * Look for an ire in the cachetable whose
* ire_addr matches the destination.
* Since we are being called by forwarding fastpath
* no need to check for Trusted Solaris label.
@@ -4293,8 +4212,8 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
dst, ipst->ips_ip_cache_table_size)];
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & (IRE_MARK_CONDEMNED |
- IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) {
+ if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN |
+ IRE_MARK_PRIVATE_ADDR)) {
continue;
}
if (ire->ire_addr == dst) {
@@ -4307,7 +4226,6 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
return (NULL);
}
-
/*
* Locate the interface ire that is tied to the cache ire 'cire' via
* cire->ire_ihandle.
@@ -4333,13 +4251,8 @@ ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire)
* because the ihandle refers to an ipif which can be in only one zone.
*/
match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
- /*
- * ip_newroute calls ire_ftable_lookup with MATCH_IRE_ILL only
- * for on-link hosts. We should never be here for onlink.
- * Thus, use MATCH_IRE_ILL_GROUP.
- */
if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
/*
* We know that the mask of the interface ire equals cire->ire_cmask.
* (When ip_newroute() created 'cire' for the gateway it set its
@@ -4376,7 +4289,7 @@ ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire)
*/
match_flags = MATCH_IRE_TYPE;
if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET,
pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
if (ire == NULL)
@@ -4411,7 +4324,16 @@ ire_t *
ipif_to_ire(const ipif_t *ipif)
{
ire_t *ire;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK;
+
+ /*
+ * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
+ * so that they aren't accidentally returned. However, if the
+ * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
+ */
+ if (IS_UNDER_IPMP(ipif->ipif_ill))
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
ASSERT(!ipif->ipif_isv6);
if (ipif->ipif_ire_type == IRE_LOOPBACK) {
@@ -4421,13 +4343,12 @@ ipif_to_ire(const ipif_t *ipif)
} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
/* In this case we need to lookup destination address. */
ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0,
- IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL,
- (MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK), ipst);
+ IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags,
+ ipst);
} else {
ire = ire_ftable_lookup(ipif->ipif_subnet,
ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL,
- ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
- MATCH_IRE_MASK), ipst);
+ ALL_ZONES, 0, NULL, match_flags, ipst);
}
return (ire);
}
@@ -4811,7 +4732,7 @@ ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst)
continue;
if (cire->ire_addr != dst)
continue;
- if (cire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+ if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
continue;
unres_cnt--;
}
@@ -4983,7 +4904,7 @@ ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
continue;
if (cire->ire_marks &
(IRE_MARK_CONDEMNED |
- IRE_MARK_HIDDEN))
+ IRE_MARK_TESTHIDDEN))
continue;
if (cire->ire_gw_secattr != NULL &&
@@ -5186,7 +5107,7 @@ ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
continue;
if (cire->ire_marks &
(IRE_MARK_CONDEMNED |
- IRE_MARK_HIDDEN))
+ IRE_MARK_TESTHIDDEN))
continue;
if (cire->ire_gw_secattr != NULL &&
@@ -5401,7 +5322,7 @@ ire_trace_cleanup(const ire_t *ire)
* invoked when the mblk containing fake_ire is freed.
*/
void
-ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
+ire_arpresolve(ire_t *in_ire)
{
areq_t *areq;
ipaddr_t *addrp;
@@ -5409,8 +5330,13 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
ire_t *ire, *buf;
size_t bufsize;
frtn_t *frtnp;
- ill_t *ill;
- ip_stack_t *ipst = dst_ill->ill_ipst;
+ ill_t *dst_ill;
+ ip_stack_t *ipst;
+
+ ASSERT(in_ire->ire_nce != NULL);
+
+ dst_ill = ire_to_ill(in_ire);
+ ipst = dst_ill->ill_ipst;
/*
* Construct message chain for the resolver
@@ -5431,16 +5357,16 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
*/
/*
- * We use esballoc to allocate the second part(the ire_t size mblk)
- * of the message chain depicted above. THis mblk will be freed
- * by arp when there is a timeout, and otherwise passed to IP
- * and IP will * free it after processing the ARP response.
+ * We use esballoc to allocate the second part (IRE_MBLK)
+ * of the message chain depicted above. This mblk will be freed
+ * by arp when there is a timeout, and otherwise passed to IP
+ * and IP will free it after processing the ARP response.
*/
bufsize = sizeof (ire_t) + sizeof (frtn_t);
buf = kmem_alloc(bufsize, KM_NOSLEEP);
if (buf == NULL) {
- ip1dbg(("ire_arpresolver:alloc buffer failed\n "));
+ ip1dbg(("ire_arpresolve: alloc buffer failed\n"));
return;
}
frtnp = (frtn_t *)(buf + 1);
@@ -5448,16 +5374,15 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
frtnp->free_func = ire_freemblk;
ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
-
if (ire_mp == NULL) {
ip1dbg(("ire_arpresolve: esballoc failed\n"));
kmem_free(buf, bufsize);
return;
}
- ASSERT(in_ire->ire_nce != NULL);
+
areq_mp = copyb(dst_ill->ill_resolver_mp);
if (areq_mp == NULL) {
- kmem_free(buf, bufsize);
+ freemsg(ire_mp);
return;
}
@@ -5473,9 +5398,8 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
ire->ire_ipif_seqid = in_ire->ire_ipif_seqid;
ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex;
ire->ire_ipif = in_ire->ire_ipif;
- ire->ire_stq = in_ire->ire_stq;
- ill = ire_to_ill(ire);
- ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
+ ire->ire_stq = dst_ill->ill_wq;
+ ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex;
ire->ire_zoneid = in_ire->ire_zoneid;
ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
ire->ire_ipst = ipst;
@@ -5528,7 +5452,6 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
* Note that the ARP/IP merge should replace the functioanlity by providing
* direct function calls to clean up unresolved entries in ire/nce lists.
*/
-
void
ire_freemblk(ire_t *ire_mp)
{
@@ -5738,9 +5661,8 @@ retry_nce:
* is marked as ND_REACHABLE at this point.
* This nce does not undergo any further state changes,
* and exists as long as the interface is plumbed.
- * Note: we do the ire_nce assignment here for IRE_BROADCAST
- * because some functions like ill_mark_bcast() inline the
- * ire_add functionality.
+ * Note: the assignment of ire_nce here is a historical
+ * artifact of old code that used to inline ire_add().
*/
ire->ire_nce = nce;
/*
@@ -5772,8 +5694,7 @@ ip4_ctable_lookup_impl(ire_ctable_args_t *margs)
ire_t *ire;
ip_stack_t *ipst = margs->ict_ipst;
- if ((margs->ict_flags &
- (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
+ if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
(margs->ict_ipif == NULL)) {
return (NULL);
}
@@ -5802,10 +5723,7 @@ ip4_ctable_lookup_impl(ire_ctable_args_t *margs)
/*
* This function locates IRE_CACHE entries which were added by the
* ire_forward() path. We can fully specify the IRE we are looking for by
- * providing the ipif_t AND the ire_stq. This is different to MATCH_IRE_ILL
- * which uses the ipif_ill. This is inadequate with IPMP groups where
- * illgrp_scheduler() may have been used to select an ill from the group for
- * the outgoing interface.
+ * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ).
*/
ire_t *
ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif,
diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c
index ac14adf00d..1a3df02418 100644
--- a/usr/src/uts/common/inet/ip/ip_mroute.c
+++ b/usr/src/uts/common/inet/ip/ip_mroute.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -2037,6 +2037,7 @@ static int
ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
struct mfc *rt)
{
+ ill_t *vill;
vifi_t vifi;
struct vif *vifp;
ipaddr_t dst = ipha->ipha_dst;
@@ -2102,25 +2103,21 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
}
/*
* Don't forward if it didn't arrive from the parent vif for its
- * origin. But do match on the groups as we nominate only one
- * ill in the group for receiving allmulti packets.
+ * origin.
*/
- if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill &&
- (ill->ill_group == NULL ||
- ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group !=
- ill->ill_group)) ||
+ vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill;
+ if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) ||
(ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
/* Came in the wrong interface */
ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
"numvifs %d ill %s viftable ill %s\n",
(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
- ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
+ vill->ill_name));
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"ip_mdq: arrived wrong if, vifi %d ill "
"%s viftable ill %s\n",
- (int)vifi, ill->ill_name,
- ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
+ (int)vifi, ill->ill_name, vill->ill_name);
}
ipst->ips_mrtstat->mrts_wrong_if++;
rt->mfc_wrong_if++;
@@ -3047,7 +3044,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp)
dst = ipha->ipha_dst;
ipif = vifp->v_ipif;
- mutex_enter(&ipif->ipif_ill->ill_lock);
if (ilm_lookup_ipif(ipif, dst) != NULL) {
/*
* The packet is not yet reassembled, thus we need to
@@ -3057,7 +3053,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp)
mblk_t *mp_loop;
ire_t *ire;
- mutex_exit(&ipif->ipif_ill->ill_lock);
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1,
SL_TRACE,
@@ -3082,8 +3077,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp)
}
if (ire != NULL)
ire_refrele(ire);
- } else {
- mutex_exit(&ipif->ipif_ill->ill_lock);
}
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c
index f3c95ae362..cbea9be165 100644
--- a/usr/src/uts/common/inet/ip/ip_multi.c
+++ b/usr/src/uts/common/inet/ip/ip_multi.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -68,12 +68,10 @@ static void ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode,
static ilm_t *ilm_add_v6(ipif_t *ipif, const in6_addr_t *group,
ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist,
- int orig_ifindex, zoneid_t zoneid);
+ zoneid_t zoneid);
static void ilm_delete(ilm_t *ilm);
static int ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *group);
static int ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *group);
-static ilg_t *ilg_lookup_ill_index_v6(conn_t *connp,
- const in6_addr_t *v6group, int index);
static ilg_t *ilg_lookup_ipif(conn_t *connp, ipaddr_t group,
ipif_t *ipif);
static int ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif,
@@ -91,25 +89,21 @@ static int ip_opt_delete_group_excl(conn_t *connp, ipaddr_t group,
static int ip_opt_delete_group_excl_v6(conn_t *connp,
const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode,
const in6_addr_t *v6src);
+static void ill_ilm_walker_hold(ill_t *ill);
+static void ill_ilm_walker_rele(ill_t *ill);
/*
* MT notes:
*
* Multicast joins operate on both the ilg and ilm structures. Multiple
* threads operating on an conn (socket) trying to do multicast joins
- * need to synchronize when operating on the ilg. Multiple threads
+ * need to synchronize when operating on the ilg. Multiple threads
* potentially operating on different conn (socket endpoints) trying to
* do multicast joins could eventually end up trying to manipulate the
- * ilm simulatenously and need to synchronize on the access to the ilm.
- * Both are amenable to standard Solaris MT techniques, but it would be
- * complex to handle a failover or failback which needs to manipulate
- * ilg/ilms if an applications can also simultaenously join/leave
- * multicast groups. Hence multicast join/leave also go through the ipsq_t
+ * ilm simultaneously and need to synchronize access to the ilm. Currently,
+ * this is done by synchronizing join/leave via per-phyint ipsq_t
* serialization.
*
- * Multicast joins and leaves are single-threaded per phyint/IPMP group
- * using the ipsq serialization mechanism.
- *
* An ilm is an IP data structure used to track multicast join/leave.
* An ilm is associated with a <multicast group, ipif> tuple in IPv4 and
* with just <multicast group> in IPv6. ilm_refcnt is the number of ilg's
@@ -211,12 +205,13 @@ conn_ilg_reap(conn_t *connp)
* Returns a pointer to the next available ilg in conn_ilg. Allocs more
* buffers in size of ILG_ALLOC_CHUNK ilgs when needed, and updates conn's
* ilg tracking fields appropriately (conn_ilg_inuse reflects usage of the
- * returned ilg). Returns NULL on failure (ENOMEM).
+ * returned ilg). Returns NULL on failure, in which case `*errp' will be
+ * filled in with the reason.
*
* Assumes connp->conn_lock is held.
*/
static ilg_t *
-conn_ilg_alloc(conn_t *connp)
+conn_ilg_alloc(conn_t *connp, int *errp)
{
ilg_t *new, *ret;
int curcnt;
@@ -224,10 +219,21 @@ conn_ilg_alloc(conn_t *connp)
ASSERT(MUTEX_HELD(&connp->conn_lock));
ASSERT(connp->conn_ilg_inuse <= connp->conn_ilg_allocated);
+ /*
+ * If CONN_CLOSING is set, conn_ilg cleanup has begun and we must not
+ * create any ilgs.
+ */
+ if (connp->conn_state_flags & CONN_CLOSING) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
if (connp->conn_ilg == NULL) {
connp->conn_ilg = GETSTRUCT(ilg_t, ILG_ALLOC_CHUNK);
- if (connp->conn_ilg == NULL)
+ if (connp->conn_ilg == NULL) {
+ *errp = ENOMEM;
return (NULL);
+ }
connp->conn_ilg_allocated = ILG_ALLOC_CHUNK;
connp->conn_ilg_inuse = 0;
}
@@ -241,12 +247,15 @@ conn_ilg_alloc(conn_t *connp)
* ilg_delete_all() will have to be changed when
* this logic is changed.
*/
+ *errp = EBUSY;
return (NULL);
}
curcnt = connp->conn_ilg_allocated;
new = GETSTRUCT(ilg_t, curcnt + ILG_ALLOC_CHUNK);
- if (new == NULL)
+ if (new == NULL) {
+ *errp = ENOMEM;
return (NULL);
+ }
bcopy(connp->conn_ilg, new, sizeof (ilg_t) * curcnt);
mi_free((char *)connp->conn_ilg);
connp->conn_ilg = new;
@@ -378,42 +387,6 @@ ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist)
}
}
-/*
- * If the given interface has failed, choose a new one to join on so
- * that we continue to receive packets. ilg_orig_ifindex remembers
- * what the application used to join on so that we know the ilg to
- * delete even though we change the ill here. Callers will store the
- * ilg returned from this function in ilg_ill. Thus when we receive
- * a packet on ilg_ill, conn_wantpacket_v6 will deliver the packets.
- *
- * This function must be called as writer so we can walk the group
- * list and examine flags without holding a lock.
- */
-ill_t *
-ip_choose_multi_ill(ill_t *ill, const in6_addr_t *grp)
-{
- ill_t *till;
- ill_group_t *illgrp = ill->ill_group;
-
- ASSERT(IAM_WRITER_ILL(ill));
-
- if (IN6_IS_ADDR_UNSPECIFIED(grp) || illgrp == NULL)
- return (ill);
-
- if ((ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) == 0)
- return (ill);
-
- till = illgrp->illgrp_ill;
- while (till != NULL &&
- (till->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE))) {
- till = till->ill_group_next;
- }
- if (till != NULL)
- return (till);
-
- return (ill);
-}
-
static int
ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist,
boolean_t isv6)
@@ -560,8 +533,7 @@ ilm_update_del(ilm_t *ilm, boolean_t isv6)
}
/*
- * INADDR_ANY means all multicast addresses. This is only used
- * by the multicast router.
+ * INADDR_ANY means all multicast addresses.
* INADDR_ANY is stored as IPv6 unspecified addr.
*/
int
@@ -578,40 +550,31 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
if (!CLASSD(group) && group != INADDR_ANY)
return (EINVAL);
+ if (IS_UNDER_IPMP(ill))
+ return (EINVAL);
+
/*
- * INADDR_ANY is represented as the IPv6 unspecifed addr.
+ * INADDR_ANY is represented as the IPv6 unspecified addr.
*/
if (group == INADDR_ANY)
v6group = ipv6_all_zeros;
else
IN6_IPADDR_TO_V4MAPPED(group, &v6group);
- mutex_enter(&ill->ill_lock);
ilm = ilm_lookup_ipif(ipif, group);
- mutex_exit(&ill->ill_lock);
/*
* Since we are writer, we know the ilm_flags itself cannot
* change at this point, and ilm_lookup_ipif would not have
* returned a DELETED ilm. However, the data path can free
- * ilm->next via ilm_walker_cleanup() so we can safely
+ * ilm->ilm_next via ilm_walker_cleanup() so we can safely
* access anything in ilm except ilm_next (for safe access to
- * ilm_next we'd have to take the ill_lock).
+ * ilm_next we'd have to take the ill_lock).
*/
if (ilm != NULL)
return (ilm_update_add(ilm, ilgstat, ilg_flist, B_FALSE));
- /*
- * ilms are associated with ipifs in IPv4. It moves with the
- * ipif if the ipif moves to a new ill when the interface
- * fails. Thus we really don't check whether the ipif_ill
- * has failed like in IPv6. If it has FAILED the ipif
- * will move (daemon will move it) and hence the ilm, if the
- * ipif is not IPIF_NOFAILOVER. For the IPIF_NOFAILOVER ipifs,
- * we continue to receive in the same place even if the
- * interface fails.
- */
ilm = ilm_add_v6(ipif, &v6group, ilgstat, ilg_fmode, ilg_flist,
- ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid);
+ ipif->ipif_zoneid);
if (ilm == NULL)
return (ENOMEM);
@@ -623,10 +586,7 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
*/
if (ilm_numentries_v6(ill, &v6group) > 1)
return (0);
- if (ill->ill_group == NULL)
- ret = ill_join_allmulti(ill);
- else
- ret = ill_nominate_mcast_rcv(ill->ill_group);
+ ret = ill_join_allmulti(ill);
if (ret != 0)
ilm_delete(ilm);
return (ret);
@@ -646,12 +606,8 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
/*
* The unspecified address means all multicast addresses.
- * This is only used by the multicast router.
*
- * ill identifies the interface to join on; it may not match the
- * interface requested by the application of a failover has taken
- * place. orig_ifindex always identifies the interface requested
- * by the app.
+ * ill identifies the interface to join on.
*
* ilgstat tells us if there's an ilg associated with this join,
* and if so, if it's a new ilg or a change to an existing one.
@@ -659,9 +615,8 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
* the ilg (and will be EXCLUDE {NULL} in the case of no ilg).
*/
int
-ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
- zoneid_t zoneid, ilg_stat_t ilgstat, mcast_record_t ilg_fmode,
- slist_t *ilg_flist)
+ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+ ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist)
{
ilm_t *ilm;
int ret;
@@ -673,37 +628,20 @@ ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
return (EINVAL);
}
+ if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_MC_SOLICITEDNODE(v6group))
+ return (EINVAL);
+
/*
- * An ilm is uniquely identified by the tuple of (group, ill,
- * orig_ill). group is the multicast group address, ill is
- * the interface on which it is currently joined, and orig_ill
- * is the interface on which the application requested the
- * join. orig_ill and ill are the same unless orig_ill has
- * failed over.
- *
- * Both orig_ill and ill are required, which means we may have
- * 2 ilms on an ill for the same group, but with different
- * orig_ills. These must be kept separate, so that when failback
- * occurs, the appropriate ilms are moved back to their orig_ill
- * without disrupting memberships on the ill to which they had
- * been moved.
- *
- * In order to track orig_ill, we store orig_ifindex in the
- * ilm and ilg.
+ * An ilm is uniquely identified by the tuple of (group, ill) where
+ * `group' is the multicast group address, and `ill' is the interface
+ * on which it is currently joined.
*/
- mutex_enter(&ill->ill_lock);
- ilm = ilm_lookup_ill_index_v6(ill, v6group, orig_ifindex, zoneid);
- mutex_exit(&ill->ill_lock);
+ ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid);
if (ilm != NULL)
return (ilm_update_add(ilm, ilgstat, ilg_flist, B_TRUE));
- /*
- * We need to remember where the application really wanted
- * to join. This will be used later if we want to failback
- * to the original interface.
- */
ilm = ilm_add_v6(ill->ill_ipif, v6group, ilgstat, ilg_fmode,
- ilg_flist, orig_ifindex, zoneid);
+ ilg_flist, zoneid);
if (ilm == NULL)
return (ENOMEM);
@@ -715,11 +653,7 @@ ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
*/
if (ilm_numentries_v6(ill, v6group) > 1)
return (0);
- if (ill->ill_group == NULL)
- ret = ill_join_allmulti(ill);
- else
- ret = ill_nominate_mcast_rcv(ill->ill_group);
-
+ ret = ill_join_allmulti(ill);
if (ret != 0)
ilm_delete(ilm);
return (ret);
@@ -756,6 +690,14 @@ ip_ll_send_enabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
ASSERT(IAM_WRITER_ILL(ill));
/*
+ * If we're on the IPMP ill, use the nominated multicast interface to
+ * send and receive DLPI messages, if one exists. (If none exists,
+ * there are no usable interfaces and thus nothing to do.)
+ */
+ if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+ return (0);
+
+ /*
* Create a AR_ENTRY_SQUERY message with a dl_enabmulti_req tacked
* on.
*/
@@ -842,9 +784,8 @@ ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *v6groupp)
}
/*
- * INADDR_ANY means all multicast addresses. This is only used
- * by the multicast router.
- * INADDR_ANY is stored as the IPv6 unspecifed addr.
+ * INADDR_ANY means all multicast addresses.
+ * INADDR_ANY is stored as the IPv6 unspecified addr.
*/
int
ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
@@ -859,7 +800,7 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
return (EINVAL);
/*
- * INADDR_ANY is represented as the IPv6 unspecifed addr.
+ * INADDR_ANY is represented as the IPv6 unspecified addr.
*/
if (group == INADDR_ANY)
v6group = ipv6_all_zeros;
@@ -870,9 +811,7 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
* Look for a match on the ipif.
* (IP_DROP_MEMBERSHIP specifies an ipif using an IP address).
*/
- mutex_enter(&ill->ill_lock);
ilm = ilm_lookup_ipif(ipif, group);
- mutex_exit(&ill->ill_lock);
if (ilm == NULL)
return (ENOENT);
@@ -897,11 +836,9 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
return (0);
/* If we never joined, then don't leave. */
- if (ill->ill_join_allmulti) {
+ if (ill->ill_join_allmulti)
ill_leave_allmulti(ill);
- if (ill->ill_group != NULL)
- (void) ill_nominate_mcast_rcv(ill->ill_group);
- }
+
return (0);
}
@@ -921,11 +858,10 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
/*
* The unspecified address means all multicast addresses.
- * This is only used by the multicast router.
*/
int
-ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
- zoneid_t zoneid, boolean_t no_ilg, boolean_t leaving)
+ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+ boolean_t no_ilg, boolean_t leaving)
{
ipif_t *ipif;
ilm_t *ilm;
@@ -938,25 +874,8 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
/*
* Look for a match on the ill.
- * (IPV6_LEAVE_GROUP specifies an ill using an ifindex).
- *
- * Similar to ip_addmulti_v6, we should always look using
- * the orig_ifindex.
- *
- * 1) If orig_ifindex is different from ill's ifindex
- * we should have an ilm with orig_ifindex created in
- * ip_addmulti_v6. We should delete that here.
- *
- * 2) If orig_ifindex is same as ill's ifindex, we should
- * not delete the ilm that is temporarily here because of
- * a FAILOVER. Those ilms will have a ilm_orig_ifindex
- * different from ill's ifindex.
- *
- * Thus, always lookup using orig_ifindex.
*/
- mutex_enter(&ill->ill_lock);
- ilm = ilm_lookup_ill_index_v6(ill, v6group, orig_ifindex, zoneid);
- mutex_exit(&ill->ill_lock);
+ ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid);
if (ilm == NULL)
return (ENOENT);
@@ -985,11 +904,9 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
return (0);
/* If we never joined, then don't leave. */
- if (ill->ill_join_allmulti) {
+ if (ill->ill_join_allmulti)
ill_leave_allmulti(ill);
- if (ill->ill_group != NULL)
- (void) ill_nominate_mcast_rcv(ill->ill_group);
- }
+
return (0);
}
@@ -1020,6 +937,13 @@ ip_ll_send_disabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
uint32_t addrlen, addroff;
ASSERT(IAM_WRITER_ILL(ill));
+
+ /*
+ * See comment in ip_ll_send_enabmulti_req().
+ */
+ if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+ return (0);
+
/*
* Create a AR_ENTRY_SQUERY message with a dl_disabmulti_req tacked
* on.
@@ -1099,16 +1023,16 @@ ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *v6group)
}
/*
- * Make the driver pass up all multicast packets
- *
- * With ill groups, the caller makes sure that there is only
- * one ill joining the allmulti group.
+ * Make the driver pass up all multicast packets. NOTE: to keep callers
+ * IPMP-unaware, if an IPMP ill is passed in, the ill_join_allmulti flag is
+ * set on it (rather than the cast ill).
*/
int
ill_join_allmulti(ill_t *ill)
{
mblk_t *promiscon_mp, *promiscoff_mp;
uint32_t addrlen, addroff;
+ ill_t *join_ill = ill;
ASSERT(IAM_WRITER_ILL(ill));
@@ -1120,7 +1044,13 @@ ill_join_allmulti(ill_t *ill)
return (0);
}
- ASSERT(!ill->ill_join_allmulti);
+ /*
+ * See comment in ip_ll_send_enabmulti_req().
+ */
+ if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+ return (0);
+
+ ASSERT(!join_ill->ill_join_allmulti);
/*
* Create a DL_PROMISCON_REQ message and send it directly to the DLPI
@@ -1144,20 +1074,18 @@ ill_join_allmulti(ill_t *ill)
ill_dlpi_send(ill, promiscon_mp);
}
- ill->ill_join_allmulti = B_TRUE;
+ join_ill->ill_join_allmulti = B_TRUE;
return (0);
}
/*
* Make the driver stop passing up all multicast packets
- *
- * With ill groups, we need to nominate some other ill as
- * this ipif->ipif_ill is leaving the group.
*/
void
ill_leave_allmulti(ill_t *ill)
{
- mblk_t *promiscoff_mp = ill->ill_promiscoff_mp;
+ mblk_t *promiscoff_mp;
+ ill_t *leave_ill = ill;
ASSERT(IAM_WRITER_ILL(ill));
@@ -1169,7 +1097,13 @@ ill_leave_allmulti(ill_t *ill)
return;
}
- ASSERT(ill->ill_join_allmulti);
+ /*
+ * See comment in ip_ll_send_enabmulti_req().
+ */
+ if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+ return;
+
+ ASSERT(leave_ill->ill_join_allmulti);
/*
* Create a DL_PROMISCOFF_REQ message and send it directly to
@@ -1179,12 +1113,13 @@ ill_leave_allmulti(ill_t *ill)
*/
if ((ill->ill_net_type == IRE_IF_RESOLVER) &&
!(ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST)) {
+ promiscoff_mp = ill->ill_promiscoff_mp;
ASSERT(promiscoff_mp != NULL);
ill->ill_promiscoff_mp = NULL;
ill_dlpi_send(ill, promiscoff_mp);
}
- ill->ill_join_allmulti = B_FALSE;
+ leave_ill->ill_join_allmulti = B_FALSE;
}
static ill_t *
@@ -1213,22 +1148,35 @@ int
ip_join_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
{
ill_t *ill;
- int ret;
+ int ret = 0;
if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL)
return (ENODEV);
+
+ /*
+ * The ip_addmulti*() functions won't allow IPMP underlying interfaces
+ * to join allmulti since only the nominated underlying interface in
+ * the group should receive multicast. We silently succeed to avoid
+ * having to teach IPobs (currently the only caller of this routine)
+ * to ignore failures in this case.
+ */
+ if (IS_UNDER_IPMP(ill))
+ goto out;
+
if (isv6) {
- ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ifindex,
- ill->ill_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
+ ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ill->ill_zoneid,
+ ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
} else {
ret = ip_addmulti(INADDR_ANY, ill->ill_ipif, ILGSTAT_NONE,
MODE_IS_EXCLUDE, NULL);
}
ill->ill_ipallmulti_cnt++;
+out:
ipsq_exit(ill->ill_phyint->phyint_ipsq);
return (ret);
}
+
int
ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
{
@@ -1236,14 +1184,17 @@ ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL)
return (ENODEV);
- ASSERT(ill->ill_ipallmulti_cnt != 0);
- if (isv6) {
- (void) ip_delmulti_v6(&ipv6_all_zeros, ill, ifindex,
- ill->ill_zoneid, B_TRUE, B_TRUE);
- } else {
- (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE, B_TRUE);
+
+ if (ill->ill_ipallmulti_cnt > 0) {
+ if (isv6) {
+ (void) ip_delmulti_v6(&ipv6_all_zeros, ill,
+ ill->ill_zoneid, B_TRUE, B_TRUE);
+ } else {
+ (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE,
+ B_TRUE);
+ }
+ ill->ill_ipallmulti_cnt--;
}
- ill->ill_ipallmulti_cnt--;
ipsq_exit(ill->ill_phyint->phyint_ipsq);
return (0);
}
@@ -1260,8 +1211,7 @@ ip_purge_allmulti(ill_t *ill)
for (; ill->ill_ipallmulti_cnt > 0; ill->ill_ipallmulti_cnt--) {
if (ill->ill_isv6) {
(void) ip_delmulti_v6(&ipv6_all_zeros, ill,
- ill->ill_phyint->phyint_ifindex, ill->ill_zoneid,
- B_TRUE, B_TRUE);
+ ill->ill_zoneid, B_TRUE, B_TRUE);
} else {
(void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE,
B_TRUE);
@@ -1539,13 +1489,14 @@ void
ill_recover_multicast(ill_t *ill)
{
ilm_t *ilm;
+ ipif_t *ipif = ill->ill_ipif;
char addrbuf[INET6_ADDRSTRLEN];
ASSERT(IAM_WRITER_ILL(ill));
ill->ill_need_recover_multicast = 0;
- ILM_WALKER_HOLD(ill);
+ ill_ilm_walker_hold(ill);
for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
/*
* Check how many ipif's that have members in this group -
@@ -1553,47 +1504,45 @@ ill_recover_multicast(ill_t *ill)
* in the list.
*/
if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 &&
- ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm)
+ ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE,
+ ALL_ZONES) != ilm) {
continue;
- ip1dbg(("ill_recover_multicast: %s\n",
- inet_ntop(AF_INET6, &ilm->ilm_v6addr, addrbuf,
- sizeof (addrbuf))));
+ }
+
+ ip1dbg(("ill_recover_multicast: %s\n", inet_ntop(AF_INET6,
+ &ilm->ilm_v6addr, addrbuf, sizeof (addrbuf))));
+
if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
- if (ill->ill_group == NULL) {
- (void) ill_join_allmulti(ill);
- } else {
- /*
- * We don't want to join on this ill,
- * if somebody else in the group has
- * already been nominated.
- */
- (void) ill_nominate_mcast_rcv(ill->ill_group);
- }
+ (void) ill_join_allmulti(ill);
} else {
- (void) ip_ll_addmulti_v6(ill->ill_ipif,
- &ilm->ilm_v6addr);
+ if (ill->ill_isv6)
+ mld_joingroup(ilm);
+ else
+ igmp_joingroup(ilm);
+
+ (void) ip_ll_addmulti_v6(ipif, &ilm->ilm_v6addr);
}
}
- ILM_WALKER_RELE(ill);
+ ill_ilm_walker_rele(ill);
+
}
/*
* The opposite of ill_recover_multicast() -- leaves all multicast groups
- * that were explicitly joined. Note that both these functions could be
- * disposed of if we enhanced ARP to allow us to handle DL_DISABMULTI_REQ
- * and DL_ENABMULTI_REQ messages when an interface is down.
+ * that were explicitly joined.
*/
void
ill_leave_multicast(ill_t *ill)
{
ilm_t *ilm;
+ ipif_t *ipif = ill->ill_ipif;
char addrbuf[INET6_ADDRSTRLEN];
ASSERT(IAM_WRITER_ILL(ill));
ill->ill_need_recover_multicast = 1;
- ILM_WALKER_HOLD(ill);
+ ill_ilm_walker_hold(ill);
for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
/*
* Check how many ipif's that have members in this group -
@@ -1601,25 +1550,26 @@ ill_leave_multicast(ill_t *ill)
* in the list.
*/
if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 &&
- ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm)
+ ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE,
+ ALL_ZONES) != ilm) {
continue;
- ip1dbg(("ill_leave_multicast: %s\n",
- inet_ntop(AF_INET6, &ilm->ilm_v6addr, addrbuf,
- sizeof (addrbuf))));
+ }
+
+ ip1dbg(("ill_leave_multicast: %s\n", inet_ntop(AF_INET6,
+ &ilm->ilm_v6addr, addrbuf, sizeof (addrbuf))));
+
if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
ill_leave_allmulti(ill);
- /*
- * If we were part of an IPMP group, then
- * ill_handoff_responsibility() has already
- * nominated a new member (so we don't).
- */
- ASSERT(ill->ill_group == NULL);
} else {
- (void) ip_ll_delmulti_v6(ill->ill_ipif,
- &ilm->ilm_v6addr);
+ if (ill->ill_isv6)
+ mld_leavegroup(ilm);
+ else
+ igmp_leavegroup(ilm);
+
+ (void) ip_ll_delmulti_v6(ipif, &ilm->ilm_v6addr);
}
}
- ILM_WALKER_RELE(ill);
+ ill_ilm_walker_rele(ill);
}
/* Find an ilm for matching the ill */
@@ -1628,91 +1578,79 @@ ilm_lookup_ill(ill_t *ill, ipaddr_t group, zoneid_t zoneid)
{
in6_addr_t v6group;
- ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
/*
- * INADDR_ANY is represented as the IPv6 unspecifed addr.
+ * INADDR_ANY is represented as the IPv6 unspecified addr.
*/
if (group == INADDR_ANY)
v6group = ipv6_all_zeros;
else
IN6_IPADDR_TO_V4MAPPED(group, &v6group);
- return (ilm_lookup_ill_v6(ill, &v6group, zoneid));
+ return (ilm_lookup_ill_v6(ill, &v6group, B_TRUE, zoneid));
}
/*
- * Find an ilm for matching the ill. All the ilm lookup functions
- * ignore ILM_DELETED ilms. These have been logically deleted, and
- * igmp and linklayer disable multicast have been done. Only mi_free
- * yet to be done. Still there in the list due to ilm_walkers. The
- * last walker will release it.
+ * Find an ilm for address `v6group' on `ill' and zone `zoneid' (which may be
+ * ALL_ZONES). In general, if `ill' is in an IPMP group, we will match
+ * against any ill in the group. However, if `restrict_solicited' is set,
+ * then specifically for IPv6 solicited-node multicast, the match will be
+ * restricted to the specified `ill'.
*/
ilm_t *
-ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group, zoneid_t zoneid)
+ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group,
+ boolean_t restrict_solicited, zoneid_t zoneid)
{
ilm_t *ilm;
+ ilm_walker_t ilw;
+ boolean_t restrict_ill = B_FALSE;
- ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
+ /*
+ * In general, underlying interfaces cannot have multicast memberships
+ * and thus lookups always match across the illgrp. However, we must
+ * allow IPv6 solicited-node multicast memberships on underlying
+ * interfaces, and thus an IPMP meta-interface and one of its
+ * underlying ills may have the same solicited-node multicast address.
+ * In that case, we need to restrict the lookup to the requested ill.
+ * However, we may receive packets on an underlying interface that
+ * are for the corresponding IPMP interface's solicited-node multicast
+ * address, and thus in that case we need to match across the group --
+ * hence the unfortunate `restrict_solicited' argument.
+ */
+ if (IN6_IS_ADDR_MC_SOLICITEDNODE(v6group) && restrict_solicited)
+ restrict_ill = (IS_IPMP(ill) || IS_UNDER_IPMP(ill));
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
- if (ilm->ilm_flags & ILM_DELETED)
+ ilm = ilm_walker_start(&ilw, ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group))
continue;
- if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) &&
- (zoneid == ALL_ZONES || zoneid == ilm->ilm_zoneid))
- return (ilm);
- }
- return (NULL);
-}
-
-ilm_t *
-ilm_lookup_ill_index_v6(ill_t *ill, const in6_addr_t *v6group, int index,
- zoneid_t zoneid)
-{
- ilm_t *ilm;
-
- ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
-
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (ilm->ilm_flags & ILM_DELETED)
+ if (zoneid != ALL_ZONES && zoneid != ilm->ilm_zoneid)
continue;
- if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) &&
- (zoneid == ALL_ZONES || zoneid == ilm->ilm_zoneid) &&
- ilm->ilm_orig_ifindex == index) {
- return (ilm);
+ if (!restrict_ill || ill == (ill->ill_isv6 ?
+ ilm->ilm_ill : ilm->ilm_ipif->ipif_ill)) {
+ break;
}
}
- return (NULL);
+ ilm_walker_finish(&ilw);
+ return (ilm);
}
-
/*
- * Found an ilm for the ipif. Only needed for IPv4 which does
+ * Find an ilm for the ipif. Only needed for IPv4 which does
* ipif specific socket options.
*/
ilm_t *
ilm_lookup_ipif(ipif_t *ipif, ipaddr_t group)
{
- ill_t *ill = ipif->ipif_ill;
- ilm_t *ilm;
- in6_addr_t v6group;
-
- ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
- /*
- * INADDR_ANY is represented as the IPv6 unspecifed addr.
- */
- if (group == INADDR_ANY)
- v6group = ipv6_all_zeros;
- else
- IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+ ilm_t *ilm;
+ ilm_walker_t ilw;
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
- if (ilm->ilm_flags & ILM_DELETED)
- continue;
- if (ilm->ilm_ipif == ipif &&
- IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group))
- return (ilm);
+ ilm = ilm_walker_start(&ilw, ipif->ipif_ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ if (ilm->ilm_ipif == ipif && ilm->ilm_addr == group)
+ break;
}
- return (NULL);
+ ilm_walker_finish(&ilw);
+ return (ilm);
}
/*
@@ -1739,8 +1677,7 @@ ilm_numentries_v6(ill_t *ill, const in6_addr_t *v6group)
/* Caller guarantees that the group is not already on the list */
static ilm_t *
ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
- mcast_record_t ilg_fmode, slist_t *ilg_flist, int orig_ifindex,
- zoneid_t zoneid)
+ mcast_record_t ilg_fmode, slist_t *ilg_flist, zoneid_t zoneid)
{
ill_t *ill = ipif->ipif_ill;
ilm_t *ilm;
@@ -1783,19 +1720,10 @@ ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
(char *), "ilm", (void *), ilm);
ipif->ipif_ilm_cnt++;
}
+
ASSERT(ill->ill_ipst);
ilm->ilm_ipst = ill->ill_ipst; /* No netstack_hold */
- /*
- * After this if ilm moves to a new ill, we don't change
- * the ilm_orig_ifindex. Thus, if ill_index != ilm_orig_ifindex,
- * it has been moved. Indexes don't match even when the application
- * wants to join on a FAILED/INACTIVE interface because we choose
- * a new interface to join in. This is considered as an implicit
- * move.
- */
- ilm->ilm_orig_ifindex = orig_ifindex;
-
ASSERT(!(ipif->ipif_state_flags & IPIF_CONDEMNED));
ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED));
@@ -1969,6 +1897,108 @@ ilm_delete(ilm_t *ilm)
}
}
+/* Increment the ILM walker count for `ill' */
+static void
+ill_ilm_walker_hold(ill_t *ill)
+{
+ mutex_enter(&ill->ill_lock);
+ ill->ill_ilm_walker_cnt++;
+ mutex_exit(&ill->ill_lock);
+}
+
+/* Decrement the ILM walker count for `ill' */
+static void
+ill_ilm_walker_rele(ill_t *ill)
+{
+ mutex_enter(&ill->ill_lock);
+ ill->ill_ilm_walker_cnt--;
+ if (ill->ill_ilm_walker_cnt == 0 && ill->ill_ilm_cleanup_reqd)
+ ilm_walker_cleanup(ill); /* drops ill_lock */
+ else
+ mutex_exit(&ill->ill_lock);
+}
+
+/*
+ * Start walking the ILMs associated with `ill'; the first ILM in the walk
+ * (if any) is returned. State associated with the walk is stored in `ilw'.
+ * Note that walks associated with interfaces under IPMP also walk the ILMs
+ * on the associated IPMP interface; this is handled transparently to callers
+ * via ilm_walker_step(). (Usually with IPMP all ILMs will be on the IPMP
+ * interface; the only exception is to support IPv6 test addresses, which
+ * require ILMs for their associated solicited-node multicast addresses.)
+ */
+ilm_t *
+ilm_walker_start(ilm_walker_t *ilw, ill_t *ill)
+{
+ ilw->ilw_ill = ill;
+ if (IS_UNDER_IPMP(ill))
+ ilw->ilw_ipmp_ill = ipmp_ill_hold_ipmp_ill(ill);
+ else
+ ilw->ilw_ipmp_ill = NULL;
+
+ ill_ilm_walker_hold(ill);
+ if (ilw->ilw_ipmp_ill != NULL)
+ ill_ilm_walker_hold(ilw->ilw_ipmp_ill);
+
+ if (ilw->ilw_ipmp_ill != NULL && ilw->ilw_ipmp_ill->ill_ilm != NULL)
+ ilw->ilw_walk_ill = ilw->ilw_ipmp_ill;
+ else
+ ilw->ilw_walk_ill = ilw->ilw_ill;
+
+ return (ilm_walker_step(ilw, NULL));
+}
+
+/*
+ * Helper function for ilm_walker_step() that returns the next ILM
+ * associated with `ilw', regardless of whether it's deleted.
+ */
+static ilm_t *
+ilm_walker_step_all(ilm_walker_t *ilw, ilm_t *ilm)
+{
+ if (ilm == NULL)
+ return (ilw->ilw_walk_ill->ill_ilm);
+
+ if (ilm->ilm_next != NULL)
+ return (ilm->ilm_next);
+
+ if (ilw->ilw_ipmp_ill != NULL && IS_IPMP(ilw->ilw_walk_ill)) {
+ ilw->ilw_walk_ill = ilw->ilw_ill;
+ /*
+ * It's possible that ilw_ill left the group during our walk,
+ * so we can't ASSERT() that it's under IPMP. Callers that
+ * care will be writer on the IPSQ anyway.
+ */
+ return (ilw->ilw_walk_ill->ill_ilm);
+ }
+ return (NULL);
+}
+
+/*
+ * Step to the next ILM associated with `ilw'.
+ */
+ilm_t *
+ilm_walker_step(ilm_walker_t *ilw, ilm_t *ilm)
+{
+ while ((ilm = ilm_walker_step_all(ilw, ilm)) != NULL) {
+ if (!(ilm->ilm_flags & ILM_DELETED))
+ break;
+ }
+ return (ilm);
+}
+
+/*
+ * Finish the ILM walk associated with `ilw'.
+ */
+void
+ilm_walker_finish(ilm_walker_t *ilw)
+{
+ ill_ilm_walker_rele(ilw->ilw_ill);
+ if (ilw->ilw_ipmp_ill != NULL) {
+ ill_ilm_walker_rele(ilw->ilw_ipmp_ill);
+ ill_refrele(ilw->ilw_ipmp_ill);
+ }
+ bzero(&ilw, sizeof (ilw));
+}
/*
* Looks up the appropriate ipif given a v4 multicast group and interface
@@ -2256,16 +2286,15 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
* didn't find an ilg, there's nothing to do.
*/
if (!leave_grp)
- ilg = conn_ilg_alloc(connp);
+ ilg = conn_ilg_alloc(connp, &err);
if (leave_grp || ilg == NULL) {
mutex_exit(&connp->conn_lock);
- return (leave_grp ? 0 : ENOMEM);
+ return (leave_grp ? 0 : err);
}
ilgstat = ILGSTAT_NEW;
IN6_IPADDR_TO_V4MAPPED(grp, &ilg->ilg_v6group);
ilg->ilg_ipif = ipif;
ilg->ilg_ill = NULL;
- ilg->ilg_orig_ifindex = 0;
} else if (leave_grp) {
ilg_delete(connp, ilg, NULL);
mutex_exit(&connp->conn_lock);
@@ -2389,7 +2418,7 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
const struct in6_addr *grp, ill_t *ill)
{
ilg_t *ilg;
- int i, orig_ifindex, orig_fmode, new_fmode, err;
+ int i, orig_fmode, new_fmode, err;
slist_t *orig_filter = NULL;
slist_t *new_filter = NULL;
struct sockaddr_storage *sl;
@@ -2409,65 +2438,31 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
ASSERT(IAM_WRITER_ILL(ill));
- /*
- * Use the ifindex to do the lookup. We can't use the ill
- * directly because ilg_ill could point to a different ill
- * if things have moved.
- */
- orig_ifindex = ill->ill_phyint->phyint_ifindex;
-
mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_index_v6(connp, grp, orig_ifindex);
+ ilg = ilg_lookup_ill_v6(connp, grp, ill);
if (ilg == NULL) {
/*
* if the request was actually to leave, and we
* didn't find an ilg, there's nothing to do.
*/
if (!leave_grp)
- ilg = conn_ilg_alloc(connp);
+ ilg = conn_ilg_alloc(connp, &err);
if (leave_grp || ilg == NULL) {
mutex_exit(&connp->conn_lock);
- return (leave_grp ? 0 : ENOMEM);
+ return (leave_grp ? 0 : err);
}
ilgstat = ILGSTAT_NEW;
ilg->ilg_v6group = *grp;
ilg->ilg_ipif = NULL;
- /*
- * Choose our target ill to join on. This might be
- * different from the ill we've been given if it's
- * currently down and part of a group.
- *
- * new ill is not refheld; we are writer.
- */
- ill = ip_choose_multi_ill(ill, grp);
- ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED));
ilg->ilg_ill = ill;
- /*
- * Remember the index that we joined on, so that we can
- * successfully delete them later on and also search for
- * duplicates if the application wants to join again.
- */
- ilg->ilg_orig_ifindex = orig_ifindex;
} else if (leave_grp) {
- /*
- * Use the ilg's current ill for the deletion,
- * we might have failed over.
- */
- ill = ilg->ilg_ill;
ilg_delete(connp, ilg, NULL);
mutex_exit(&connp->conn_lock);
- (void) ip_delmulti_v6(grp, ill, orig_ifindex,
- connp->conn_zoneid, B_FALSE, B_TRUE);
+ (void) ip_delmulti_v6(grp, ill, connp->conn_zoneid, B_FALSE,
+ B_TRUE);
return (0);
} else {
ilgstat = ILGSTAT_CHANGE;
- /*
- * The current ill might be different from the one we were
- * asked to join on (if failover has occurred); we should
- * join on the ill stored in the ilg. The original ill
- * is noted in ilg_orig_ifindex, which matched our request.
- */
- ill = ilg->ilg_ill;
/* preserve existing state in case ip_addmulti() fails */
orig_fmode = ilg->ilg_fmode;
if (ilg->ilg_filter == NULL) {
@@ -2531,8 +2526,8 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
mutex_exit(&connp->conn_lock);
- err = ip_addmulti_v6(grp, ill, orig_ifindex, connp->conn_zoneid,
- ilgstat, new_fmode, new_filter);
+ err = ip_addmulti_v6(grp, ill, connp->conn_zoneid, ilgstat, new_fmode,
+ new_filter);
if (err != 0) {
/*
* Restore the original filter state, or delete the
@@ -2541,7 +2536,7 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
* conn_lock.
*/
mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_index_v6(connp, grp, orig_ifindex);
+ ilg = ilg_lookup_ill_v6(connp, grp, ill);
ASSERT(ilg != NULL);
if (ilgstat == ILGSTAT_NEW) {
ilg_delete(connp, ilg, NULL);
@@ -3043,20 +3038,12 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group,
ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src)
{
ilg_t *ilg;
- ill_t *ilg_ill;
- uint_t ilg_orig_ifindex;
boolean_t leaving = B_TRUE;
ASSERT(IAM_WRITER_ILL(ill));
- /*
- * Use the index that we originally used to join. We can't
- * use the ill directly because ilg_ill could point to
- * a new ill if things have moved.
- */
mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_index_v6(connp, v6group,
- ill->ill_phyint->phyint_ifindex);
+ ilg = ilg_lookup_ill_v6(connp, v6group, ill);
if ((ilg == NULL) || (ilg->ilg_flags & ILG_DELETED)) {
mutex_exit(&connp->conn_lock);
return (EADDRNOTAVAIL);
@@ -3087,12 +3074,10 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group,
leaving = B_FALSE;
}
- ilg_ill = ilg->ilg_ill;
- ilg_orig_ifindex = ilg->ilg_orig_ifindex;
ilg_delete(connp, ilg, v6src);
mutex_exit(&connp->conn_lock);
- (void) ip_delmulti_v6(v6group, ilg_ill, ilg_orig_ifindex,
- connp->conn_zoneid, B_FALSE, leaving);
+ (void) ip_delmulti_v6(v6group, ill, connp->conn_zoneid, B_FALSE,
+ leaving);
return (0);
}
@@ -3345,10 +3330,10 @@ ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode,
if (ilg == NULL) {
ilgstat = ILGSTAT_NEW;
- if ((ilg = conn_ilg_alloc(connp)) == NULL) {
+ if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) {
mutex_exit(&connp->conn_lock);
l_free(new_filter);
- return (ENOMEM);
+ return (error);
}
if (src != INADDR_ANY) {
ilg->ilg_filter = l_alloc();
@@ -3369,7 +3354,6 @@ ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode,
}
ilg->ilg_ipif = ipif;
ilg->ilg_ill = NULL;
- ilg->ilg_orig_ifindex = 0;
ilg->ilg_fmode = fmode;
} else {
int index;
@@ -3437,7 +3421,6 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
mcast_record_t fmode, const in6_addr_t *v6src)
{
int error = 0;
- int orig_ifindex;
ilg_t *ilg;
ilg_stat_t ilgstat;
slist_t *new_filter = NULL;
@@ -3456,13 +3439,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
*/
mutex_enter(&connp->conn_lock);
- /*
- * Use the ifindex to do the lookup. We can't use the ill
- * directly because ilg_ill could point to a different ill if
- * things have moved.
- */
- orig_ifindex = ill->ill_phyint->phyint_ifindex;
- ilg = ilg_lookup_ill_index_v6(connp, v6group, orig_ifindex);
+ ilg = ilg_lookup_ill_v6(connp, v6group, ill);
/*
* Depending on the option we're handling, may or may not be okay
@@ -3501,10 +3478,10 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
}
if (ilg == NULL) {
- if ((ilg = conn_ilg_alloc(connp)) == NULL) {
+ if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) {
mutex_exit(&connp->conn_lock);
l_free(new_filter);
- return (ENOMEM);
+ return (error);
}
if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) {
ilg->ilg_filter = l_alloc();
@@ -3521,22 +3498,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
ilg->ilg_v6group = *v6group;
ilg->ilg_fmode = fmode;
ilg->ilg_ipif = NULL;
- /*
- * Choose our target ill to join on. This might be different
- * from the ill we've been given if it's currently down and
- * part of a group.
- *
- * new ill is not refheld; we are writer.
- */
- ill = ip_choose_multi_ill(ill, v6group);
- ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED));
ilg->ilg_ill = ill;
- /*
- * Remember the orig_ifindex that we joined on, so that we
- * can successfully delete them later on and also search
- * for duplicates if the application wants to join again.
- */
- ilg->ilg_orig_ifindex = orig_ifindex;
} else {
int index;
if (ilg->ilg_fmode != fmode || IN6_IS_ADDR_UNSPECIFIED(v6src)) {
@@ -3560,13 +3522,6 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
ilgstat = ILGSTAT_CHANGE;
index = ilg->ilg_filter->sl_numsrc++;
ilg->ilg_filter->sl_addr[index] = *v6src;
- /*
- * The current ill might be different from the one we were
- * asked to join on (if failover has occurred); we should
- * join on the ill stored in the ilg. The original ill
- * is noted in ilg_orig_ifindex, which matched our request.
- */
- ill = ilg->ilg_ill;
}
/*
@@ -3584,8 +3539,8 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
* info for the ill, which involves looking at the status of
* all the ilgs associated with this group/interface pair.
*/
- error = ip_addmulti_v6(v6group, ill, orig_ifindex, connp->conn_zoneid,
- ilgstat, new_fmode, new_filter);
+ error = ip_addmulti_v6(v6group, ill, connp->conn_zoneid, ilgstat,
+ new_fmode, new_filter);
if (error != 0) {
/*
* But because we waited, we have to undo the ilg update
@@ -3595,7 +3550,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
in6_addr_t delsrc =
(ilgstat == ILGSTAT_NEW) ? ipv6_all_zeros : *v6src;
mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_index_v6(connp, v6group, orig_ifindex);
+ ilg = ilg_lookup_ill_v6(connp, v6group, ill);
ASSERT(ilg != NULL);
ilg_delete(connp, ilg, &delsrc);
mutex_exit(&connp->conn_lock);
@@ -3639,7 +3594,7 @@ ilg_lookup_ill_withsrc(conn_t *connp, ipaddr_t group, ipaddr_t src, ill_t *ill)
ASSERT(ilg->ilg_ill == NULL);
ilg_ill = ipif->ipif_ill;
ASSERT(!ilg_ill->ill_isv6);
- if (ilg_ill == ill &&
+ if (IS_ON_SAME_LAN(ilg_ill, ill) &&
IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group)) {
if (SLIST_IS_EMPTY(ilg->ilg_filter)) {
/* no source filter, so this is a match */
@@ -3692,7 +3647,7 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
continue;
ASSERT(ilg->ilg_ipif == NULL);
ASSERT(ilg_ill->ill_isv6);
- if (ilg_ill == ill &&
+ if (IS_ON_SAME_LAN(ilg_ill, ill) &&
IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) {
if (SLIST_IS_EMPTY(ilg->ilg_filter)) {
/* no source filter, so this is a match */
@@ -3724,35 +3679,6 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
}
/*
- * Get the ilg whose ilg_orig_ifindex is associated with ifindex.
- * This is useful when the interface fails and we have moved
- * to a new ill, but still would like to locate using the index
- * that we originally used to join. Used only for IPv6 currently.
- */
-static ilg_t *
-ilg_lookup_ill_index_v6(conn_t *connp, const in6_addr_t *v6group, int ifindex)
-{
- ilg_t *ilg;
- int i;
-
- ASSERT(MUTEX_HELD(&connp->conn_lock));
- for (i = 0; i < connp->conn_ilg_inuse; i++) {
- ilg = &connp->conn_ilg[i];
- if (ilg->ilg_ill == NULL ||
- (ilg->ilg_flags & ILG_DELETED) != 0)
- continue;
- /* ilg_ipif is NULL for V6 */
- ASSERT(ilg->ilg_ipif == NULL);
- ASSERT(ilg->ilg_orig_ifindex != 0);
- if (IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group) &&
- ilg->ilg_orig_ifindex == ifindex) {
- return (ilg);
- }
- }
- return (NULL);
-}
-
-/*
* Find an IPv6 ilg matching group and ill
*/
ilg_t *
@@ -3863,32 +3789,28 @@ ilg_delete_all(conn_t *connp)
in6_addr_t v6group;
boolean_t success;
ipsq_t *ipsq;
- int orig_ifindex;
mutex_enter(&connp->conn_lock);
retry:
ILG_WALKER_HOLD(connp);
- for (i = connp->conn_ilg_inuse - 1; i >= 0; ) {
+ for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
ilg = &connp->conn_ilg[i];
/*
* Since this walk is not atomic (we drop the
* conn_lock and wait in ipsq_enter) we need
* to check for the ILG_DELETED flag.
*/
- if (ilg->ilg_flags & ILG_DELETED) {
- /* Go to the next ilg */
- i--;
+ if (ilg->ilg_flags & ILG_DELETED)
continue;
- }
- v6group = ilg->ilg_v6group;
- if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
+ if (IN6_IS_ADDR_V4MAPPED(&ilg->ilg_v6group)) {
ipif = ilg->ilg_ipif;
ill = ipif->ipif_ill;
} else {
ipif = NULL;
ill = ilg->ilg_ill;
}
+
/*
* We may not be able to refhold the ill if the ill/ipif
* is changing. But we need to make sure that the ill will
@@ -3897,11 +3819,9 @@ retry:
* in which case the unplumb thread will handle the cleanup,
* and we move on to the next ilg.
*/
- if (!ill_waiter_inc(ill)) {
- /* Go to the next ilg */
- i--;
+ if (!ill_waiter_inc(ill))
continue;
- }
+
mutex_exit(&connp->conn_lock);
/*
* To prevent deadlock between ill close which waits inside
@@ -3916,51 +3836,31 @@ retry:
ipsq = ill->ill_phyint->phyint_ipsq;
ill_waiter_dcr(ill);
mutex_enter(&connp->conn_lock);
- if (!success) {
- /* Go to the next ilg */
- i--;
+ if (!success)
continue;
- }
/*
- * Make sure that nothing has changed under. For eg.
- * a failover/failback can change ilg_ill while we were
- * waiting to become exclusive above
+ * Move on if the ilg was deleted while conn_lock was dropped.
*/
- if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
- ipif = ilg->ilg_ipif;
- ill = ipif->ipif_ill;
- } else {
- ipif = NULL;
- ill = ilg->ilg_ill;
- }
- if (!IAM_WRITER_ILL(ill) || (ilg->ilg_flags & ILG_DELETED)) {
- /*
- * The ilg has changed under us probably due
- * to a failover or unplumb. Retry on the same ilg.
- */
+ if (ilg->ilg_flags & ILG_DELETED) {
mutex_exit(&connp->conn_lock);
ipsq_exit(ipsq);
mutex_enter(&connp->conn_lock);
continue;
}
v6group = ilg->ilg_v6group;
- orig_ifindex = ilg->ilg_orig_ifindex;
ilg_delete(connp, ilg, NULL);
mutex_exit(&connp->conn_lock);
- if (ipif != NULL)
+ if (ipif != NULL) {
(void) ip_delmulti(V4_PART_OF_V6(v6group), ipif,
B_FALSE, B_TRUE);
-
- else
- (void) ip_delmulti_v6(&v6group, ill, orig_ifindex,
+ } else {
+ (void) ip_delmulti_v6(&v6group, ill,
connp->conn_zoneid, B_FALSE, B_TRUE);
-
+ }
ipsq_exit(ipsq);
mutex_enter(&connp->conn_lock);
- /* Go to the next ilg */
- i--;
}
ILG_WALKER_RELE(connp);
@@ -4063,7 +3963,6 @@ conn_delete_ill(conn_t *connp, caddr_t arg)
int i;
char group_buf[INET6_ADDRSTRLEN];
in6_addr_t v6group;
- int orig_ifindex;
ilg_t *ilg;
/*
@@ -4097,11 +3996,10 @@ conn_delete_ill(conn_t *connp, caddr_t arg)
ill->ill_name));
v6group = ilg->ilg_v6group;
- orig_ifindex = ilg->ilg_orig_ifindex;
ilg_delete(connp, ilg, NULL);
mutex_exit(&connp->conn_lock);
- (void) ip_delmulti_v6(&v6group, ill, orig_ifindex,
+ (void) ip_delmulti_v6(&v6group, ill,
connp->conn_zoneid, B_FALSE, B_TRUE);
mutex_enter(&connp->conn_lock);
}
@@ -4115,7 +4013,6 @@ conn_delete_ill(conn_t *connp, caddr_t arg)
if (connp->conn_multicast_ill == ill) {
/* Revert to late binding */
connp->conn_multicast_ill = NULL;
- connp->conn_orig_multicast_ifindex = 0;
}
mutex_exit(&connp->conn_lock);
}
diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c
index b53897cefe..895cc74bd2 100644
--- a/usr/src/uts/common/inet/ip/ip_ndp.c
+++ b/usr/src/uts/common/inet/ip/ip_ndp.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -83,8 +83,9 @@ static boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
static void nce_ire_delete(nce_t *nce);
static void nce_ire_delete1(ire_t *ire, char *nce_arg);
static void nce_set_ll(nce_t *nce, uchar_t *ll_addr);
-static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
-static nce_t *nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
+static nce_t *nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *,
+ nce_t *);
+static nce_t *nce_lookup_mapping(ill_t *, const in6_addr_t *);
static void nce_make_mapping(nce_t *nce, uchar_t *addrpos,
uchar_t *addr);
static int nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
@@ -93,11 +94,16 @@ static mblk_t *nce_udreq_alloc(ill_t *ill);
static void nce_update(nce_t *nce, uint16_t new_state,
uchar_t *new_ll_addr);
static uint32_t nce_solicit(nce_t *nce, mblk_t *mp);
-static boolean_t nce_xmit(ill_t *ill, uint32_t operation,
- ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
+static boolean_t nce_xmit(ill_t *ill, uint8_t type,
+ boolean_t use_lla_addr, const in6_addr_t *sender,
const in6_addr_t *target, int flag);
+static boolean_t nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla,
+ const in6_addr_t *target, uint_t flags);
+static boolean_t nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla,
+ const in6_addr_t *src, uint_t flags);
static int ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
nce_t **, nce_t *);
+static ipif_t *ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill);
#ifdef DEBUG
static void nce_trace_cleanup(const nce_t *);
@@ -110,22 +116,6 @@ static void nce_trace_cleanup(const nce_t *);
(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
NCE_TABLE_SIZE)]))
-/*
- * Compute default flags to use for an advertisement of this nce's address.
- */
-static int
-nce_advert_flags(const nce_t *nce)
-{
- int flag = 0;
-
- if (nce->nce_flags & NCE_F_ISROUTER)
- flag |= NDP_ISROUTER;
- if (!(nce->nce_flags & NCE_F_ANYCAST))
- flag |= NDP_ORIDE;
-
- return (flag);
-}
-
/* Non-tunable probe interval, based on link capabilities */
#define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
@@ -262,8 +252,7 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
mutex_exit(&nce->nce_lock);
- dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
- &ipv6_all_zeros, addr, NDP_PROBE);
+ dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
if (dropped) {
mutex_enter(&nce->nce_lock);
nce->nce_pcnt++;
@@ -282,23 +271,20 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
mutex_exit(&nce->nce_lock);
- dropped = nce_xmit(ill,
- ND_NEIGHBOR_ADVERT,
- ill, /* ill to be used for extracting ill_nd_lla */
- B_TRUE, /* use ill_nd_lla */
- addr, /* Source and target of the advertisement pkt */
- &ipv6_all_hosts_mcast, /* Destination of the packet */
- nce_advert_flags(nce));
+ dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast,
+ 0);
mutex_enter(&nce->nce_lock);
if (dropped)
nce->nce_unsolicit_count++;
if (nce->nce_unsolicit_count != 0) {
+ ASSERT(nce->nce_timeout_id == 0);
nce->nce_timeout_id = timeout(ndp_timer, nce,
MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
}
mutex_exit(&nce->nce_lock);
mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
}
+
/*
* If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
* we call nce_fastpath as soon as the nce is resolved in ndp_process.
@@ -311,10 +297,10 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
}
int
-ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
- const in6_addr_t *mask, const in6_addr_t *extract_mask,
- uint32_t hw_extract_start, uint16_t flags, uint16_t state,
- nce_t **newnce)
+ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
+ const in6_addr_t *addr, const in6_addr_t *mask,
+ const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags,
+ uint16_t state, nce_t **newnce)
{
int err = 0;
nce_t *nce;
@@ -325,7 +311,7 @@ ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
/* Get head of v6 hash table */
nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
- nce = nce_lookup_addr(ill, addr, nce);
+ nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
if (nce == NULL) {
err = ndp_add_v6(ill,
hw_addr,
@@ -562,13 +548,11 @@ nce_ire_delete_list(nce_t *nce)
if (nce->nce_ipversion == IPV4_VERSION) {
ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, nce_ire_delete1,
- (char *)nce, nce->nce_ill);
+ IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
} else {
ASSERT(nce->nce_ipversion == IPV6_VERSION);
ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, nce_ire_delete1,
- (char *)nce, nce->nce_ill);
+ IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
}
NCE_REFRELE_NOTR(nce);
nce = nce_next;
@@ -628,8 +612,7 @@ ndp_restart_dad(nce_t *nce)
nce->nce_state = ND_PROBE;
nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
mutex_exit(&nce->nce_lock);
- dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
- B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
+ dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
if (dropped) {
mutex_enter(&nce->nce_lock);
nce->nce_pcnt++;
@@ -649,22 +632,19 @@ ndp_restart_dad(nce_t *nce)
* If one is found, the refcnt on the nce will be incremented.
*/
nce_t *
-ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
+ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
+ boolean_t caller_holds_lock)
{
nce_t *nce;
- ip_stack_t *ipst;
-
- ASSERT(ill != NULL);
- ipst = ill->ill_ipst;
+ ip_stack_t *ipst = ill->ill_ipst;
- ASSERT(ill != NULL && ill->ill_isv6);
- if (!caller_holds_lock) {
+ ASSERT(ill->ill_isv6);
+ if (!caller_holds_lock)
mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
- }
/* Get head of v6 hash table */
nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
- nce = nce_lookup_addr(ill, addr, nce);
+ nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
if (nce == NULL)
nce = nce_lookup_mapping(ill, addr);
if (!caller_holds_lock)
@@ -685,14 +665,17 @@ ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
in6_addr_t addr6;
ip_stack_t *ipst = ill->ill_ipst;
- if (!caller_holds_lock) {
+ if (!caller_holds_lock)
mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
- }
/* Get head of v4 hash table */
nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
- nce = nce_lookup_addr(ill, &addr6, nce);
+ /*
+ * NOTE: IPv4 never matches across the illgrp since the NCE's we're
+ * looking up have fastpath headers that are inherently per-ill.
+ */
+ nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
if (!caller_holds_lock)
mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
return (nce);
@@ -706,7 +689,8 @@ ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
* lock (ndp_g_lock).
*/
static nce_t *
-nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
+nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
+ nce_t *nce)
{
ndp_g_t *ndp;
ip_stack_t *ipst = ill->ill_ipst;
@@ -716,12 +700,12 @@ nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
else
ndp = ipst->ips_ndp4;
- ASSERT(ill != NULL);
ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
if (IN6_IS_ADDR_UNSPECIFIED(addr))
return (NULL);
for (; nce != NULL; nce = nce->nce_next) {
- if (nce->nce_ill == ill) {
+ if (nce->nce_ill == ill ||
+ match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) {
if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
&ipv6_all_ones)) {
@@ -771,8 +755,8 @@ nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
* Process passed in parameters either from an incoming packet or via
* user ioctl.
*/
-void
-ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
+static void
+nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
{
ill_t *ill = nce->nce_ill;
uint32_t hw_addr_len = ill->ill_nd_lla_len;
@@ -852,7 +836,7 @@ ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
} else {
/*
* Send locally originated packets back
- * into * ip_wput_v6.
+ * into ip_wput_v6.
*/
put(ill->ill_wq, mp);
}
@@ -918,6 +902,65 @@ ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
}
/*
+ * Walker state structure used by ndp_process() / ndp_process_entry().
+ */
+typedef struct ndp_process_data {
+ ill_t *np_ill; /* ill/illgrp to match against */
+ const in6_addr_t *np_addr; /* IPv6 address to match */
+ uchar_t *np_hw_addr; /* passed to nce_process() */
+ uint32_t np_flag; /* passed to nce_process() */
+ boolean_t np_is_adv; /* passed to nce_process() */
+} ndp_process_data_t;
+
+/*
+ * Walker callback used by ndp_process() for IPMP groups: calls nce_process()
+ * for each NCE with a matching address that's in the same IPMP group.
+ */
+static void
+ndp_process_entry(nce_t *nce, void *arg)
+{
+ ndp_process_data_t *npp = arg;
+
+ if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) &&
+ IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) &&
+ IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
+ nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv);
+ }
+}
+
+/*
+ * Wrapper around nce_process() that handles IPMP. In particular, for IPMP,
+ * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have
+ * more than one NCE for a given IPv6 address to tend to. In that case, we
+ * need to walk all NCEs and callback nce_process() for each one. Since this
+ * is expensive, in the non-IPMP case we just directly call nce_process().
+ * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP
+ * interfaces in an IPMP group share the same NCEs -- at which point this
+ * function can be removed entirely.
+ */
+void
+ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
+{
+ ill_t *ill = nce->nce_ill;
+ struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6;
+ ndp_process_data_t np;
+
+ if (ill->ill_grp == NULL) {
+ nce_process(nce, hw_addr, flag, is_adv);
+ return;
+ }
+
+ /* IPMP case: walk all NCEs */
+ np.np_ill = ill;
+ np.np_addr = &nce->nce_addr;
+ np.np_flag = flag;
+ np.np_is_adv = is_adv;
+ np.np_hw_addr = hw_addr;
+
+ ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES);
+}
+
+/*
* Pass arg1 to the pfi supplied, along with each nce in existence.
* ndp_walk() places a REFHOLD on the nce and drops the lock when
* walking the hash list.
@@ -926,7 +969,6 @@ void
ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
boolean_t trace)
{
-
nce_t *nce;
nce_t *nce1;
nce_t **ncep;
@@ -1021,27 +1063,58 @@ ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
int
ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
{
- nce_t *nce;
- int err = 0;
+ nce_t *nce, *hw_nce = NULL;
+ int err;
+ ill_t *ipmp_ill;
+ uint16_t nce_flags;
uint32_t ms;
mblk_t *mp_nce = NULL;
ip_stack_t *ipst = ill->ill_ipst;
+ uchar_t *hwaddr = NULL;
ASSERT(ill->ill_isv6);
- if (IN6_IS_ADDR_MULTICAST(dst)) {
- err = nce_set_multicast(ill, dst);
- return (err);
+
+ if (IN6_IS_ADDR_MULTICAST(dst))
+ return (nce_set_multicast(ill, dst));
+
+ nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
+
+ /*
+ * If `ill' is under IPMP, then first check to see if there's an NCE
+ * for `dst' on the IPMP meta-interface (e.g., because an application
+ * explicitly did an SIOCLIFSETND to tie a hardware address to `dst').
+ * If so, we use that hardware address when creating the NCE below.
+ * Note that we don't yet have a mechanism to remove these NCEs if the
+ * NCE for `dst' on the IPMP meta-interface is subsequently removed --
+ * but rather than build such a beast, we should fix NCEs so that they
+ * can be properly shared across an IPMP group.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
+ hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE);
+ if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) {
+ hwaddr = hw_nce->nce_res_mp->b_rptr +
+ NCE_LL_ADDR_OFFSET(ipmp_ill);
+ nce_flags |= hw_nce->nce_flags;
+ }
+ ill_refrele(ipmp_ill);
+ }
}
+
err = ndp_lookup_then_add_v6(ill,
- NULL, /* No hardware address */
+ B_FALSE, /* NCE fastpath is per ill; don't match across group */
+ hwaddr,
dst,
&ipv6_all_ones,
&ipv6_all_zeros,
0,
- (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
- ND_INCOMPLETE,
+ nce_flags,
+ hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE,
&nce);
+ if (hw_nce != NULL)
+ NCE_REFRELE(hw_nce);
+
switch (err) {
case 0:
/*
@@ -1057,11 +1130,10 @@ ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
NCE_REFRELE(nce);
return (0);
}
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+
mutex_enter(&nce->nce_lock);
if (nce->nce_state != ND_INCOMPLETE) {
mutex_exit(&nce->nce_lock);
- rw_exit(&ipst->ips_ill_g_lock);
NCE_REFRELE(nce);
return (0);
}
@@ -1069,14 +1141,11 @@ ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
if (mp_nce == NULL) {
/* The caller will free mp */
mutex_exit(&nce->nce_lock);
- rw_exit(&ipst->ips_ill_g_lock);
ndp_delete(nce);
NCE_REFRELE(nce);
return (ENOMEM);
}
- ms = nce_solicit(nce, mp_nce);
- rw_exit(&ipst->ips_ill_g_lock);
- if (ms == 0) {
+ if ((ms = nce_solicit(nce, mp_nce)) == 0) {
/* The caller will free mp */
if (mp_nce != mp)
freeb(mp_nce);
@@ -1143,6 +1212,7 @@ ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
}
err = ndp_lookup_then_add_v6(ill,
+ B_FALSE, /* NCE fastpath is per ill; don't match across group */
NULL, /* hardware address */
dst,
&ipv6_all_ones,
@@ -1191,7 +1261,7 @@ nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
- nce = nce_lookup_addr(ill, dst, nce);
+ nce = nce_lookup_addr(ill, B_FALSE, dst, nce);
if (nce != NULL) {
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
NCE_REFRELE(nce);
@@ -1259,7 +1329,13 @@ ndp_query(ill_t *ill, struct lif_nd_req *lnr)
sin6 = (sin6_t *)&lnr->lnr_addr;
addr = &sin6->sin6_addr;
- nce = ndp_lookup_v6(ill, addr, B_FALSE);
+ /*
+ * NOTE: if the ill is an IPMP interface, then match against the whole
+ * illgrp. This e.g. allows in.ndpd to retrieve the link layer
+ * addresses for the data addresses on an IPMP interface even though
+ * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill.
+ */
+ nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE);
if (nce == NULL)
return (ESRCH);
/* If in INCOMPLETE state, no link layer address is available yet */
@@ -1347,24 +1423,14 @@ ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
uint32_t
nce_solicit(nce_t *nce, mblk_t *mp)
{
- ill_t *ill;
- ill_t *src_ill;
ip6_t *ip6h;
- in6_addr_t src;
- in6_addr_t dst;
- ipif_t *ipif;
- ip6i_t *ip6i;
- boolean_t dropped = B_FALSE;
- ip_stack_t *ipst = nce->nce_ill->ill_ipst;
+ in6_addr_t sender;
+ boolean_t dropped;
- ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
ASSERT(MUTEX_HELD(&nce->nce_lock));
- ill = nce->nce_ill;
- ASSERT(ill != NULL);
- if (nce->nce_rcnt == 0) {
+ if (nce->nce_rcnt == 0)
return (0);
- }
if (mp == NULL) {
ASSERT(nce->nce_qd_mp != NULL);
@@ -1385,60 +1451,22 @@ nce_solicit(nce_t *nce, mblk_t *mp)
* could be from the nce_qd_mp which could have b_next/b_prev
* non-NULL.
*/
- ip6i = (ip6i_t *)ip6h;
- ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
- sizeof (ip6i_t) + IPV6_HDR_LEN);
+ ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
}
- src = ip6h->ip6_src;
- /*
- * If the src of outgoing packet is one of the assigned interface
- * addresses use it, otherwise we will pick the source address below.
- */
- src_ill = ill;
- if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
- if (ill->ill_group != NULL)
- src_ill = ill->ill_group->illgrp_ill;
- for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
- for (ipif = src_ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (IN6_ARE_ADDR_EQUAL(&src,
- &ipif->ipif_v6lcl_addr)) {
- break;
- }
- }
- if (ipif != NULL)
- break;
- }
- /*
- * If no relevant ipif can be found, then it's not one of our
- * addresses. Reset to :: and let nce_xmit. If an ipif can be
- * found, but it's not yet done with DAD verification, then
- * just postpone this transmission until later.
- */
- if (src_ill == NULL)
- src = ipv6_all_zeros;
- else if (!ipif->ipif_addr_ready)
- return (ill->ill_reachable_retrans_time);
- }
- dst = nce->nce_addr;
+
/*
- * If source address is unspecified, nce_xmit will choose
- * one for us and initialize the hardware address also
- * appropriately.
+ * Need to copy the sender address into a local since `mp' can
+ * go away once we drop nce_lock.
*/
- if (IN6_IS_ADDR_UNSPECIFIED(&src))
- src_ill = NULL;
+ sender = ip6h->ip6_src;
nce->nce_rcnt--;
mutex_exit(&nce->nce_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
- &dst, 0);
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0);
mutex_enter(&nce->nce_lock);
if (dropped)
nce->nce_rcnt++;
- return (ill->ill_reachable_retrans_time);
+ return (nce->nce_ill->ill_reachable_retrans_time);
}
/*
@@ -1475,7 +1503,7 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
*/
mutex_enter(&ill->ill_lock);
if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
- (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
+ (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
mutex_exit(&ill->ill_lock);
continue;
}
@@ -1485,8 +1513,8 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
mutex_exit(&ill->ill_lock);
ipif->ipif_was_dup = B_TRUE;
- if (ipif_ndp_up(ipif) != EINPROGRESS)
- (void) ipif_up_done_v6(ipif);
+ VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
+ (void) ipif_up_done_v6(ipif);
}
freeb(mp);
}
@@ -1515,7 +1543,7 @@ ipif6_dup_recovery(void *arg)
/*
* No lock, because this is just an optimization.
*/
- if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
+ if (ipif->ipif_state_flags & IPIF_CONDEMNED)
return;
/* If the link is down, we'll retry this later */
@@ -1542,13 +1570,20 @@ ndp_do_recovery(ipif_t *ipif)
if (mp == NULL) {
mutex_enter(&ill->ill_lock);
if (ipif->ipif_recovery_id == 0 &&
- !(ipif->ipif_state_flags & (IPIF_MOVING |
- IPIF_CONDEMNED))) {
+ !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
}
mutex_exit(&ill->ill_lock);
} else {
+ /*
+ * A recovery timer may still be running if we got here from
+ * ill_restart_dad(); cancel that timer.
+ */
+ if (ipif->ipif_recovery_id != 0)
+ (void) untimeout(ipif->ipif_recovery_id);
+ ipif->ipif_recovery_id = 0;
+
bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
sizeof (ipif->ipif_v6lcl_addr));
ill_refhold(ill);
@@ -1558,41 +1593,51 @@ ndp_do_recovery(ipif_t *ipif)
}
/*
- * Find the solicitation in the given message, and extract printable details
- * (MAC and IP addresses) from it.
+ * Find the MAC and IP addresses in an NA/NS message.
*/
-static nd_neighbor_solicit_t *
-ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
- size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
+static void
+ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp,
+ uchar_t **haddr, uint_t *haddrlenp)
{
- nd_neighbor_solicit_t *ns;
- ip6_t *ip6h;
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
+ nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
+ nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
uchar_t *addr;
- int alen;
+ int alen = 0;
- alen = 0;
- ip6h = (ip6_t *)mp->b_rptr;
if (dl_mp == NULL) {
nd_opt_hdr_t *opt;
- int nslen;
+ int len;
/*
* If it's from the fast-path, then it can't be a probe
- * message, and thus must include the source linkaddr option.
+ * message, and thus must include a linkaddr option.
* Extract that here.
*/
- ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
- nslen = mp->b_wptr - (uchar_t *)ns;
- if ((nslen -= sizeof (*ns)) > 0) {
- opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
- ND_OPT_SOURCE_LINKADDR);
- if (opt != NULL &&
- opt->nd_opt_len * 8 - sizeof (*opt) >=
- ill->ill_nd_lla_len) {
- addr = (uchar_t *)(opt + 1);
- alen = ill->ill_nd_lla_len;
+ switch (icmp6->icmp6_type) {
+ case ND_NEIGHBOR_SOLICIT:
+ len = mp->b_wptr - (uchar_t *)ns;
+ if ((len -= sizeof (*ns)) > 0) {
+ opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1),
+ len, ND_OPT_SOURCE_LINKADDR);
}
+ break;
+ case ND_NEIGHBOR_ADVERT:
+ len = mp->b_wptr - (uchar_t *)na;
+ if ((len -= sizeof (*na)) > 0) {
+ opt = ndp_get_option((nd_opt_hdr_t *)(na + 1),
+ len, ND_OPT_TARGET_LINKADDR);
+ }
+ break;
+ }
+
+ if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >=
+ ill->ill_nd_lla_len) {
+ addr = (uchar_t *)(opt + 1);
+ alen = ill->ill_nd_lla_len;
}
+
/*
* We cheat a bit here for the sake of printing usable log
* messages in the rare case where the reply we got was unicast
@@ -1624,16 +1669,17 @@ ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
}
}
}
+
if (alen > 0) {
*haddr = addr;
- (void) mac_colon_addr(addr, alen, hbuf, hlen);
+ *haddrlenp = alen;
} else {
*haddr = NULL;
- (void) strcpy(hbuf, "?");
+ *haddrlenp = 0;
}
- ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
- (void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
- return (ns);
+
+ /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
+ *targp = ns->nd_ns_target;
}
/*
@@ -1646,68 +1692,80 @@ ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
{
ill_t *ill = rq->q_ptr;
ipif_t *ipif;
- char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */
- char hbuf[MAC_STR_LEN];
- char sbuf[INET6_ADDRSTRLEN];
- nd_neighbor_solicit_t *ns;
- mblk_t *dl_mp = NULL;
- uchar_t *haddr;
+ mblk_t *dl_mp = NULL;
+ uchar_t *haddr;
+ uint_t haddrlen;
ip_stack_t *ipst = ill->ill_ipst;
+ in6_addr_t targ;
if (DB_TYPE(mp) != M_DATA) {
dl_mp = mp;
mp = mp->b_cont;
}
- ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
- sizeof (sbuf), &haddr);
- if (haddr != NULL &&
- bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
+
+ ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
+ if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
/*
- * Ignore conflicts generated by misbehaving switches that just
- * reflect our own messages back to us.
+ * Ignore conflicts generated by misbehaving switches that
+ * just reflect our own messages back to us. For IPMP, we may
+ * see reflections across any ill in the illgrp.
*/
- goto ignore_conflict;
+ if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
+ IS_UNDER_IPMP(ill) &&
+ ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL)
+ goto ignore_conflict;
}
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ /*
+ * Look up the appropriate ipif.
+ */
+ ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL,
+ NULL, ipst);
+ if (ipif == NULL)
+ goto ignore_conflict;
- if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
- !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
- &ns->nd_ns_target)) {
- continue;
- }
+ /* Reload the ill to match the ipif */
+ ill = ipif->ipif_ill;
- /* If it's already marked, then don't do anything. */
- if (ipif->ipif_flags & IPIF_DUPLICATE)
- continue;
+ /* If it's already duplicate or ineligible, then don't do anything. */
+ if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
+ ipif_refrele(ipif);
+ goto ignore_conflict;
+ }
- /*
- * If this is a failure during duplicate recovery, then don't
- * complain. It may take a long time to recover.
- */
- if (!ipif->ipif_was_dup) {
- ipif_get_name(ipif, ibuf, sizeof (ibuf));
- cmn_err(CE_WARN, "%s has duplicate address %s (in "
- "use by %s); disabled", ibuf, sbuf, hbuf);
- }
- mutex_enter(&ill->ill_lock);
- ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
- ipif->ipif_flags |= IPIF_DUPLICATE;
- ill->ill_ipif_dup_count++;
- mutex_exit(&ill->ill_lock);
- (void) ipif_down(ipif, NULL, NULL);
- ipif_down_tail(ipif);
- mutex_enter(&ill->ill_lock);
- if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
- ill->ill_net_type == IRE_IF_RESOLVER &&
- !(ipif->ipif_state_flags & (IPIF_MOVING |
- IPIF_CONDEMNED)) &&
- ipst->ips_ip_dup_recovery > 0) {
- ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
- ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
- }
- mutex_exit(&ill->ill_lock);
+ /*
+ * If this is a failure during duplicate recovery, then don't
+ * complain. It may take a long time to recover.
+ */
+ if (!ipif->ipif_was_dup) {
+ char ibuf[LIFNAMSIZ];
+ char hbuf[MAC_STR_LEN];
+ char sbuf[INET6_ADDRSTRLEN];
+
+ ipif_get_name(ipif, ibuf, sizeof (ibuf));
+ cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
+ " disabled", ibuf,
+ inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
+ mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
}
+ mutex_enter(&ill->ill_lock);
+ ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
+ ipif->ipif_flags |= IPIF_DUPLICATE;
+ ill->ill_ipif_dup_count++;
+ mutex_exit(&ill->ill_lock);
+ (void) ipif_down(ipif, NULL, NULL);
+ ipif_down_tail(ipif);
+ mutex_enter(&ill->ill_lock);
+ if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
+ ill->ill_net_type == IRE_IF_RESOLVER &&
+ !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
+ ipst->ips_ip_dup_recovery > 0) {
+ ASSERT(ipif->ipif_recovery_id == 0);
+ ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
+ ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
+ }
+ mutex_exit(&ill->ill_lock);
+ ipif_refrele(ipif);
ignore_conflict:
if (dl_mp != NULL)
freeb(dl_mp);
@@ -1721,7 +1779,7 @@ ignore_conflict:
* we start a timer on the ipif.
*/
static void
-ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
+ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
{
if ((mp = copymsg(mp)) != NULL) {
if (dl_mp == NULL)
@@ -1736,7 +1794,6 @@ ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
B_FALSE);
}
}
- ndp_delete(nce);
}
/*
@@ -1757,6 +1814,7 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
NULL, NULL, ipst);
if (ipif == NULL)
return;
+
/*
* First, figure out if this address is disposable.
*/
@@ -1786,19 +1844,21 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
* sending out an unsolicited Neighbor Advertisement.
*/
if (defs >= maxdefense) {
- ip_ndp_failure(ill, mp, dl_mp, nce);
+ ip_ndp_failure(ill, mp, dl_mp);
} else {
char hbuf[MAC_STR_LEN];
char sbuf[INET6_ADDRSTRLEN];
uchar_t *haddr;
+ uint_t haddrlen;
+ in6_addr_t targ;
- (void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
- sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
+ ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
- hbuf, sbuf, ill->ill_name);
- (void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
- &nce->nce_addr, &ipv6_all_hosts_mcast,
- nce_advert_flags(nce));
+ mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)),
+ inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
+ ill->ill_name);
+
+ (void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0);
}
}
@@ -1843,6 +1903,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
bad_solicit = B_TRUE;
goto done;
}
+
}
if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
/* Check to see if this is a valid DAD solicitation */
@@ -1859,7 +1920,13 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
}
}
- our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
+ /*
+ * NOTE: with IPMP, it's possible the nominated multicast ill (which
+ * received this packet if it's multicast) is not the ill tied to
+ * e.g. the IPMP ill's data link-local. So we match across the illgrp
+ * to ensure we find the associated NCE.
+ */
+ our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE);
/*
* If this is a valid Solicitation, a permanent
* entry should exist in the cache
@@ -1883,7 +1950,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
haddr = (uchar_t *)&opt[1];
if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
hlen == 0) {
- ip1dbg(("ndp_input_advert: bad SLLA\n"));
+ ip1dbg(("ndp_input_solicit: bad SLLA\n"));
bad_solicit = B_TRUE;
goto done;
}
@@ -1934,6 +2001,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
goto no_source;
err = ndp_lookup_then_add_v6(ill,
+ B_FALSE,
haddr,
&src, /* Soliciting nodes address */
&ipv6_all_ones,
@@ -1949,8 +2017,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
break;
case EEXIST:
/*
- * B_FALSE indicates this is not an
- * an advertisement.
+ * B_FALSE indicates this is not an an advertisement.
*/
ndp_process(nnce, haddr, 0, B_FALSE);
NCE_REFRELE(nnce);
@@ -1985,7 +2052,7 @@ no_source:
* If someone else is probing our address, then
* we've crossed wires. Declare failure.
*/
- ip_ndp_failure(ill, mp, dl_mp, our_nce);
+ ip_ndp_failure(ill, mp, dl_mp);
}
goto done;
}
@@ -1995,15 +2062,8 @@ no_source:
*/
src = ipv6_all_hosts_mcast;
}
- flag |= nce_advert_flags(our_nce);
/* Response to a solicitation */
- (void) nce_xmit(ill,
- ND_NEIGHBOR_ADVERT,
- ill, /* ill to be used for extracting ill_nd_lla */
- B_TRUE, /* use ill_nd_lla */
- &target, /* Source and target of the advertisement pkt */
- &src, /* IP Destination (source of original pkt) */
- flag);
+ (void) nce_xmit_advert(our_nce, B_TRUE, &src, flag);
done:
if (bad_solicit)
BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
@@ -2023,8 +2083,8 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
in6_addr_t target;
nd_opt_hdr_t *opt = NULL;
int len;
- mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
ip_stack_t *ipst = ill->ill_ipst;
+ mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
ip6h = (ip6_t *)mp->b_rptr;
icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
@@ -2067,66 +2127,62 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
}
/*
- * If this interface is part of the group look at all the
+ * NOTE: we match across the illgrp since we need to do DAD for all of
+ * our local addresses, and those are spread across all the active
* ills in the group.
*/
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- if (ill->ill_group != NULL)
- ill = ill->ill_group->illgrp_ill;
+ if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL)
+ return;
- for (; ill != NULL; ill = ill->ill_group_next) {
- mutex_enter(&ill->ill_lock);
- if (!ILL_CAN_LOOKUP(ill)) {
- mutex_exit(&ill->ill_lock);
- continue;
- }
- ill_refhold_locked(ill);
- mutex_exit(&ill->ill_lock);
- dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
- /* We have to drop the lock since ndp_process calls put* */
- rw_exit(&ipst->ips_ill_g_lock);
- if (dst_nce != NULL) {
- if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
- dst_nce->nce_state == ND_PROBE) {
- /*
- * Someone else sent an advertisement for an
- * address that we're trying to configure.
- * Tear it down. Note that dl_mp might be NULL
- * if we're getting a unicast reply. This
- * isn't typically done (multicast is the norm
- * in response to a probe), but ip_ndp_failure
- * will handle the dl_mp == NULL case as well.
- */
- ip_ndp_failure(ill, mp, dl_mp, dst_nce);
- } else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
- /*
- * Someone just announced one of our local
- * addresses. If it wasn't us, then this is a
- * conflict. Defend the address or shut it
- * down.
- */
- if (dl_mp != NULL &&
- (haddr == NULL ||
- nce_cmp_ll_addr(dst_nce, haddr,
- ill->ill_nd_lla_len))) {
- ip_ndp_conflict(ill, mp, dl_mp,
- dst_nce);
- }
- } else {
- if (na->nd_na_flags_reserved &
- ND_NA_FLAG_ROUTER) {
- dst_nce->nce_flags |= NCE_F_ISROUTER;
+ if (dst_nce->nce_flags & NCE_F_PERMANENT) {
+ /*
+ * Someone just advertised one of our local addresses. First,
+ * check it it was us -- if so, we can safely ignore it.
+ */
+ if (haddr != NULL) {
+ if (!nce_cmp_ll_addr(dst_nce, haddr, hlen))
+ goto out; /* from us -- no conflict */
+
+ /*
+ * If we're in an IPMP group, check if this is an echo
+ * from another ill in the group. Use the double-
+ * checked locking pattern to avoid grabbing
+ * ill_g_lock in the non-IPMP case.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
+ ill->ill_grp, haddr, hlen) != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
+ goto out;
}
- /* B_TRUE indicates this an advertisement */
- ndp_process(dst_nce, haddr,
- na->nd_na_flags_reserved, B_TRUE);
+ rw_exit(&ipst->ips_ill_g_lock);
}
- NCE_REFRELE(dst_nce);
}
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- ill_refrele(ill);
+
+ /*
+ * This appears to be a real conflict. If we're trying to
+ * configure this NCE (ND_PROBE), then shut it down.
+ * Otherwise, handle the discovered conflict.
+ *
+ * Note that dl_mp might be NULL if we're getting a unicast
+ * reply. This isn't typically done (multicast is the norm in
+ * response to a probe), but we can handle the dl_mp == NULL
+ * case as well.
+ */
+ if (dst_nce->nce_state == ND_PROBE)
+ ip_ndp_failure(ill, mp, dl_mp);
+ else
+ ip_ndp_conflict(ill, mp, dl_mp, dst_nce);
+ } else {
+ if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
+ dst_nce->nce_flags |= NCE_F_ISROUTER;
+
+ /* B_TRUE indicates this an advertisement */
+ ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE);
}
- rw_exit(&ipst->ips_ill_g_lock);
+out:
+ NCE_REFRELE(dst_nce);
}
/*
@@ -2194,6 +2250,40 @@ done:
}
/*
+ * Utility routine to send an advertisement. Assumes that the NCE cannot
+ * go away (e.g., because it's refheld).
+ */
+static boolean_t
+nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target,
+ uint_t flags)
+{
+ ASSERT((flags & NDP_PROBE) == 0);
+
+ if (nce->nce_flags & NCE_F_ISROUTER)
+ flags |= NDP_ISROUTER;
+ if (!(nce->nce_flags & NCE_F_ANYCAST))
+ flags |= NDP_ORIDE;
+
+ return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla,
+ &nce->nce_addr, target, flags));
+}
+
+/*
+ * Utility routine to send a solicitation. Assumes that the NCE cannot
+ * go away (e.g., because it's refheld).
+ */
+static boolean_t
+nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
+ uint_t flags)
+{
+ if (flags & NDP_PROBE)
+ sender = &ipv6_all_zeros;
+
+ return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla,
+ sender, &nce->nce_addr, flags));
+}
+
+/*
* nce_xmit is called to form and transmit a ND solicitation or
* advertisement ICMP packet.
*
@@ -2207,88 +2297,79 @@ done:
* corresponding ill's ill_wq otherwise returns B_TRUE.
*/
static boolean_t
-nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
- boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
- int flag)
+nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
+ const in6_addr_t *sender, const in6_addr_t *target, int flag)
{
+ ill_t *hwaddr_ill;
uint32_t len;
icmp6_t *icmp6;
mblk_t *mp;
ip6_t *ip6h;
nd_opt_hdr_t *opt;
- uint_t plen;
+ uint_t plen, maxplen;
ip6i_t *ip6i;
ipif_t *src_ipif = NULL;
uint8_t *hw_addr;
zoneid_t zoneid = GLOBAL_ZONEID;
+ char buf[INET6_ADDRSTRLEN];
+
+ ASSERT(!IS_IPMP(ill));
/*
- * If we have a unspecified source(sender) address, select a
- * proper source address for the solicitation here itself so
- * that we can initialize the h/w address correctly. This is
- * needed for interface groups as source address can come from
- * the whole group and the h/w address initialized from ill will
- * be wrong if the source address comes from a different ill.
- *
- * If the sender is specified then we use this address in order
- * to lookup the zoneid before calling ip_output_v6(). This is to
- * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
- * by IP (we cannot guarantee that the global zone has an interface
- * route to the destination).
- *
- * Note that the NA never comes here with the unspecified source
- * address. The following asserts that whenever the source
- * address is specified, the haddr also should be specified.
+ * Check that the sender is actually a usable address on `ill', and if
+ * so, track that as the src_ipif. If not, for solicitations, set the
+ * sender to :: so that a new one will be picked below; for adverts,
+ * drop the packet since we expect nce_xmit_advert() to always provide
+ * a valid sender.
*/
- ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
+ if (!IN6_IS_ADDR_UNSPECIFIED(sender)) {
+ if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL ||
+ !src_ipif->ipif_addr_ready) {
+ if (src_ipif != NULL) {
+ ipif_refrele(src_ipif);
+ src_ipif = NULL;
+ }
+ if (type == ND_NEIGHBOR_ADVERT) {
+ ip1dbg(("nce_xmit: No source ipif for src %s\n",
+ inet_ntop(AF_INET6, sender, buf,
+ sizeof (buf))));
+ return (B_TRUE);
+ }
+ sender = &ipv6_all_zeros;
+ }
+ }
+ /*
+ * If we still have an unspecified source (sender) address and this
+ * isn't a probe, select a source address from `ill'.
+ */
if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
- ASSERT(operation != ND_NEIGHBOR_ADVERT);
+ ASSERT(type != ND_NEIGHBOR_ADVERT);
/*
- * Pick a source address for this solicitation, but
- * restrict the selection to addresses assigned to the
- * output interface (or interface group). We do this
- * because the destination will create a neighbor cache
- * entry for the source address of this packet, so the
- * source address had better be a valid neighbor.
+ * Pick a source address for this solicitation, but restrict
+ * the selection to addresses assigned to the output
+ * interface. We do this because the destination will create
+ * a neighbor cache entry for the source address of this
+ * packet, so the source address needs to be a valid neighbor.
*/
- src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
+ src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
if (src_ipif == NULL) {
- char buf[INET6_ADDRSTRLEN];
-
ip1dbg(("nce_xmit: No source ipif for dst %s\n",
- inet_ntop(AF_INET6, (char *)target, buf,
- sizeof (buf))));
+ inet_ntop(AF_INET6, target, buf, sizeof (buf))));
return (B_TRUE);
}
sender = &src_ipif->ipif_v6src_addr;
- hwaddr_ill = src_ipif->ipif_ill;
- } else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
- zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst);
- /*
- * It's possible for ipif_lookup_addr_zoneid_v6() to return
- * ALL_ZONES if it cannot find a matching ipif for the address
- * we are trying to use. In this case we err on the side of
- * trying to send the packet by defaulting to the GLOBAL_ZONEID.
- */
- if (zoneid == ALL_ZONES)
- zoneid = GLOBAL_ZONEID;
}
/*
- * Always make sure that the NS/NA packets don't get load
- * spread. This is needed so that the probe packets sent
- * by the in.mpathd daemon can really go out on the desired
- * interface. Probe packets are made to go out on a desired
- * interface by including a ip6i with ATTACH_IF flag. As these
- * packets indirectly end up sending/receiving NS/NA packets
- * (neighbor doing NUD), we have to make sure that NA
- * also go out on the same interface.
+ * We're either sending a probe or we have a source address.
*/
- plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
+ ASSERT((flag & NDP_PROBE) || src_ipif != NULL);
+
+ maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8);
len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
- plen * 8;
+ maxplen;
mp = allocb(len, BPRI_LO);
if (mp == NULL) {
if (src_ipif != NULL)
@@ -2301,28 +2382,27 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
ip6i = (ip6i_t *)mp->b_rptr;
ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
ip6i->ip6i_nxt = IPPROTO_RAW;
- ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
+ ip6i->ip6i_flags = IP6I_HOPLIMIT;
if (flag & NDP_PROBE)
ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
- ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
ip6h->ip6_nxt = IPPROTO_ICMPV6;
ip6h->ip6_hops = IPV6_MAX_HOPS;
+ ip6h->ip6_src = *sender;
ip6h->ip6_dst = *target;
icmp6 = (icmp6_t *)&ip6h[1];
opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
sizeof (nd_neighbor_advert_t));
- if (operation == ND_NEIGHBOR_SOLICIT) {
+ if (type == ND_NEIGHBOR_SOLICIT) {
nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
if (!(flag & NDP_PROBE))
opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
- ip6h->ip6_src = *sender;
ns->nd_ns_target = *target;
if (!(flag & NDP_UNICAST)) {
/* Form multicast address of the target */
@@ -2335,7 +2415,6 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
ASSERT(!(flag & NDP_PROBE));
opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
- ip6h->ip6_src = *sender;
na->nd_na_target = *sender;
if (flag & NDP_ISROUTER)
na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
@@ -2347,22 +2426,48 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
hw_addr = NULL;
if (!(flag & NDP_PROBE)) {
+ /*
+ * Use our source address to find the hardware address to put
+ * in the packet, so that the hardware address and IP address
+ * will match up -- even if that hardware address doesn't
+ * match the ill we actually transmit the packet through.
+ */
+ if (IS_IPMP(src_ipif->ipif_ill)) {
+ hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif);
+ if (hwaddr_ill == NULL) {
+ ip1dbg(("nce_xmit: no bound ill!\n"));
+ ipif_refrele(src_ipif);
+ freemsg(mp);
+ return (B_TRUE);
+ }
+ } else {
+ hwaddr_ill = src_ipif->ipif_ill;
+ ill_refhold(hwaddr_ill); /* for symmetry */
+ }
+
+ plen = roundup(sizeof (nd_opt_hdr_t) +
+ hwaddr_ill->ill_nd_lla_len, 8);
+
hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
hwaddr_ill->ill_phys_addr;
if (hw_addr != NULL) {
/* Fill in link layer address and option len */
- opt->nd_opt_len = (uint8_t)plen;
+ opt->nd_opt_len = (uint8_t)(plen / 8);
bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
}
+
+ ill_refrele(hwaddr_ill);
}
- if (hw_addr == NULL) {
- /* If there's no link layer address option, then strip it. */
- len -= plen * 8;
- mp->b_wptr = mp->b_rptr + len;
- ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
- }
- icmp6->icmp6_type = (uint8_t)operation;
+ if (hw_addr == NULL)
+ plen = 0;
+
+ /* Fix up the length of the packet now that plen is known */
+ len -= (maxplen - plen);
+ mp->b_wptr = mp->b_rptr + len;
+ ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
+
+ icmp6->icmp6_type = type;
icmp6->icmp6_code = 0;
/*
* Prepare for checksum by putting icmp length in the icmp
@@ -2370,8 +2475,17 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
*/
icmp6->icmp6_cksum = ip6h->ip6_plen;
- if (src_ipif != NULL)
+ /*
+ * Before we toss the src_ipif, look up the zoneid to pass to
+ * ip_output_v6(). This is to ensure unicast ND_NEIGHBOR_ADVERT
+ * packets to be routed correctly by IP (we cannot guarantee that the
+ * global zone has an interface route to the destination).
+ */
+ if (src_ipif != NULL) {
+ if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES)
+ zoneid = GLOBAL_ZONEID;
ipif_refrele(src_ipif);
+ }
ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
return (B_FALSE);
@@ -2448,7 +2562,6 @@ ndp_timer(void *arg)
ill_t *ill = nce->nce_ill;
uint32_t ms;
char addrbuf[INET6_ADDRSTRLEN];
- mblk_t *mp;
boolean_t dropped = B_FALSE;
ip_stack_t *ipst = ill->ill_ipst;
@@ -2460,11 +2573,6 @@ ndp_timer(void *arg)
*/
ASSERT(nce != NULL);
- /*
- * Grab the ill_g_lock now itself to avoid lock order problems.
- * nce_solicit needs ill_g_lock to be able to traverse ills
- */
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
mutex_enter(&nce->nce_lock);
NCE_REFHOLD_LOCKED(nce);
nce->nce_timeout_id = 0;
@@ -2474,11 +2582,10 @@ ndp_timer(void *arg)
*/
switch (nce->nce_state) {
case ND_DELAY:
- rw_exit(&ipst->ips_ill_g_lock);
nce->nce_state = ND_PROBE;
mutex_exit(&nce->nce_lock);
- (void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
- &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
+ (void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros,
+ NDP_UNICAST);
if (ip_debug > 3) {
/* ip2dbg */
pr_addr_dbg("ndp_timer: state for %s changed "
@@ -2489,7 +2596,6 @@ ndp_timer(void *arg)
return;
case ND_PROBE:
/* must be retransmit timer */
- rw_exit(&ipst->ips_ill_g_lock);
nce->nce_pcnt--;
ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
nce->nce_pcnt >= -1);
@@ -2504,8 +2610,8 @@ ndp_timer(void *arg)
nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
addrbuf, sizeof (addrbuf))));
mutex_exit(&nce->nce_lock);
- dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
- B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
+ dropped = nce_xmit_solicit(nce, B_FALSE,
+ &ipv6_all_zeros,
(nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
NDP_UNICAST);
if (dropped) {
@@ -2542,8 +2648,8 @@ ndp_timer(void *arg)
*/
nce->nce_state = ND_REACHABLE;
mutex_exit(&nce->nce_lock);
- ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
- ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
+ ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr,
+ nce->nce_ill);
if (ipif != NULL) {
if (ipif->ipif_was_dup) {
char ibuf[LIFNAMSIZ + 10];
@@ -2566,9 +2672,8 @@ ndp_timer(void *arg)
}
/* Begin defending our new address */
nce->nce_unsolicit_count = 0;
- dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
- B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
- nce_advert_flags(nce));
+ dropped = nce_xmit_advert(nce, B_FALSE,
+ &ipv6_all_hosts_mcast, 0);
if (dropped) {
nce->nce_unsolicit_count = 1;
NDP_RESTART_TIMER(nce,
@@ -2589,51 +2694,40 @@ ndp_timer(void *arg)
}
NCE_REFRELE(nce);
return;
- case ND_INCOMPLETE:
+ case ND_INCOMPLETE: {
+ ip6_t *ip6h;
+ ip6i_t *ip6i;
+ mblk_t *mp, *datamp, *nextmp, **prevmpp;
+
/*
- * Must be resolvers retransmit timer.
+ * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp
+ * for any IPMP probe packets, and toss 'em. IPMP probe
+ * packets will always be at the head of nce_qd_mp and always
+ * have an ip6i_t header, so we can stop at the first queued
+ * ND packet without an ip6i_t.
*/
- for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
- ip6i_t *ip6i;
- ip6_t *ip6h;
- mblk_t *data_mp;
-
- /*
- * Walk the list of packets queued, and see if there
- * are any multipathing probe packets. Such packets
- * are always queued at the head. Since this is a
- * retransmit timer firing, mark such packets as
- * delayed in ND resolution. This info will be used
- * in ip_wput_v6(). Multipathing probe packets will
- * always have an ip6i_t. Once we hit a packet without
- * it, we can break out of this loop.
- */
- if (mp->b_datap->db_type == M_CTL)
- data_mp = mp->b_cont;
- else
- data_mp = mp;
-
- ip6h = (ip6_t *)data_mp->b_rptr;
+ prevmpp = &nce->nce_qd_mp;
+ for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) {
+ nextmp = mp->b_next;
+ datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp;
+ ip6h = (ip6_t *)datamp->b_rptr;
if (ip6h->ip6_nxt != IPPROTO_RAW)
break;
- /*
- * This message should have been pulled up already in
- * ip_wput_v6. We can't do pullups here because the
- * b_next/b_prev is non-NULL.
- */
ip6i = (ip6i_t *)ip6h;
- ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
- sizeof (ip6i_t) + IPV6_HDR_LEN);
-
- /* Mark this packet as delayed due to ND resolution */
- if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
- ip6i->ip6i_flags |= IP6I_ND_DELAYED;
+ if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) {
+ inet_freemsg(mp);
+ *prevmpp = nextmp;
+ } else {
+ prevmpp = &mp->b_next;
+ }
}
+
+ /*
+ * Must be resolver's retransmit timer.
+ */
if (nce->nce_qd_mp != NULL) {
- ms = nce_solicit(nce, NULL);
- rw_exit(&ipst->ips_ill_g_lock);
- if (ms == 0) {
+ if ((ms = nce_solicit(nce, NULL)) == 0) {
if (nce->nce_state != ND_REACHABLE) {
mutex_exit(&nce->nce_lock);
nce_resolv_failed(nce);
@@ -2649,11 +2743,10 @@ ndp_timer(void *arg)
return;
}
mutex_exit(&nce->nce_lock);
- rw_exit(&ipst->ips_ill_g_lock);
NCE_REFRELE(nce);
break;
- case ND_REACHABLE :
- rw_exit(&ipst->ips_ill_g_lock);
+ }
+ case ND_REACHABLE:
if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
nce->nce_unsolicit_count != 0) ||
((nce->nce_flags & NCE_F_PERMANENT) &&
@@ -2661,13 +2754,8 @@ ndp_timer(void *arg)
if (nce->nce_unsolicit_count > 0)
nce->nce_unsolicit_count--;
mutex_exit(&nce->nce_lock);
- dropped = nce_xmit(ill,
- ND_NEIGHBOR_ADVERT,
- ill, /* ill to be used for hw addr */
- B_FALSE, /* use ill_phys_addr */
- &nce->nce_addr,
- &ipv6_all_hosts_mcast,
- nce_advert_flags(nce));
+ dropped = nce_xmit_advert(nce, B_FALSE,
+ &ipv6_all_hosts_mcast, 0);
if (dropped) {
mutex_enter(&nce->nce_lock);
nce->nce_unsolicit_count++;
@@ -2686,7 +2774,6 @@ ndp_timer(void *arg)
NCE_REFRELE(nce);
break;
default:
- rw_exit(&ipst->ips_ill_g_lock);
mutex_exit(&nce->nce_lock);
NCE_REFRELE(nce);
break;
@@ -2819,23 +2906,20 @@ void
nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
{
uint_t count = 0;
- mblk_t **mpp;
+ mblk_t **mpp, *tmp;
ASSERT(MUTEX_HELD(&nce->nce_lock));
- for (mpp = &nce->nce_qd_mp; *mpp != NULL;
- mpp = &(*mpp)->b_next) {
- if (++count >
- nce->nce_ill->ill_max_buf) {
- mblk_t *tmp = nce->nce_qd_mp->b_next;
-
+ for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
+ if (++count > nce->nce_ill->ill_max_buf) {
+ tmp = nce->nce_qd_mp->b_next;
nce->nce_qd_mp->b_next = NULL;
nce->nce_qd_mp->b_prev = NULL;
freemsg(nce->nce_qd_mp);
nce->nce_qd_mp = tmp;
}
}
- /* put this on the list */
+
if (head_insert) {
mp->b_next = nce->nce_qd_mp;
nce->nce_qd_mp = mp;
@@ -2849,8 +2933,8 @@ nce_queue_mp(nce_t *nce, mblk_t *mp)
{
boolean_t head_insert = B_FALSE;
ip6_t *ip6h;
- ip6i_t *ip6i;
- mblk_t *data_mp;
+ ip6i_t *ip6i;
+ mblk_t *data_mp;
ASSERT(MUTEX_HELD(&nce->nce_lock));
@@ -2867,43 +2951,28 @@ nce_queue_mp(nce_t *nce, mblk_t *mp)
* non-NULL.
*/
ip6i = (ip6i_t *)ip6h;
- ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
- sizeof (ip6i_t) + IPV6_HDR_LEN);
+ ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
+
/*
- * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
- * This has 2 aspects mentioned below.
- * 1. Perform head insertion in the nce_qd_mp for these packets.
- * This ensures that next retransmit of ND solicitation
- * will use the interface specified by the probe packet,
- * for both NS and NA. This corresponds to the src address
- * in the IPv6 packet. If we insert at tail, we will be
- * depending on the packet at the head for successful
- * ND resolution. This is not reliable, because the interface
- * on which the NA arrives could be different from the interface
- * on which the NS was sent, and if the receiving interface is
- * failed, it will appear that the sending interface is also
- * failed, causing in.mpathd to misdiagnose this as link
- * failure.
- * 2. Drop the original packet, if the ND resolution did not
- * succeed in the first attempt. However we will create the
- * nce and the ire, as soon as the ND resolution succeeds.
- * We don't gain anything by queueing multiple probe packets
- * and sending them back-to-back once resolution succeeds.
- * It is sufficient to send just 1 packet after ND resolution
- * succeeds. Since mpathd is sending down probe packets at a
- * constant rate, we don't need to send the queued packet. We
- * need to queue it only for NDP resolution. The benefit of
- * dropping the probe packets that were delayed in ND
- * resolution, is that in.mpathd will not see inflated
- * RTT. If the ND resolution does not succeed within
- * in.mpathd's failure detection time, mpathd may detect
- * a failure, and it does not matter whether the packet
- * was queued or dropped.
+ * If this packet is marked IP6I_IPMP_PROBE, then we need to:
+ *
+ * 1. Insert it at the head of the nce_qd_mp list. Consider
+ * the normal (non-probe) load-speading case where the
+ * source address of the ND packet is not tied to nce_ill.
+ * If the ill bound to the source address cannot receive,
+ * the response to the ND packet will not be received.
+ * However, if ND packets for nce_ill's probes are queued
+ * behind that ND packet, those probes will also fail to
+ * be sent, and thus in.mpathd will erroneously conclude
+ * that nce_ill has also failed.
+ *
+ * 2. Drop the probe packet in ndp_timer() if the ND did
+ * not succeed on the first attempt. This ensures that
+ * ND problems do not manifest as probe RTT spikes.
*/
- if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
+ if (ip6i->ip6i_flags & IP6I_IPMP_PROBE)
head_insert = B_TRUE;
}
-
nce_queue_mp_common(nce, mp, head_insert);
}
@@ -2988,13 +3057,17 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
(lnr->lnr_state_create != ND_STALE))
return (EINVAL);
+ if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN)
+ return (EINVAL);
+
sin6 = (sin6_t *)&lnr->lnr_addr;
addr = &sin6->sin6_addr;
mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
/* We know it can not be mapping so just look in the hash table */
nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
- nce = nce_lookup_addr(ill, addr, nce);
+ /* See comment in ndp_query() regarding IS_IPMP(ill) usage */
+ nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce);
if (nce != NULL)
new_flags = nce->nce_flags;
@@ -3065,7 +3138,7 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
* the link layer address passed in to determine the state
* much like incoming packets.
*/
- ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
+ nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
NCE_REFRELE(nce);
return (0);
}
@@ -3463,7 +3536,11 @@ ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
- nce = nce_lookup_addr(ill, &addr6, nce);
+ /*
+ * NOTE: IPv4 never matches across the illgrp since the NCE's we're
+ * looking up have fastpath headers that are inherently per-ill.
+ */
+ nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
if (nce == NULL) {
err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
} else {
@@ -3718,3 +3795,26 @@ ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
return (nce != NULL);
}
+
+/*
+ * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly
+ * with IPMP. Specifically, since neighbor discovery is always done on
+ * underlying interfaces (even for addresses owned by an IPMP interface), we
+ * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface
+ * associated with `ill' (if it exists).
+ */
+static ipif_t *
+ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill)
+{
+ ipif_t *ipif;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
+ if (ipif == NULL && IS_UNDER_IPMP(ill)) {
+ if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
+ ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
+ ill_refrele(ill);
+ }
+ }
+ return (ipif);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_netinfo.c b/usr/src/uts/common/inet/ip/ip_netinfo.c
index 53665593be..e81c7a0e1f 100644
--- a/usr/src/uts/common/inet/ip/ip_netinfo.c
+++ b/usr/src/uts/common/inet/ip/ip_netinfo.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -568,33 +568,17 @@ ip_getifname_impl(phy_if_t phy_ifdata,
char *buffer, const size_t buflen, boolean_t isv6, ip_stack_t *ipst)
{
ill_t *ill;
- char *name;
ASSERT(buffer != NULL);
ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, NULL, NULL,
NULL, NULL, ipst);
- if (ill != NULL) {
- name = ill->ill_name;
- } else {
- /* Fallback to group names only if hook_emulation is set */
- if (ipst->ips_ipmp_hook_emulation) {
- ill = ill_group_lookup_on_ifindex((uint_t)phy_ifdata,
- isv6, ipst);
- }
- if (ill == NULL)
- return (1);
- name = ill->ill_phyint->phyint_groupname;
- }
- if (name != NULL) {
- (void) strlcpy(buffer, name, buflen);
- ill_refrele(ill);
- return (0);
- } else {
- ill_refrele(ill);
+ if (ill == NULL)
return (1);
- }
+ (void) strlcpy(buffer, ill->ill_name, buflen);
+ ill_refrele(ill);
+ return (0);
}
/*
@@ -625,9 +609,6 @@ ipv6_getmtu(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata)
/*
* Shared implementation to determine the MTU of a network interface
- *
- * Note: this does not handle a non-zero ifdata when ipmp_hook_emulation is set.
- * But IP Filter only uses a zero ifdata.
*/
/* ARGSUSED */
static int
@@ -653,16 +634,7 @@ ip_getmtu_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
if ((ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6,
NULL, NULL, NULL, NULL, ipst)) == NULL) {
- /*
- * Fallback to group names only if hook_emulation
- * is set
- */
- if (ipst->ips_ipmp_hook_emulation) {
- ill = ill_group_lookup_on_ifindex(
- (uint_t)phy_ifdata, isv6, ipst);
- }
- if (ill == NULL)
- return (0);
+ return (0);
}
mtu = ill->ill_max_frag;
ill_refrele(ill);
@@ -686,9 +658,6 @@ ip_getpmtuenabled(net_handle_t neti)
/*
* Get next interface from the current list of IPv4 physical network interfaces
- *
- * Note: this does not handle the case when ipmp_hook_emulation is set.
- * But IP Filter does not use this function.
*/
static phy_if_t
ip_phygetnext(net_handle_t neti, phy_if_t phy_ifdata)
@@ -752,15 +721,10 @@ ip_phylookup_impl(const char *name, boolean_t isv6, ip_stack_t *ipst)
ill = ill_lookup_on_name((char *)name, B_FALSE, isv6, NULL, NULL,
NULL, NULL, NULL, ipst);
-
- /* Fallback to group names only if hook_emulation is set */
- if (ill == NULL && ipst->ips_ipmp_hook_emulation) {
- ill = ill_group_lookup_on_name((char *)name, isv6, ipst);
- }
if (ill == NULL)
return (0);
- phy = ill->ill_phyint->phyint_hook_ifindex;
+ phy = ill->ill_phyint->phyint_ifindex;
ill_refrele(ill);
@@ -798,9 +762,6 @@ ipv6_lifgetnext(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata)
/*
* Shared implementation to get next interface from the current list of
* logical network interfaces
- *
- * Note: this does not handle the case when ipmp_hook_emulation is set.
- * But IP Filter does not use this function.
*/
static lif_if_t
ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
@@ -834,7 +795,7 @@ ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
/*
* It's safe to iterate the ill_ipif list when holding an ill_lock.
* And it's also safe to access ipif_id without ipif refhold.
- * See ipif_get_id().
+ * See the field access rules in ip.h.
*/
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
if (!IPIF_CAN_LOOKUP(ipif))
@@ -1013,8 +974,8 @@ ip_inject_impl(inject_t style, net_inject_t *packet, boolean_t isv6,
if (ire->ire_nce == NULL ||
ire->ire_nce->nce_fp_mp == NULL &&
ire->ire_nce->nce_res_mp == NULL) {
- ip_newroute_v6(ire->ire_stq, mp,
- &sin6->sin6_addr, NULL, NULL, ALL_ZONES, ipst);
+ ip_newroute_v6(ire->ire_stq, mp, &sin6->sin6_addr,
+ &ip6h->ip6_src, NULL, ALL_ZONES, ipst);
ire_refrele(ire);
return (0);
@@ -1170,7 +1131,7 @@ ip_routeto_impl(struct sockaddr *address, struct sockaddr *nexthop,
}
ASSERT(ill != NULL);
- phy_if = (phy_if_t)ill->ill_phyint->phyint_hook_ifindex;
+ phy_if = (phy_if_t)ill->ill_phyint->phyint_ifindex;
if (sire != NULL)
ire_refrele(sire);
ire_refrele(ire);
@@ -1305,9 +1266,6 @@ ipv6_getlifaddr(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata,
/*
* Shared implementation to determine the network addresses for an interface
- *
- * Note: this does not handle a non-zero ifdata when ipmp_hook_emulation is set.
- * But IP Filter only uses a zero ifdata.
*/
/* ARGSUSED */
static int
@@ -1531,12 +1489,6 @@ ip_ni_queue_func_impl(injection_t *inject, boolean_t out)
ill = ill_lookup_on_ifindex((uint_t)packet->ni_physical,
B_FALSE, NULL, NULL, NULL, NULL, ipst);
-
- /* Fallback to group names only if hook_emulation is set */
- if (ill == NULL && ipst->ips_ipmp_hook_emulation) {
- ill = ill_group_lookup_on_ifindex((uint_t)packet->ni_physical,
- B_FALSE, ipst);
- }
if (ill == NULL) {
kmem_free(inject, sizeof (*inject));
return;
@@ -1613,65 +1565,3 @@ done:
kmem_free(info->hnei_event.hne_data, info->hnei_event.hne_datalen);
kmem_free(arg, sizeof (hook_nic_event_int_t));
}
-
-/*
- * Temporary function to support IPMP emulation for IP Filter.
- * Lookup an ill based on the ifindex assigned to the group.
- * Skips unusable ones i.e. where any of these flags are set:
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)
- */
-ill_t *
-ill_group_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
-{
- ill_t *ill;
- phyint_t *phyi;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- phyi = phyint_lookup_group_ifindex(index, ipst);
- if (phyi != NULL) {
- ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
- if (ill != NULL) {
- mutex_enter(&ill->ill_lock);
- if (ILL_CAN_LOOKUP(ill)) {
- ill_refhold_locked(ill);
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (ill);
- }
- mutex_exit(&ill->ill_lock);
- }
- }
- rw_exit(&ipst->ips_ill_g_lock);
- return (NULL);
-}
-
-/*
- * Temporary function to support IPMP emulation for IP Filter.
- * Lookup an ill based on the group name.
- * Skips unusable ones i.e. where any of these flags are set:
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)
- */
-ill_t *
-ill_group_lookup_on_name(char *name, boolean_t isv6, ip_stack_t *ipst)
-{
- ill_t *ill;
- phyint_t *phyi;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- phyi = phyint_lookup_group(name, B_TRUE, ipst);
- if (phyi != NULL) {
- ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
- if (ill != NULL) {
- mutex_enter(&ill->ill_lock);
- if (ILL_CAN_LOOKUP(ill)) {
- ill_refhold_locked(ill);
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (ill);
- }
- mutex_exit(&ill->ill_lock);
- }
- }
- rw_exit(&ipst->ips_ill_g_lock);
- return (NULL);
-}
diff --git a/usr/src/uts/common/inet/ip/ip_opt_data.c b/usr/src/uts/common/inet/ip/ip_opt_data.c
index bb6e98a99e..1c91ea667f 100644
--- a/usr/src/uts/common/inet/ip/ip_opt_data.c
+++ b/usr/src/uts/common/inet/ip/ip_opt_data.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -119,9 +119,6 @@ opdes_t ip_opt_arr[] = {
{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 /* no ifindex */ },
-{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
- sizeof (struct in_addr), 0 /* not initialized */ },
-
{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
sizeof (int), 0 },
@@ -199,12 +196,6 @@ opdes_t ip_opt_arr[] = {
{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 /* no ifindex */ },
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 /* no ifindex */ },
-
-{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 /* no ifindex */ },
-
{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
sizeof (int), 0 },
diff --git a/usr/src/uts/common/inet/ip/ip_rts.c b/usr/src/uts/common/inet/ip/ip_rts.c
index 3324d1d833..77ab2cc220 100644
--- a/usr/src/uts/common/inet/ip/ip_rts.c
+++ b/usr/src/uts/common/inet/ip/ip_rts.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -93,34 +93,52 @@ static void rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics);
static void ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *);
/*
- * Send the ack to all the routing queues. In case of the originating queue,
- * send it only if the loopback is set.
- *
- * Messages are sent upstream only on routing sockets that did not specify an
- * address family when they were created or when the address family matches the
- * one specified by the caller.
+ * Send `mp' to all eligible routing queues. A queue is ineligible if:
*
+ * 1. SO_USELOOPBACK is off and it is not the originating queue.
+ * 2. RTAW_UNDER_IPMP is on and RTSQ_UNDER_IPMP is clear in `flags'.
+ * 3. RTAW_UNDER_IPMP is off and RTSQ_NORMAL is clear in `flags'.
+ * 4. It is not the same address family as `af', and `af' isn't AF_UNSPEC.
*/
void
-rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst)
+rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
+ ip_stack_t *ipst)
{
mblk_t *mp1;
conn_t *connp, *next_connp;
+ /*
+ * Since we don't have an ill_t here, RTSQ_DEFAULT must already be
+ * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP by now.
+ */
+ ASSERT(!(flags & RTSQ_DEFAULT));
+
mutex_enter(&ipst->ips_rts_clients->connf_lock);
connp = ipst->ips_rts_clients->connf_head;
- while (connp != NULL) {
+ for (; connp != NULL; connp = next_connp) {
+ next_connp = connp->conn_next;
+
/*
* If there was a family specified when this routing socket was
* created and it doesn't match the family of the message to
* copy, then continue.
*/
if ((connp->conn_proto != AF_UNSPEC) &&
- (connp->conn_proto != af)) {
- connp = connp->conn_next;
+ (connp->conn_proto != af))
continue;
+
+ /*
+ * Queue the message only if the conn_t and flags match.
+ */
+ if (connp->conn_rtaware & RTAW_UNDER_IPMP) {
+ if (!(flags & RTSQ_UNDER_IPMP))
+ continue;
+ } else {
+ if (!(flags & RTSQ_NORMAL))
+ continue;
}
+
/*
* For the originating queue, we only copy the message upstream
* if loopback is set. For others reading on the routing
@@ -128,8 +146,8 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst)
* message.
*/
if ((o_connp == connp) && connp->conn_loopback == 0) {
- connp = connp->conn_next;
- continue;
+ connp = connp->conn_next;
+ continue;
}
CONN_INC_REF(connp);
mutex_exit(&ipst->ips_rts_clients->connf_lock);
@@ -145,10 +163,9 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst)
}
mutex_enter(&ipst->ips_rts_clients->connf_lock);
- /* Follow the next pointer before releasing the conn. */
+ /* reload next_connp since conn_next may have changed */
next_connp = connp->conn_next;
CONN_DEC_REF(connp);
- connp = next_connp;
}
mutex_exit(&ipst->ips_rts_clients->connf_lock);
freemsg(mp);
@@ -209,7 +226,7 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
rtm->rtm_errno = error;
else
rtm->rtm_flags |= RTF_DONE;
- rts_queue_input(mp, NULL, af, ipst);
+ rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst);
}
/* ARGSUSED */
@@ -430,7 +447,7 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
if (index != 0) {
ill_t *ill;
-
+lookup:
/*
* IPC must be refheld somewhere in ip_wput_nondata or
* ip_wput_ioctl etc... and cleaned up if ioctl is killed.
@@ -445,16 +462,33 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
goto done;
}
- ipif = ipif_get_next_ipif(NULL, ill);
- ill_refrele(ill);
/*
- * If this is replacement ipif, prevent a route from
- * being added.
+ * Since all interfaces in an IPMP group must be equivalent,
+ * we prevent changes to a specific underlying interface's
+ * routing configuration. However, for backward compatibility,
+ * we intepret a request to add a route on an underlying
+ * interface as a request to add a route on its IPMP interface.
*/
- if (ipif != NULL && ipif->ipif_replace_zero) {
- error = ENETDOWN;
- goto done;
+ if (IS_UNDER_IPMP(ill)) {
+ switch (rtm->rtm_type) {
+ case RTM_CHANGE:
+ case RTM_DELETE:
+ ill_refrele(ill);
+ error = EINVAL;
+ goto done;
+ case RTM_ADD:
+ index = ipmp_ill_get_ipmp_ifindex(ill);
+ ill_refrele(ill);
+ if (index == 0) {
+ error = EINVAL;
+ goto done;
+ }
+ goto lookup;
+ }
}
+
+ ipif = ipif_get_next_ipif(NULL, ill);
+ ill_refrele(ill);
match_flags |= MATCH_IRE_ILL;
}
@@ -1037,7 +1071,7 @@ done:
/* OK ACK already set up by caller except this */
ip2dbg(("ip_rts_request: OK ACK\n"));
}
- rts_queue_input(mp, connp, af, ipst);
+ rts_queue_input(mp, connp, af, RTSQ_ALL, ipst);
}
iocp->ioc_error = error;
@@ -1724,7 +1758,7 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
rtm->rtm_errno = error;
rtm->rtm_flags |= RTF_DONE;
rtm->rtm_addrs = rtm_addrs;
- rts_queue_input(mp, NULL, AF_INET, ipst);
+ rts_queue_input(mp, NULL, AF_INET, RTSQ_ALL, ipst);
}
/*
@@ -1733,7 +1767,13 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
* Message type generated RTM_IFINFO.
*/
void
-ip_rts_ifmsg(const ipif_t *ipif)
+ip_rts_ifmsg(const ipif_t *ipif, uint_t flags)
+{
+ ip_rts_xifmsg(ipif, 0, 0, flags);
+}
+
+void
+ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags)
{
if_msghdr_t *ifm;
mblk_t *mp;
@@ -1741,12 +1781,12 @@ ip_rts_ifmsg(const ipif_t *ipif)
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
/*
- * This message should be generated only
- * when the physical device is changing
- * state.
+ * This message should be generated only when the physical interface
+ * is changing state.
*/
if (ipif->ipif_id != 0)
return;
+
if (ipif->ipif_isv6) {
af = AF_INET6;
mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
@@ -1765,11 +1805,22 @@ ip_rts_ifmsg(const ipif_t *ipif)
}
ifm = (if_msghdr_t *)mp->b_rptr;
ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
- ifm->ifm_flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags |
- ipif->ipif_ill->ill_phyint->phyint_flags;
+ ifm->ifm_flags = (ipif->ipif_flags | ipif->ipif_ill->ill_flags |
+ ipif->ipif_ill->ill_phyint->phyint_flags | set) & ~clear;
rts_getifdata(&ifm->ifm_data, ipif);
ifm->ifm_addrs = RTA_IFP;
- rts_queue_input(mp, NULL, af, ipst);
+
+ if (flags & RTSQ_DEFAULT) {
+ flags = RTSQ_ALL;
+ /*
+ * If this message is for an underlying interface, prevent
+ * "normal" (IPMP-unaware) routing sockets from seeing it.
+ */
+ if (IS_UNDER_IPMP(ipif->ipif_ill))
+ flags &= ~RTSQ_NORMAL;
+ }
+
+ rts_queue_input(mp, NULL, af, flags, ipst);
}
/*
@@ -1778,7 +1829,7 @@ ip_rts_ifmsg(const ipif_t *ipif)
* The structure of the code is based on the 4.4BSD-Lite2 <net/rtsock.c>.
*/
void
-ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
+ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
{
int pass;
int ncmd;
@@ -1793,6 +1844,17 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
af = AF_INET6;
else
af = AF_INET;
+
+ if (flags & RTSQ_DEFAULT) {
+ flags = RTSQ_ALL;
+ /*
+ * If this message is for an underlying interface, prevent
+ * "normal" (IPMP-unaware) routing sockets from seeing it.
+ */
+ if (IS_UNDER_IPMP(ipif->ipif_ill))
+ flags &= ~RTSQ_NORMAL;
+ }
+
/*
* If the request is DELETE, send RTM_DELETE and RTM_DELADDR.
* if the request is ADD, send RTM_NEWADDR and RTM_ADD.
@@ -1827,7 +1889,7 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
ifam->ifam_metric = ipif->ipif_metric;
ifam->ifam_flags = ((cmd == RTM_ADD) ? RTF_UP : 0);
ifam->ifam_addrs = rtm_addrs;
- rts_queue_input(mp, NULL, af, ipst);
+ rts_queue_input(mp, NULL, af, flags, ipst);
}
if ((cmd == RTM_ADD && pass == 2) ||
(cmd == RTM_DELETE && pass == 1)) {
@@ -1857,7 +1919,7 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
if (error == 0)
rtm->rtm_flags |= RTF_DONE;
rtm->rtm_addrs = rtm_addrs;
- rts_queue_input(mp, NULL, af, ipst);
+ rts_queue_input(mp, NULL, af, flags, ipst);
}
}
}
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index 59ddb7461f..5afa70160d 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -2322,11 +2322,8 @@ ipcl_conn_cleanup(conn_t *connp)
* We should replace these pointers with ifindex/ipaddr_t to
* make the code less complex.
*/
- ASSERT(connp->conn_xmit_if_ill == NULL);
- ASSERT(connp->conn_nofailover_ill == NULL);
ASSERT(connp->conn_outgoing_ill == NULL);
ASSERT(connp->conn_incoming_ill == NULL);
- ASSERT(connp->conn_outgoing_pill == NULL);
ASSERT(connp->conn_multicast_ipif == NULL);
ASSERT(connp->conn_multicast_ill == NULL);
#endif
diff --git a/usr/src/uts/common/inet/ip/ipmp.c b/usr/src/uts/common/inet/ip/ipmp.c
new file mode 100644
index 0000000000..b8f3768834
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ipmp.c
@@ -0,0 +1,2201 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <inet/arp.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_rts.h>
+#include <inet/mi.h>
+#include <net/if_types.h>
+#include <sys/dlpi.h>
+#include <sys/kmem.h>
+#include <sys/modhash.h>
+#include <sys/sdt.h>
+#include <sys/strsun.h>
+#include <sys/sunddi.h>
+#include <sys/types.h>
+
+/*
+ * Convenience macros for getting the ip_stack_t associated with an
+ * ipmp_illgrp_t or ipmp_grp_t.
+ */
+#define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint)
+#define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst)
+
+/*
+ * Assorted constants that aren't important enough to be tunable.
+ */
+#define IPMP_GRP_HASH_SIZE 64
+#define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */
+
+/*
+ * Templates for IPMP ARP messages.
+ */
+static const arie_t ipmp_aract_template = {
+ AR_IPMP_ACTIVATE,
+ sizeof (arie_t), /* Name offset */
+ sizeof (arie_t) /* Name length (set by ill_arp_alloc) */
+};
+
+static const arie_t ipmp_ardeact_template = {
+ AR_IPMP_DEACTIVATE,
+ sizeof (arie_t), /* Name offset */
+ sizeof (arie_t) /* Name length (set by ill_arp_alloc) */
+};
+
+/*
+ * IPMP meta-interface kstats (based on those in PSARC/1997/198).
+ */
+static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = {
+ { "obytes", KSTAT_DATA_UINT32 },
+ { "obytes64", KSTAT_DATA_UINT64 },
+ { "rbytes", KSTAT_DATA_UINT32 },
+ { "rbytes64", KSTAT_DATA_UINT64 },
+ { "opackets", KSTAT_DATA_UINT32 },
+ { "opackets64", KSTAT_DATA_UINT64 },
+ { "oerrors", KSTAT_DATA_UINT32 },
+ { "ipackets", KSTAT_DATA_UINT32 },
+ { "ipackets64", KSTAT_DATA_UINT64 },
+ { "ierrors", KSTAT_DATA_UINT32 },
+ { "multircv", KSTAT_DATA_UINT32 },
+ { "multixmt", KSTAT_DATA_UINT32 },
+ { "brdcstrcv", KSTAT_DATA_UINT32 },
+ { "brdcstxmt", KSTAT_DATA_UINT32 },
+ { "link_up", KSTAT_DATA_UINT32 }
+};
+
+static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t);
+static int ipmp_grp_create_kstats(ipmp_grp_t *);
+static int ipmp_grp_update_kstats(kstat_t *, int);
+static void ipmp_grp_destroy_kstats(ipmp_grp_t *);
+static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *);
+static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *);
+static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
+static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t);
+static boolean_t ipmp_ill_activate(ill_t *);
+static void ipmp_ill_deactivate(ill_t *);
+static void ipmp_ill_ire_mark_testhidden(ire_t *, char *);
+static void ipmp_ill_ire_clear_testhidden(ire_t *, char *);
+static void ipmp_ill_refresh_active_timer_start(ill_t *);
+static void ipmp_ill_rtsaddrmsg(ill_t *, int);
+static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action);
+static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t);
+static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *);
+static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *);
+
+/*
+ * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init().
+ */
+void
+ipmp_init(ip_stack_t *ipst)
+{
+ ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash",
+ IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
+ mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
+ rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0);
+}
+
+/*
+ * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini().
+ */
+void
+ipmp_destroy(ip_stack_t *ipst)
+{
+ mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash);
+ rw_destroy(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Create an IPMP group named `grname', associate it with IPMP phyint `phyi',
+ * and add it to the hash. On success, return a pointer to the created group.
+ * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP
+ * meta-interface associated with the group also has the same name (but they
+ * may differ later via ipmp_grp_rename()).
+ */
+ipmp_grp_t *
+ipmp_grp_create(const char *grname, phyint_t *phyi)
+{
+ ipmp_grp_t *grp;
+ ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+ mod_hash_hndl_t mh;
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL)
+ return (NULL);
+
+ (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
+ (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname));
+
+ /*
+ * Cache the group's phyint. This is safe since a phyint_t will
+ * outlive its ipmp_grp_t.
+ */
+ grp->gr_phyint = phyi;
+
+ /*
+ * Create IPMP group kstats.
+ */
+ if (ipmp_grp_create_kstats(grp) != 0) {
+ kmem_free(grp, sizeof (ipmp_grp_t));
+ return (NULL);
+ }
+
+ /*
+ * Insert the group into the hash.
+ */
+ if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) {
+ ipmp_grp_destroy_kstats(grp);
+ kmem_free(grp, sizeof (ipmp_grp_t));
+ return (NULL);
+ }
+ ipmp_grp_insert(grp, mh);
+
+ return (grp);
+}
+
+/*
+ * Create IPMP kstat structures for `grp'. Return an errno upon failure.
+ */
+static int
+ipmp_grp_create_kstats(ipmp_grp_t *grp)
+{
+ kstat_t *ksp;
+ netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
+
+ ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net",
+ KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id);
+ if (ksp == NULL)
+ return (ENOMEM);
+
+ ksp->ks_update = ipmp_grp_update_kstats;
+ ksp->ks_private = grp;
+ bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats));
+
+ kstat_install(ksp);
+ grp->gr_ksp = ksp;
+ return (0);
+}
+
+/*
+ * Update the IPMP kstats tracked by `ksp'; called by the kstats framework.
+ */
+static int
+ipmp_grp_update_kstats(kstat_t *ksp, int rw)
+{
+ uint_t i;
+ kstat_named_t *kn = KSTAT_NAMED_PTR(ksp);
+ ipmp_grp_t *grp = ksp->ks_private;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+ ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq;
+ phyint_t *phyi;
+ uint64_t phyi_kstats[IPMP_KSTAT_MAX];
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Start with the group's baseline values.
+ */
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ if (kn[i].data_type == KSTAT_DATA_UINT32) {
+ kn[i].value.ui32 = grp->gr_kstats0[i];
+ } else {
+ ASSERT(kn[i].data_type == KSTAT_DATA_UINT64);
+ kn[i].value.ui64 = grp->gr_kstats0[i];
+ }
+ }
+
+ /*
+ * Add in the stats of each phyint currently in the group. Since we
+ * don't directly track the phyints in a group, we cheat by walking
+ * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while
+ * ill_g_lock is held.)
+ */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ ipsq = grp_ipsq->ipsq_next;
+ for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) {
+ phyi = ipsq->ipsq_phyint;
+
+ /*
+ * If a phyint in a group is being unplumbed, it's possible
+ * that ill_glist_delete() -> phyint_free() already freed the
+ * phyint (and set ipsq_phyint to NULL), but the unplumb
+ * operation has yet to complete (and thus ipsq_dq() has yet
+ * to remove the phyint's IPSQ from the group IPSQ's phyint
+ * list). We skip those phyints here (note that their kstats
+ * have already been added to gr_kstats0[]).
+ */
+ if (phyi == NULL)
+ continue;
+
+ ipmp_phyint_get_kstats(phyi, phyi_kstats);
+
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ phyi_kstats[i] -= phyi->phyint_kstats0[i];
+ if (kn[i].data_type == KSTAT_DATA_UINT32)
+ kn[i].value.ui32 += phyi_kstats[i];
+ else
+ kn[i].value.ui64 += phyi_kstats[i];
+ }
+ }
+
+ kn[IPMP_KSTAT_LINK_UP].value.ui32 =
+ (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0;
+
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (0);
+}
+
+/*
+ * Destroy IPMP kstat structures for `grp'.
+ */
+static void
+ipmp_grp_destroy_kstats(ipmp_grp_t *grp)
+{
+ netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
+
+ kstat_delete_netstack(grp->gr_ksp, id);
+ bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0));
+ grp->gr_ksp = NULL;
+}
+
+/*
+ * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it
+ * does not exist.
+ */
+ipmp_grp_t *
+ipmp_grp_lookup(const char *grname, ip_stack_t *ipst)
+{
+ ipmp_grp_t *grp;
+
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
+ (mod_hash_val_t *)&grp) == 0)
+ return (grp);
+
+ return (NULL);
+}
+
+/*
+ * Place information about group `grp' into `lifgr'.
+ */
+void
+ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr)
+{
+ ill_t *ill;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ lifgr->gi_v4 = (grp->gr_v4 != NULL);
+ lifgr->gi_v6 = (grp->gr_v6 != NULL);
+ lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4;
+ lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6;
+ lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP;
+ (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ);
+ lifgr->gi_m4ifname[0] = '\0';
+ lifgr->gi_m6ifname[0] = '\0';
+ lifgr->gi_bcifname[0] = '\0';
+
+ if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) {
+ (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ);
+ (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ);
+ }
+
+ if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL)
+ (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ);
+}
+
+/*
+ * Insert `grp' into the hash using the reserved hash entry `mh'.
+ * Caller must ensure `grp' is not yet in the hash.
+ */
+static void
+ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh)
+{
+ int err;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * Since grp->gr_name will exist at least as long as `grp' is in the
+ * hash, we use it directly as the key.
+ */
+ err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash,
+ (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh);
+ if (err != 0) {
+ /*
+ * This should never happen since `mh' was preallocated.
+ */
+ panic("cannot insert IPMP group \"%s\" (err %d)",
+ grp->gr_name, err);
+ }
+}
+
+/*
+ * Remove `grp' from the hash. Caller must ensure `grp' is in it.
+ */
+static void
+ipmp_grp_remove(ipmp_grp_t *grp)
+{
+ int err;
+ mod_hash_val_t val;
+ mod_hash_key_t key = (mod_hash_key_t)grp->gr_name;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val);
+ if (err != 0 || val != grp) {
+ panic("cannot remove IPMP group \"%s\" (err %d)",
+ grp->gr_name, err);
+ }
+}
+
+/*
+ * Attempt to rename `grp' to new name `grname'. Return an errno if the new
+ * group name already exists or is invalid, or if there isn't enough memory.
+ */
+int
+ipmp_grp_rename(ipmp_grp_t *grp, const char *grname)
+{
+ mod_hash_hndl_t mh;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if (grname[0] == '\0')
+ return (EINVAL);
+
+ if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
+ (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND)
+ return (EEXIST);
+
+ /*
+ * Before we remove the group from the hash, ensure we'll be able to
+ * re-insert it by reserving space.
+ */
+ if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0)
+ return (ENOMEM);
+
+ ipmp_grp_remove(grp);
+ (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
+ ipmp_grp_insert(grp, mh);
+
+ return (0);
+}
+
+/*
+ * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in
+ * the hash, and that there are no interfaces on it.
+ */
+void
+ipmp_grp_destroy(ipmp_grp_t *grp)
+{
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * If there are still interfaces using this group, panic before things
+ * go really off the rails.
+ */
+ if (grp->gr_nif != 0)
+ panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name);
+
+ ipmp_grp_remove(grp);
+ ipmp_grp_destroy_kstats(grp);
+
+ ASSERT(grp->gr_v4 == NULL);
+ ASSERT(grp->gr_v6 == NULL);
+ ASSERT(grp->gr_nv4 == 0);
+ ASSERT(grp->gr_nv6 == 0);
+ ASSERT(grp->gr_nactif == 0);
+ ASSERT(grp->gr_linkdownmp == NULL);
+ grp->gr_phyint = NULL;
+
+ kmem_free(grp, sizeof (ipmp_grp_t));
+}
+
+/*
+ * Check whether `ill' is suitable for inclusion into `grp', and return an
+ * errno describing the problem (if any). NOTE: many of these errno values
+ * are interpreted by ifconfig, which will take corrective action and retry
+ * the SIOCSLIFGROUPNAME, so please exercise care when changing them.
+ */
+static int
+ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
+{
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * To sidestep complicated address migration logic in the kernel and
+ * to force the kernel's all-hosts multicast memberships to be blown
+ * away, all addresses that had been brought up must be brought back
+ * down prior to adding an interface to a group. (This includes
+ * addresses currently down due to DAD.) Once the interface has been
+ * added to the group, its addresses can then be brought back up, at
+ * which point they will be moved to the IPMP meta-interface.
+ * NOTE: we do this before ill_appaddr_cnt() since bringing down the
+ * link-local causes in.ndpd to remove its ADDRCONF'd addresses.
+ */
+ if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
+ return (EADDRINUSE);
+
+ /*
+ * To avoid confusing applications by changing addresses that are
+ * under their control, all such control must be removed prior to
+ * adding an interface into a group.
+ */
+ if (ill_appaddr_cnt(ill) != 0)
+ return (EADDRNOTAVAIL);
+
+ /*
+ * Since PTP addresses do not share the same broadcast domain, they
+ * are not allowed to be in an IPMP group.
+ */
+ if (ill_ptpaddr_cnt(ill) != 0)
+ return (EINVAL);
+
+ /*
+ * An ill must support multicast to be allowed into a group.
+ */
+ if (!(ill->ill_flags & ILLF_MULTICAST))
+ return (ENOTSUP);
+
+ /*
+ * An ill must strictly be using ARP and/or ND for address
+ * resolution for it to be allowed into a group.
+ */
+ if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV))
+ return (ENOTSUP);
+
+ /*
+ * An ill cannot also be using usesrc groups. (Although usesrc uses
+ * ill_g_usesrc_lock, we don't need to grab it since usesrc also does
+ * all its modifications as writer.)
+ */
+ if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill))
+ return (ENOTSUP);
+
+ /*
+ * All ills in a group must be the same mactype.
+ */
+ if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Check whether `phyi' is suitable for inclusion into `grp', and return an
+ * errno describing the problem (if any). See comment above ipmp_grp_vet_ill()
+ * regarding errno values.
+ */
+int
+ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi)
+{
+ int err = 0;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq));
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * An interface cannot have address families plumbed that are not
+ * configured in the group.
+ */
+ if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL ||
+ phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL)
+ return (EAFNOSUPPORT);
+
+ if (phyi->phyint_illv4 != NULL)
+ err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4);
+ if (err == 0 && phyi->phyint_illv6 != NULL)
+ err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6);
+
+ return (err);
+}
+
+/*
+ * Create a new illgrp on IPMP meta-interface `ill'.
+ */
+ipmp_illgrp_t *
+ipmp_illgrp_create(ill_t *ill)
+{
+ uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
+ ipmp_illgrp_t *illg;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_IPMP(ill));
+ ASSERT(ill->ill_grp == NULL);
+
+ if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL)
+ return (NULL);
+
+ list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode));
+ list_create(&illg->ig_actif, sizeof (ill_t),
+ offsetof(ill_t, ill_actnode));
+ list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t),
+ offsetof(ipmp_arpent_t, ia_node));
+
+ illg->ig_ipmp_ill = ill;
+ ill->ill_grp = illg;
+ ipmp_illgrp_set_mtu(illg, mtu);
+
+ return (illg);
+}
+
+/*
+ * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface.
+ */
+void
+ipmp_illgrp_destroy(ipmp_illgrp_t *illg)
+{
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+ ASSERT(IS_IPMP(illg->ig_ipmp_ill));
+
+ /*
+ * Verify `illg' is empty.
+ */
+ ASSERT(illg->ig_next_ill == NULL);
+ ASSERT(illg->ig_cast_ill == NULL);
+ ASSERT(list_is_empty(&illg->ig_arpent));
+ ASSERT(list_is_empty(&illg->ig_if));
+ ASSERT(list_is_empty(&illg->ig_actif));
+ ASSERT(illg->ig_nactif == 0);
+
+ /*
+ * Destroy `illg'.
+ */
+ illg->ig_ipmp_ill->ill_grp = NULL;
+ illg->ig_ipmp_ill = NULL;
+ list_destroy(&illg->ig_if);
+ list_destroy(&illg->ig_actif);
+ list_destroy(&illg->ig_arpent);
+ kmem_free(illg, sizeof (ipmp_illgrp_t));
+}
+
+/*
+ * Add `ipif' to the pool of usable data addresses on `illg' and attempt to
+ * bind it to an underlying ill, while keeping an even address distribution.
+ * If the bind is successful, return a pointer to the bound ill.
+ */
+ill_t *
+ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
+{
+ ill_t *minill;
+ ipmp_arpent_t *entp;
+
+ ASSERT(IAM_WRITER_IPIF(ipif));
+ ASSERT(ipmp_ipif_is_dataaddr(ipif));
+
+ /*
+ * IPMP data address mappings are internally managed by IP itself, so
+ * delete any existing ARP entries associated with the address.
+ */
+ if (!ipif->ipif_isv6) {
+ entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr);
+ if (entp != NULL)
+ ipmp_illgrp_destroy_arpent(illg, entp);
+ }
+
+ if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
+ ipmp_ill_bind_ipif(minill, ipif, Res_act_none);
+
+ return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL);
+}
+
+/*
+ * Delete `ipif' from the pool of usable data addresses on `illg'. If it's
+ * bound, unbind it from the underlying ill while keeping an even address
+ * distribution.
+ */
+void
+ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
+{
+ ill_t *maxill, *boundill = ipif->ipif_bound_ill;
+
+ ASSERT(IAM_WRITER_IPIF(ipif));
+
+ if (boundill != NULL) {
+ (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE);
+
+ maxill = ipmp_illgrp_max_ill(illg);
+ if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) {
+ ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
+ ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind);
+ }
+ }
+}
+
+/*
+ * Return the active ill with the greatest number of data addresses in `illg'.
+ */
+static ill_t *
+ipmp_illgrp_max_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill, *bestill = NULL;
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ ill = list_head(&illg->ig_actif);
+ for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
+ if (bestill == NULL ||
+ ill->ill_bound_cnt > bestill->ill_bound_cnt) {
+ bestill = ill;
+ }
+ }
+ return (bestill);
+}
+
+/*
+ * Return the active ill with the fewest number of data addresses in `illg'.
+ */
+static ill_t *
+ipmp_illgrp_min_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill, *bestill = NULL;
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ ill = list_head(&illg->ig_actif);
+ for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
+ if (bestill == NULL ||
+ ill->ill_bound_cnt < bestill->ill_bound_cnt) {
+ if (ill->ill_bound_cnt == 0)
+ return (ill); /* can't get better */
+ bestill = ill;
+ }
+ }
+ return (bestill);
+}
+
+/*
+ * Return a pointer to IPMP meta-interface for `illg' (which must exist).
+ * Since ig_ipmp_ill never changes for a given illg, no locks are needed.
+ */
+ill_t *
+ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg)
+{
+ return (illg->ig_ipmp_ill);
+}
+
+/*
+ * Return a pointer to the next available underlying ill in `illg', or NULL if
+ * one doesn't exist. Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_next_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ if ((ill = illg->ig_next_ill) != NULL) {
+ illg->ig_next_ill = list_next(&illg->ig_actif, ill);
+ if (illg->ig_next_ill == NULL)
+ illg->ig_next_ill = list_head(&illg->ig_actif);
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ return (ill);
+}
+
+/*
+ * Return a held pointer to the next available underlying ill in `illg', or
+ * NULL if one doesn't exist. Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill;
+ uint_t i;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ for (i = 0; i < illg->ig_nactif; i++) {
+ ill = illg->ig_next_ill;
+ illg->ig_next_ill = list_next(&illg->ig_actif, ill);
+ if (illg->ig_next_ill == NULL)
+ illg->ig_next_ill = list_head(&illg->ig_actif);
+
+ if (ILL_CAN_LOOKUP(ill)) {
+ ill_refhold(ill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (ill);
+ }
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ return (NULL);
+}
+
+/*
+ * Return a pointer to the nominated multicast ill in `illg', or NULL if one
+ * doesn't exist. Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg)
+{
+ /*
+ * Since an IPMP ill's ill_grp gets cleared during I_PUNLINK but
+ * this function can get called after that point, handle NULL.
+ */
+ if (illg == NULL)
+ return (NULL);
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+ return (illg->ig_cast_ill);
+}
+
+/*
+ * Return a held pointer to the nominated multicast ill in `illg', or NULL if
+ * one doesn't exist. Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *castill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ castill = illg->ig_cast_ill;
+ if (castill != NULL && ILL_CAN_LOOKUP(castill)) {
+ ill_refhold(castill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (castill);
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (NULL);
+}
+
+/*
+ * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL,
+ * any existing nomination is removed. Caller must be inside the IPSQ.
+ */
+static void
+ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
+{
+ ill_t *ocastill = illg->ig_cast_ill;
+ ill_t *ipmp_ill = illg->ig_ipmp_ill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill));
+
+ /*
+ * Disable old nominated ill (if any).
+ */
+ if (ocastill != NULL) {
+ DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *,
+ illg, ill_t *, ocastill);
+ ASSERT(ocastill->ill_nom_cast);
+ ocastill->ill_nom_cast = B_FALSE;
+ /*
+ * If the IPMP meta-interface is down, we never did the join,
+ * so we must not try to leave.
+ */
+ if (ipmp_ill->ill_dl_up)
+ ill_leave_multicast(ipmp_ill);
+ }
+
+ /*
+ * Set new nomination.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ illg->ig_cast_ill = castill;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ if (ocastill != NULL) {
+ /*
+ * Delete any IREs tied to the old nomination. We must do
+ * this after the new castill is set and has reached global
+ * visibility since the datapath has not been quiesced.
+ */
+ ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
+ ill_stq_cache_delete, ocastill, ocastill);
+ }
+
+ /*
+ * Enable new nominated ill (if any).
+ */
+ if (castill != NULL) {
+ DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *,
+ illg, ill_t *, castill);
+ ASSERT(!castill->ill_nom_cast);
+ castill->ill_nom_cast = B_TRUE;
+ /*
+ * If the IPMP meta-interface is down, the attempt to recover
+ * will silently fail but ill_need_recover_multicast will be
+ * erroneously cleared -- so check first.
+ */
+ if (ipmp_ill->ill_dl_up)
+ ill_recover_multicast(ipmp_ill);
+ }
+
+ /*
+ * For IPv4, refresh our broadcast IREs. This needs to be done even
+ * if there's no new nomination since ill_refresh_bcast() still must
+ * update the IPMP meta-interface's broadcast IREs to point back at
+ * the IPMP meta-interface itself.
+ */
+ if (!ipmp_ill->ill_isv6)
+ ill_refresh_bcast(ipmp_ill);
+}
+
+/*
+ * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an
+ * entry for the same IP address already exists, destroy it first. Return the
+ * created IPMP ARP entry, or NULL on failure.
+ */
+ipmp_arpent_t *
+ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp)
+{
+ uchar_t *addrp;
+ area_t *area = (area_t *)mp->b_rptr;
+ ipmp_arpent_t *entp, *oentp;
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+ ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t));
+
+ if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL)
+ return (NULL);
+
+ if ((mp = copyb(mp)) == NULL) {
+ kmem_free(entp, sizeof (ipmp_arpent_t));
+ return (NULL);
+ }
+
+ DB_TYPE(mp) = M_PROTO;
+ entp->ia_area_mp = mp;
+ entp->ia_proxyarp = proxyarp;
+ addrp = mi_offset_paramc(mp, area->area_proto_addr_offset,
+ sizeof (ipaddr_t));
+ bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t));
+
+ if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
+ ipmp_illgrp_destroy_arpent(illg, oentp);
+
+ list_insert_head(&illg->ig_arpent, entp);
+ return (entp);
+}
+
+/*
+ * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it.
+ */
+void
+ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
+{
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ list_remove(&illg->ig_arpent, entp);
+ freeb(entp->ia_area_mp);
+ kmem_free(entp, sizeof (ipmp_arpent_t));
+}
+
+/*
+ * Mark that ARP has been notified about the IP address on `entp'; `illg' is
+ * taken as a debugging aid for DTrace FBT probes.
+ */
+/* ARGSUSED */
+void
+ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
+{
+ entp->ia_notified = B_TRUE;
+}
+
+/*
+ * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is
+ * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist.
+ */
+ipmp_arpent_t *
+ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp)
+{
+ ipmp_arpent_t *entp = list_head(&illg->ig_arpent);
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ if (addrp == NULL)
+ return (entp);
+
+ for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp))
+ if (entp->ia_ipaddr == *addrp)
+ break;
+ return (entp);
+}
+
+/*
+ * Refresh ARP entries on `illg' to be distributed across its active
+ * interfaces. Entries that cannot be refreshed (e.g., because there are no
+ * active interfaces) are marked so that subsequent calls can try again.
+ */
+void
+ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
+{
+ ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
+ uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
+ area_t *area;
+ mblk_t *area_mp;
+ uchar_t *physaddr;
+ ipmp_arpent_t *entp;
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill));
+ ASSERT(!ipmp_ill->ill_isv6);
+
+ ill = list_head(&illg->ig_actif);
+ entp = list_head(&illg->ig_arpent);
+ for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) {
+ if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) {
+ entp->ia_notified = B_FALSE;
+ continue;
+ }
+
+ area = (area_t *)entp->ia_area_mp->b_rptr;
+ ASSERT(paddrlen == ill->ill_phys_addr_length);
+ ASSERT(paddrlen == area->area_hw_addr_length);
+ physaddr = mi_offset_paramc(entp->ia_area_mp,
+ area->area_hw_addr_offset, paddrlen);
+
+ /*
+ * If this is a proxy ARP entry, we can skip notifying ARP if
+ * the entry is already up-to-date. If it has changed, we
+ * update the entry's hardware address before notifying ARP.
+ */
+ if (entp->ia_proxyarp) {
+ if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 &&
+ entp->ia_notified)
+ continue;
+ bcopy(ill->ill_phys_addr, physaddr, paddrlen);
+ }
+
+ if ((area_mp = copyb(entp->ia_area_mp)) == NULL) {
+ entp->ia_notified = B_FALSE;
+ continue;
+ }
+
+ putnext(ipmp_ill->ill_rq, area_mp);
+ ipmp_illgrp_mark_arpent(illg, entp);
+
+ if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
+ ill = list_head(&illg->ig_actif);
+ }
+}
+
+/*
+ * Return an interface in `illg' with the specified `physaddr', or NULL if one
+ * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen)
+{
+ ill_t *ill;
+ ill_t *ipmp_ill = illg->ig_ipmp_ill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock));
+
+ ill = list_head(&illg->ig_if);
+ for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+ if (ill->ill_phys_addr_length == paddrlen &&
+ bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0)
+ return (ill);
+ }
+ return (NULL);
+}
+
+/*
+ * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND.
+ * Caller must be inside the IPSQ unless this is initialization.
+ */
+static void
+ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu)
+{
+ ill_t *ill = illg->ig_ipmp_ill;
+ mblk_t *mp;
+
+ ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill));
+
+ /*
+ * If allocation fails, we have bigger problems than MTU.
+ */
+ if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) {
+ illg->ig_mtu = mtu;
+ put(ill->ill_rq, mp);
+ }
+}
+
+/*
+ * Recalculate the IPMP group MTU for `illg', and update its associated IPMP
+ * ill MTU if necessary.
+ */
+void
+ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
+{
+ ill_t *ill;
+ ill_t *ipmp_ill = illg->ig_ipmp_ill;
+ uint_t mtu = 0;
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill));
+
+ /*
+ * Since ill_max_mtu can only change under ill_lock, we hold ill_lock
+ * for each ill as we iterate through the list. Any changes to the
+ * ill_max_mtu will also trigger an update, so even if we missed it
+ * this time around, the update will catch it.
+ */
+ ill = list_head(&illg->ig_if);
+ for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+ mutex_enter(&ill->ill_lock);
+ if (mtu == 0 || ill->ill_max_mtu < mtu)
+ mtu = ill->ill_max_mtu;
+ mutex_exit(&ill->ill_lock);
+ }
+
+ /*
+ * MTU must be at least the minimum MTU.
+ */
+ mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
+
+ if (illg->ig_mtu != mtu)
+ ipmp_illgrp_set_mtu(illg, mtu);
+}
+
+/*
+ * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently
+ * allow the same link to be established more than once.
+ */
+void
+ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp)
+{
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if (illg->ig_ipmp_ill->ill_isv6) {
+ ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg);
+ grp->gr_v6 = illg;
+ } else {
+ ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg);
+ grp->gr_v4 = illg;
+ }
+}
+
+/*
+ * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp
+ * cannot be unlinked (e.g., because there are still interfaces using it).
+ */
+int
+ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg)
+{
+ ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if (illg->ig_ipmp_ill->ill_isv6) {
+ if (grp->gr_nv6 + grp->gr_pendv6 != 0)
+ return (EBUSY);
+ grp->gr_v6 = NULL;
+ } else {
+ if (grp->gr_nv4 + grp->gr_pendv4 != 0)
+ return (EBUSY);
+ grp->gr_v4 = NULL;
+ }
+ return (0);
+}
+
+/*
+ * Place `ill' into `illg', and rebalance the data addresses on `illg'
+ * to be spread evenly across the ills now in it. Also, adjust the IPMP
+ * ill as necessary to account for `ill' (e.g., MTU).
+ */
+void
+ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
+{
+ ill_t *ipmp_ill;
+ ipif_t *ipif;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */
+ ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL);
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(ill->ill_grp == NULL);
+
+ ipmp_ill = illg->ig_ipmp_ill;
+
+ /*
+ * Account for `ill' joining the illgrp.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ if (ill->ill_isv6)
+ ill->ill_phyint->phyint_grp->gr_nv6++;
+ else
+ ill->ill_phyint->phyint_grp->gr_nv4++;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * Ensure the ILLF_ROUTER flag remains consistent across the group.
+ */
+ mutex_enter(&ill->ill_lock);
+ if (ipmp_ill->ill_flags & ILLF_ROUTER)
+ ill->ill_flags |= ILLF_ROUTER;
+ else
+ ill->ill_flags &= ~ILLF_ROUTER;
+ mutex_exit(&ill->ill_lock);
+
+ /*
+ * Blow away all multicast memberships that currently exist on `ill'.
+ * This may seem odd, but it's consistent with the application view
+ * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
+ */
+ if (ill->ill_isv6) {
+ reset_conn_ill(ill);
+ reset_mrt_ill(ill);
+ } else {
+ ipif = ill->ill_ipif;
+ for (; ipif != NULL; ipif = ipif->ipif_next) {
+ reset_conn_ipif(ipif);
+ reset_mrt_vif_ipif(ipif);
+ }
+ }
+ ip_purge_allmulti(ill);
+
+ /*
+ * Borrow the first ill's ill_phys_addr_length value for the illgrp's
+ * physical address length. All other ills must have the same value,
+ * since they are required to all be the same mactype. Also update
+ * the IPMP ill's MTU and CoS marking, if necessary.
+ */
+ if (list_is_empty(&illg->ig_if)) {
+ ASSERT(ipmp_ill->ill_phys_addr_length == 0);
+ /*
+ * NOTE: we leave ill_phys_addr NULL since the IPMP group
+ * doesn't have a physical address. This means that code must
+ * not assume that ill_phys_addr is non-NULL just because
+ * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla.
+ */
+ ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length;
+ ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length;
+ ipmp_ill->ill_type = ill->ill_type;
+
+ if (ill->ill_flags & ILLF_COS_ENABLED) {
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ }
+ ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+ } else {
+ ASSERT(ipmp_ill->ill_phys_addr_length ==
+ ill->ill_phys_addr_length);
+ ASSERT(ipmp_ill->ill_type == ill->ill_type);
+
+ if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ }
+ if (illg->ig_mtu > ill->ill_max_mtu)
+ ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+ }
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ list_insert_tail(&illg->ig_if, ill);
+ ill->ill_grp = illg;
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ /*
+ * Hide the IREs on `ill' so that we don't accidentally find them when
+ * sending data traffic.
+ */
+ ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
+
+ /*
+ * Merge any broadcast IREs, if need be.
+ */
+ if (!ill->ill_isv6)
+ ill_refresh_bcast(ill);
+
+ ipmp_ill_refresh_active(ill);
+}
+
+/*
+ * Remove `ill' from its illgrp, and rebalance the data addresses in that
+ * illgrp to be spread evenly across the remaining ills. Also, adjust the
+ * IPMP ill as necessary now that `ill' is removed (e.g., MTU).
+ */
+void
+ipmp_ill_leave_illgrp(ill_t *ill)
+{
+ ill_t *ipmp_ill;
+ ipif_t *ipif;
+ ipmp_arpent_t *entp;
+ ipmp_illgrp_t *illg = ill->ill_grp;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IS_UNDER_IPMP(ill));
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(illg != NULL);
+
+ ipmp_ill = illg->ig_ipmp_ill;
+
+ /*
+ * Cancel IPMP-specific ill timeouts.
+ */
+ (void) untimeout(ill->ill_refresh_tid);
+
+ /*
+ * Expose any previously-hidden IREs on `ill'.
+ */
+ ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill);
+
+ /*
+ * Ensure the multicast state for each ipif on `ill' is down so that
+ * our ipif_multicast_up() (once `ill' leaves the group) will rejoin
+ * all eligible groups.
+ */
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipif->ipif_flags & IPIF_UP)
+ ipif_multicast_down(ipif);
+
+ /*
+ * Account for `ill' leaving the illgrp.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ if (ill->ill_isv6)
+ ill->ill_phyint->phyint_grp->gr_nv6--;
+ else
+ ill->ill_phyint->phyint_grp->gr_nv4--;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * Pull `ill' out of the interface lists.
+ */
+ if (list_link_active(&ill->ill_actnode))
+ ipmp_ill_deactivate(ill);
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ list_remove(&illg->ig_if, ill);
+ ill->ill_grp = NULL;
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ /*
+ * Recreate any broadcast IREs that had been shared, if need be.
+ */
+ if (!ill->ill_isv6)
+ ill_refresh_bcast(ill);
+
+ /*
+ * Re-establish multicast memberships that were previously being
+ * handled by the IPMP meta-interface.
+ */
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipif->ipif_flags & IPIF_UP)
+ ipif_multicast_up(ipif);
+
+ /*
+ * Refresh the group MTU based on the new interface list.
+ */
+ ipmp_illgrp_refresh_mtu(illg);
+
+ if (list_is_empty(&illg->ig_if)) {
+ /*
+ * No ills left in the illgrp; we no longer have a physical
+ * address length, nor can we support ARP, CoS, or anything
+ * else that depends on knowing the link layer type.
+ */
+ while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL)
+ ipmp_illgrp_destroy_arpent(illg, entp);
+
+ ipmp_ill->ill_phys_addr_length = 0;
+ ipmp_ill->ill_nd_lla_len = 0;
+ ipmp_ill->ill_type = IFT_OTHER;
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ } else {
+ /*
+ * If `ill' didn't support CoS, see if it can now be enabled.
+ */
+ if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
+ ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED));
+
+ ill = list_head(&illg->ig_if);
+ do {
+ if (!(ill->ill_flags & ILLF_COS_ENABLED))
+ break;
+ } while ((ill = list_next(&illg->ig_if, ill)) != NULL);
+
+ if (ill == NULL) {
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ }
+ }
+ }
+}
+
+/*
+ * Check if `ill' should be active, and activate or deactivate if need be.
+ * Return B_FALSE if a refresh was necessary but could not be performed.
+ */
+static boolean_t
+ipmp_ill_try_refresh_active(ill_t *ill)
+{
+ boolean_t refreshed = B_TRUE;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ if (ipmp_ill_is_active(ill)) {
+ if (!list_link_active(&ill->ill_actnode))
+ refreshed = ipmp_ill_activate(ill);
+ } else {
+ if (list_link_active(&ill->ill_actnode))
+ ipmp_ill_deactivate(ill);
+ }
+
+ return (refreshed);
+}
+
+/*
+ * Check if `ill' should be active, and activate or deactivate if need be.
+ * If the refresh fails, schedule a timer to try again later.
+ */
+void
+ipmp_ill_refresh_active(ill_t *ill)
+{
+ if (!ipmp_ill_try_refresh_active(ill))
+ ipmp_ill_refresh_active_timer_start(ill);
+}
+
+/*
+ * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'.
+ */
+static void
+ipmp_ill_refresh_active_timer(void *ill_arg)
+{
+ ill_t *ill = ill_arg;
+ boolean_t refreshed = B_FALSE;
+
+ /*
+ * Clear ill_refresh_tid to indicate that no timeout is pending
+ * (another thread could schedule a new timeout while we're still
+ * running, but that's harmless). If the ill is going away, bail.
+ */
+ mutex_enter(&ill->ill_lock);
+ ill->ill_refresh_tid = 0;
+ if (ill->ill_state_flags & ILL_CONDEMNED) {
+ mutex_exit(&ill->ill_lock);
+ return;
+ }
+ mutex_exit(&ill->ill_lock);
+
+ if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) {
+ refreshed = ipmp_ill_try_refresh_active(ill);
+ ipsq_exit(ill->ill_phyint->phyint_ipsq);
+ }
+
+ /*
+ * If the refresh failed, schedule another attempt.
+ */
+ if (!refreshed)
+ ipmp_ill_refresh_active_timer_start(ill);
+}
+
+/*
+ * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'.
+ */
+static void
+ipmp_ill_refresh_active_timer_start(ill_t *ill)
+{
+ mutex_enter(&ill->ill_lock);
+
+ /*
+ * If the ill is going away or a refresh is already scheduled, bail.
+ */
+ if (ill->ill_refresh_tid != 0 ||
+ (ill->ill_state_flags & ILL_CONDEMNED)) {
+ mutex_exit(&ill->ill_lock);
+ return;
+ }
+
+ ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill,
+ SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT));
+
+ mutex_exit(&ill->ill_lock);
+}
+
+/*
+ * Activate `ill' so it will be used to send and receive data traffic. Return
+ * B_FALSE if `ill' cannot be activated. Note that we allocate any messages
+ * needed to deactivate `ill' here as well so that deactivation cannot fail.
+ */
+static boolean_t
+ipmp_ill_activate(ill_t *ill)
+{
+ ipif_t *ipif;
+ mblk_t *actmp = NULL, *deactmp = NULL;
+ mblk_t *linkupmp = NULL, *linkdownmp = NULL;
+ ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
+ const char *grifname = grp->gr_ifname;
+ ipmp_illgrp_t *illg = ill->ill_grp;
+ ill_t *maxill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ /*
+ * If this will be the first active interface in the group, allocate
+ * the link-up and link-down messages.
+ */
+ if (grp->gr_nactif == 0) {
+ linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0);
+ linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0);
+ if (linkupmp == NULL || linkdownmp == NULL)
+ goto fail;
+ }
+
+ /*
+ * For IPv4, allocate the activate/deactivate messages, and tell ARP.
+ */
+ if (!ill->ill_isv6) {
+ actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template);
+ deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template);
+ if (actmp == NULL || deactmp == NULL)
+ goto fail;
+
+ ASSERT(ill->ill_ardeact_mp == NULL);
+ ill->ill_ardeact_mp = deactmp;
+ putnext(illg->ig_ipmp_ill->ill_rq, actmp);
+ }
+
+ if (list_is_empty(&illg->ig_actif)) {
+ /*
+ * Now that we have an active ill, nominate it for multicast
+ * and broadcast duties. Do this before ipmp_ill_bind_ipif()
+ * since that may need to send multicast packets (e.g., IPv6
+ * neighbor discovery probes).
+ */
+ ipmp_illgrp_set_cast(illg, ill);
+
+ /*
+ * This is the first active ill in the illgrp -- add 'em all.
+ * We can access/walk ig_ipmp_ill's ipif list since we're
+ * writer on its IPSQ as well.
+ */
+ ipif = illg->ig_ipmp_ill->ill_ipif;
+ for (; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipmp_ipif_is_up_dataaddr(ipif))
+ ipmp_ill_bind_ipif(ill, ipif, Res_act_initial);
+ } else {
+ /*
+ * Redistribute the addresses by moving them from the ill with
+ * the most addresses until the ill being activated is at the
+ * same level as the rest of the ills.
+ */
+ for (;;) {
+ maxill = ipmp_illgrp_max_ill(illg);
+ ASSERT(maxill != NULL);
+ if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt)
+ break;
+ ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
+ ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
+ }
+
+ /*
+ * TODO: explore whether it's advantageous to flush IRE_CACHE
+ * bindings to force existing connections to be redistributed
+ * to the new ill.
+ */
+ }
+
+ /*
+ * Put the interface in the active list.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ list_insert_tail(&illg->ig_actif, ill);
+ illg->ig_nactif++;
+ illg->ig_next_ill = ill;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * Refresh ARP entries to use `ill', if need be.
+ */
+ if (!ill->ill_isv6)
+ ipmp_illgrp_refresh_arpent(illg);
+
+ /*
+ * Finally, mark the group link up, if necessary.
+ */
+ if (grp->gr_nactif++ == 0) {
+ ASSERT(grp->gr_linkdownmp == NULL);
+ grp->gr_linkdownmp = linkdownmp;
+ put(illg->ig_ipmp_ill->ill_rq, linkupmp);
+ }
+ return (B_TRUE);
+fail:
+ freemsg(actmp);
+ freemsg(deactmp);
+ freemsg(linkupmp);
+ freemsg(linkdownmp);
+ return (B_FALSE);
+}
+
+/*
+ * Deactivate `ill' so it will not be used to send or receive data traffic.
+ */
+static void
+ipmp_ill_deactivate(ill_t *ill)
+{
+ ill_t *minill;
+ ipif_t *ipif, *ubnextipif, *ubheadipif = NULL;
+ mblk_t *mp;
+ ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
+ ipmp_illgrp_t *illg = ill->ill_grp;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ /*
+ * Delete IRE_CACHE entries tied to this ill before they become stale.
+ */
+ ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
+ ill_stq_cache_delete, ill, ill);
+
+ /*
+ * Pull the interface out of the active list.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ list_remove(&illg->ig_actif, ill);
+ illg->ig_nactif--;
+ illg->ig_next_ill = list_head(&illg->ig_actif);
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * If the ill that's being deactivated had been nominated for
+ * multicast/broadcast, nominate a new one.
+ */
+ if (ill == illg->ig_cast_ill)
+ ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
+
+ /*
+ * Unbind all of the ipifs bound to this ill, and save 'em in a list;
+ * we'll rebind them after we tell the resolver the ill is no longer
+ * active. We must do things in this order or the resolver could
+ * accidentally rebind to the ill we're trying to remove if multiple
+ * ills in the group have the same hardware address (which is
+ * unsupported, but shouldn't lead to a wedged machine).
+ */
+ while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) {
+ ipif->ipif_bound_next = ubheadipif;
+ ubheadipif = ipif;
+ }
+
+ if (!ill->ill_isv6) {
+ /*
+ * Tell ARP `ill' is no longer active in the group.
+ */
+ mp = ill->ill_ardeact_mp;
+ ill->ill_ardeact_mp = NULL;
+ ASSERT(mp != NULL);
+ putnext(illg->ig_ipmp_ill->ill_rq, mp);
+
+ /*
+ * Refresh any ARP entries that had been using `ill'.
+ */
+ ipmp_illgrp_refresh_arpent(illg);
+ }
+
+ /*
+ * Rebind each ipif from the deactivated ill to the active ill with
+ * the fewest ipifs. If there are no active ills, the ipifs will
+ * remain unbound.
+ */
+ for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) {
+ ubnextipif = ipif->ipif_bound_next;
+ ipif->ipif_bound_next = NULL;
+
+ if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
+ ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
+ }
+
+ /*
+ * Finally, mark the group link down, if necessary.
+ */
+ if (--grp->gr_nactif == 0) {
+ mp = grp->gr_linkdownmp;
+ grp->gr_linkdownmp = NULL;
+ ASSERT(mp != NULL);
+ put(illg->ig_ipmp_ill->ill_rq, mp);
+ }
+}
+
+/*
+ * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD)
+ * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners.
+ */
+static void
+ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd)
+{
+ ipif_t *ipif;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE);
+
+ /*
+ * If `ill' is truly down, there are no messages to generate since:
+ *
+ * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface
+ * and its addresses by bringing them down. But that's already
+ * true, so there's nothing to hide.
+ *
+ * 2. If cmd == RTM_ADD, then we're supposed to generate messages
+ * indicating that any previously-hidden up addresses are again
+ * back up (along with the interface). But they aren't, so
+ * there's nothing to expose.
+ */
+ if (ill->ill_ipif_up_count == 0)
+ return;
+
+ if (cmd == RTM_ADD)
+ ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL);
+
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipif->ipif_flags & IPIF_UP)
+ ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL);
+
+ if (cmd == RTM_DELETE)
+ ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL);
+}
+
+/*
+ * Bind the address named by `ipif' to the underlying ill named by `ill'.
+ * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act'
+ * will indicate to the resolver whether this is an initial bringup of
+ * `ipif', or just a rebind to another ill.
+ */
+static void
+ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
+{
+ int err = 0;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif));
+ ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill));
+ ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif));
+ ASSERT(ipif->ipif_bound_ill == NULL);
+ ASSERT(ipif->ipif_bound_next == NULL);
+
+ ipif->ipif_bound_next = ill->ill_bound_ipif;
+ ill->ill_bound_ipif = ipif;
+ ill->ill_bound_cnt++;
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ ipif->ipif_bound_ill = ill;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * If necessary, tell ARP/NDP about the new mapping. Note that
+ * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills.
+ */
+ if (act != Res_act_none) {
+ if (ill->ill_isv6) {
+ VERIFY(ipif_resolver_up(ipif, act) == 0);
+ err = ipif_ndp_up(ipif, act == Res_act_initial);
+ } else {
+ err = ipif_resolver_up(ipif, act);
+ }
+
+ /*
+ * Since ipif_ndp_up() never returns EINPROGRESS and
+ * ipif_resolver_up() only returns EINPROGRESS when the
+ * associated ill is not up, we should never be here with
+ * EINPROGRESS. We rely on this to simplify the design.
+ */
+ ASSERT(err != EINPROGRESS);
+ }
+ /* TODO: retry binding on failure? when? */
+ ipif->ipif_bound = (err == 0);
+}
+
+/*
+ * Unbind the address named by `ipif' from the underlying ill named by `ill'.
+ * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned.
+ * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is
+ * B_TRUE, notify the resolver about the change.
+ */
+static ipif_t *
+ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
+{
+ ill_t *ipmp_ill;
+ ipif_t *previpif;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ ipmp_ill = ill->ill_grp->ig_ipmp_ill;
+
+ /*
+ * If necessary, find an ipif to unbind.
+ */
+ if (ipif == NULL) {
+ if ((ipif = ill->ill_bound_ipif) == NULL) {
+ ASSERT(ill->ill_bound_cnt == 0);
+ return (NULL);
+ }
+ }
+
+ ASSERT(IAM_WRITER_IPIF(ipif));
+ ASSERT(IS_IPMP(ipif->ipif_ill));
+ ASSERT(ipif->ipif_bound_ill == ill);
+ ASSERT(ill->ill_bound_cnt > 0);
+
+ /*
+ * Unbind it.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ ipif->ipif_bound_ill = NULL;
+ rw_exit(&ipst->ips_ipmp_lock);
+ ill->ill_bound_cnt--;
+
+ if (ill->ill_bound_ipif == ipif) {
+ ill->ill_bound_ipif = ipif->ipif_bound_next;
+ } else {
+ previpif = ill->ill_bound_ipif;
+ while (previpif->ipif_bound_next != ipif)
+ previpif = previpif->ipif_bound_next;
+
+ previpif->ipif_bound_next = ipif->ipif_bound_next;
+ }
+ ipif->ipif_bound_next = NULL;
+
+ /*
+ * If requested, notify the resolvers (provided we're bound).
+ */
+ if (notifyres && ipif->ipif_bound) {
+ if (ill->ill_isv6) {
+ ipif_ndp_down(ipif);
+ } else {
+ ASSERT(ipif->ipif_arp_del_mp != NULL);
+ putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp);
+ ipif->ipif_arp_del_mp = NULL;
+ }
+ }
+ ipif->ipif_bound = B_FALSE;
+
+ return (ipif);
+}
+
+/*
+ * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if
+ * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this
+ * to determine whether an ill should be considered active, other consumers
+ * may race and learn about an ill that should be deactivated/activated before
+ * IPMP has performed the activation/deactivation. This should be safe though
+ * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that
+ * would've been cleaned up by ipmp_ill_deactivate().
+ */
+boolean_t
+ipmp_ill_is_active(ill_t *ill)
+{
+ phyint_t *phyi = ill->ill_phyint;
+
+ ASSERT(IS_UNDER_IPMP(ill));
+ ASSERT(IAM_WRITER_ILL(ill) ||
+ (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock)));
+
+ /*
+ * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to
+ * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the
+ * link flapping logic to be just in in.mpathd and allows us to ignore
+ * changes to PHYI_RUNNING.
+ */
+ return (!(ill->ill_ipif_up_count == 0 ||
+ (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED))));
+}
+
+/*
+ * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet
+ * IREs with a source address on `ill_arg'.
+ */
+static void
+ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
+{
+ ill_t *ill = (ill_t *)ill_arg;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(!IS_IPMP(ill));
+
+ if (ire->ire_ipif->ipif_ill != ill)
+ return;
+
+ switch (ire->ire_type) {
+ case IRE_HOST:
+ case IRE_PREFIX:
+ case IRE_DEFAULT:
+ case IRE_CACHE:
+ case IRE_IF_RESOLVER:
+ case IRE_IF_NORESOLVER:
+ DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
+ ire->ire_marks |= IRE_MARK_TESTHIDDEN;
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source
+ * address on `ill_arg'.
+ */
+static void
+ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
+{
+ ill_t *ill = (ill_t *)ill_arg;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(!IS_IPMP(ill));
+
+ if (ire->ire_ipif->ipif_ill == ill) {
+ DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
+ ire->ire_marks &= ~IRE_MARK_TESTHIDDEN;
+ }
+}
+
+/*
+ * Return a held pointer to the IPMP ill for underlying interface `ill', or
+ * NULL if one doesn't exist. (Unfortunately, this function needs to take an
+ * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
+ * ill_grp pointer may become stale when not under an IPSQ and not holding
+ * ipmp_lock.) Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_ill_hold_ipmp_ill(ill_t *ill)
+{
+ ip_stack_t *ipst = ill->ill_ipst;
+ ipmp_illgrp_t *illg;
+
+ ASSERT(!IS_IPMP(ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ illg = ill->ill_grp;
+ if (illg != NULL && ILL_CAN_LOOKUP(illg->ig_ipmp_ill)) {
+ ill_refhold(illg->ig_ipmp_ill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (illg->ig_ipmp_ill);
+ }
+ /*
+ * Assume `ill' was removed from the illgrp in the meantime.
+ */
+ rw_exit(&ill->ill_ipst->ips_ipmp_lock);
+ return (NULL);
+}
+
+/*
+ * Return the interface index for the IPMP ill tied to underlying interface
+ * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ.
+ */
+uint_t
+ipmp_ill_get_ipmp_ifindex(const ill_t *ill)
+{
+ uint_t ifindex = 0;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ipmp_grp_t *grp;
+
+ ASSERT(!IS_IPMP(ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ if ((grp = ill->ill_phyint->phyint_grp) != NULL)
+ ifindex = grp->gr_phyint->phyint_ifindex;
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (ifindex);
+}
+
+/*
+ * Place phyint `phyi' into IPMP group `grp'.
+ */
+void
+ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp)
+{
+ ill_t *ill;
+ ipsq_t *ipsq = phyi->phyint_ipsq;
+ ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq;
+ ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+
+ ASSERT(IAM_WRITER_IPSQ(ipsq));
+ ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL);
+
+ /*
+ * Send routing socket messages indicating that the phyint's ills
+ * and ipifs vanished.
+ */
+ if (phyi->phyint_illv4 != NULL) {
+ ill = phyi->phyint_illv4;
+ ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
+ }
+
+ if (phyi->phyint_illv6 != NULL) {
+ ill = phyi->phyint_illv6;
+ ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
+ }
+
+ /*
+ * Snapshot the phyint's initial kstats as a baseline.
+ */
+ ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0);
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+
+ phyi->phyint_grp = grp;
+ if (++grp->gr_nif == 1)
+ grp->gr_mactype = ill->ill_mactype;
+ else
+ ASSERT(grp->gr_mactype == ill->ill_mactype);
+
+ /*
+ * Now that we're in the group, request a switch to the group's xop
+ * when we ipsq_exit(). All future operations will be exclusive on
+ * the group xop until ipmp_phyint_leave_grp() is called.
+ */
+ ASSERT(ipsq->ipsq_swxop == NULL);
+ ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop);
+ ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop;
+
+ rw_exit(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Remove phyint `phyi' from its current IPMP group.
+ */
+void
+ipmp_phyint_leave_grp(phyint_t *phyi)
+{
+ uint_t i;
+ ipsq_t *ipsq = phyi->phyint_ipsq;
+ ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+ uint64_t phyi_kstats[IPMP_KSTAT_MAX];
+
+ ASSERT(IAM_WRITER_IPSQ(ipsq));
+
+ /*
+ * If any of the phyint's ills are still in an illgrp, kick 'em out.
+ */
+ if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4))
+ ipmp_ill_leave_illgrp(phyi->phyint_illv4);
+ if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6))
+ ipmp_ill_leave_illgrp(phyi->phyint_illv6);
+
+ /*
+ * Send routing socket messages indicating that the phyint's ills
+ * and ipifs have reappeared.
+ */
+ if (phyi->phyint_illv4 != NULL)
+ ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD);
+ if (phyi->phyint_illv6 != NULL)
+ ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD);
+
+ /*
+ * Calculate the phyint's cumulative kstats while it was in the group,
+ * and add that to the group's baseline.
+ */
+ ipmp_phyint_get_kstats(phyi, phyi_kstats);
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ phyi_kstats[i] -= phyi->phyint_kstats0[i];
+ atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]);
+ }
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+
+ phyi->phyint_grp->gr_nif--;
+ phyi->phyint_grp = NULL;
+
+ /*
+ * As our final act in leaving the group, request a switch back to our
+ * IPSQ's own xop when we ipsq_exit().
+ */
+ ASSERT(ipsq->ipsq_swxop == NULL);
+ ipsq->ipsq_swxop = &ipsq->ipsq_ownxop;
+
+ rw_exit(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Store the IPMP-related kstats for `phyi' into the array named by `kstats'.
+ * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements.
+ */
+static void
+ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[])
+{
+ uint_t i, j;
+ const char *name;
+ kstat_t *ksp;
+ kstat_named_t *kn;
+
+ bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX);
+
+ /*
+ * NOTE: ALL_ZONES here assumes that there's at most one link
+ * with a given name on a given system (safe for now).
+ */
+ ksp = kstat_hold_byname("link", 0, phyi->phyint_name, ALL_ZONES);
+ if (ksp == NULL)
+ return;
+
+ KSTAT_ENTER(ksp);
+
+ if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
+ /*
+ * Bring kstats up-to-date before recording.
+ */
+ (void) KSTAT_UPDATE(ksp, KSTAT_READ);
+
+ kn = KSTAT_NAMED_PTR(ksp);
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ name = ipmp_kstats[i].name;
+ kstats[i] = 0;
+ for (j = 0; j < ksp->ks_ndata; j++) {
+ if (strcmp(kn[j].name, name) != 0)
+ continue;
+
+ switch (kn[j].data_type) {
+ case KSTAT_DATA_INT32:
+ case KSTAT_DATA_UINT32:
+ kstats[i] = kn[j].value.ui32;
+ break;
+#ifdef _LP64
+ case KSTAT_DATA_LONG:
+ case KSTAT_DATA_ULONG:
+ kstats[i] = kn[j].value.ul;
+ break;
+#endif
+ case KSTAT_DATA_INT64:
+ case KSTAT_DATA_UINT64:
+ kstats[i] = kn[j].value.ui64;
+ break;
+ }
+ break;
+ }
+ }
+ }
+
+ KSTAT_EXIT(ksp);
+ kstat_rele(ksp);
+}
+
+/*
+ * Refresh the active state of all ills on `phyi'.
+ */
+void
+ipmp_phyint_refresh_active(phyint_t *phyi)
+{
+ if (phyi->phyint_illv4 != NULL)
+ ipmp_ill_refresh_active(phyi->phyint_illv4);
+ if (phyi->phyint_illv6 != NULL)
+ ipmp_ill_refresh_active(phyi->phyint_illv6);
+}
+
+/*
+ * Return a held pointer to the underlying ill bound to `ipif', or NULL if one
+ * doesn't exist. Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
+{
+ ill_t *boundill;
+ ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+
+ ASSERT(IS_IPMP(ipif->ipif_ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ boundill = ipif->ipif_bound_ill;
+ if (boundill != NULL && ILL_CAN_LOOKUP(boundill)) {
+ ill_refhold(boundill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (boundill);
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (NULL);
+}
+
+/*
+ * Return a pointer to the underlying ill bound to `ipif', or NULL if one
+ * doesn't exist. Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_ipif_bound_ill(const ipif_t *ipif)
+{
+ ASSERT(IAM_WRITER_ILL(ipif->ipif_ill));
+ ASSERT(IS_IPMP(ipif->ipif_ill));
+
+ return (ipif->ipif_bound_ill);
+}
+
+/*
+ * Check if `ipif' is a "stub" (placeholder address not being used).
+ */
+boolean_t
+ipmp_ipif_is_stubaddr(const ipif_t *ipif)
+{
+ if (ipif->ipif_flags & IPIF_UP)
+ return (B_FALSE);
+ if (ipif->ipif_ill->ill_isv6)
+ return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
+ else
+ return (ipif->ipif_lcl_addr == INADDR_ANY);
+}
+
+/*
+ * Check if `ipif' is an IPMP data address.
+ */
+boolean_t
+ipmp_ipif_is_dataaddr(const ipif_t *ipif)
+{
+ if (ipif->ipif_flags & IPIF_NOFAILOVER)
+ return (B_FALSE);
+ if (ipif->ipif_ill->ill_isv6)
+ return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
+ else
+ return (ipif->ipif_lcl_addr != INADDR_ANY);
+}
+
+/*
+ * Check if `ipif' is an IPIF_UP IPMP data address.
+ */
+static boolean_t
+ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
+{
+ return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
+}
diff --git a/usr/src/uts/common/inet/ip/rts.c b/usr/src/uts/common/inet/ip/rts.c
index 4999f28d1e..2751b19993 100644
--- a/usr/src/uts/common/inet/ip/rts.c
+++ b/usr/src/uts/common/inet/ip/rts.c
@@ -561,7 +561,6 @@ rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
case SO_TYPE:
*i1 = SOCK_RAW;
break;
-
/*
* The following three items are available here,
* but are only meaningful to IP.
@@ -597,6 +596,15 @@ rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
return (-1);
}
break;
+ case SOL_ROUTE:
+ switch (name) {
+ case RT_AWARE:
+ mutex_enter(&connp->conn_lock);
+ *i1 = connp->conn_rtaware;
+ mutex_exit(&connp->conn_lock);
+ break;
+ }
+ break;
default:
return (-1);
}
@@ -701,6 +709,20 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
return (EINVAL);
}
break;
+ case SOL_ROUTE:
+ switch (name) {
+ case RT_AWARE:
+ if (!checkonly) {
+ mutex_enter(&connp->conn_lock);
+ connp->conn_rtaware = *i1;
+ mutex_exit(&connp->conn_lock);
+ }
+ break; /* goto sizeof (int) option return */
+ default:
+ *outlenp = 0;
+ return (EINVAL);
+ }
+ break;
default:
*outlenp = 0;
return (EINVAL);
diff --git a/usr/src/uts/common/inet/ip/rts_opt_data.c b/usr/src/uts/common/inet/ip/rts_opt_data.c
index bac0eabdc4..7397b53b9e 100644
--- a/usr/src/uts/common/inet/ip/rts_opt_data.c
+++ b/usr/src/uts/common/inet/ip/rts_opt_data.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -60,6 +60,7 @@ opdes_t rts_opt_arr[] = {
{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ RT_AWARE, SOL_ROUTE, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
};
/*
diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c
index f785d8a3f6..8a3aa86d60 100644
--- a/usr/src/uts/common/inet/ip/spd.c
+++ b/usr/src/uts/common/inet/ip/spd.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -3989,7 +3989,7 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h)
ipsec_out_t *io;
boolean_t v4;
mblk_t *mp;
- boolean_t secure, attach_if;
+ boolean_t secure;
uint_t ifindex;
ipsec_selector_t sel;
ipsec_action_t *reflect_action = NULL;
@@ -4012,7 +4012,6 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h)
} else if (!ii->ipsec_in_loopback)
reflect_action = ipsec_in_to_out_action(ii);
secure = ii->ipsec_in_secure;
- attach_if = ii->ipsec_in_attach_if;
ifindex = ii->ipsec_in_ill_index;
zoneid = ii->ipsec_in_zoneid;
ASSERT(zoneid != ALL_ZONES);
@@ -4057,7 +4056,6 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h)
io->ipsec_out_proc_begin = B_FALSE;
io->ipsec_out_secure = secure;
io->ipsec_out_v4 = v4;
- io->ipsec_out_attach_if = attach_if;
io->ipsec_out_ill_index = ifindex;
io->ipsec_out_zoneid = zoneid;
io->ipsec_out_ns = ns; /* No netstack_hold */
@@ -4549,7 +4547,6 @@ ipsec_out_to_in(mblk_t *ipsec_mp)
ii->ipsec_in_secure = B_TRUE;
ii->ipsec_in_v4 = v4;
ii->ipsec_in_icmp_loopback = icmp_loopback;
- ii->ipsec_in_attach_if = B_FALSE;
}
/*
diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h
index d463c3f6ee..ad331d5706 100644
--- a/usr/src/uts/common/inet/ip6.h
+++ b/usr/src/uts/common/inet/ip6.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -133,10 +133,8 @@ typedef struct ip6_info ip6i_t;
#define IP6I_RAW_CHECKSUM 0x10
/* Compute checksum and stuff in ip6i_checksum_off */
#define IP6I_VERIFY_SRC 0x20 /* Verify ip6_src. Used when IPV6_PKTINFO */
-#define IP6I_ATTACH_IF 0x40 /* Bind to no failover address or BOUND_PIF. */
-#define IP6I_DROP_IFDELAYED 0x80
- /* Drop the packet if delayed in ndp resolver */
-#define IP6I_ND_DELAYED 0x100 /* Packet was delayed in ndp resolver */
+#define IP6I_IPMP_PROBE 0x40 /* IPMP (in.mpathd) probe packet */
+ /* 0x80 - 0x100 available */
#define IP6I_DONTFRAG 0x200 /* Don't fragment this packet */
#define IP6I_HOPLIMIT 0x400 /* hoplimit has been set by the sender */
@@ -340,7 +338,7 @@ extern void icmp_time_exceeded_v6(queue_t *, mblk_t *, uint8_t,
extern void icmp_unreachable_v6(queue_t *, mblk_t *, uint8_t,
boolean_t, boolean_t, zoneid_t, ip_stack_t *);
extern void icmp_inbound_error_fanout_v6(queue_t *, mblk_t *, ip6_t *,
- icmp6_t *, ill_t *, boolean_t, zoneid_t);
+ icmp6_t *, ill_t *, ill_t *, boolean_t, zoneid_t);
extern boolean_t conn_wantpacket_v6(conn_t *, ill_t *, ip6_t *, int, zoneid_t);
extern mblk_t *ip_add_info_v6(mblk_t *, ill_t *, const in6_addr_t *);
extern in6addr_scope_t ip_addr_scope_v6(const in6_addr_t *);
@@ -382,7 +380,7 @@ extern int ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t,
ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t,
const in6_addr_t *, mblk_t *);
extern void ip_newroute_ipif_v6(queue_t *, mblk_t *, ipif_t *,
- in6_addr_t, int, zoneid_t);
+ const in6_addr_t *, const in6_addr_t *, int, zoneid_t);
extern void ip_newroute_v6(queue_t *, mblk_t *, const in6_addr_t *,
const in6_addr_t *, ill_t *, zoneid_t, ip_stack_t *);
extern void *ip6_kstat_init(netstackid_t, ip6_stat_t *);
diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h
index c5982de059..094800197e 100644
--- a/usr/src/uts/common/inet/ip_if.h
+++ b/usr/src/uts/common/inet/ip_if.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -80,7 +80,7 @@ extern "C" {
*/
#define IFF_PHYINT_FLAGS (IFF_LOOPBACK|IFF_RUNNING|IFF_PROMISC| \
IFF_ALLMULTI|IFF_INTELLIGENT|IFF_MULTI_BCAST|IFF_FAILED|IFF_STANDBY| \
- IFF_INACTIVE|IFF_OFFLINE|IFF_VIRTUAL)
+ IFF_INACTIVE|IFF_OFFLINE|IFF_VIRTUAL|IFF_IPMP)
#define IFF_PHYINTINST_FLAGS (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP| \
IFF_MULTICAST|IFF_ROUTER|IFF_NONUD|IFF_NORTEXCH|IFF_IPV4|IFF_IPV6| \
@@ -91,11 +91,6 @@ extern "C" {
IFF_DEPRECATED|IFF_ADDRCONF|IFF_ANYCAST|IFF_NOFAILOVER| \
IFF_PREFERRED|IFF_TEMPORARY|IFF_FIXEDMTU|IFF_DUPLICATE)
-#define IPIF_REPL_CHECK(to_ipif, failback_cmd) \
- (((to_ipif)->ipif_replace_zero) || ((failback_cmd) && \
- !(to_ipif)->ipif_isv6 && !((to_ipif)->ipif_flags & IPIF_UP) && \
- (to_ipif)->ipif_lcl_addr == INADDR_ANY))
-
#define PHYI_LOOPBACK IFF_LOOPBACK /* is a loopback net */
#define PHYI_RUNNING IFF_RUNNING /* resources allocated */
#define PHYI_PROMISC IFF_PROMISC /* receive all packets */
@@ -107,6 +102,7 @@ extern "C" {
#define PHYI_INACTIVE IFF_INACTIVE /* Standby active or not ? */
#define PHYI_OFFLINE IFF_OFFLINE /* NIC has been offlined */
#define PHYI_VIRTUAL IFF_VIRTUAL /* Will not send or recv pkts */
+#define PHYI_IPMP IFF_IPMP /* IPMP meta-interface */
#define ILLF_DEBUG IFF_DEBUG /* turn on debugging */
#define ILLF_NOTRAILERS IFF_NOTRAILERS /* avoid use of trailers */
@@ -137,11 +133,6 @@ extern "C" {
#define IPIF_FIXEDMTU IFF_FIXEDMTU /* set with SIOCSLIFMTU */
#define IPIF_DUPLICATE IFF_DUPLICATE /* address is in use */
-/* Source selection values for ipif_select_source_v6 */
-#define RESTRICT_TO_NONE 0x0 /* No restriction in source selection */
-#define RESTRICT_TO_GROUP 0x1 /* Restrict to IPMP group */
-#define RESTRICT_TO_ILL 0x2 /* Restrict to ILL */
-
#ifdef DEBUG
#define ILL_MAC_PERIM_HELD(ill) ill_mac_perim_held(ill)
#else
@@ -151,24 +142,23 @@ extern "C" {
/* for ipif_resolver_up */
enum ip_resolver_action {
Res_act_initial, /* initial address establishment */
- Res_act_move, /* address move (IPMP, new DL addr) */
- Res_act_defend /* address defense */
+ Res_act_rebind, /* IPMP address rebind (new hwaddr) */
+ Res_act_defend, /* address defense */
+ Res_act_none /* do nothing */
};
-extern ill_t *illgrp_scheduler(ill_t *);
-extern mblk_t *ill_arp_alloc(ill_t *, uchar_t *, caddr_t);
-extern mblk_t *ipif_area_alloc(ipif_t *);
+extern mblk_t *ill_arp_alloc(ill_t *, const uchar_t *, caddr_t);
+extern mblk_t *ipif_area_alloc(ipif_t *, uint_t);
extern mblk_t *ipif_ared_alloc(ipif_t *);
extern mblk_t *ill_ared_alloc(ill_t *, ipaddr_t);
-extern void ill_dlpi_done(ill_t *, t_uscalar_t);
+extern mblk_t *ill_arie_alloc(ill_t *, const char *, const void *);
extern boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t);
+extern void ill_dlpi_done(ill_t *, t_uscalar_t);
extern void ill_dlpi_send(ill_t *, mblk_t *);
extern void ill_dlpi_send_deferred(ill_t *);
extern void ill_capability_done(ill_t *);
extern mblk_t *ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t);
-extern ill_t *ill_group_lookup_on_ifindex(uint_t, boolean_t, ip_stack_t *);
-extern ill_t *ill_group_lookup_on_name(char *, boolean_t, ip_stack_t *);
/* NOTE: Keep unmodified ill_lookup_on_ifindex for ipp for now */
extern ill_t *ill_lookup_on_ifindex_global_instance(uint_t, boolean_t,
queue_t *, mblk_t *, ipsq_func_t, int *);
@@ -180,6 +170,7 @@ extern ill_t *ill_lookup_on_name(char *, boolean_t,
extern uint_t ill_get_next_ifindex(uint_t, boolean_t, ip_stack_t *);
extern uint_t ill_get_ifindex_by_name(char *, ip_stack_t *);
extern void ill_ipif_cache_delete(ire_t *, char *);
+extern void ill_stq_cache_delete(ire_t *, char *);
extern void ill_delete(ill_t *);
extern void ill_delete_tail(ill_t *);
extern int ill_dl_phys(ill_t *, ipif_t *, mblk_t *, queue_t *);
@@ -193,9 +184,9 @@ extern void ill_frag_prune(ill_t *, uint_t);
extern void ill_frag_free_pkts(ill_t *, ipfb_t *, ipf_t *, int);
extern time_t ill_frag_timeout(ill_t *, time_t);
extern int ill_init(queue_t *, ill_t *);
-extern int ill_nominate_mcast_rcv(ill_group_t *);
-extern boolean_t ill_setdefaulttoken(ill_t *);
+extern void ill_refresh_bcast(ill_t *);
extern void ill_restart_dad(ill_t *, boolean_t);
+extern boolean_t ill_setdefaulttoken(ill_t *);
extern int ill_set_phys_addr(ill_t *, mblk_t *);
extern void ill_set_ndmp(ill_t *, mblk_t *, uint_t, uint_t);
@@ -222,11 +213,9 @@ extern void ill_capability_reset(ill_t *, boolean_t);
extern void ill_taskq_dispatch(ip_stack_t *);
extern void ill_mtu_change(ire_t *, char *);
-extern void ill_group_cleanup(ill_t *);
-extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *);
-extern boolean_t ill_is_probeonly(ill_t *);
-extern boolean_t ill_hook_event_create(ill_t *, lif_if_t, nic_event_t,
- nic_event_data_t, size_t);
+extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *);
+extern uint_t ill_appaddr_cnt(const ill_t *);
+extern uint_t ill_ptpaddr_cnt(const ill_t *);
extern void ip_loopback_cleanup(ip_stack_t *);
extern void ipif_get_name(const ipif_t *, char *, int);
@@ -239,6 +228,8 @@ extern ipif_t *ipif_lookup_addr_v6(const in6_addr_t *, ill_t *, zoneid_t,
queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *);
extern boolean_t ip_addr_exists_v6(const in6_addr_t *, zoneid_t,
ip_stack_t *);
+extern ipif_t *ipif_lookup_addr_exact_v6(const in6_addr_t *, ill_t *,
+ ip_stack_t *);
extern zoneid_t ipif_lookup_addr_zoneid(ipaddr_t, ill_t *, ip_stack_t *);
extern zoneid_t ipif_lookup_addr_zoneid_v6(const in6_addr_t *, ill_t *,
ip_stack_t *);
@@ -251,31 +242,30 @@ extern ipif_t *ipif_lookup_multicast(ip_stack_t *, zoneid_t, boolean_t);
extern ipif_t *ipif_lookup_remote(ill_t *, ipaddr_t, zoneid_t);
extern ipif_t *ipif_lookup_onlink_addr(ipaddr_t, zoneid_t, ip_stack_t *);
extern ipif_t *ipif_lookup_seqid(ill_t *, uint_t);
-extern boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int,
- ipif_t **);
-extern boolean_t ipif_lookup_zoneid_group(ill_t *, zoneid_t, int,
- ipif_t **);
+extern boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int, ipif_t **);
extern ipif_t *ipif_select_source(ill_t *, ipaddr_t, zoneid_t);
extern boolean_t ipif_usesrc_avail(ill_t *, zoneid_t);
extern void ipif_refhold(ipif_t *);
extern void ipif_refhold_locked(ipif_t *);
-extern void ipif_refrele(ipif_t *);
+extern void ipif_refrele(ipif_t *);
extern void ipif_all_down_tail(ipsq_t *, queue_t *, mblk_t *, void *);
+extern void ipif_resolver_down(ipif_t *);
extern int ipif_resolver_up(ipif_t *, enum ip_resolver_action);
extern int ipif_arp_setup_multicast(ipif_t *, mblk_t **);
extern int ipif_down(ipif_t *, queue_t *, mblk_t *);
extern void ipif_down_tail(ipif_t *);
+extern void ipif_multicast_down(ipif_t *);
extern void ipif_multicast_up(ipif_t *);
extern void ipif_ndp_down(ipif_t *);
-extern int ipif_ndp_up(ipif_t *);
+extern int ipif_ndp_up(ipif_t *, boolean_t);
extern int ipif_ndp_setup_multicast(ipif_t *, struct nce_s **);
extern int ipif_up_done(ipif_t *);
extern int ipif_up_done_v6(ipif_t *);
extern void ipif_up_notify(ipif_t *);
-extern void ipif_update_other_ipifs_v6(ipif_t *, ill_group_t *);
+extern void ipif_update_other_ipifs_v6(ipif_t *);
extern void ipif_recreate_interface_routes_v6(ipif_t *, ipif_t *);
extern void ill_update_source_selection(ill_t *);
-extern ipif_t *ipif_select_source_v6(ill_t *, const in6_addr_t *, uint_t,
+extern ipif_t *ipif_select_source_v6(ill_t *, const in6_addr_t *, boolean_t,
uint32_t, zoneid_t);
extern boolean_t ipif_cant_setlinklocal(ipif_t *);
extern int ipif_setlinklocal(ipif_t *);
@@ -284,11 +274,8 @@ extern ipif_t *ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t, queue_t *,
mblk_t *, ipsq_func_t, int *, ip_stack_t *);
extern ipif_t *ipif_get_next_ipif(ipif_t *curr, ill_t *ill);
extern void ipif_ill_refrele_tail(ill_t *ill);
-extern void ipif_arp_down(ipif_t *ipif);
extern void ipif_mask_reply(ipif_t *);
-
-extern int illgrp_insert(ill_group_t **, ill_t *, char *, ill_group_t *,
- boolean_t);
+extern int ipif_up(ipif_t *, queue_t *, mblk_t *);
extern void ipsq_current_start(ipsq_t *, ipif_t *, int);
extern void ipsq_current_finish(ipsq_t *);
@@ -451,13 +438,13 @@ extern int ip_sioctl_tmyaddr(ipif_t *, sin_t *, queue_t *, mblk_t *,
extern int ip_sioctl_tunparam(ipif_t *, sin_t *, queue_t *, mblk_t *,
ip_ioctl_cmd_t *, void *);
+extern int ip_sioctl_get_binding(ipif_t *, sin_t *, queue_t *,
+ mblk_t *, ip_ioctl_cmd_t *, void *);
extern int ip_sioctl_groupname(ipif_t *, sin_t *, queue_t *,
mblk_t *, ip_ioctl_cmd_t *, void *);
extern int ip_sioctl_get_groupname(ipif_t *, sin_t *, queue_t *,
mblk_t *, ip_ioctl_cmd_t *, void *);
-extern int ip_sioctl_slifoindex(ipif_t *, sin_t *, queue_t *,
- mblk_t *, ip_ioctl_cmd_t *, void *);
-extern int ip_sioctl_get_oindex(ipif_t *, sin_t *, queue_t *,
+extern int ip_sioctl_groupinfo(ipif_t *, sin_t *, queue_t *,
mblk_t *, ip_ioctl_cmd_t *, void *);
extern int ip_sioctl_get_lifzone(ipif_t *, sin_t *, queue_t *,
@@ -473,15 +460,11 @@ extern int ip_sioctl_slifusesrc(ipif_t *, sin_t *, queue_t *,
mblk_t *, ip_ioctl_cmd_t *, void *);
extern int ip_sioctl_get_lifsrcof(ipif_t *, sin_t *, queue_t *,
mblk_t *, ip_ioctl_cmd_t *, void *);
-extern int ip_sioctl_set_ipmpfailback(ipif_t *, sin_t *, queue_t *,
- mblk_t *, ip_ioctl_cmd_t *, void *);
extern void ip_sioctl_copyin_resume(ipsq_t *, queue_t *, mblk_t *, void *);
extern void ip_sioctl_copyin_setup(queue_t *, mblk_t *);
-extern void ip_sioctl_iocack(queue_t *, mblk_t *);
+extern void ip_sioctl_iocack(ipsq_t *, queue_t *, mblk_t *, void *);
extern ip_ioctl_cmd_t *ip_sioctl_lookup(int);
-extern int ip_sioctl_move(ipif_t *, sin_t *, queue_t *, mblk_t *,
- ip_ioctl_cmd_t *, void *);
extern void conn_delete_ire(conn_t *, caddr_t);
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index dae62ab499..369ba60005 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -394,11 +394,9 @@ typedef struct ip_lso_info_s {
#define CONN_IS_LSO_MD_FASTPATH(connp) \
((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \
!((connp)->conn_nexthop_set) && /* IP_NEXTHOP */ \
- (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \
- (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \
(connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */
-/* Definitons for fragmenting IP packets using MDT. */
+/* Definitions for fragmenting IP packets using MDT. */
/*
* Smaller and private version of pdescinfo_t used specifically for IP,
diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h
index 7accbbcfa3..0a9f8add85 100644
--- a/usr/src/uts/common/inet/ip_ire.h
+++ b/usr/src/uts/common/inet/ip_ire.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -86,31 +86,17 @@ extern "C" {
/* return the ire. No recursive */
/* lookup should be done. */
#define MATCH_IRE_IHANDLE 0x0200 /* Match IRE on ihandle */
-#define MATCH_IRE_MARK_HIDDEN 0x0400 /* Match IRE ire_marks with */
- /* IRE_MARK_HIDDEN. */
+#define MATCH_IRE_MARK_TESTHIDDEN 0x0400 /* Match IRE_MARK_TESTHIDDEN IREs */
+
/*
- * MATCH_IRE_ILL is used whenever we want to specifically match an IRE
- * whose ire_ipif->ipif_ill or (ill_t *)ire_stq->q_ptr matches a given
- * ill. When MATCH_IRE_ILL is used to locate an IRE_CACHE, it implies
- * that the packet will not be load balanced. This is normally used
- * by in.mpathd to send out failure detection probes.
- *
- * MATCH_IRE_ILL_GROUP is used whenever we are not specific about which
- * interface (ill) the packet should be sent out. This implies that the
- * packets will be subjected to load balancing and it might go out on
- * any interface in the group. When there is only interface in the group,
- * MATCH_IRE_ILL_GROUP becomes MATCH_IRE_ILL. Most of the code uses
- * MATCH_IRE_ILL_GROUP and MATCH_IRE_ILL is used in very few cases where
- * we want to disable load balancing.
- *
* MATCH_IRE_PARENT is used whenever we unconditionally want to get the
* parent IRE (sire) while recursively searching IREs for an offsubnet
* destination. With this flag, even if no IRE_CACHETABLE or IRE_INTERFACE
* is found to help resolving IRE_OFFSUBNET in lookup routines, the
* IRE_OFFSUBNET sire, if any, is returned to the caller.
*/
-#define MATCH_IRE_ILL_GROUP 0x0800 /* Match IRE on ill or the ill_group. */
-#define MATCH_IRE_ILL 0x1000 /* Match IRE on the ill only */
+/* UNUSED 0x0800 */
+#define MATCH_IRE_ILL 0x1000 /* Match IRE on the ill */
#define MATCH_IRE_PARENT 0x2000 /* Match parent ire, if any, */
/* even if ire is not matched. */
@@ -305,7 +291,7 @@ extern ire_t *ire_ihandle_lookup_onlink(ire_t *);
extern ire_t *ire_ihandle_lookup_offlink(ire_t *, ire_t *);
extern ire_t *ire_ihandle_lookup_offlink_v6(ire_t *, ire_t *);
-extern boolean_t ire_local_same_ill_group(ire_t *, ire_t *);
+extern boolean_t ire_local_same_lan(ire_t *, ire_t *);
extern boolean_t ire_local_ok_across_zones(ire_t *, zoneid_t, void *,
const struct ts_label_s *, ip_stack_t *);
@@ -354,7 +340,7 @@ extern ire_t *ipif_lookup_multi_ire_v6(ipif_t *, const in6_addr_t *);
extern ire_t *ire_get_next_bcast_ire(ire_t *, ire_t *);
extern ire_t *ire_get_next_default_ire(ire_t *, ire_t *);
-extern void ire_arpresolve(ire_t *, ill_t *);
+extern void ire_arpresolve(ire_t *);
extern void ire_freemblk(ire_t *);
extern boolean_t ire_match_args(ire_t *, ipaddr_t, ipaddr_t, ipaddr_t,
int, const ipif_t *, zoneid_t, uint32_t, const struct ts_label_s *, int,
diff --git a/usr/src/uts/common/inet/ip_multi.h b/usr/src/uts/common/inet/ip_multi.h
index a3f4282cc7..7dee133967 100644
--- a/usr/src/uts/common/inet/ip_multi.h
+++ b/usr/src/uts/common/inet/ip_multi.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -49,6 +49,15 @@ typedef enum {
} ilg_stat_t;
/*
+ * Flags shared via ips_mrt_flags, used by mcast_restart_timers_thread().
+ */
+typedef enum {
+ IP_MRT_STOP = 0x1, /* request to stop thread */
+ IP_MRT_DONE = 0x2, /* indication that thread is stopped */
+ IP_MRT_RUN = 0x4 /* request to restart timers */
+} ip_mrt_flags_t;
+
+/*
* Extern functions
*/
extern mblk_t *igmp_input(queue_t *, mblk_t *, ill_t *);
@@ -78,9 +87,7 @@ extern int ip_get_dlpi_mbcast(ill_t *, mblk_t *);
extern void ilm_free(ipif_t *);
extern ilm_t *ilm_lookup_ill(ill_t *, ipaddr_t, zoneid_t);
extern ilm_t *ilm_lookup_ill_v6(ill_t *, const in6_addr_t *,
- zoneid_t);
-extern ilm_t *ilm_lookup_ill_index_v6(ill_t *, const in6_addr_t *,
- int, zoneid_t);
+ boolean_t, zoneid_t);
extern ilm_t *ilm_lookup_ipif(ipif_t *, ipaddr_t);
extern int ilm_numentries_v6(ill_t *, const in6_addr_t *);
@@ -92,10 +99,10 @@ extern int ip_ll_send_enabmulti_req(ill_t *, const in6_addr_t *);
extern int ip_addmulti(ipaddr_t, ipif_t *, ilg_stat_t,
mcast_record_t, slist_t *);
-extern int ip_addmulti_v6(const in6_addr_t *, ill_t *, int,
+extern int ip_addmulti_v6(const in6_addr_t *, ill_t *,
zoneid_t, ilg_stat_t, mcast_record_t, slist_t *);
extern int ip_delmulti(ipaddr_t, ipif_t *, boolean_t, boolean_t);
-extern int ip_delmulti_v6(const in6_addr_t *, ill_t *, int,
+extern int ip_delmulti_v6(const in6_addr_t *, ill_t *,
zoneid_t, boolean_t, boolean_t);
extern int ill_join_allmulti(ill_t *);
extern void ill_leave_allmulti(ill_t *);
@@ -140,9 +147,11 @@ extern void reset_conn_ipif(ipif_t *);
extern void reset_conn_ill(ill_t *);
extern void reset_mrt_ill(ill_t *);
extern void reset_mrt_vif_ipif(ipif_t *);
-extern void igmp_start_timers(unsigned, ip_stack_t *);
-extern void mld_start_timers(unsigned, ip_stack_t *);
+extern void mcast_restart_timers_thread(ip_stack_t *);
extern void ilm_inactive(ilm_t *);
+extern ilm_t *ilm_walker_start(ilm_walker_t *, ill_t *);
+extern ilm_t *ilm_walker_step(ilm_walker_t *, ilm_t *);
+extern void ilm_walker_finish(ilm_walker_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip_ndp.h b/usr/src/uts/common/inet/ip_ndp.h
index 4dbb56a884..5eda155c0e 100644
--- a/usr/src/uts/common/inet/ip_ndp.h
+++ b/usr/src/uts/common/inet/ip_ndp.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_IP_NDP_H
#define _INET_IP_NDP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/mutex.h>
#include <sys/stream.h>
#include <netinet/in.h>
@@ -318,7 +316,8 @@ extern nd_opt_hdr_t *ndp_get_option(nd_opt_hdr_t *, int, int);
extern void ndp_inactive(nce_t *);
extern void ndp_input(ill_t *, mblk_t *, mblk_t *);
extern boolean_t ndp_lookup_ipaddr(in_addr_t, netstack_t *);
-extern nce_t *ndp_lookup_v6(ill_t *, const in6_addr_t *, boolean_t);
+extern nce_t *ndp_lookup_v6(ill_t *, boolean_t, const in6_addr_t *,
+ boolean_t);
extern nce_t *ndp_lookup_v4(ill_t *, const in_addr_t *, boolean_t);
extern int ndp_mcastreq(ill_t *, const in6_addr_t *, uint32_t, uint32_t,
mblk_t *);
@@ -346,7 +345,7 @@ extern void nce_fastpath(nce_t *);
extern int ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *,
const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t,
nce_t **);
-extern int ndp_lookup_then_add_v6(ill_t *, uchar_t *,
+extern int ndp_lookup_then_add_v6(ill_t *, boolean_t, uchar_t *,
const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, uint32_t,
uint16_t, uint16_t, nce_t **);
extern int ndp_lookup_then_add_v4(ill_t *,
diff --git a/usr/src/uts/common/inet/ip_rts.h b/usr/src/uts/common/inet/ip_rts.h
index 70b33e0278..61bc451995 100644
--- a/usr/src/uts/common/inet/ip_rts.h
+++ b/usr/src/uts/common/inet/ip_rts.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -37,19 +37,28 @@ extern "C" {
*/
#define TSOL_RTSA_REQUEST_MAX 1 /* one per route destination */
+/*
+ * Flags for RTS queuing operations.
+ */
+#define RTSQ_UNDER_IPMP 0x01 /* send only on RTAW_UNDER_IPMP queues */
+#define RTSQ_NORMAL 0x02 /* send only on normal queues */
+#define RTSQ_ALL (RTSQ_UNDER_IPMP|RTSQ_NORMAL) /* send on all queues */
+#define RTSQ_DEFAULT 0x04 /* use standard filtering */
+
#ifdef _KERNEL
extern void ip_rts_change(int, ipaddr_t, ipaddr_t,
- ipaddr_t, ipaddr_t, ipaddr_t, int, int,
- int, ip_stack_t *);
+ ipaddr_t, ipaddr_t, ipaddr_t, int, int, int, ip_stack_t *);
extern void ip_rts_change_v6(int, const in6_addr_t *, const in6_addr_t *,
const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, int, int, int,
ip_stack_t *);
-extern void ip_rts_ifmsg(const ipif_t *);
+extern void ip_rts_ifmsg(const ipif_t *, uint_t);
-extern void ip_rts_newaddrmsg(int, int, const ipif_t *);
+extern void ip_rts_xifmsg(const ipif_t *, uint64_t, uint64_t, uint_t);
+
+extern void ip_rts_newaddrmsg(int, int, const ipif_t *, uint_t);
extern int ip_rts_request(queue_t *, mblk_t *, cred_t *);
@@ -70,9 +79,11 @@ extern void rts_fill_msg_v6(int, int, const in6_addr_t *,
extern size_t rts_header_msg_size(int);
-extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, ip_stack_t *);
+extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, uint_t,
+ ip_stack_t *);
extern int ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *, cred_t *);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index 3c53e1a3d3..750378f587 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,6 +33,7 @@ extern "C" {
#include <sys/netstack.h>
#include <netinet/igmp_var.h>
+#include <sys/modhash.h>
#ifdef _KERNEL
#include <sys/list.h>
@@ -172,9 +173,6 @@ struct ip_stack {
krwlock_t ips_ill_g_usesrc_lock;
- struct ill_group *ips_illgrp_head_v4; /* Head of IPv4 ill groups */
- struct ill_group *ips_illgrp_head_v6; /* Head of IPv6 ill groups */
-
/* Taskq dispatcher for capability operations */
kmutex_t ips_capab_taskq_lock;
kcondvar_t ips_capab_taskq_cv;
@@ -204,7 +202,6 @@ struct ip_stack {
int ips_igmp_timer_scheduled_last;
int ips_igmp_deferred_next;
timeout_id_t ips_igmp_timeout_id;
- kthread_t *ips_igmp_timer_thread;
boolean_t ips_igmp_timer_setter_active;
/* Following protected by mld_timer_lock */
@@ -212,7 +209,6 @@ struct ip_stack {
int ips_mld_timer_scheduled_last;
int ips_mld_deferred_next;
timeout_id_t ips_mld_timeout_id;
- kthread_t *ips_mld_timer_thread;
boolean_t ips_mld_timer_setter_active;
/* Protected by igmp_slowtimeout_lock */
@@ -269,8 +265,6 @@ struct ip_stack {
int ips_ip_g_forward;
int ips_ipv6_forward;
- int ips_ipmp_hook_emulation; /* ndd variable */
-
time_t ips_ip_g_frag_timeout;
clock_t ips_ip_g_frag_timo_ms;
@@ -280,8 +274,6 @@ struct ip_stack {
clock_t ips_icmp_pkt_err_last;
/* Number of packets sent in burst */
uint_t ips_icmp_pkt_err_sent;
- /* Used by icmp_send_redirect_v6 for picking random src. */
- uint_t ips_icmp_redirect_v6_src_index;
/* Protected by ip_mi_lock */
void *ips_ip_g_head; /* Instance Data List Head */
@@ -356,8 +348,6 @@ struct ip_stack {
kstat_t *ips_loopback_ksp;
- uint_t ips_ipif_src_random;
-
struct idl_s *ips_conn_drain_list; /* Array of conn drain lists */
uint_t ips_conn_drain_list_cnt; /* Count of conn_drain_list */
int ips_conn_drain_list_index; /* Next drain_list */
@@ -375,15 +365,6 @@ struct ip_stack {
uint64_t ips_ipif_g_seqid;
union phyint_list_u *ips_phyint_g_list; /* start of phyint list */
- /*
- * Reflects value of FAILBACK variable in IPMP config file
- * /etc/default/mpathd. Default value is B_TRUE.
- * Set to B_FALSE if user disabled failback by configuring
- * "FAILBACK=no" in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this
- * information to kernel.
- */
- boolean_t ips_ipmp_enable_failback;
-
/* ip_neti.c */
hook_family_t ips_ipv4root;
hook_family_t ips_ipv6root;
@@ -427,12 +408,25 @@ struct ip_stack {
kcondvar_t ips_ipobs_cb_cv;
struct __ldi_ident *ips_ldi_ident;
+
+/* ipmp.c */
+ krwlock_t ips_ipmp_lock;
+ mod_hash_t *ips_ipmp_grp_hash;
+
+/* igmp.c */
+ /* multicast restart timers thread logic */
+ kmutex_t ips_mrt_lock;
+ uint_t ips_mrt_flags;
+ kcondvar_t ips_mrt_cv;
+ kcondvar_t ips_mrt_done_cv;
+ kthread_t *ips_mrt_thread;
};
typedef struct ip_stack ip_stack_t;
/* Finding an ip_stack_t */
#define CONNQ_TO_IPST(_q) (Q_TO_CONN(_q)->conn_netstack->netstack_ip)
#define ILLQ_TO_IPST(_q) (((ill_t *)(_q)->q_ptr)->ill_ipst)
+#define PHYINT_TO_IPST(phyi) ((phyi)->phyint_ipsq->ipsq_ipst)
#else /* _KERNEL */
typedef int ip_stack_t;
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index 5fb86a5262..d80123a977 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -249,7 +249,6 @@ struct conn_s {
squeue_t *conn_initial_sqp; /* Squeue at open time */
squeue_t *conn_final_sqp; /* Squeue after connect */
- ill_t *conn_nofailover_ill; /* Failover ill */
ill_t *conn_dhcpinit_ill; /* IP_DHCPINIT_IF */
ipsec_latch_t *conn_latch; /* latched state */
ill_t *conn_outgoing_ill; /* IP{,V6}_BOUND_IF */
@@ -295,7 +294,6 @@ struct conn_s {
uint_t conn_proto; /* SO_PROTOTYPE state */
ill_t *conn_incoming_ill; /* IP{,V6}_BOUND_IF */
- ill_t *conn_outgoing_pill; /* IP{,V6}_BOUND_PIF */
ill_t *conn_oper_pending_ill; /* pending shared ioctl */
ilg_t *conn_ilg; /* Group memberships */
@@ -307,9 +305,6 @@ struct conn_s {
struct ipif_s *conn_multicast_ipif; /* IP_MULTICAST_IF */
ill_t *conn_multicast_ill; /* IPV6_MULTICAST_IF */
- int conn_orig_bound_ifindex; /* BOUND_IF before MOVE */
- int conn_orig_multicast_ifindex;
- /* IPv6 MC IF before MOVE */
struct conn_s *conn_drain_next; /* Next conn in drain list */
struct conn_s *conn_drain_prev; /* Prev conn in drain list */
idl_t *conn_idl; /* Ptr to the drain list head */
@@ -322,7 +317,7 @@ struct conn_s {
uchar_t conn_broadcast_ttl; /* IP_BROADCAST_TTL */
#define conn_nexthop_v4 V4_PART_OF_V6(conn_nexthop_v6)
cred_t *conn_peercred; /* Peer credentials, if any */
-
+ int conn_rtaware; /* RT_AWARE sockopt value */
kcondvar_t conn_sq_cv; /* For non-STREAMS socket IO */
kthread_t *conn_sq_caller; /* Caller of squeue sync ops */
sock_upcalls_t *conn_upcalls; /* Upcalls to sockfs */
diff --git a/usr/src/uts/common/inet/ipnet/ipnet.c b/usr/src/uts/common/inet/ipnet/ipnet.c
index 577205f25a..e94af50424 100644
--- a/usr/src/uts/common/inet/ipnet/ipnet.c
+++ b/usr/src/uts/common/inet/ipnet/ipnet.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -229,16 +229,19 @@ ipnet_if_init(void)
int
_init(void)
{
- int ret;
+ int ret;
+ boolean_t netstack_registered = B_FALSE;
if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
return (ENODEV);
ipnet_minor_space = id_space_create("ipnet_minor_space",
IPNET_MINOR_MIN, MAXMIN32);
- netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
+
/*
* We call ddi_taskq_create() with nthread == 1 to ensure in-order
- * delivery of packets to clients.
+ * delivery of packets to clients. Note that we need to create the
+ * taskqs before calling netstack_register() since ipnet_stack_init()
+ * registers callbacks that use 'em.
*/
ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
@@ -247,6 +250,10 @@ _init(void)
ret = ENOMEM;
goto done;
}
+
+ netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
+ netstack_registered = B_TRUE;
+
if ((ret = ipnet_if_init()) == 0)
ret = mod_install(&modlinkage);
done:
@@ -255,7 +262,8 @@ done:
ddi_taskq_destroy(ipnet_taskq);
if (ipnet_nicevent_taskq != NULL)
ddi_taskq_destroy(ipnet_nicevent_taskq);
- netstack_unregister(NS_IPNET);
+ if (netstack_registered)
+ netstack_unregister(NS_IPNET);
id_space_destroy(ipnet_minor_space);
}
return (ret);
@@ -268,9 +276,10 @@ _fini(void)
if ((err = mod_remove(&modlinkage)) != 0)
return (err);
+
+ netstack_unregister(NS_IPNET);
ddi_taskq_destroy(ipnet_nicevent_taskq);
ddi_taskq_destroy(ipnet_taskq);
- netstack_unregister(NS_IPNET);
id_space_destroy(ipnet_minor_space);
return (0);
}
@@ -987,6 +996,7 @@ static boolean_t
ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
ipnet_addrp_t *dst)
{
+ boolean_t obsif;
uint64_t ifindex = ipnet->ipnet_if->if_index;
ipnet_addrtype_t srctype, dsttype;
@@ -994,6 +1004,13 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
dsttype = ipnet_get_addrtype(ipnet, dst);
/*
+ * If the packet's ifindex matches ours, or the packet's group ifindex
+ * matches ours, it's on the interface we're observing. (Thus,
+ * observing on the group ifindex matches all ifindexes in the group.)
+ */
+ obsif = (ihd->ihd_ifindex == ifindex || ihd->ihd_grifindex == ifindex);
+
+ /*
* Do not allow an ipnet stream to see packets that are not from or to
* its zone. The exception is when zones are using the shared stack
* model. In this case, streams in the global zone have visibility
@@ -1025,7 +1042,7 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
* have our source address (this allows us to see packets we send).
*/
if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
- if (ihd->ihd_ifindex == ifindex || srctype == IPNETADDR_MYADDR)
+ if (srctype == IPNETADDR_MYADDR || obsif)
return (B_TRUE);
}
@@ -1033,7 +1050,7 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
* We accept multicast and broadcast packets transmitted or received
* on the interface we're observing.
*/
- if (dsttype == IPNETADDR_MBCAST && ihd->ihd_ifindex == ifindex)
+ if (dsttype == IPNETADDR_MBCAST && obsif)
return (B_TRUE);
return (B_FALSE);
diff --git a/usr/src/uts/common/inet/ipsec_info.h b/usr/src/uts/common/inet/ipsec_info.h
index b014bdade0..0348e10b91 100644
--- a/usr/src/uts/common/inet/ipsec_info.h
+++ b/usr/src/uts/common/inet/ipsec_info.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_IPSEC_INFO_H
#define _INET_IPSEC_INFO_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -114,12 +112,11 @@ typedef struct ipsec_in_s {
ipsec_in_decaps : 1, /* Was this packet decapsulated from */
/* a matching inner packet? */
- ipsec_in_attach_if : 1, /* Don't load spread this packet */
ipsec_in_accelerated : 1, /* hardware accelerated packet */
ipsec_in_icmp_loopback : 1, /* Looped-back ICMP packet, */
/* all should trust this. */
- ipsec_in_pad_bits : 24;
+ ipsec_in_pad_bits : 25;
int ipsec_in_ill_index; /* interface on which ipha_dst was */
/* configured when pkt was recv'd */
@@ -197,12 +194,11 @@ typedef struct ipsec_out_s {
ipsec_out_reserved : 1,
ipsec_out_v4 : 1,
- ipsec_out_attach_if : 1,
ipsec_out_unspec_src : 1, /* IPv6 ip6i_t info */
ipsec_out_reachable : 1, /* NDP reachability info */
ipsec_out_failed: 1,
-
ipsec_out_se_done: 1,
+
ipsec_out_esp_done: 1,
ipsec_out_ah_done: 1,
ipsec_out_need_policy: 1,
@@ -225,7 +221,7 @@ typedef struct ipsec_out_s {
*/
ipsec_out_icmp_loopback: 1,
ipsec_out_ip_nexthop : 1, /* IP_NEXTHOP option is set */
- ipsec_out_pad_bits : 12;
+ ipsec_out_pad_bits : 13;
cred_t *ipsec_out_cred;
uint32_t ipsec_out_capab_ill_index;
diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h
index 5abfc06581..a467abaee9 100644
--- a/usr/src/uts/common/inet/mib2.h
+++ b/usr/src/uts/common/inet/mib2.h
@@ -17,9 +17,8 @@
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -27,8 +26,6 @@
#ifndef _INET_MIB2_H
#define _INET_MIB2_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <netinet/in.h> /* For in6_addr_t */
#include <sys/tsol/label.h> /* For brange_t */
#include <sys/tsol/label_macro.h> /* For brange_t */
@@ -65,9 +62,14 @@ extern "C" {
* #define OPTLEN(x) ((((x) + sizeof(long) - 1) / sizeof(long)) * sizeof(long))
* #define OPTVAL(opt) ((char *)(opt + 1))
*
- * For get requests (T_NEGOTIATE), any MIB2_xxx value can be used (only
+ * For get requests (T_CURRENT), any MIB2_xxx value can be used (only
* "get all" is supported, so all modules get a copy of the request to
- * return everything it knows. Recommend: Use MIB2_IP
+ * return everything it knows. In general, we use MIB2_IP. There is
+ * one exception: in general, IP will not report information related to
+ * IRE_MARK_TESTHIDDEN routes (e.g., in the MIB2_IP_ROUTE table).
+ * However, using the special value EXPER_IP_AND_TESTHIDDEN will cause
+ * all information to be reported. This special value should only be
+ * used by IPMP-aware low-level utilities (e.g. in.mpathd).
*
* IMPORTANT: some fields are grouped in a different structure than
* suggested by MIB-II, e.g., checksum error counts. The original MIB-2
@@ -79,7 +81,6 @@ extern "C" {
#define IPPROTO_MAX 256
#endif
-
#define MIB2_SYSTEM (IPPROTO_MAX+1)
#define MIB2_INTERFACES (IPPROTO_MAX+2)
#define MIB2_AT (IPPROTO_MAX+3)
@@ -108,12 +109,13 @@ extern "C" {
#define EXPER_IGMP (EXPER+1)
#define EXPER_DVMRP (EXPER+2)
#define EXPER_RAWIP (EXPER+3)
+#define EXPER_IP_AND_TESTHIDDEN (EXPER+4)
/*
* Define range of levels for experimental use
*/
#define EXPER_RANGE_START (EXPER+1)
-#define EXPER_RANGE_END (EXPER+3)
+#define EXPER_RANGE_END (EXPER+4)
#define BUMP_MIB(s, x) { \
extern void __dtrace_probe___mib_##x(int, void *); \
diff --git a/usr/src/uts/common/inet/sctp/sctp_addr.c b/usr/src/uts/common/inet/sctp/sctp_addr.c
index 1761396031..94cc8e8883 100644
--- a/usr/src/uts/common/inet/sctp/sctp_addr.c
+++ b/usr/src/uts/common/inet/sctp/sctp_addr.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/stream.h>
diff --git a/usr/src/uts/common/inet/sctp_ip.h b/usr/src/uts/common/inet/sctp_ip.h
index 16ab99abab..7b20d3fd2b 100644
--- a/usr/src/uts/common/inet/sctp_ip.h
+++ b/usr/src/uts/common/inet/sctp_ip.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_SCTP_IP_H
#define _INET_SCTP_IP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 488f8ee021..68e0883222 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -31,7 +31,6 @@
#include <sys/strsubr.h>
#include <sys/stropts.h>
#include <sys/strlog.h>
-#include <sys/strsun.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/timod.h>
@@ -4683,18 +4682,10 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
/* ifindex must be already set */
ASSERT(ifindex != 0);
- if (ltcp->tcp_bound_if != 0) {
- /*
- * Set newtcp's bound_if equal to
- * listener's value. If ifindex is
- * not the same as ltcp->tcp_bound_if,
- * it must be a packet for the ipmp group
- * of interfaces
- */
+ if (ltcp->tcp_bound_if != 0)
tcp->tcp_bound_if = ltcp->tcp_bound_if;
- } else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
+ else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
tcp->tcp_bound_if = ifindex;
- }
tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary;
tcp->tcp_recvifindex = 0;
@@ -10716,9 +10707,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
ipp->ipp_fields |= IPPF_USE_MIN_MTU;
ipp->ipp_use_min_mtu = *i1;
break;
- case IPV6_BOUND_PIF:
- /* Handled at the IP level */
- return (-EINVAL);
case IPV6_SEC_OPT:
/*
* We should not allow policy setting after
@@ -18895,7 +18883,6 @@ tcp_zcopy_check(tcp_t *tcp)
connp->conn_dontroute == 0 &&
!connp->conn_nexthop_set &&
connp->conn_outgoing_ill == NULL &&
- connp->conn_nofailover_ill == NULL &&
do_tcpzcopy == 1) {
/*
* the checks above closely resemble the fast path checks
@@ -19139,7 +19126,6 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp)
ipaddr_t dst;
ire_t *ire;
ill_t *ill;
- conn_t *connp = tcp->tcp_connp;
mblk_t *ire_fp_mp;
tcp_stack_t *tcps = tcp->tcp_tcps;
@@ -19164,14 +19150,6 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp)
}
ill = ire_to_ill(ire);
- if (connp->conn_outgoing_ill != NULL) {
- ill_t *conn_outgoing_ill = NULL;
- /*
- * Choose a good ill in the group to send the packets on.
- */
- ire = conn_set_outgoing_ill(connp, ire, &conn_outgoing_ill);
- ill = ire_to_ill(ire);
- }
ASSERT(ill != NULL);
if (!tcp->tcp_ire_ill_check_done) {
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
index 15b5d04d61..8c8eee3b58 100644
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +35,7 @@
#include <inet/common.h>
#include <inet/optcom.h>
#include <inet/ip.h>
+#include <inet/ip_if.h>
#include <inet/ip_impl.h>
#include <inet/tcp.h>
#include <inet/tcp_impl.h>
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index d977c27e53..e2314f8104 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -151,9 +151,6 @@ opdes_t tcp_opt_arr[] = {
{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
sizeof (in_addr_t), -1 /* not initialized */ },
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (int), 0 /* no ifindex */ },
-
{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
sizeof (int), 0 },
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 7c9433caa0..1178315cb5 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -80,6 +80,7 @@
#include <inet/ipp_common.h>
#include <sys/squeue_impl.h>
#include <inet/ipnet.h>
+#include <sys/ethernet.h>
/*
* The ipsec_info.h header file is here since it has the definition for the
@@ -2141,7 +2142,6 @@ udp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
case MCAST_UNBLOCK_SOURCE:
case MCAST_JOIN_SOURCE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
- case IP_DONTFAILOVER_IF:
/* cannot "get" the value for these */
return (-1);
case IP_BOUND_IF:
@@ -3152,9 +3152,7 @@ udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
ipp->ipp_use_min_mtu = *i1;
break;
- case IPV6_BOUND_PIF:
case IPV6_SEC_OPT:
- case IPV6_DONTFAILOVER_IF:
case IPV6_SRC_PREFERENCES:
case IPV6_V6ONLY:
/* Handled at the IP level */
@@ -5351,7 +5349,6 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port,
if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
CONN_OUTBOUND_POLICY_PRESENT(connp, ipss) ||
connp->conn_dontroute ||
- connp->conn_nofailover_ill != NULL ||
connp->conn_outgoing_ill != NULL || optinfo.ip_opt_flags != 0 ||
optinfo.ip_opt_ill_index != 0 ||
ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
@@ -5419,8 +5416,7 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr;
ASSERT(ipif != NULL);
- if (stq_ill != ipif->ipif_ill && (stq_ill->ill_group == NULL ||
- stq_ill->ill_group != ipif->ipif_ill->ill_group))
+ if (!IS_ON_SAME_LAN(stq_ill, ipif->ipif_ill))
retry_caching = B_TRUE;
}
@@ -5444,7 +5440,7 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
ASSERT(ipif != NULL);
ire = ire_ctable_lookup(dst, 0, 0, ipif,
connp->conn_zoneid, MBLK_GETLABEL(mp),
- MATCH_IRE_ILL_GROUP, ipst);
+ MATCH_IRE_ILL, ipst);
} else {
ASSERT(ipif == NULL);
ire = ire_cache_lookup(dst, connp->conn_zoneid,
@@ -5622,12 +5618,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
}
if (CLASSD(dst)) {
- boolean_t ilm_exists;
-
- ILM_WALKER_HOLD(ill);
- ilm_exists = (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL);
- ILM_WALKER_RELE(ill);
- if (ilm_exists) {
+ if (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL) {
ip_multicast_loopback(q, ill, mp,
connp->conn_multicast_loop ? 0 :
IP_FF_NO_MCAST_LOOP, zoneid);
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index 0ec5a2c45e..65729b82f1 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -132,9 +132,6 @@ opdes_t udp_opt_arr[] = {
{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
sizeof (int), 0 /* no ifindex */ },
-{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (struct in_addr), 0 /* not initialized */ },
-
{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
sizeof (int), 0 },
@@ -191,12 +188,6 @@ opdes_t udp_opt_arr[] = {
{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
sizeof (int), 0 /* no ifindex */ },
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (int), 0 /* no ifindex */ },
-
-{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (int), 0 /* no ifindex */ },
-
{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
sizeof (int), 0 },
diff --git a/usr/src/uts/common/inet/vni/vni.c b/usr/src/uts/common/inet/vni/vni.c
deleted file mode 100644
index a370a7b4be..0000000000
--- a/usr/src/uts/common/inet/vni/vni.c
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-
-#include "vni_impl.h"
-#include <sys/conf.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/dlpi.h>
-#include <sys/stat.h>
-#include <sys/ethernet.h>
-#include <sys/strsun.h>
-#include <sys/stropts.h>
-
-static int vniopen(queue_t *, dev_t *, int, int, cred_t *);
-static int vniclose(queue_t *, int, cred_t *);
-static int vniwput(queue_t *, mblk_t *);
-static int vniattach(dev_info_t *, ddi_attach_cmd_t);
-static int vnidetach(dev_info_t *, ddi_detach_cmd_t);
-
-static struct module_info minfo = {
- VNIIDNUM, /* mi_idnum */
- VNINAME, /* mi_idname */
- VNIMINPSZ, /* mi_minpsz */
- VNIMAXPSZ, /* mi_maxpsz */
- VNIHIWAT, /* mi_hiwat */
- VNILOWAT /* mi_lowat */
-};
-
-static struct qinit vnirinit = {
- NULL, /* qi_putp */
- NULL, /* qi_srvp */
- vniopen, /* qi_qopen */
- vniclose, /* qi_qclose */
- NULL, /* qi_qadmin */
- &minfo, /* qi_minfo */
- NULL /* qi_mstat */
-};
-
-static struct qinit vniwinit = {
- vniwput, /* qi_putp */
- NULL, /* qi_srvp */
- NULL, /* qi_qopen */
- NULL, /* qi_qclose */
- NULL, /* qi_qadmin */
- &minfo, /* qi_minfo */
- NULL /* qi_mstat */
-};
-
-static struct streamtab vni_info = {
- &vnirinit, /* st_rdinit */
- &vniwinit, /* st_wrinit */
- NULL, /* st_muxrinit */
- NULL /* st_muxwrinit */
-};
-
-DDI_DEFINE_STREAM_OPS(vni_ops, nulldev, nulldev, vniattach, \
- vnidetach, nodev, nodev, VNIFLAGS, &vni_info, ddi_quiesce_not_supported);
-
-static struct modldrv modldrv = {
- &mod_driverops,
- "Virtual network interface",
- &vni_ops,
-};
-
-static struct modlinkage modlinkage = {
- MODREV_1, &modldrv, NULL
-};
-
-static vni_str_t *vni_strlist_head;
-
-/*
- * DL_INFO_ACK template for VNI pseudo interface.
- */
-static dl_info_ack_t dlvni_infoack = {
- DL_INFO_ACK, /* dl_primitive */
- 0, /* dl_max_sdu */
- 0, /* dl_min_sdu */
- 0, /* dl_addr_length */
- SUNW_DL_VNI, /* dl_mac_type */
- 0, /* dl_reserved */
- 0, /* dl_current_state */
- 0, /* dl_sap_length */
- DL_CLDLS, /* dl_service_mode */
- 0, /* dl_qos_length */
- 0, /* dl_qos_offset */
- 0, /* dl_range_length */
- 0, /* dl_range_offset */
- DL_STYLE2, /* dl_provider_style */
- 0, /* dl_addr_offset */
- DL_VERSION_2, /* dl_version */
- 0, /* dl_brdcst_addr_length */
- 0, /* dl_brdcst_addr_offset */
- 0 /* dl_growth */
-};
-
-int
-_init(void)
-{
- return (mod_install(&modlinkage));
-}
-
-int
-_fini(void)
-{
- return (mod_remove(&modlinkage));
-}
-
-int
-_info(struct modinfo *modinfop)
-{
- return (mod_info(&modlinkage, modinfop));
-}
-
-static int
-vniattach(dev_info_t *devi, ddi_attach_cmd_t cmd)
-{
- if (cmd != DDI_ATTACH) {
- cmn_err(CE_NOTE, "vniattach failure: cmd != DDI_ATTACH\n");
- return (DDI_FAILURE);
- }
-
- if (ddi_create_minor_node(devi, VNINAME, S_IFCHR,
- ddi_get_instance(devi), DDI_PSEUDO, CLONE_DEV) ==
- DDI_FAILURE) {
- ddi_remove_minor_node(devi, NULL);
- cmn_err(CE_NOTE, "vniattach failure: ddi_create_minor_node\n");
- return (DDI_FAILURE);
- }
-
- return (DDI_SUCCESS);
-}
-
-static int
-vnidetach(dev_info_t *devi, ddi_detach_cmd_t cmd)
-{
- if (cmd != DDI_DETACH)
- return (DDI_FAILURE);
-
- ddi_remove_minor_node(devi, NULL);
- return (DDI_SUCCESS);
-}
-
-/* ARGSUSED */
-static int
-vniopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
-{
- vni_str_t *stp, *prevstp;
- minor_t minordev = 0;
-
- if (sflag != CLONEOPEN)
- return (EINVAL);
-
- prevstp = NULL;
-
- for (stp = vni_strlist_head; stp != NULL; stp = stp->st_next) {
- if (minordev < stp->st_minor)
- break;
- minordev++;
- prevstp = stp;
- }
-
- stp = kmem_zalloc(sizeof (vni_str_t), KM_SLEEP);
-
- *devp = makedevice(getmajor(*devp), minordev);
-
- stp->st_minor = minordev;
- stp->st_state = DL_UNATTACHED;
- stp->st_next = NULL;
-
- q->q_ptr = stp;
- WR(q)->q_ptr = stp;
-
- if (prevstp != NULL) {
- stp->st_next = prevstp->st_next;
- prevstp->st_next = stp;
- } else {
- stp->st_next = vni_strlist_head;
- vni_strlist_head = stp;
- }
-
- qprocson(q);
- return (0);
-}
-
-/* ARGSUSED */
-static int
-vniclose(queue_t *q, int flag, cred_t *credp)
-{
- vni_str_t *stp, **prevstpp;
-
- qprocsoff(q);
- stp = (vni_str_t *)q->q_ptr;
- stp->st_state = DL_UNATTACHED;
-
- /* Unlink the per-stream entry from the list and free it */
- stp = vni_strlist_head;
- prevstpp = &vni_strlist_head;
-
- for (; stp != NULL; stp = stp->st_next) {
- if (stp == (vni_str_t *)q->q_ptr)
- break;
- prevstpp = &stp->st_next;
- }
-
- ASSERT(stp != NULL);
-
- *prevstpp = stp->st_next;
-
- kmem_free(stp, sizeof (vni_str_t));
-
- q->q_ptr = WR(q)->q_ptr = NULL;
- return (0);
-}
-
-static int
-vniwput(queue_t *q, mblk_t *mp)
-{
- union DL_primitives *dlp;
- vni_str_t *stp;
- dl_info_ack_t *dlip;
- t_scalar_t prim;
-
- stp = q->q_ptr;
-
- switch ((mp)->b_datap->db_type) {
- case M_PROTO:
- case M_PCPROTO:
- if (MBLKL(mp) < sizeof (t_scalar_t)) {
- dlerrorack(q, mp, DL_PRIM_INVAL, DL_UNSUPPORTED, 0);
- return (0);
- }
- dlp = (void *)mp->b_rptr;
- prim = dlp->dl_primitive;
- switch (prim) {
- case DL_ATTACH_REQ:
- if (MBLKL(mp) < DL_ATTACH_REQ_SIZE) {
- dlerrorack(q, mp, DL_ATTACH_REQ, DL_BADPRIM, 0);
- return (0);
- }
- if (stp->st_state != DL_UNATTACHED) {
- dlerrorack(q, mp, DL_ATTACH_REQ, DL_OUTSTATE,
- 0);
- return (0);
- }
- stp->st_ppa = dlp->attach_req.dl_ppa;
- stp->st_state = DL_UNBOUND;
- dlokack(q, mp, DL_ATTACH_REQ);
- break;
- case DL_BIND_REQ:
- if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
- dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
- return (0);
- }
- if (stp->st_state != DL_UNBOUND) {
- dlerrorack(q, mp, DL_BIND_REQ, DL_OUTSTATE, 0);
- return (0);
- }
- stp->st_state = DL_IDLE;
- dlbindack(q, mp, dlp->bind_req.dl_sap, NULL, 0, 0, 0);
- break;
- case DL_INFO_REQ:
- if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
- dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
- return (0);
- }
- if ((mp = mexchange(q, mp, sizeof (dl_info_ack_t),
- M_PCPROTO, DL_INFO_ACK)) == NULL) {
- return (0);
- }
- dlip = (void *)mp->b_rptr;
- *dlip = dlvni_infoack;
- dlip->dl_current_state = stp->st_state;
- qreply(q, mp);
- break;
- case DL_PHYS_ADDR_REQ:
- if (MBLKL(mp) < DL_PHYS_ADDR_REQ_SIZE) {
- dlerrorack(q, mp, DL_PHYS_ADDR_REQ, DL_BADPRIM,
- 0);
- return (0);
- }
- dlphysaddrack(q, mp, NULL, 0);
- break;
- case DL_UNBIND_REQ:
- if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
- dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
- return (0);
- }
- if (stp->st_state != DL_IDLE) {
- dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE,
- 0);
- return (0);
- }
- /* Nothing to flush. But DLPI spec says to; so do it */
- flushq(q, FLUSHALL);
- flushq(RD(q), FLUSHALL);
- stp->st_state = DL_UNBOUND;
- dlokack(q, mp, DL_UNBIND_REQ);
- break;
- case DL_DETACH_REQ:
- if (MBLKL(mp) < DL_DETACH_REQ_SIZE) {
- dlerrorack(q, mp, DL_DETACH_REQ, DL_BADPRIM, 0);
- return (0);
- }
- if (stp->st_state != DL_UNBOUND) {
- dlerrorack(q, mp, DL_DETACH_REQ, DL_OUTSTATE,
- 0);
- return (0);
- }
- stp->st_state = DL_UNATTACHED;
- dlokack(q, mp, DL_DETACH_REQ);
- break;
- default:
- dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
- }
- break;
- case M_IOCTL:
- /*
- * No ioctl's currently supported. Need to have the NAK since
- * ifconfig calls SIOCGTUNPARAM during the end of plumb
- */
- miocnak(q, mp, 0, EINVAL);
- break;
- case M_FLUSH:
- /* Really nothing to flush since no msgs enqueued */
- if (*mp->b_rptr & FLUSHR) {
- qreply(q, mp);
- } else {
- freemsg(mp);
- }
- break;
- default:
- freemsg(mp);
- break;
- }
- return (0);
-}
diff --git a/usr/src/uts/common/inet/vni/vni_impl.h b/usr/src/uts/common/inet/vni/vni_impl.h
deleted file mode 100644
index ffba1b08bf..0000000000
--- a/usr/src/uts/common/inet/vni/vni_impl.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _INET_VNI_IMPL_H
-#define _INET_VNI_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/modctl.h>
-#include <sys/stream.h>
-
-typedef struct vni_str {
- struct vni_str *st_next; /* next in list */
- t_uscalar_t st_state; /* DLPI state */
- minor_t st_minor; /* corresponding minor */
- uint32_t st_ppa; /* physical point of attachment */
-} vni_str_t;
-
-#define DL_MAXPRIM DL_GET_STATISTICS_ACK
-#define VNIIDNUM 0x2a84
-#define VNINAME "vni"
-#define VNIFLAGS (D_MP|D_MTPERMOD)
-#define VNIHIWAT 1024
-#define VNILOWAT 512
-#define VNIMINPSZ 0
-#define VNIMAXPSZ INFPSZ
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _INET_VNI_IMPL_H */