summaryrefslogtreecommitdiff
path: root/usr
diff options
context:
space:
mode:
Diffstat (limited to 'usr')
-rw-r--r--usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c16
-rw-r--r--usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c7
-rw-r--r--usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c206
-rw-r--r--usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h9
-rw-r--r--usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c9
-rw-r--r--usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c7
-rw-r--r--usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c22
-rw-r--r--usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c5
-rw-r--r--usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c263
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile45
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h14
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c1013
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c1210
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c1331
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h130
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c236
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c66
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c183
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h11
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c10
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/Makefile18
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c717
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile5
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h8
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c1265
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h5
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c27
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h13
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c35
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile48
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c1498
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl106
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types10
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c53
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c4
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c76
-rw-r--r--usr/src/cmd/devfsadm/misc_link.c4
-rw-r--r--usr/src/cmd/mdb/common/modules/ip/ip.c7
-rw-r--r--usr/src/cmd/rcm_daemon/Makefile.com4
-rw-r--r--usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c14
-rw-r--r--usr/src/cmd/rcm_daemon/common/ip_rcm.c1798
-rw-r--r--usr/src/cmd/svc/milestone/net-init13
-rw-r--r--usr/src/cmd/svc/milestone/net-loopback11
-rw-r--r--usr/src/cmd/svc/milestone/net-physical147
-rw-r--r--usr/src/cmd/svc/shell/net_include.sh510
-rw-r--r--usr/src/cmd/truss/codes.c9
-rw-r--r--usr/src/cmd/truss/print.c22
-rw-r--r--usr/src/cmd/zoneadmd/vplat.c26
-rw-r--r--usr/src/lib/brand/native/zone/platform.xml3
-rw-r--r--usr/src/lib/brand/sn1/zone/platform.xml3
-rw-r--r--usr/src/lib/libbsm/common/adt.c4
-rw-r--r--usr/src/lib/libdlpi/common/libdlpi.c21
-rw-r--r--usr/src/lib/libinetcfg/common/inetcfg.c38
-rw-r--r--usr/src/lib/libinetutil/Makefile.com14
-rw-r--r--usr/src/lib/libinetutil/common/ifaddrlist.c68
-rw-r--r--usr/src/lib/libinetutil/common/ifaddrlistx.c168
-rw-r--r--usr/src/lib/libinetutil/common/inetutil.c (renamed from usr/src/lib/libinetutil/common/inetutil4.c)36
-rw-r--r--usr/src/lib/libinetutil/common/libinetutil.h55
-rw-r--r--usr/src/lib/libinetutil/common/mapfile-vers7
-rw-r--r--usr/src/lib/libipmp/Makefile6
-rw-r--r--usr/src/lib/libipmp/Makefile.com9
-rw-r--r--usr/src/lib/libipmp/common/ipmp.c20
-rw-r--r--usr/src/lib/libipmp/common/ipmp.h12
-rw-r--r--usr/src/lib/libipmp/common/ipmp_admin.c104
-rw-r--r--usr/src/lib/libipmp/common/ipmp_admin.h50
-rw-r--r--usr/src/lib/libipmp/common/ipmp_mpathd.c26
-rw-r--r--usr/src/lib/libipmp/common/ipmp_mpathd.h81
-rw-r--r--usr/src/lib/libipmp/common/ipmp_query.c498
-rw-r--r--usr/src/lib/libipmp/common/ipmp_query.h112
-rw-r--r--usr/src/lib/libipmp/common/ipmp_query_impl.h41
-rw-r--r--usr/src/lib/libipmp/common/llib-lipmp10
-rw-r--r--usr/src/lib/libipmp/common/mapfile-vers13
-rw-r--r--usr/src/lib/libnsl/nss/netdir_inet_sundry.c92
-rw-r--r--usr/src/lib/libsocket/inet/interface_id.c23
-rw-r--r--usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c4
-rw-r--r--usr/src/pkgdefs/SUNWarc/prototype_com4
-rw-r--r--usr/src/pkgdefs/SUNWarcr/prototype_com4
-rw-r--r--usr/src/pkgdefs/SUNWckr/prototype_com4
-rw-r--r--usr/src/pkgdefs/SUNWckr/prototype_i3864
-rw-r--r--usr/src/pkgdefs/SUNWckr/prototype_sparc4
-rw-r--r--usr/src/pkgdefs/SUNWcsd/postinstall3
-rw-r--r--usr/src/pkgdefs/SUNWcsl/prototype_com4
-rw-r--r--usr/src/pkgdefs/SUNWcslr/prototype_com4
-rw-r--r--usr/src/pkgdefs/SUNWcsr/prototype_com6
-rw-r--r--usr/src/pkgdefs/SUNWcsu/prototype_com5
-rw-r--r--usr/src/pkgdefs/SUNWhea/prototype_com3
-rw-r--r--usr/src/tools/scripts/bfu.sh8
-rw-r--r--usr/src/uts/common/Makefile.files6
-rw-r--r--usr/src/uts/common/Makefile.rules11
-rw-r--r--usr/src/uts/common/inet/arp.h13
-rw-r--r--usr/src/uts/common/inet/arp/arp.c607
-rw-r--r--usr/src/uts/common/inet/arp_impl.h7
-rw-r--r--usr/src/uts/common/inet/dlpistub/dlpistub.c370
-rw-r--r--usr/src/uts/common/inet/dlpistub/dlpistub.conf (renamed from usr/src/uts/common/inet/vni/vni.conf)12
-rw-r--r--usr/src/uts/common/inet/dlpistub/dlpistub_impl.h49
-rw-r--r--usr/src/uts/common/inet/ip.h688
-rw-r--r--usr/src/uts/common/inet/ip/icmp.c4
-rw-r--r--usr/src/uts/common/inet/ip/icmp_opt_data.c11
-rw-r--r--usr/src/uts/common/inet/ip/igmp.c246
-rw-r--r--usr/src/uts/common/inet/ip/ip.c2367
-rw-r--r--usr/src/uts/common/inet/ip/ip6.c1170
-rw-r--r--usr/src/uts/common/inet/ip/ip6_if.c535
-rw-r--r--usr/src/uts/common/inet/ip/ip6_ire.c206
-rw-r--r--usr/src/uts/common/inet/ip/ip6_rts.c6
-rw-r--r--usr/src/uts/common/inet/ip/ip_ftable.c136
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c8887
-rw-r--r--usr/src/uts/common/inet/ip/ip_ire.c548
-rw-r--r--usr/src/uts/common/inet/ip/ip_mroute.c21
-rw-r--r--usr/src/uts/common/inet/ip/ip_multi.c787
-rw-r--r--usr/src/uts/common/inet/ip/ip_ndp.c1040
-rw-r--r--usr/src/uts/common/inet/ip/ip_netinfo.c132
-rw-r--r--usr/src/uts/common/inet/ip/ip_opt_data.c11
-rw-r--r--usr/src/uts/common/inet/ip/ip_rts.c134
-rw-r--r--usr/src/uts/common/inet/ip/ipclassifier.c5
-rw-r--r--usr/src/uts/common/inet/ip/ipmp.c2201
-rw-r--r--usr/src/uts/common/inet/ip/rts.c24
-rw-r--r--usr/src/uts/common/inet/ip/rts_opt_data.c3
-rw-r--r--usr/src/uts/common/inet/ip/spd.c7
-rw-r--r--usr/src/uts/common/inet/ip6.h12
-rw-r--r--usr/src/uts/common/inet/ip_if.h77
-rw-r--r--usr/src/uts/common/inet/ip_impl.h6
-rw-r--r--usr/src/uts/common/inet/ip_ire.h28
-rw-r--r--usr/src/uts/common/inet/ip_multi.h25
-rw-r--r--usr/src/uts/common/inet/ip_ndp.h9
-rw-r--r--usr/src/uts/common/inet/ip_rts.h23
-rw-r--r--usr/src/uts/common/inet/ip_stack.h36
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h7
-rw-r--r--usr/src/uts/common/inet/ipnet/ipnet.c33
-rw-r--r--usr/src/uts/common/inet/ipsec_info.h12
-rw-r--r--usr/src/uts/common/inet/mib2.h20
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_addr.c4
-rw-r--r--usr/src/uts/common/inet/sctp_ip.h4
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c26
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_fusion.c3
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_opt_data.c5
-rw-r--r--usr/src/uts/common/inet/udp/udp.c17
-rw-r--r--usr/src/uts/common/inet/udp/udp_opt_data.c11
-rw-r--r--usr/src/uts/common/inet/vni/vni.c359
-rw-r--r--usr/src/uts/common/inet/vni/vni_impl.h59
-rw-r--r--usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c301
-rw-r--r--usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c139
-rw-r--r--usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c108
-rw-r--r--usr/src/uts/common/ipp/ipgpc/classifier-objects.h19
-rw-r--r--usr/src/uts/common/ipp/ipgpc/classifier.c72
-rw-r--r--usr/src/uts/common/ipp/ipgpc/classifier.h11
-rw-r--r--usr/src/uts/common/ipp/ipgpc/classifierddi.c40
-rw-r--r--usr/src/uts/common/ipp/ipgpc/filters.c79
-rw-r--r--usr/src/uts/common/ipp/ipgpc/ipgpc.h10
-rw-r--r--usr/src/uts/common/net/if.h52
-rw-r--r--usr/src/uts/common/net/route.h15
-rw-r--r--usr/src/uts/common/netinet/in.h21
-rw-r--r--usr/src/uts/common/rpc/rpcib.c392
-rw-r--r--usr/src/uts/common/sys/dlpi.h7
-rw-r--r--usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h14
-rw-r--r--usr/src/uts/common/sys/socket.h9
-rw-r--r--usr/src/uts/common/sys/sockio.h25
-rw-r--r--usr/src/uts/common/sys/sysevent/eventdefs.h4
-rw-r--r--usr/src/uts/common/sys/sysevent/ipmp.h102
-rw-r--r--usr/src/uts/intel/Makefile.intel.shared2
-rw-r--r--usr/src/uts/intel/dlpistub/Makefile (renamed from usr/src/uts/intel/vni/Makefile)21
-rw-r--r--usr/src/uts/intel/ip/ip.global-objs.debug645
-rw-r--r--usr/src/uts/intel/ip/ip.global-objs.obj645
-rw-r--r--usr/src/uts/intel/os/name_to_major2
-rw-r--r--usr/src/uts/sparc/Makefile.sparc.shared5
-rw-r--r--usr/src/uts/sparc/dlpistub/Makefile (renamed from usr/src/uts/sparc/vni/Makefile)21
-rw-r--r--usr/src/uts/sparc/ip/ip.global-objs.debug645
-rw-r--r--usr/src/uts/sparc/ip/ip.global-objs.obj645
-rw-r--r--usr/src/uts/sparc/os/name_to_major2
168 files changed, 17264 insertions, 18066 deletions
diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c
index 34bb772632..5a4779cfa5 100644
--- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c
+++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -133,6 +133,7 @@ main(int argc, char **argv)
boolean_t is_verbose;
int ipc_fd;
int c;
+ int aware = RTAW_UNDER_IPMP;
struct rlimit rl;
debug_level = df_get_int("", B_FALSE, DF_DEBUG_LEVEL);
@@ -301,6 +302,17 @@ main(int argc, char **argv)
dhcpmsg(MSG_ERR, "cannot open routing socket");
return (EXIT_FAILURE);
}
+
+ /*
+ * We're IPMP-aware and can manage IPMP test addresses, so issue
+ * RT_AWARE to get routing socket messages for interfaces under IPMP.
+ */
+ if (setsockopt(rtsock_fd, SOL_ROUTE, RT_AWARE, &aware,
+ sizeof (aware)) == -1) {
+ dhcpmsg(MSG_ERR, "cannot set RT_AWARE on routing socket");
+ return (EXIT_FAILURE);
+ }
+
if (iu_register_event(eh, rtsock_fd, POLLIN, rtsock_event, 0) == -1) {
dhcpmsg(MSG_ERR, "cannot register routing socket for messages");
return (EXIT_FAILURE);
@@ -1182,7 +1194,7 @@ check_lif(dhcp_lif_t *lif, const struct ifa_msghdr *ifam, int msglen)
lif->lif_name);
lif_mark_decline(lif, "duplicate address");
close_ip_lif(lif);
- (void) open_ip_lif(lif, INADDR_ANY);
+ (void) open_ip_lif(lif, INADDR_ANY, B_TRUE);
}
dad_wait = lif->lif_dad_wait;
diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c
index 4637ecc346..6cfce9f0a9 100644
--- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c
+++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c
@@ -19,14 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* BOUND state of the DHCP client state machine.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/socket.h>
#include <sys/types.h>
#include <string.h>
@@ -358,7 +356,8 @@ dhcp_bound_complete(dhcp_smach_t *dsmp)
lif = dsmp->dsm_lif;
if (router_list != NULL &&
(router_list->len % sizeof (ipaddr_t)) == 0 &&
- strchr(lif->lif_name, ':') == NULL) {
+ strchr(lif->lif_name, ':') == NULL &&
+ !lif->lif_pif->pif_under_ipmp) {
dsmp->dsm_nrouters = router_list->len / sizeof (ipaddr_t);
dsmp->dsm_routers = malloc(router_list->len);
diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c
index 0cfdad40e3..5d2d5fb99e 100644
--- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c
+++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -76,6 +76,7 @@ insert_pif(const char *pname, boolean_t isv6, int *error)
{
dhcp_pif_t *pif;
struct lifreq lifr;
+ lifgroupinfo_t lifgr;
dlpi_handle_t dh = NULL;
int fd = isv6 ? v6_sock_fd : v4_sock_fd;
@@ -127,12 +128,60 @@ insert_pif(const char *pname, boolean_t isv6, int *error)
}
/*
- * For IPv4, use DLPI to determine the hardware type, hardware
- * address, and hardware address length.
+ * Check if the pif is in an IPMP group. Interfaces using IPMP don't
+ * have dedicated hardware addresses, and get their hardware type from
+ * the SIOCGLIFGROUPINFO ioctl rather than DLPI.
*/
- if (!isv6) {
- int rc;
- dlpi_info_t dlinfo;
+ if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1) {
+ *error = DHCP_IPC_E_INT;
+ dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFGROUPNAME for %s", pname);
+ goto failure;
+ }
+
+ if (lifr.lifr_groupname[0] != '\0') {
+ (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname,
+ LIFGRNAMSIZ);
+ if (ioctl(fd, SIOCGLIFGROUPINFO, &lifgr) == -1) {
+ *error = DHCP_IPC_E_INT;
+ dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFGROUPINFO for %s",
+ lifgr.gi_grname);
+ goto failure;
+ }
+
+ pif->pif_hwtype = dlpi_arptype(lifgr.gi_mactype);
+ pif->pif_under_ipmp = (strcmp(pname, lifgr.gi_grifname) != 0);
+ (void) strlcpy(pif->pif_grifname, lifgr.gi_grifname, LIFNAMSIZ);
+
+ /*
+ * For IPMP underlying interfaces, stash the interface index
+ * of the IPMP meta-interface; we'll use it to send/receive
+ * traffic. This is both necessary (since IP_BOUND_IF for
+ * non-unicast traffic won't work on underlying interfaces)
+ * and preferred (since a test address lease will be able to
+ * be maintained as long as another interface in the group is
+ * still functioning).
+ */
+ if (pif->pif_under_ipmp) {
+ (void) strlcpy(lifr.lifr_name, pif->pif_grifname,
+ LIFNAMSIZ);
+
+ if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) {
+ *error = DHCP_IPC_E_INT;
+ dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFINDEX "
+ "for %s", lifr.lifr_name);
+ goto failure;
+ }
+ pif->pif_grindex = lifr.lifr_index;
+ }
+ }
+
+ /*
+ * For IPv4, if the hardware type is still unknown, use DLPI to
+ * determine it, the hardware address, and hardware address length.
+ */
+ if (!isv6 && pif->pif_hwtype == 0) {
+ int rc;
+ dlpi_info_t dlinfo;
if ((rc = dlpi_open(pname, &dh, 0)) != DLPI_SUCCESS) {
dhcpmsg(MSG_ERROR, "insert_pif: dlpi_open: %s",
@@ -661,11 +710,12 @@ verify_lif(const dhcp_lif_t *lif)
boolean_t isv6;
int fd;
struct lifreq lifr;
+ dhcp_pif_t *pif = lif->lif_pif;
(void) memset(&lifr, 0, sizeof (struct lifreq));
(void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ);
- isv6 = lif->lif_pif->pif_isv6;
+ isv6 = pif->pif_isv6;
fd = isv6 ? v6_sock_fd : v4_sock_fd;
if (ioctl(fd, SIOCGLIFFLAGS, &lifr) == -1) {
@@ -689,43 +739,41 @@ verify_lif(const dhcp_lif_t *lif)
}
/*
- * Special case: if the interface has gone down as a duplicate, then
- * this alone does _not_ mean that we're abandoning it just yet. Allow
- * the state machine to handle this normally by trying to get a new
- * lease.
- */
- if ((lifr.lifr_flags & (IFF_UP|IFF_DUPLICATE)) == IFF_DUPLICATE) {
- dhcpmsg(MSG_DEBUG, "verify_lif: duplicate address on %s",
- lif->lif_name);
- return (B_TRUE);
- }
-
- /*
- * If the user has torn down or started up the interface manually, then
- * abandon the lease.
- */
- if ((lif->lif_flags ^ lifr.lifr_flags) & IFF_UP) {
- dhcpmsg(MSG_DEBUG, "verify_lif: user has %s %s",
- lifr.lifr_flags & IFF_UP ? "started up" : "shut down",
- lif->lif_name);
- return (B_FALSE);
- }
-
- /*
* Check for delete and recreate.
*/
if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) {
- dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX failed on %s",
- lif->lif_name);
+ if (errno != ENXIO) {
+ dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX failed "
+ "on %s", lif->lif_name);
+ }
return (B_FALSE);
}
- if (lifr.lifr_index != lif->lif_pif->pif_index) {
+ if (lifr.lifr_index != pif->pif_index) {
dhcpmsg(MSG_DEBUG,
"verify_lif: ifindex on %s changed: %u to %u",
- lif->lif_name, lif->lif_pif->pif_index, lifr.lifr_index);
+ lif->lif_name, pif->pif_index, lifr.lifr_index);
return (B_FALSE);
}
+ if (pif->pif_under_ipmp) {
+ (void) strlcpy(lifr.lifr_name, pif->pif_grifname, LIFNAMSIZ);
+
+ if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) {
+ if (errno != ENXIO) {
+ dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX "
+ "failed on %s", lifr.lifr_name);
+ }
+ return (B_FALSE);
+ }
+
+ if (lifr.lifr_index != pif->pif_grindex) {
+ dhcpmsg(MSG_DEBUG, "verify_lif: IPMP group ifindex "
+ "on %s changed: %u to %u", lifr.lifr_name,
+ pif->pif_grindex, lifr.lifr_index);
+ return (B_FALSE);
+ }
+ }
+
/*
* If the IP address, netmask, or broadcast address have changed, or
* the interface has been unplumbed, then we act like there has been an
@@ -934,6 +982,13 @@ plumb_lif(dhcp_pif_t *pif, const in6_addr_t *addr)
lifr.lifr_name);
goto failure;
}
+
+ /*
+ * See comment in set_lif_dhcp().
+ */
+ if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER))
+ lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED;
+
lifr.lifr_flags |= IFF_UP | IFF_DHCPRUNNING;
if (ioctl(v6_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) {
dhcpmsg(MSG_ERR, "plumb_lif: SIOCSLIFFLAGS %s",
@@ -1060,8 +1115,9 @@ set_lif_dhcp(dhcp_lif_t *lif, boolean_t is_adopting)
int fd;
int err;
struct lifreq lifr;
+ dhcp_pif_t *pif = lif->lif_pif;
- fd = lif->lif_pif->pif_isv6 ? v6_sock_fd : v4_sock_fd;
+ fd = pif->pif_isv6 ? v6_sock_fd : v4_sock_fd;
(void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ);
@@ -1098,6 +1154,17 @@ set_lif_dhcp(dhcp_lif_t *lif, boolean_t is_adopting)
"set on %s", lif->lif_name);
}
} else {
+ /*
+ * If the lif is on an interface under IPMP, IFF_NOFAILOVER
+ * must be set or the kernel will prevent us from setting
+ * IFF_DHCPRUNNING (since the subsequent IFF_UP would lead to
+ * migration). We set IFF_DEPRECATED too since the kernel
+ * will set it automatically when setting IFF_NOFAILOVER,
+ * causing our lif_flags value to grow stale.
+ */
+ if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER))
+ lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED;
+
lifr.lifr_flags |= IFF_DHCPRUNNING;
if (ioctl(fd, SIOCSLIFFLAGS, &lifr) == -1) {
dhcpmsg(MSG_ERR, "set_lif_dhcp: SIOCSLIFFLAGS for %s",
@@ -1207,6 +1274,13 @@ clear_lif_deprecated(dhcp_lif_t *lif)
return (B_FALSE);
}
+ /*
+ * Don't try to clear IFF_DEPRECATED if this is a test address,
+ * since IPMP's use of IFF_DEPRECATED is not compatible with ours.
+ */
+ if (lifr.lifr_flags & IFF_NOFAILOVER)
+ return (B_TRUE);
+
if (!(lifr.lifr_flags & IFF_DEPRECATED))
return (B_TRUE);
@@ -1226,16 +1300,19 @@ clear_lif_deprecated(dhcp_lif_t *lif)
*
* input: dhcp_lif_t *: the logical interface to operate on
* in_addr_t: the address the socket will be bound to (in hbo)
+ * boolean_t: B_TRUE if the address should be brought up (if needed)
* output: boolean_t: B_TRUE if the socket was opened successfully.
*/
boolean_t
-open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo)
+open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo, boolean_t bringup)
{
const char *errmsg;
struct lifreq lifr;
int on = 1;
uchar_t ttl = 255;
+ uint32_t ifindex;
+ dhcp_pif_t *pif = lif->lif_pif;
if (lif->lif_sock_ip_fd != -1) {
dhcpmsg(MSG_WARNING, "open_ip_lif: socket already open on %s",
@@ -1270,7 +1347,7 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo)
}
if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_DHCPINIT_IF,
- &lif->lif_pif->pif_index, sizeof (int)) == -1) {
+ &pif->pif_index, sizeof (int)) == -1) {
errmsg = "cannot set IP_DHCPINIT_IF";
goto failure;
}
@@ -1288,23 +1365,40 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo)
goto failure;
}
- if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_BOUND_IF,
- &lif->lif_pif->pif_index, sizeof (int)) == -1) {
+ ifindex = pif->pif_under_ipmp ? pif->pif_grindex : pif->pif_index;
+ if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_BOUND_IF, &ifindex,
+ sizeof (int)) == -1) {
errmsg = "cannot set IP_BOUND_IF";
goto failure;
}
- /*
- * Make sure at least one lif on the interface we used in IP_BOUND_IF
- * is IFF_UP so that we can send and receive IP packets.
- */
(void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ);
if (ioctl(v4_sock_fd, SIOCGLIFFLAGS, &lifr) == -1) {
errmsg = "cannot get interface flags";
goto failure;
}
- if (!(lifr.lifr_flags & IFF_UP)) {
+ /*
+ * If the lif is part of an interface under IPMP, IFF_NOFAILOVER must
+ * be set or the kernel will prevent us from setting IFF_DHCPRUNNING
+ * (since the subsequent IFF_UP would lead to migration). We set
+ * IFF_DEPRECATED too since the kernel will set it automatically when
+ * setting IFF_NOFAILOVER, causing our lif_flags value to grow stale.
+ */
+ if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER)) {
+ lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED;
+ if (ioctl(v4_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) {
+ errmsg = "cannot set IFF_NOFAILOVER";
+ goto failure;
+ }
+ }
+ lif->lif_flags = lifr.lifr_flags;
+
+ /*
+ * If this is initial bringup, make sure the address we're acquiring a
+ * lease on is IFF_UP.
+ */
+ if (bringup && !(lifr.lifr_flags & IFF_UP)) {
/*
* Start from a clean slate.
*/
@@ -1330,6 +1424,30 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo)
((struct sockaddr_in *)&lifr.lifr_addr)->sin_addr.s_addr;
}
+ /*
+ * Usually, bringing up the address we're acquiring a lease on is
+ * sufficient to allow packets to be sent and received via the
+ * IP_BOUND_IF we did earlier. However, if we're acquiring a lease on
+ * an underlying IPMP interface, the group interface will be used for
+ * sending and receiving IP packets via IP_BOUND_IF. Thus, ensure at
+ * least one address on the group interface is IFF_UP.
+ */
+ if (bringup && pif->pif_under_ipmp) {
+ (void) strlcpy(lifr.lifr_name, pif->pif_grifname, LIFNAMSIZ);
+ if (ioctl(v4_sock_fd, SIOCGLIFFLAGS, &lifr) == -1) {
+ errmsg = "cannot get IPMP group interface flags";
+ goto failure;
+ }
+
+ if (!(lifr.lifr_flags & IFF_UP)) {
+ lifr.lifr_flags |= IFF_UP;
+ if (ioctl(v4_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) {
+ errmsg = "cannot bring up IPMP group interface";
+ goto failure;
+ }
+ }
+ }
+
lif->lif_packet_id = iu_register_event(eh, lif->lif_sock_ip_fd, POLLIN,
dhcp_packet_lif, lif);
if (lif->lif_packet_id == -1) {
diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h
index a59e3ea68d..46cf30bedb 100644
--- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h
+++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef INTERFACE_H
#define INTERFACE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Interface.[ch] encapsulate all of the agent's knowledge of network
* interfaces from the DHCP agent's perspective. See interface.c for
@@ -66,6 +64,9 @@ struct dhcp_pif_s {
boolean_t pif_running; /* interface is running */
uint_t pif_hold_count; /* reference count */
char pif_name[LIFNAMSIZ];
+ char pif_grifname[LIFNAMSIZ];
+ uint32_t pif_grindex; /* interface index for pif_grifname */
+ boolean_t pif_under_ipmp; /* is an ipmp underlying interface */
};
struct dhcp_lif_s {
@@ -182,7 +183,7 @@ dhcp_lif_t *attach_lif(const char *, boolean_t, int *);
int set_lif_dhcp(dhcp_lif_t *, boolean_t);
void set_lif_deprecated(dhcp_lif_t *);
boolean_t clear_lif_deprecated(dhcp_lif_t *);
-boolean_t open_ip_lif(dhcp_lif_t *, in_addr_t);
+boolean_t open_ip_lif(dhcp_lif_t *, in_addr_t, boolean_t);
void close_ip_lif(dhcp_lif_t *);
void lif_mark_decline(dhcp_lif_t *, const char *);
boolean_t schedule_lif_timer(dhcp_lif_t *, dhcp_timer_t *,
diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c
index 8a32b55ea5..a763530436 100644
--- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c
+++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <string.h>
#include <sys/types.h>
#include <stdlib.h>
@@ -970,7 +968,10 @@ send_pkt_internal(dhcp_smach_t *dsmp)
ipi6->ipi6_addr = lif->lif_v6addr;
else
ipi6->ipi6_addr = my_in6addr_any;
- ipi6->ipi6_ifindex = lif->lif_pif->pif_index;
+ if (lif->lif_pif->pif_under_ipmp)
+ ipi6->ipi6_ifindex = lif->lif_pif->pif_grindex;
+ else
+ ipi6->ipi6_ifindex = lif->lif_pif->pif_index;
cmsg->cmsg_len = (char *)(ipi6 + 1) - (char *)cmsg;
/*
diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c
index a8c05de986..78da07aebf 100644
--- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c
+++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c
@@ -19,14 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* REQUESTING state of the client state machine.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdlib.h>
#include <string.h>
#include <search.h>
@@ -1008,7 +1006,8 @@ dhcp_acknak_global(iu_eh_t *ehp, int fd, short events, iu_event_id_t id,
for (dsmp = lookup_smach_by_xid(xid, NULL, isv6); dsmp != NULL;
dsmp = lookup_smach_by_xid(xid, dsmp, isv6)) {
pif = dsmp->dsm_lif->lif_pif;
- if (pif->pif_index == plp->ifindex)
+ if (pif->pif_index == plp->ifindex ||
+ pif->pif_under_ipmp && pif->pif_grindex == plp->ifindex)
break;
}
diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c
index 9ae7fd7aba..852b428551 100644
--- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c
+++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* This module contains core functions for managing DHCP state machine
* instances.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdlib.h>
#include <search.h>
#include <string.h>
@@ -151,7 +149,7 @@ insert_smach(dhcp_lif_t *lif, int *error)
/*
* With IPv4 DHCP, we use a socket per lif.
*/
- if (!open_ip_lif(lif, INADDR_ANY)) {
+ if (!open_ip_lif(lif, INADDR_ANY, B_TRUE)) {
dhcpmsg(MSG_ERR, "unable to open socket for %s",
lif->lif_name);
/* This will also dispose of the LIF */
@@ -696,14 +694,15 @@ set_smach_state(dhcp_smach_t *dsmp, DHCPSTATE state)
if (is_bound_state(dsmp->dsm_state)) {
if (!is_bound_state(state)) {
close_ip_lif(lif);
- if (!open_ip_lif(lif, INADDR_ANY))
+ if (!open_ip_lif(lif, INADDR_ANY,
+ B_FALSE))
return (B_FALSE);
}
} else {
if (is_bound_state(state)) {
close_ip_lif(lif);
if (!open_ip_lif(lif,
- ntohl(lif->lif_addr)))
+ ntohl(lif->lif_addr), B_FALSE))
return (B_FALSE);
}
}
@@ -952,11 +951,14 @@ no_specified_id:
* unable to parse it. We need to determine if a Client ID is required
* and, if so, generate one.
*
- * If it's IPv4 and not a logical interface, then we need to preserve
- * backward-compatibility by avoiding new-fangled DUID/IAID
- * construction.
+ * If it's IPv4, not in an IPMP group, and not a logical interface,
+ * then we need to preserve backward-compatibility by avoiding
+ * new-fangled DUID/IAID construction. (Note: even for IPMP test
+ * addresses, we construct a DUID/IAID since we may renew a lease for
+ * an IPMP test address on any functioning IP interface in the group.)
*/
- if (!pif->pif_isv6 && strchr(dsmp->dsm_name, ':') == NULL) {
+ if (!pif->pif_isv6 && pif->pif_grifname[0] == '\0' &&
+ strchr(dsmp->dsm_name, ':') == NULL) {
if (pif->pif_hwtype == ARPHRD_IB) {
/*
* This comes from the DHCP over IPoIB specification.
diff --git a/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c b/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c
index 47e1202b32..d73722cc55 100644
--- a/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c
+++ b/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -8,8 +8,6 @@
* specifies the terms and conditions for redistribution.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Ifparse splits up an ifconfig command line, and was written for use
* with the networking boot scripts; see $SRC/cmd/svc/shell/net_include.sh
@@ -184,6 +182,7 @@ struct cmd {
{ "auto-revarp", 0, AF_INET, PARSEFIXED},
{ "plumb", 0, AF_ANY, PARSENOW },
{ "unplumb", 0, AF_ANY, PARSENOW },
+ { "ipmp", 0, AF_ANY, PARSELOG0 },
{ "subnet", NEXTARG, AF_ANY, 0 },
{ "token", NEXTARG, AF_INET6, PARSELOG0 },
{ "tsrc", NEXTARG, AF_ANY, PARSELOG0 },
diff --git a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c
index b9a02b54e7..2d115e221b 100644
--- a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c
+++ b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -29,8 +29,6 @@
* MROUTING Revision 3.5
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* simple netstat based on snmp/mib-2 interface to the TCP/IP stack
*
@@ -221,6 +219,7 @@ static char *plural(int n);
static char *pluraly(int n);
static char *plurales(int n);
static void process_filter(char *arg);
+static char *ifindex2str(uint_t, char *);
static boolean_t family_selected(int family);
static void usage(char *);
@@ -680,8 +679,14 @@ mibget(int sd)
tor->OPT_offset = sizeof (struct T_optmgmt_req);
tor->OPT_length = sizeof (struct opthdr);
tor->MGMT_flags = T_CURRENT;
+
+
+ /*
+ * Note: we use the special level value below so that IP will return
+ * us information concerning IRE_MARK_TESTHIDDEN routes.
+ */
req = (struct opthdr *)&tor[1];
- req->level = MIB2_IP; /* any MIB2_xxx value ok here */
+ req->level = EXPER_IP_AND_TESTHIDDEN;
req->name = 0;
req->len = 0;
@@ -712,7 +717,7 @@ mibget(int sd)
stderr);
i = 0;
for (last_item = first_item; last_item;
- last_item = last_item->next_item)
+ last_item = last_item->next_item)
(void) printf("%d %4d %5d %d\n",
++i,
last_item->group,
@@ -1707,19 +1712,19 @@ mib_get_constants(mib_item_t *item)
ipRouteAttributeSize = ip->ipRouteAttributeSize;
transportMLPSize = ip->transportMLPSize;
assert(IS_P2ALIGNED(ipAddrEntrySize,
- sizeof (mib2_ipAddrEntry_t *)) &&
- IS_P2ALIGNED(ipRouteEntrySize,
- sizeof (mib2_ipRouteEntry_t *)) &&
- IS_P2ALIGNED(ipNetToMediaEntrySize,
- sizeof (mib2_ipNetToMediaEntry_t *)) &&
- IS_P2ALIGNED(ipMemberEntrySize,
- sizeof (ip_member_t *)) &&
- IS_P2ALIGNED(ipGroupSourceEntrySize,
- sizeof (ip_grpsrc_t *)) &&
- IS_P2ALIGNED(ipRouteAttributeSize,
- sizeof (mib2_ipAttributeEntry_t *)) &&
- IS_P2ALIGNED(transportMLPSize,
- sizeof (mib2_transportMLPEntry_t *)));
+ sizeof (mib2_ipAddrEntry_t *)));
+ assert(IS_P2ALIGNED(ipRouteEntrySize,
+ sizeof (mib2_ipRouteEntry_t *)));
+ assert(IS_P2ALIGNED(ipNetToMediaEntrySize,
+ sizeof (mib2_ipNetToMediaEntry_t *)));
+ assert(IS_P2ALIGNED(ipMemberEntrySize,
+ sizeof (ip_member_t *)));
+ assert(IS_P2ALIGNED(ipGroupSourceEntrySize,
+ sizeof (ip_grpsrc_t *)));
+ assert(IS_P2ALIGNED(ipRouteAttributeSize,
+ sizeof (mib2_ipAttributeEntry_t *)));
+ assert(IS_P2ALIGNED(transportMLPSize,
+ sizeof (mib2_transportMLPEntry_t *)));
break;
}
case EXPER_DVMRP: {
@@ -1728,8 +1733,9 @@ mib_get_constants(mib_item_t *item)
vifctlSize = mrts->mrts_vifctlSize;
mfcctlSize = mrts->mrts_mfcctlSize;
assert(IS_P2ALIGNED(vifctlSize,
- sizeof (struct vifclt *)) &&
- IS_P2ALIGNED(mfcctlSize, sizeof (struct mfcctl *)));
+ sizeof (struct vifclt *)));
+ assert(IS_P2ALIGNED(mfcctlSize,
+ sizeof (struct mfcctl *)));
break;
}
case MIB2_IP6: {
@@ -1745,17 +1751,17 @@ mib_get_constants(mib_item_t *item)
ipv6GroupSourceEntrySize =
ip6->ipv6GroupSourceEntrySize;
assert(IS_P2ALIGNED(ipv6IfStatsEntrySize,
- sizeof (mib2_ipv6IfStatsEntry_t *)) &&
- IS_P2ALIGNED(ipv6AddrEntrySize,
- sizeof (mib2_ipv6AddrEntry_t *)) &&
- IS_P2ALIGNED(ipv6RouteEntrySize,
- sizeof (mib2_ipv6RouteEntry_t *)) &&
- IS_P2ALIGNED(ipv6NetToMediaEntrySize,
- sizeof (mib2_ipv6NetToMediaEntry_t *)) &&
- IS_P2ALIGNED(ipv6MemberEntrySize,
- sizeof (ipv6_member_t *)) &&
- IS_P2ALIGNED(ipv6GroupSourceEntrySize,
- sizeof (ipv6_grpsrc_t *)));
+ sizeof (mib2_ipv6IfStatsEntry_t *)));
+ assert(IS_P2ALIGNED(ipv6AddrEntrySize,
+ sizeof (mib2_ipv6AddrEntry_t *)));
+ assert(IS_P2ALIGNED(ipv6RouteEntrySize,
+ sizeof (mib2_ipv6RouteEntry_t *)));
+ assert(IS_P2ALIGNED(ipv6NetToMediaEntrySize,
+ sizeof (mib2_ipv6NetToMediaEntry_t *)));
+ assert(IS_P2ALIGNED(ipv6MemberEntrySize,
+ sizeof (ipv6_member_t *)));
+ assert(IS_P2ALIGNED(ipv6GroupSourceEntrySize,
+ sizeof (ipv6_grpsrc_t *)));
break;
}
case MIB2_ICMP6: {
@@ -1774,9 +1780,9 @@ mib_get_constants(mib_item_t *item)
tcpConnEntrySize = tcp->tcpConnTableSize;
tcp6ConnEntrySize = tcp->tcp6ConnTableSize;
assert(IS_P2ALIGNED(tcpConnEntrySize,
- sizeof (mib2_tcpConnEntry_t *)) &&
- IS_P2ALIGNED(tcp6ConnEntrySize,
- sizeof (mib2_tcp6ConnEntry_t *)));
+ sizeof (mib2_tcpConnEntry_t *)));
+ assert(IS_P2ALIGNED(tcp6ConnEntrySize,
+ sizeof (mib2_tcp6ConnEntry_t *)));
break;
}
case MIB2_UDP: {
@@ -1785,9 +1791,9 @@ mib_get_constants(mib_item_t *item)
udpEntrySize = udp->udpEntrySize;
udp6EntrySize = udp->udp6EntrySize;
assert(IS_P2ALIGNED(udpEntrySize,
- sizeof (mib2_udpEntry_t *)) &&
- IS_P2ALIGNED(udp6EntrySize,
- sizeof (mib2_udp6Entry_t *)));
+ sizeof (mib2_udpEntry_t *)));
+ assert(IS_P2ALIGNED(udp6EntrySize,
+ sizeof (mib2_udp6Entry_t *)));
break;
}
case MIB2_SCTP: {
@@ -1843,7 +1849,6 @@ stat_report(mib_item_t *item)
{
int jtemp = 0;
char ifname[LIFNAMSIZ + 1];
- char *ifnamep;
/* 'for' loop 1: */
for (; item; item = item->next_item) {
@@ -1891,12 +1896,10 @@ stat_report(mib_item_t *item)
bzero(&sum6, sizeof (sum6));
/* 'for' loop 2a: */
for (ip6 = (mib2_ipv6IfStatsEntry_t *)item->valp;
- (char *)ip6 < (char *)item->valp
- + item->length;
+ (char *)ip6 < (char *)item->valp + item->length;
/* LINTED: (note 1) */
ip6 = (mib2_ipv6IfStatsEntry_t *)((char *)ip6 +
ipv6IfStatsEntrySize)) {
-
if (ip6->ipv6IfIndex == 0) {
/*
* The "unknown interface" ip6
@@ -1905,19 +1908,10 @@ stat_report(mib_item_t *item)
sum_ip6_stats(ip6, &sum6);
continue; /* 'for' loop 2a */
}
- ifnamep = if_indextoname(
- ip6->ipv6IfIndex,
- ifname);
- if (ifnamep == NULL) {
- (void) printf(
- "Invalid ifindex %d\n",
- ip6->ipv6IfIndex);
- continue; /* 'for' loop 2a */
- }
-
if (Aflag) {
(void) printf("\nIPv6 for %s\n",
- ifnamep);
+ ifindex2str(ip6->ipv6IfIndex,
+ ifname));
print_ip6_stats(ip6);
}
sum_ip6_stats(ip6, &sum6);
@@ -1935,15 +1929,10 @@ stat_report(mib_item_t *item)
break;
bzero(&sum6, sizeof (sum6));
/* 'for' loop 2b: */
- for (icmp6 =
- (mib2_ipv6IfIcmpEntry_t *)item->valp;
- (char *)icmp6 < (char *)item->valp
- + item->length;
- icmp6 =
- /* LINTED: (note 1) */
- (mib2_ipv6IfIcmpEntry_t *)((char *)icmp6
- + ipv6IfIcmpEntrySize)) {
-
+ for (icmp6 = (mib2_ipv6IfIcmpEntry_t *)item->valp;
+ (char *)icmp6 < (char *)item->valp + item->length;
+ icmp6 = (void *)((char *)icmp6 +
+ ipv6IfIcmpEntrySize)) {
if (icmp6->ipv6IfIcmpIfIndex == 0) {
/*
* The "unknown interface" icmp6
@@ -1952,19 +1941,10 @@ stat_report(mib_item_t *item)
sum_icmp6_stats(icmp6, &sum6);
continue; /* 'for' loop 2b: */
}
- ifnamep = if_indextoname(
- icmp6->ipv6IfIcmpIfIndex, ifname);
- if (ifnamep == NULL) {
- (void) printf(
- "Invalid ifindex %d\n",
- icmp6->ipv6IfIcmpIfIndex);
- continue; /* 'for' loop 2b: */
- }
-
if (Aflag) {
- (void) printf(
- "\nICMPv6 for %s\n",
- ifnamep);
+ (void) printf("\nICMPv6 for %s\n",
+ ifindex2str(
+ icmp6->ipv6IfIcmpIfIndex, ifname));
print_icmp6_stats(icmp6);
}
sum_icmp6_stats(icmp6, &sum6);
@@ -2369,51 +2349,49 @@ print_mrt_stats(struct mrtstat *mrts)
{
(void) puts("DVMRP multicast routing:");
(void) printf(" %10u hit%s - kernel forwarding cache hits\n",
- mrts->mrts_mfc_hits, PLURAL(mrts->mrts_mfc_hits));
+ mrts->mrts_mfc_hits, PLURAL(mrts->mrts_mfc_hits));
(void) printf(" %10u miss%s - kernel forwarding cache misses\n",
- mrts->mrts_mfc_misses, PLURALES(mrts->mrts_mfc_misses));
+ mrts->mrts_mfc_misses, PLURALES(mrts->mrts_mfc_misses));
(void) printf(" %10u packet%s potentially forwarded\n",
- mrts->mrts_fwd_in, PLURAL(mrts->mrts_fwd_in));
+ mrts->mrts_fwd_in, PLURAL(mrts->mrts_fwd_in));
(void) printf(" %10u packet%s actually sent out\n",
- mrts->mrts_fwd_out, PLURAL(mrts->mrts_fwd_out));
+ mrts->mrts_fwd_out, PLURAL(mrts->mrts_fwd_out));
(void) printf(" %10u upcall%s - upcalls made to mrouted\n",
- mrts->mrts_upcalls, PLURAL(mrts->mrts_upcalls));
+ mrts->mrts_upcalls, PLURAL(mrts->mrts_upcalls));
(void) printf(" %10u packet%s not sent out due to lack of resources\n",
- mrts->mrts_fwd_drop, PLURAL(mrts->mrts_fwd_drop));
+ mrts->mrts_fwd_drop, PLURAL(mrts->mrts_fwd_drop));
(void) printf(" %10u datagram%s with malformed tunnel options\n",
- mrts->mrts_bad_tunnel, PLURAL(mrts->mrts_bad_tunnel));
+ mrts->mrts_bad_tunnel, PLURAL(mrts->mrts_bad_tunnel));
(void) printf(" %10u datagram%s with no room for tunnel options\n",
- mrts->mrts_cant_tunnel, PLURAL(mrts->mrts_cant_tunnel));
+ mrts->mrts_cant_tunnel, PLURAL(mrts->mrts_cant_tunnel));
(void) printf(" %10u datagram%s arrived on wrong interface\n",
- mrts->mrts_wrong_if, PLURAL(mrts->mrts_wrong_if));
+ mrts->mrts_wrong_if, PLURAL(mrts->mrts_wrong_if));
(void) printf(" %10u datagram%s dropped due to upcall Q overflow\n",
- mrts->mrts_upq_ovflw, PLURAL(mrts->mrts_upq_ovflw));
+ mrts->mrts_upq_ovflw, PLURAL(mrts->mrts_upq_ovflw));
(void) printf(" %10u datagram%s cleaned up by the cache\n",
- mrts->mrts_cache_cleanups, PLURAL(mrts->mrts_cache_cleanups));
+ mrts->mrts_cache_cleanups, PLURAL(mrts->mrts_cache_cleanups));
(void) printf(" %10u datagram%s dropped selectively by ratelimiter\n",
- mrts->mrts_drop_sel, PLURAL(mrts->mrts_drop_sel));
+ mrts->mrts_drop_sel, PLURAL(mrts->mrts_drop_sel));
(void) printf(" %10u datagram%s dropped - bucket Q overflow\n",
- mrts->mrts_q_overflow, PLURAL(mrts->mrts_q_overflow));
+ mrts->mrts_q_overflow, PLURAL(mrts->mrts_q_overflow));
(void) printf(" %10u datagram%s dropped - larger than bkt size\n",
- mrts->mrts_pkt2large, PLURAL(mrts->mrts_pkt2large));
+ mrts->mrts_pkt2large, PLURAL(mrts->mrts_pkt2large));
(void) printf("\nPIM multicast routing:\n");
(void) printf(" %10u datagram%s dropped - bad version number\n",
- mrts->mrts_pim_badversion, PLURAL(mrts->mrts_pim_badversion));
+ mrts->mrts_pim_badversion, PLURAL(mrts->mrts_pim_badversion));
(void) printf(" %10u datagram%s dropped - bad checksum\n",
- mrts->mrts_pim_rcv_badcsum, PLURAL(mrts->mrts_pim_rcv_badcsum));
+ mrts->mrts_pim_rcv_badcsum, PLURAL(mrts->mrts_pim_rcv_badcsum));
(void) printf(" %10u datagram%s dropped - bad register packets\n",
- mrts->mrts_pim_badregisters,
- PLURAL(mrts->mrts_pim_badregisters));
+ mrts->mrts_pim_badregisters, PLURAL(mrts->mrts_pim_badregisters));
(void) printf(
- " %10u datagram%s potentially forwarded - register packets\n",
- mrts->mrts_pim_regforwards, PLURAL(mrts->mrts_pim_regforwards));
+ " %10u datagram%s potentially forwarded - register packets\n",
+ mrts->mrts_pim_regforwards, PLURAL(mrts->mrts_pim_regforwards));
(void) printf(" %10u datagram%s dropped - register send drops\n",
- mrts->mrts_pim_regsend_drops,
- PLURAL(mrts->mrts_pim_regsend_drops));
+ mrts->mrts_pim_regsend_drops, PLURAL(mrts->mrts_pim_regsend_drops));
(void) printf(" %10u datagram%s dropped - packet malformed\n",
- mrts->mrts_pim_malformed, PLURAL(mrts->mrts_pim_malformed));
+ mrts->mrts_pim_malformed, PLURAL(mrts->mrts_pim_malformed));
(void) printf(" %10u datagram%s dropped - no memory to forward\n",
- mrts->mrts_pim_nomemory, PLURAL(mrts->mrts_pim_nomemory));
+ mrts->mrts_pim_nomemory, PLURAL(mrts->mrts_pim_nomemory));
}
static void
@@ -2674,7 +2652,7 @@ if_report(mib_item_t *item, char *matchname,
"Ierrs", "Opkts", "Oerrs",
"Collis", "Queue");
- first = B_FALSE;
+ first = B_FALSE;
}
if_report_ip4(ap, ifname,
logintname, &stat, B_TRUE);
@@ -2717,7 +2695,7 @@ if_report(mib_item_t *item, char *matchname,
+ item->length;
ap++) {
(void) octetstr(&ap->ipAdEntIfIndex,
- 'a', ifname, sizeof (ifname));
+ 'a', ifname, sizeof (ifname));
(void) strtok(ifname, ":");
if (matchname) {
@@ -3387,7 +3365,7 @@ dhcp_walk_interfaces(uint_t flags_on, uint_t flags_off, int af,
*/
(void) memset(&lifn, 0, sizeof (lifn));
lifn.lifn_family = af;
- lifn.lifn_flags = LIFC_ALLZONES | LIFC_NOXMIT;
+ lifn.lifn_flags = LIFC_ALLZONES | LIFC_NOXMIT | LIFC_UNDER_IPMP;
if (ioctl(sock_fd, SIOCGLIFNUM, &lifn) == -1)
n_ifs = LIFN_GUARD_VALUE;
else
@@ -3471,7 +3449,6 @@ group_report(mib_item_t *item)
ip_grpsrc_t *ips;
ipv6_member_t *ipmp6;
ipv6_grpsrc_t *ips6;
- char *ifnamep;
boolean_t first, first_src;
/* 'for' loop 1: */
@@ -3604,7 +3581,7 @@ group_report(mib_item_t *item)
(char *)ipmp6 < (char *)v6grp->valp + v6grp->length;
/* LINTED: (note 1) */
ipmp6 = (ipv6_member_t *)((char *)ipmp6 +
- ipv6MemberEntrySize)) {
+ ipv6MemberEntrySize)) {
if (first) {
(void) puts("Group Memberships: "
"IPv6");
@@ -3615,15 +3592,8 @@ group_report(mib_item_t *item)
first = B_FALSE;
}
- ifnamep = if_indextoname(
- ipmp6->ipv6GroupMemberIfIndex, ifname);
- if (ifnamep == NULL) {
- (void) printf("Invalid ifindex %d\n",
- ipmp6->ipv6GroupMemberIfIndex);
- continue;
- }
(void) printf("%-5s %-27s %5u\n",
- ifnamep,
+ ifindex2str(ipmp6->ipv6GroupMemberIfIndex, ifname),
pr_addr6(&ipmp6->ipv6GroupMemberAddress,
abuf, sizeof (abuf)),
ipmp6->ipv6GroupMemberRefCnt);
@@ -3784,7 +3754,6 @@ ndp_report(mib_item_t *item)
char xbuf[STR_EXPAND * OCTET_LENGTH + 1];
mib2_ipv6NetToMediaEntry_t *np6;
char ifname[LIFNAMSIZ + 1];
- char *ifnamep;
boolean_t first;
if (!(family_selected(AF_INET6)))
@@ -3820,13 +3789,6 @@ ndp_report(mib_item_t *item)
first = B_FALSE;
}
- ifnamep = if_indextoname(np6->ipv6NetToMediaIfIndex,
- ifname);
- if (ifnamep == NULL) {
- (void) printf("Invalid ifindex %d\n",
- np6->ipv6NetToMediaIfIndex);
- continue; /* 'for' loop 2 */
- }
switch (np6->ipv6NetToMediaState) {
case ND_INCOMPLETE:
state = "INCOMPLETE";
@@ -3865,7 +3827,7 @@ ndp_report(mib_item_t *item)
break;
}
(void) printf("%-5s %-17s %-7s %-12s %-27s\n",
- ifnamep,
+ ifindex2str(np6->ipv6NetToMediaIfIndex, ifname),
octetstr(&np6->ipv6NetToMediaPhysAddress, 'h',
xbuf, sizeof (xbuf)),
type,
@@ -4472,7 +4434,7 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
(void) printf("%-27s %-27s %-5s %5u%c %5u %3u "
"%-5s %6u %6u %s\n",
pr_prefix6(&rp6->ipv6RouteDest,
- rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)),
+ rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)),
IN6_IS_ADDR_UNSPECIFIED(&rp6->ipv6RouteNextHop) ?
" --" :
pr_addr6(&rp6->ipv6RouteNextHop, gwbuf, sizeof (gwbuf)),
@@ -4489,7 +4451,7 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
} else {
(void) printf("%-27s %-27s %-5s %3u %7u %-5s %s\n",
pr_prefix6(&rp6->ipv6RouteDest,
- rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)),
+ rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)),
IN6_IS_ADDR_UNSPECIFIED(&rp6->ipv6RouteNextHop) ?
" --" :
pr_addr6(&rp6->ipv6RouteNextHop, gwbuf, sizeof (gwbuf)),
@@ -4690,9 +4652,9 @@ tcp_report_item_v4(const mib2_tcpConnEntry_t *tp, boolean_t first,
(void) printf("%-20s\n%-20s %5u %08x %08x %5u %08x %08x "
"%5u %5u %s\n",
pr_ap(tp->tcpConnLocalAddress,
- tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)),
+ tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)),
pr_ap(tp->tcpConnRemAddress,
- tp->tcpConnRemPort, "tcp", fname, sizeof (fname)),
+ tp->tcpConnRemPort, "tcp", fname, sizeof (fname)),
tp->tcpConnEntryInfo.ce_swnd,
tp->tcpConnEntryInfo.ce_snxt,
tp->tcpConnEntryInfo.ce_suna,
@@ -4710,9 +4672,9 @@ tcp_report_item_v4(const mib2_tcpConnEntry_t *tp, boolean_t first,
(void) printf("%-20s %-20s %5u %6d %5u %6d %s\n",
pr_ap(tp->tcpConnLocalAddress,
- tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)),
+ tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)),
pr_ap(tp->tcpConnRemAddress,
- tp->tcpConnRemPort, "tcp", fname, sizeof (fname)),
+ tp->tcpConnRemPort, "tcp", fname, sizeof (fname)),
tp->tcpConnEntryInfo.ce_swnd,
(sq >= 0) ? sq : 0,
tp->tcpConnEntryInfo.ce_rwnd,
@@ -4756,9 +4718,9 @@ tcp_report_item_v6(const mib2_tcp6ConnEntry_t *tp6, boolean_t first,
(void) printf("%-33s\n%-33s %5u %08x %08x %5u %08x %08x "
"%5u %5u %-11s %s\n",
pr_ap6(&tp6->tcp6ConnLocalAddress,
- tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)),
+ tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)),
pr_ap6(&tp6->tcp6ConnRemAddress,
- tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)),
+ tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)),
tp6->tcp6ConnEntryInfo.ce_swnd,
tp6->tcp6ConnEntryInfo.ce_snxt,
tp6->tcp6ConnEntryInfo.ce_suna,
@@ -4777,9 +4739,9 @@ tcp_report_item_v6(const mib2_tcp6ConnEntry_t *tp6, boolean_t first,
(void) printf("%-33s %-33s %5u %6d %5u %6d %-11s %s\n",
pr_ap6(&tp6->tcp6ConnLocalAddress,
- tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)),
+ tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)),
pr_ap6(&tp6->tcp6ConnRemAddress,
- tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)),
+ tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)),
tp6->tcp6ConnEntryInfo.ce_swnd,
(sq >= 0) ? sq : 0,
tp6->tcp6ConnEntryInfo.ce_rwnd,
@@ -5112,7 +5074,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr,
* displaying.
*/
switch (type) {
- case MIB2_SCTP_ADDR_V4:
+ case MIB2_SCTP_ADDR_V4:
/* v4 */
v6addr = *addr;
@@ -5124,7 +5086,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr,
}
break;
- case MIB2_SCTP_ADDR_V6:
+ case MIB2_SCTP_ADDR_V6:
/* v6 */
if (port > 0) {
(void) pr_ap6(addr, port, "sctp", name, namelen);
@@ -5133,7 +5095,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr,
}
break;
- default:
+ default:
(void) snprintf(name, namelen, "<unknown addr type>");
break;
}
@@ -5379,7 +5341,7 @@ mrt_report(mib_item_t *item)
case EXPER_DVMRP_MRT:
if (Dflag)
(void) printf("%u records for ipMfcTable:\n",
- item->length/sizeof (struct vifctl));
+ item->length/sizeof (struct vifctl));
if (item->length/sizeof (struct vifctl) == 0) {
(void) puts("\nMulticast Forwarding Cache is "
"empty");
@@ -5402,10 +5364,10 @@ mrt_report(mib_item_t *item)
abuf, sizeof (abuf)));
(void) printf("%-15.15s %6s %3u ",
pr_net(mfccp->mfcc_mcastgrp.s_addr,
- mfccp->mfcc_mcastgrp.s_addr,
- abuf, sizeof (abuf)),
+ mfccp->mfcc_mcastgrp.s_addr,
+ abuf, sizeof (abuf)),
pktscale((int)mfccp->mfcc_pkt_cnt),
- mfccp->mfcc_parent);
+ mfccp->mfcc_parent);
for (vifi = 0; vifi < MAXVIFS; ++vifi) {
if (mfccp->mfcc_ttls[vifi]) {
@@ -5468,7 +5430,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes)
strncmp(ksp->ks_name, "streams_dblk", 12) == 0) {
(void) safe_kstat_read(kc, ksp, NULL);
total_buf_inuse -=
- kstat_named_value(ksp, "buf_constructed");
+ kstat_named_value(ksp, "buf_constructed");
continue; /* 'for' loop 1 */
}
@@ -5501,7 +5463,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes)
if (buf_size == 0) {
(void) printf("%-22s [couldn't find statistics for %s]\n",
- title, name);
+ title, name);
return;
}
@@ -5511,7 +5473,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes)
(void) snprintf(buf, sizeof (buf), "%s", title);
(void) printf("%-22s %6d %9d %11lld %11d\n", buf,
- total_buf_inuse, total_buf_max, total_alloc, total_alloc_fail);
+ total_buf_inuse, total_buf_max, total_alloc, total_alloc_fail);
}
static void
@@ -5534,7 +5496,7 @@ m_report(void)
kmem_cache_stats("qband", "qband_cache", 0, &total_bytes);
(void) printf("\n%lld Kbytes allocated for streams data\n",
- total_bytes / 1024);
+ total_bytes / 1024);
(void) putchar('\n');
(void) fflush(stdout);
@@ -5967,7 +5929,7 @@ portname(uint_t port, char *proto, char *dst, uint_t dstlen)
sp = getservbyport(htons(port), proto);
if (sp || port == 0)
(void) snprintf(dst, dstlen, "%.*s", MAXHOSTNAMELEN,
- sp ? sp->s_name : "*");
+ sp ? sp->s_name : "*");
else
(void) snprintf(dst, dstlen, "%d", port);
dst[dstlen - 1] = 0;
@@ -6161,8 +6123,8 @@ process_filter(char *arg)
*/
if (hp->h_addr_list[0] != NULL &&
/* LINTED: (note 1) */
- IN6_IS_ADDR_V4MAPPED((in6_addr_t
- *)hp->h_addr_list[0])) {
+ IN6_IS_ADDR_V4MAPPED((in6_addr_t *)
+ hp->h_addr_list[0])) {
maxv = IP_ABITS;
} else {
maxv = IPV6_ABITS;
@@ -6226,6 +6188,21 @@ family_selected(int family)
}
/*
+ * Convert the interface index to a string using the buffer `ifname', which
+ * must be at least LIFNAMSIZ bytes. We first try to map it to name. If that
+ * fails (e.g., because we're inside a zone and it does not have access to
+ * interface for the index in question), just return "if#<num>".
+ */
+static char *
+ifindex2str(uint_t ifindex, char *ifname)
+{
+ if (if_indextoname(ifindex, ifname) == NULL)
+ (void) snprintf(ifname, LIFNAMSIZ, "if#%d", ifindex);
+
+ return (ifname);
+}
+
+/*
* print the usage line
*/
static void
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile
index f0c4c03250..f3ce9fae4b 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile
@@ -19,51 +19,58 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
-
-PROG = in.mpathd
-OBJS = mpd_tables.o mpd_main.o mpd_probe.o
-SRCS = $(OBJS:%.o=%.c)
-DEFAULTFILES = mpathd.dfl
+PROG = in.mpathd
+ROOTFS_PROG = $(PROG)
+OBJS = mpd_tables.o mpd_main.o mpd_probe.o
+SRCS = $(OBJS:%.o=%.c)
+DEFAULTFILES = mpathd.dfl
include ../../../Makefile.cmd
-POFILE = $(PROG).po
-POFILES = $(SRCS:%.c=%.po)
+ROOTCMDDIR = $(ROOT)/lib/inet
+
+POFILE = $(PROG).po
+POFILES = $(SRCS:%.c=%.po)
-C99MODE= $(C99_ENABLE)
+C99MODE = $(C99_ENABLE)
#
# We need access to the ancillary data features which are only available
# via the SUS standards. Further, C99 support requires SUSv3 or higher.
#
CPPFLAGS += -D_XOPEN_SOURCE=600 -D__EXTENSIONS__
-LDLIBS += -lsocket -lnsl -lsysevent -lnvpair -lipmp -lc
+LDLIBS += -lsocket -lnsl -lsysevent -lnvpair -lipmp -linetutil -ldlpi
+LINTFLAGS += -erroff=E_INCONS_ARG_DECL2 -erroff=E_INCONS_ARG_USED2
-LINTFLAGS += -erroff=E_FUNC_DECL_VAR_ARG2 -erroff=E_INCONS_VAL_TYPE_DECL2 \
- -erroff=E_FUNC_USED_VAR_ARG2 -erroff=E_INCONS_ARG_DECL2 \
- -erroff=E_NAME_USED_NOT_DEF2 -erroff=E_INCONS_ARG_USED2 \
- -errtags=yes
+#
+# Instrument in.mpathd with CTF data to ease debugging.
+#
+CTFCONVERT_HOOK = && $(CTFCONVERT_O)
+CTFMERGE_HOOK = && $(CTFMERGE) -L VERSION -o $@ $(OBJS)
+$(OBJS) := CFLAGS += $(CTF_FLAGS)
.KEEP_STATE:
all: $(PROG)
$(PROG): $(OBJS)
- $(LINK.c) -o $@ $(OBJS) $(LDLIBS)
+ $(LINK.c) -o $@ $(OBJS) $(LDLIBS) $(CTFMERGE_HOOK)
$(POST_PROCESS)
include ../Makefile.lib
+$(ROOTLIBINETPROG):
+ $(RM) $@; $(SYMLINK) ../../../lib/inet/$(PROG) $@
+
$(ROOTSBINPROG):
- $(RM) $@; $(SYMLINK) ../usr/lib/inet/$(PROG) $@
+ $(RM) $@; $(SYMLINK) ../lib/inet/$(PROG) $@
-install: all $(ROOTLIBINETPROG) $(ROOTSBINPROG) $(ROOTETCDEFAULTFILES)
+install: all $(ROOTLIBINETPROG) $(ROOTSBINPROG) $(ROOTCMD) \
+ $(ROOTETCDEFAULTFILES)
clean:
$(RM) $(OBJS)
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h
index 9b07e2a7a3..e7cb096bf7 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _MPD_DEFS_H
#define _MPD_DEFS_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -78,12 +76,13 @@ extern "C" {
#include <locale.h>
#include <deflt.h>
+#include <libdlpi.h>
+#include <libinetutil.h>
#include <libnvpair.h>
#include <libsysevent.h>
#include <sys/sysevent.h>
#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/ipmp.h>
-#include <zone.h>
#include <ipmp_mpathd.h>
#include <ipmp_query_impl.h>
@@ -92,7 +91,7 @@ extern "C" {
/* Debug flags */
#define D_ALL 0xffff /* enable all debug */
#define D_PROBE 0x0001 /* probe mechanism */
-#define D_FAILOVER 0x0002 /* failover mechanism */
+#define D_FAILREP 0x0002 /* failure/repair mechanism */
#define D_PHYINT 0x0004 /* phyint table */
#define D_LOGINT 0x0008 /* logint table */
#define D_TARGET 0x0010 /* target table */
@@ -199,10 +198,8 @@ extern int user_failure_detection_time; /* User specified fdt */
extern int ifsock_v4; /* IPv4 socket for ioctls */
extern int ifsock_v6; /* IPv6 socket for ioctls */
-extern boolean_t full_scan_required; /* Do full scans */
-
extern int debug; /* debug option */
-
+extern boolean_t cleanup_started; /* true if we're shutting down */
extern boolean_t handle_link_notifications;
/*
@@ -212,6 +209,7 @@ extern void timer_schedule(uint_t delay);
extern void logmsg(int pri, const char *fmt, ...);
extern void logperror(const char *str);
extern int poll_add(int fd);
+extern int poll_remove(int fd);
extern uint64_t getcurrentsec(void);
extern uint_t getcurrenttime(void);
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
index aa6a99fb9c..e1e22e12d4 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "mpd_defs.h"
#include "mpd_tables.h"
@@ -46,7 +44,6 @@ static int lsock_v6; /* Listen socket to detect mpathd */
static int mibfd = -1; /* fd to get mib info */
static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
-boolean_t full_scan_required = _B_FALSE;
static uint_t last_initifs_time; /* Time when initifs was last run */
static char **argv0; /* Saved for re-exec on SIGHUP */
boolean_t handle_link_notifications = _B_TRUE;
@@ -58,10 +55,6 @@ static void check_if_removed(struct phyint_instance *pii);
static void select_test_ifs(void);
static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
-static void router_add_v4(mib2_ipRouteEntry_t *rp1,
- struct in_addr nexthop_v4);
-static void router_add_v6(mib2_ipv6RouteEntry_t *rp1,
- struct in6_addr nexthop_v6);
static void router_add_common(int af, char *ifname,
struct in6_addr nexthop);
static void init_router_targets();
@@ -74,17 +67,17 @@ static void check_addr_unique(struct phyint_instance *,
static void init_host_targets(void);
static void dup_host_targets(struct phyint_instance *desired_pii);
static void loopback_cmd(int sock, int family);
-static int poll_remove(int fd);
static boolean_t daemonize(void);
static int closefunc(void *, int);
static unsigned int process_cmd(int newfd, union mi_commands *mpi);
static unsigned int process_query(int fd, mi_query_t *miq);
+static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop);
static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
static unsigned int send_result(int fd, unsigned int error, int syserror);
-struct local_addr *laddr_list = NULL;
+addrlist_t *localaddrs;
/*
* Return the current time in milliseconds (from an arbitrary reference)
@@ -153,7 +146,7 @@ retry:
/*
* Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
*/
-static int
+int
poll_remove(int fd)
{
int i;
@@ -205,17 +198,11 @@ pii_process(int af, char *name, struct phyint_instance **pii_p)
break;
case PI_GROUP_CHANGED:
- /*
- * The phyint has changed group.
- */
- restore_phyint(pii->pii_phyint);
- /* FALLTHRU */
-
case PI_IFINDEX_CHANGED:
/*
- * Interface index has changed. Delete and
- * recreate the phyint as it is quite likely
- * the interface has been unplumbed and replumbed.
+ * Interface index or group membership has changed.
+ * Delete the old state and recreate based on the new
+ * state (it may no longer be in a group).
*/
pii_other = phyint_inst_other(pii);
if (pii_other != NULL)
@@ -249,51 +236,26 @@ pii_process(int af, char *name, struct phyint_instance **pii_p)
}
/*
- * This phyint is leaving the group. Try to restore the phyint to its
- * initial state. Return the addresses that belong to other group members,
- * to the group, and take back any addresses owned by this phyint
- */
-void
-restore_phyint(struct phyint *pi)
-{
- if (pi->pi_group == phyint_anongroup)
- return;
-
- /*
- * Move everthing to some other member in the group.
- * The phyint has changed group in the kernel. But we
- * have yet to do it in our tables.
- */
- if (!pi->pi_empty)
- (void) try_failover(pi, FAILOVER_TO_ANY);
- /*
- * Move all addresses owned by 'pi' back to pi, from each
- * of the other members of the group
- */
- (void) try_failback(pi);
-}
-
-/*
* Scan all interfaces to detect changes as well as new and deleted interfaces
*/
static void
initifs()
{
- int n;
+ int i, nlifr;
int af;
char *cp;
char *buf;
- int numifs;
+ int sockfd;
+ uint64_t flags;
struct lifnum lifn;
struct lifconf lifc;
+ struct lifreq lifreq;
struct lifreq *lifr;
struct logint *li;
struct phyint_instance *pii;
struct phyint_instance *next_pii;
- char pi_name[LIFNAMSIZ + 1];
- boolean_t exists;
- struct phyint *pi;
- struct local_addr *next;
+ struct phyint_group *pg, *next_pg;
+ char pi_name[LIFNAMSIZ + 1];
if (debug & D_PHYINT)
logdebug("initifs: Scanning interfaces\n");
@@ -301,13 +263,9 @@ initifs()
last_initifs_time = getcurrenttime();
/*
- * Free the laddr_list before collecting the local addresses.
+ * Free the existing local address list; we'll build a new list below.
*/
- while (laddr_list != NULL) {
- next = laddr_list->next;
- free(laddr_list);
- laddr_list = next;
- }
+ addrlist_free(&localaddrs);
/*
* Mark the interfaces so that we can find phyints and logints
@@ -326,122 +284,142 @@ initifs()
}
}
+ /*
+ * As above, mark groups so that we can detect IPMP interfaces which
+ * have been removed from the kernel. Also, delete the group address
+ * list since we'll iteratively recreate it below.
+ */
+ for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
+ pg->pg_in_use = _B_FALSE;
+ addrlist_free(&pg->pg_addrs);
+ }
+
lifn.lifn_family = AF_UNSPEC;
- lifn.lifn_flags = LIFC_ALLZONES;
+ lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
+again:
if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
- logperror("initifs: ioctl (get interface numbers)");
+ logperror("initifs: ioctl (get interface count)");
return;
}
- numifs = lifn.lifn_count;
+ /*
+ * Pad the interface count to detect when additional interfaces have
+ * been configured between SIOCGLIFNUM and SIOCGLIFCONF.
+ */
+ lifn.lifn_count += 4;
- buf = (char *)calloc(numifs, sizeof (struct lifreq));
- if (buf == NULL) {
+ if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) {
logperror("initifs: calloc");
return;
}
lifc.lifc_family = AF_UNSPEC;
- lifc.lifc_flags = LIFC_ALLZONES;
- lifc.lifc_len = numifs * sizeof (struct lifreq);
+ lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
+ lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq);
lifc.lifc_buf = buf;
if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
- /*
- * EINVAL is commonly encountered, when things change
- * underneath us rapidly, (eg. at boot, when new interfaces
- * are plumbed successively) and the kernel finds the buffer
- * size we passed as too small. We will retry again
- * when we see the next routing socket msg, or at worst after
- * IF_SCAN_INTERVAL ms.
- */
- if (errno != EINVAL) {
- logperror("initifs: ioctl"
- " (get interface configuration)");
- }
+ logperror("initifs: ioctl (get interface configuration)");
free(buf);
return;
}
- lifr = (struct lifreq *)lifc.lifc_req;
-
/*
- * For each lifreq returned by SIOGGLIFCONF, call pii_process()
- * and get the state of the corresponding phyint_instance. If it is
- * successful, then call logint_init_from_k() to get the state of the
- * logint.
+ * If every lifr_req slot is taken, then additional interfaces must
+ * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF.
+ * Recalculate to make sure we didn't miss any interfaces.
*/
- for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) {
- int sockfd;
- struct local_addr *taddr;
- struct sockaddr_in *sin;
- struct sockaddr_in6 *sin6;
- struct lifreq lifreq;
+ nlifr = lifc.lifc_len / sizeof (struct lifreq);
+ if (nlifr >= lifn.lifn_count) {
+ free(buf);
+ goto again;
+ }
+ /*
+ * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the
+ * global list of addresses, phyint groups, phyints, and logints.
+ */
+ for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) {
af = lifr->lifr_addr.ss_family;
-
- /*
- * Collect all local addresses.
- */
sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
- (void) memset(&lifreq, 0, sizeof (lifreq));
- (void) strlcpy(lifreq.lifr_name, lifr->lifr_name,
- sizeof (lifreq.lifr_name));
+ (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ);
if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) {
if (errno != ENXIO)
logperror("initifs: ioctl (SIOCGLIFFLAGS)");
continue;
}
+ flags = lifreq.lifr_flags;
+
+ /*
+ * If the address is IFF_UP, add it to the local address list.
+ * (We ignore addresses that aren't IFF_UP since another node
+ * might legitimately have that address IFF_UP.)
+ */
+ if (flags & IFF_UP) {
+ (void) addrlist_add(&localaddrs, lifr->lifr_name, flags,
+ &lifr->lifr_addr);
+ }
/*
- * Add the interface address to laddr_list.
- * Another node might have the same IP address which is up.
- * In that case, it is appropriate to use the address as a
- * target, even though it is also configured (but not up) on
- * the local system.
- * Hence,the interface address is not added to laddr_list
- * unless it is IFF_UP.
+ * If this address is on an IPMP meta-interface, update our
+ * phyint_group information (either by recording that group
+ * still exists or creating a new group), and track what
+ * group the address is part of.
*/
- if (lifreq.lifr_flags & IFF_UP) {
- taddr = malloc(sizeof (struct local_addr));
- if (taddr == NULL) {
- logperror("initifs: malloc");
+ if (flags & IFF_IPMP) {
+ if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) {
+ if (errno != ENXIO)
+ logperror("initifs: ioctl "
+ "(SIOCGLIFGROUPNAME)");
continue;
}
- if (af == AF_INET) {
- sin = (struct sockaddr_in *)&lifr->lifr_addr;
- IN6_INADDR_TO_V4MAPPED(&sin->sin_addr,
- &taddr->addr);
- } else {
- sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr;
- taddr->addr = sin6->sin6_addr;
+
+ pg = phyint_group_lookup(lifreq.lifr_groupname);
+ if (pg == NULL) {
+ pg = phyint_group_create(lifreq.lifr_groupname);
+ if (pg == NULL) {
+ logerr("initifs: cannot create group "
+ "%s\n", lifreq.lifr_groupname);
+ continue;
+ }
+ phyint_group_insert(pg);
+ }
+ pg->pg_in_use = _B_TRUE;
+
+ /*
+ * Add this to the group's list of data addresses.
+ */
+ if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags,
+ &lifr->lifr_addr)) {
+ logerr("initifs: insufficient memory to track "
+ "data address information for %s\n",
+ lifr->lifr_name);
}
- taddr->next = laddr_list;
- laddr_list = taddr;
+ continue;
}
/*
- * Need to pass a phyint name to pii_process. Insert the
- * null where the ':' IF_SEPARATOR is found in the logical
- * name.
+ * This isn't an address on an IPMP meta-interface, so it's
+ * either on an underlying interface or not related to any
+ * group. Update our phyint and logint information (via
+ * pii_process() and logint_init_from_k()) -- but first,
+ * convert the logint name to a phyint name so we can call
+ * pii_process().
*/
(void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name));
if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
*cp = '\0';
- exists = pii_process(af, pi_name, &pii);
- if (exists) {
+ if (pii_process(af, pi_name, &pii)) {
/* The phyint is fine. So process the logint */
logint_init_from_k(pii, lifr->lifr_name);
check_addr_unique(pii, &lifr->lifr_addr);
}
-
}
-
free(buf);
/*
- * Scan for phyints and logints that have disappeared from the
+ * Scan for groups, phyints and logints that have disappeared from the
* kernel, and delete them.
*/
for (pii = phyint_instances; pii != NULL; pii = next_pii) {
@@ -449,70 +427,31 @@ initifs()
check_if_removed(pii);
}
+ for (pg = phyint_groups; pg != NULL; pg = next_pg) {
+ next_pg = pg->pg_next;
+ if (!pg->pg_in_use) {
+ phyint_group_delete(pg);
+ continue;
+ }
+ /*
+ * Refresh the group's state. This is necessary since the
+ * group's state is defined by the set of usable interfaces in
+ * the group, and an interface is considered unusable if all
+ * of its addresses are down. When an address goes down/up,
+ * the RTM_DELADDR/RTM_NEWADDR brings us through here.
+ */
+ phyint_group_refresh_state(pg);
+ }
+
/*
* Select a test address for sending probes on each phyint instance
*/
select_test_ifs();
/*
- * Handle link up/down notifications from the NICs.
+ * Handle link up/down notifications.
*/
process_link_state_changes();
-
- for (pi = phyints; pi != NULL; pi = pi->pi_next) {
- /*
- * If this is a case of group failure, we don't have much
- * to do until the group recovers again.
- */
- if (GROUP_FAILED(pi->pi_group))
- continue;
-
- /*
- * Try/Retry any pending failovers / failbacks, that did not
- * not complete, or that could not be initiated previously.
- * This implements the 3 invariants described in the big block
- * comment at the beginning of probe.c
- */
- if (pi->pi_flags & IFF_INACTIVE) {
- if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
- (void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
- } else {
- struct phyint_instance *pii;
-
- /*
- * Skip LINK UP interfaces which are not capable
- * of probing.
- */
- pii = pi->pi_v4;
- if (pii == NULL ||
- (LINK_UP(pi) && !PROBE_CAPABLE(pii))) {
- pii = pi->pi_v6;
- if (pii == NULL ||
- (LINK_UP(pi) && !PROBE_CAPABLE(pii)))
- continue;
- }
-
- /*
- * It is possible that the phyint has started
- * receiving packets, after it has been marked
- * PI_FAILED. Don't initiate failover, if the
- * phyint has started recovering. failure_state()
- * captures this check. A similar logic is used
- * for failback/repair case.
- */
- if (pi->pi_state == PI_FAILED && !pi->pi_empty &&
- (failure_state(pii) == PHYINT_FAILURE)) {
- (void) try_failover(pi, FAILOVER_NORMAL);
- } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
- if (try_failback(pi) != IPMP_FAILURE) {
- (void) change_lif_flags(pi, IFF_FAILED,
- _B_FALSE);
- /* Per state diagram */
- pi->pi_empty = 0;
- }
- }
- }
- }
}
/*
@@ -569,7 +508,7 @@ check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss)
* The probe socket is closed on each interface instance, and the
* interface state set to PI_OFFLINE.
*/
-static void
+void
stop_probing(struct phyint *pi)
{
struct phyint_instance *pii;
@@ -631,7 +570,6 @@ select_test_ifs(void)
struct logint *li;
struct logint *probe_logint;
boolean_t target_scan_reqd = _B_FALSE;
- struct target *tg;
int rating;
if (debug & D_PHYINT)
@@ -645,8 +583,8 @@ select_test_ifs(void)
probe_logint = NULL;
/*
- * An interface that is offline, should not be probed.
- * Offline interfaces should always in PI_OFFLINE state,
+ * An interface that is offline should not be probed.
+ * IFF_OFFLINE interfaces should always be PI_OFFLINE
* unless some other entity has set the offline flag.
*/
if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
@@ -659,6 +597,15 @@ select_test_ifs(void)
stop_probing(pii->pii_phyint);
}
continue;
+ } else {
+ /*
+ * If something cleared IFF_OFFLINE (e.g., by accident
+ * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is
+ * inherently racy), the phyint may still be offline.
+ * Just ignore it.
+ */
+ if (pii->pii_phyint->pi_state == PI_OFFLINE)
+ continue;
}
li = pii->pii_probe_logint;
@@ -776,17 +723,6 @@ select_test_ifs(void)
phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
}
- if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
- tg = pii->pii_targets;
- if (tg != NULL)
- target_delete(tg);
- assert(pii->pii_targets == NULL);
- assert(pii->pii_target_next == NULL);
- assert(pii->pii_ntargets == 0);
- target_create(pii, probe_logint->li_dstaddr,
- _B_TRUE);
- }
-
/*
* If no targets are currently known for this phyint
* we need to call init_router_targets. Since
@@ -806,15 +742,16 @@ select_test_ifs(void)
}
/*
- * Check the interface list for any interfaces that are marked
- * PI_FAILED but no longer enabled to send probes, and call
- * phyint_check_for_repair() to see if the link now indicates that the
- * interface should be repaired. Also see the state diagram in
+ * Scan the interface list for any interfaces that are PI_FAILED or
+ * PI_NOTARGETS but no longer enabled to send probes, and call
+ * phyint_check_for_repair() to see if the link state indicates that
+ * the interface should be repaired. Also see the state diagram in
* mpd_probe.c.
*/
for (pi = phyints; pi != NULL; pi = pi->pi_next) {
- if (pi->pi_state == PI_FAILED &&
- !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
+ if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) &&
+ (pi->pi_state == PI_FAILED ||
+ pi->pi_state == PI_NOTARGETS)) {
phyint_check_for_repair(pi);
}
}
@@ -875,15 +812,14 @@ check_testconfig(void)
pi->pi_v6->pii_probe_logint->li_dupaddr)
li = pi->pi_v6->pii_probe_logint;
- if (li != NULL) {
- if (!pi->pi_duptaddrmsg_printed) {
- (void) pr_addr(li->li_phyint_inst->pii_af,
- li->li_addr, abuf, sizeof (abuf));
- logerr("Test address %s is not unique in "
- "group; disabling probe-based failure "
- "detection on %s\n", abuf, pi->pi_name);
- pi->pi_duptaddrmsg_printed = 1;
- }
+ if (li != NULL && li->li_dupaddr) {
+ if (pi->pi_duptaddrmsg_printed)
+ continue;
+ logerr("Test address %s is not unique in group; "
+ "disabling probe-based failure detection on %s\n",
+ pr_addr(li->li_phyint_inst->pii_af,
+ li->li_addr, abuf, sizeof (abuf)), pi->pi_name);
+ pi->pi_duptaddrmsg_printed = 1;
continue;
}
@@ -915,10 +851,10 @@ check_config(void)
boolean_t v6_in_group;
/*
- * All phyints of a group must be homogenous to ensure that
- * failover or failback can be done. If any phyint in a group
- * has IPv4 plumbed, check that all phyints have IPv4 plumbed.
- * Do a similar check for IPv6.
+ * All phyints of a group must be homogeneous to ensure that they can
+ * take over for one another. If any phyint in a group has IPv4
+ * plumbed, check that all phyints have IPv4 plumbed. Do a similar
+ * check for IPv6.
*/
for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
if (pg == phyint_anongroup)
@@ -949,9 +885,9 @@ check_config(void)
if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
if (!pi->pi_cfgmsg_printed) {
- logerr("NIC %s of group %s is"
- " not plumbed for IPv4 and may"
- " affect failover capability\n",
+ logerr("IP interface %s in group %s is"
+ " not plumbed for IPv4, affecting"
+ " IPv4 connectivity\n",
pi->pi_name,
pi->pi_group->pg_name);
pi->pi_cfgmsg_printed = 1;
@@ -959,9 +895,9 @@ check_config(void)
} else if (v6_in_group == _B_TRUE &&
pi->pi_v6 == NULL) {
if (!pi->pi_cfgmsg_printed) {
- logerr("NIC %s of group %s is"
- " not plumbed for IPv6 and may"
- " affect failover capability\n",
+ logerr("IP interface %s in group %s is"
+ " not plumbed for IPv6, affecting"
+ " IPv6 connectivity\n",
pi->pi_name,
pi->pi_group->pg_name);
pi->pi_cfgmsg_printed = 1;
@@ -974,10 +910,10 @@ check_config(void)
* error recovery message
*/
if (pi->pi_cfgmsg_printed) {
- logerr("NIC %s is now consistent with "
- "group %s and failover capability "
- "is restored\n", pi->pi_name,
- pi->pi_group->pg_name);
+ logerr("IP interface %s is now"
+ " consistent with group %s "
+ " and connectivity is restored\n",
+ pi->pi_name, pi->pi_group->pg_name);
pi->pi_cfgmsg_printed = 0;
}
}
@@ -1117,8 +1053,8 @@ run_timeouts(void)
static int eventpipe_read = -1; /* Used for synchronous signal delivery */
static int eventpipe_write = -1;
-static boolean_t cleanup_started = _B_FALSE;
- /* Don't write to eventpipe if in cleanup */
+boolean_t cleanup_started = _B_FALSE; /* true if we're going away */
+
/*
* Ensure that signals are processed synchronously with the rest of
* the code by just writing a one character signal number on the pipe.
@@ -1228,7 +1164,7 @@ in_signal(int fd)
"Number of probes sent %lld\n"
"Number of probe acks received %lld\n"
"Number of probes/acks lost %lld\n"
- "Number of valid unacknowled probes %lld\n"
+ "Number of valid unacknowledged probes %lld\n"
"Number of ambiguous probe acks received %lld\n",
AF_STR(pii->pii_af), pii->pii_name,
sent, acked, lost, unacked, unknown);
@@ -1321,12 +1257,20 @@ setup_rtsock(int af)
{
int s;
int flags;
+ int aware = RTAW_UNDER_IPMP;
s = socket(PF_ROUTE, SOCK_RAW, af);
if (s == -1) {
logperror("setup_rtsock: socket PF_ROUTE");
exit(1);
}
+
+ if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) {
+ logperror("setup_rtsock: setsockopt RT_AWARE");
+ (void) close(s);
+ exit(1);
+ }
+
if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
logperror("setup_rtsock: fcntl F_GETFL");
(void) close(s);
@@ -1347,8 +1291,7 @@ setup_rtsock(int af)
/*
* Process an RTM_IFINFO message received on a routing socket.
* The return value indicates whether a full interface scan is required.
- * Link up/down notifications from the NICs are reflected in the
- * IFF_RUNNING flag.
+ * Link up/down notifications are reflected in the IFF_RUNNING flag.
* If just the state of the IFF_RUNNING interface flag has changed, a
* a full interface scan isn't required.
*/
@@ -1400,7 +1343,7 @@ process_rtm_ifinfo(if_msghdr_t *ifm, int type)
/*
* We want to try and avoid doing a full interface scan for
- * link state notifications from the NICs, as indicated
+ * link state notifications from the datalink layer, as indicated
* by the state of the IFF_RUNNING flag. If just the
* IFF_RUNNING flag has changed state, the link state changes
* are processed without a full scan.
@@ -1441,25 +1384,7 @@ process_rtm_ifinfo(if_msghdr_t *ifm, int type)
* types.
*/
if ((old_flags ^ pii->pii_flags) & IFF_STANDBY)
- phyint_newtype(pi);
-
- /*
- * If IFF_INACTIVE has been set, then no data addresses should be
- * hosted on the interface. If IFF_INACTIVE has been cleared, then
- * move previously failed-over addresses back to it, provided it is
- * not failed. For details, see the state diagram in mpd_probe.c.
- */
- if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) {
- if (pii->pii_flags & IFF_INACTIVE) {
- if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
- (void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
- } else {
- if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
- pi->pi_empty = 0;
- (void) try_failback(pi);
- }
- }
- }
+ phyint_changed(pi);
/* Has just the IFF_RUNNING flag changed state ? */
if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
@@ -1620,22 +1545,24 @@ update_router_list(int fd)
t_scalar_t prim;
tor = (struct T_optmgmt_req *)&buf;
-
tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
tor->OPT_offset = sizeof (struct T_optmgmt_req);
tor->OPT_length = sizeof (struct opthdr);
tor->MGMT_flags = T_CURRENT;
+ /*
+ * Note: we use the special level value below so that IP will return
+ * us information concerning IRE_MARK_TESTHIDDEN routes.
+ */
req = (struct opthdr *)&tor[1];
- req->level = MIB2_IP; /* any MIB2_xxx value ok here */
+ req->level = EXPER_IP_AND_TESTHIDDEN;
req->name = 0;
req->len = 0;
ctlbuf.buf = (char *)&buf;
ctlbuf.len = tor->OPT_length + tor->OPT_offset;
ctlbuf.maxlen = sizeof (buf);
- flags = 0;
- if (putmsg(fd, &ctlbuf, NULL, flags) == -1) {
+ if (putmsg(fd, &ctlbuf, NULL, 0) == -1) {
logperror("update_router_list: putmsg(ctl)");
return (_B_FALSE);
}
@@ -1689,7 +1616,8 @@ update_router_list(int fd)
case T_OPTMGMT_ACK:
toa = &buf.uprim.optmgmt_ack;
optp = (struct opthdr *)&toa[1];
- if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) {
+ if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) +
+ sizeof (struct opthdr))) {
logerr("update_router_list: ctlbuf.len %d\n",
ctlbuf.len);
return (_B_FALSE);
@@ -1707,7 +1635,7 @@ update_router_list(int fd)
return (_B_FALSE);
}
- /* Process the T_OPGMGMT_ACK below */
+ /* Process the T_OPTMGMT_ACK below */
assert(prim == T_OPTMGMT_ACK);
switch (status) {
@@ -1717,9 +1645,8 @@ update_router_list(int fd)
* message. If this is the last message i.e EOD,
* return, else process the next T_OPTMGMT_ACK msg.
*/
- if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) +
- sizeof (struct opthdr)) && optp->len == 0 &&
- optp->name == 0 && optp->level == 0) {
+ if (optp->len == 0 && optp->name == 0 &&
+ optp->level == 0) {
/*
* This is the EOD message. Return
*/
@@ -1747,17 +1674,14 @@ update_router_list(int fd)
databuf.len = 0;
flags = 0;
for (;;) {
- status = getmsg(fd, NULL, &databuf, &flags);
- if (status >= 0) {
+ if (getmsg(fd, NULL, &databuf, &flags) >= 0)
break;
- } else if (errno == EINTR) {
+ if (errno == EINTR)
continue;
- } else {
- logperror("update_router_list:"
- " getmsg(data)");
- free(databuf.buf);
- return (_B_FALSE);
- }
+
+ logperror("update_router_list: getmsg(data)");
+ free(databuf.buf);
+ return (_B_FALSE);
}
if (optp->level == MIB2_IP &&
@@ -1777,18 +1701,35 @@ update_router_list(int fd)
/* NOTREACHED */
}
+
+/*
+ * Convert octet `octp' to a phyint name and store in `ifname'
+ */
+static void
+oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize)
+{
+ char *cp;
+ size_t len = MIN(octp->o_length, ifsize - 1);
+
+ (void) strncpy(ifname, octp->o_bytes, len);
+ ifname[len] = '\0';
+
+ if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL)
+ *cp = '\0';
+}
+
/*
- * Examine the IPv4 routing table, for default routers. For each default
- * router, populate the list of targets of each phyint that is on the same
- * link as the default router
+ * Examine the IPv4 routing table `buf' for possible targets. For each
+ * possible target, if it's on the same subnet an interface route, pass
+ * it to router_add_common() for further consideration.
*/
static void
ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
{
- mib2_ipRouteEntry_t *rp;
- mib2_ipRouteEntry_t *rp1;
- struct in_addr nexthop_v4;
- mib2_ipRouteEntry_t *endp;
+ char ifname[LIFNAMSIZ];
+ mib2_ipRouteEntry_t *rp, *rp1, *endp;
+ struct in_addr nexthop_v4;
+ struct in6_addr nexthop;
if (len == 0)
return;
@@ -1797,75 +1738,40 @@ ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
endp = buf + (len / sizeof (mib2_ipRouteEntry_t));
/*
- * Loop thru the routing table entries. Process any IRE_DEFAULT,
- * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
- * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
- * This is a potential target for probing, which we try to add
- * to the list of probe targets.
+ * Scan the routing table entries for any IRE_OFFSUBNET entries, and
+ * cross-reference them with the interface routes to determine if
+ * they're possible probe targets.
*/
for (rp = buf; rp < endp; rp++) {
if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
continue;
- /* Get the nexthop address. */
+ /* Get the nexthop address. */
nexthop_v4.s_addr = rp->ipRouteNextHop;
/*
- * Get the nexthop address. Then determine the outgoing
- * interface, by examining all interface IREs, and picking the
- * match. We don't look at the interface specified in the route
- * because we need to add the router target on all matching
- * interfaces anyway; the goal is to avoid falling back to
- * multicast when some interfaces are in the same subnet but
- * not in the same group.
+ * Rescan the routing table looking for interface routes that
+ * are on the same subnet, and try to add them. If they're
+ * not relevant (e.g., the interface route isn't part of an
+ * IPMP group, router_add_common() will discard).
*/
for (rp1 = buf; rp1 < endp; rp1++) {
- if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) {
+ if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) ||
+ rp1->ipRouteIfIndex.o_length == 0)
continue;
- }
- /*
- * Determine the interface IRE that matches the nexthop.
- * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask)
- */
- if ((rp1->ipRouteDest & rp1->ipRouteMask) ==
- (nexthop_v4.s_addr & rp1->ipRouteMask)) {
- /*
- * We found the interface ire
- */
- router_add_v4(rp1, nexthop_v4);
- }
+ if ((rp1->ipRouteDest & rp1->ipRouteMask) !=
+ (nexthop_v4.s_addr & rp1->ipRouteMask))
+ continue;
+
+ oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ);
+ IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
+ router_add_common(AF_INET, ifname, nexthop);
}
}
}
void
-router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4)
-{
- char *cp;
- char ifname[LIFNAMSIZ + 1];
- struct in6_addr nexthop;
- int len;
-
- if (debug & D_TARGET)
- logdebug("router_add_v4()\n");
-
- len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1);
- (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len);
- ifname[len] = '\0';
-
- if (ifname[0] == '\0')
- return;
-
- cp = strchr(ifname, IF_SEPARATOR);
- if (cp != NULL)
- *cp = '\0';
-
- IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
- router_add_common(AF_INET, ifname, nexthop);
-}
-
-void
router_add_common(int af, char *ifname, struct in6_addr nexthop)
{
struct phyint_instance *pii;
@@ -1906,16 +1812,17 @@ router_add_common(int af, char *ifname, struct in6_addr nexthop)
}
/*
- * Examine the IPv6 routing table, for default routers. For each default
- * router, populate the list of targets of each phyint that is on the same
- * link as the default router
+ * Examine the IPv6 routing table `buf' for possible link-local targets, and
+ * pass any contenders to router_add_common() for further consideration.
*/
static void
ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
{
- mib2_ipv6RouteEntry_t *rp;
- mib2_ipv6RouteEntry_t *endp;
- struct in6_addr nexthop_v6;
+ struct lifreq lifr;
+ char ifname[LIFNAMSIZ];
+ char grname[LIFGRNAMSIZ];
+ mib2_ipv6RouteEntry_t *rp, *rp1, *endp;
+ struct in6_addr nexthop_v6;
if (debug & D_TARGET)
logdebug("ire_process_v6(len %d)\n", len);
@@ -1927,62 +1834,51 @@ ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t));
/*
- * Loop thru the routing table entries. Process any IRE_DEFAULT,
- * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
- * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
- * This is a potential target for probing, which we try to add
- * to the list of probe targets.
+ * Scan the routing table entries for any IRE_OFFSUBNET entries, and
+ * cross-reference them with the interface routes to determine if
+ * they're possible probe targets.
*/
for (rp = buf; rp < endp; rp++) {
- if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET))
+ if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) ||
+ !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop))
continue;
- /*
- * We have the outgoing interface in ipv6RouteIfIndex
- * if ipv6RouteIfindex.o_length is non-zero. The outgoing
- * interface must be present for link-local addresses. Since
- * we use only link-local addreses for probing, we don't
- * consider the case when the outgoing interface is not
- * known and we need to scan interface ires
- */
+ /* Get the nexthop address. */
nexthop_v6 = rp->ipv6RouteNextHop;
- if (rp->ipv6RouteIfIndex.o_length != 0) {
- /*
- * We already have the outgoing interface
- * in ipv6RouteIfIndex.
- */
- router_add_v6(rp, nexthop_v6);
- }
- }
-}
-
-void
-router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6)
-{
- char ifname[LIFNAMSIZ + 1];
- char *cp;
- int len;
-
- if (debug & D_TARGET)
- logdebug("router_add_v6()\n");
-
- len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1);
- (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len);
- ifname[len] = '\0';
+ /*
+ * The interface name should always exist for link-locals;
+ * we use it to map this entry to an IPMP group name.
+ */
+ if (rp->ipv6RouteIfIndex.o_length == 0)
+ continue;
- if (ifname[0] == '\0')
- return;
+ oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ);
+ if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 ||
+ strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) {
+ continue;
+ }
- cp = strchr(ifname, IF_SEPARATOR);
- if (cp != NULL)
- *cp = '\0';
+ /*
+ * Rescan the list of routes for interface routes, and add the
+ * above target to any interfaces in the same IPMP group.
+ */
+ for (rp1 = buf; rp1 < endp; rp1++) {
+ if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) ||
+ rp1->ipv6RouteIfIndex.o_length == 0) {
+ continue;
+ }
+ oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ);
+ (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ);
- router_add_common(AF_INET6, ifname, nexthop_v6);
+ if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 &&
+ strcmp(lifr.lifr_groupname, grname) == 0) {
+ router_add_common(AF_INET6, ifname, nexthop_v6);
+ }
+ }
+ }
}
-
-
/*
* Build a list of target routers, by scanning the routing tables.
* It is assumed that interface routes exist, to reach the routers.
@@ -2001,11 +1897,9 @@ init_router_targets(void)
for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
pi = pii->pii_phyint;
/*
- * Exclude ptp and host targets. Set tg_in_use to false,
- * only for router targets.
+ * Set tg_in_use to false only for router targets.
*/
- if (!pii->pii_targets_are_routers ||
- (pi->pi_flags & IFF_POINTOPOINT))
+ if (!pii->pii_targets_are_routers)
continue;
for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
@@ -2026,15 +1920,21 @@ init_router_targets(void)
}
for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
- if (!pii->pii_targets_are_routers ||
- (pi->pi_flags & IFF_POINTOPOINT))
+ pi = pii->pii_phyint;
+ if (!pii->pii_targets_are_routers)
continue;
for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
next_tg = tg->tg_next;
- if (!tg->tg_in_use) {
+ /*
+ * If the group has failed, it's likely the route was
+ * removed by an application affected by that failure.
+ * In that case, we keep the target so that we can
+ * reliably repair, at which point we'll refresh the
+ * target list again.
+ */
+ if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group))
target_delete(tg);
- }
}
}
}
@@ -2140,7 +2040,7 @@ getdefault(char *name)
* Command line options below
*/
boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */
-boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */
+boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */
static boolean_t adopt = _B_FALSE;
static boolean_t foreground = _B_FALSE;
@@ -2149,6 +2049,7 @@ main(int argc, char *argv[])
{
int i;
int c;
+ struct phyint *pi;
struct phyint_instance *pii;
char *value;
@@ -2173,14 +2074,15 @@ main(int argc, char *argv[])
if (user_failure_detection_time <= 0) {
user_failure_detection_time = FAILURE_DETECTION_TIME;
logerr("Invalid failure detection time %s, assuming "
- "default %d\n", value, user_failure_detection_time);
+ "default of %d ms\n", value,
+ user_failure_detection_time);
} else if (user_failure_detection_time <
MIN_FAILURE_DETECTION_TIME) {
user_failure_detection_time =
MIN_FAILURE_DETECTION_TIME;
logerr("Too small failure detection time of %s, "
- "assuming minimum %d\n", value,
+ "assuming minimum of %d ms\n", value,
user_failure_detection_time);
}
free(value);
@@ -2211,9 +2113,9 @@ main(int argc, char *argv[])
*/
value = getdefault("FAILBACK");
if (value != NULL) {
- if (strncasecmp(value, "yes", 3) == 0)
+ if (strcasecmp(value, "yes") == 0)
failback_enabled = _B_TRUE;
- else if (strncasecmp(value, "no", 2) == 0)
+ else if (strcasecmp(value, "no") == 0)
failback_enabled = _B_FALSE;
else
logerr("Invalid value for FAILBACK %s\n", value);
@@ -2229,9 +2131,9 @@ main(int argc, char *argv[])
*/
value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
if (value != NULL) {
- if (strncasecmp(value, "yes", 3) == 0)
+ if (strcasecmp(value, "yes") == 0)
track_all_phyints = _B_FALSE;
- else if (strncasecmp(value, "no", 2) == 0)
+ else if (strcasecmp(value, "no") == 0)
track_all_phyints = _B_TRUE;
else
logerr("Invalid value for "
@@ -2340,12 +2242,6 @@ main(int argc, char *argv[])
initifs();
- /* Inform kernel whether failback is enabled or disabled */
- if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) {
- logperror("main: ioctl (SIOCSIPMPFAILBACK)");
- exit(1);
- }
-
/*
* If we're operating in "adopt" mode and no interfaces need to be
* tracked, shut down (ifconfig(1M) will restart us on demand if
@@ -2379,6 +2275,7 @@ main(int argc, char *argv[])
process_rtsock(rtsock_v4, rtsock_v6);
break;
}
+
for (pii = phyint_instances; pii != NULL;
pii = pii->pii_next) {
if (pollfds[i].fd == pii->pii_probe_sock) {
@@ -2389,15 +2286,21 @@ main(int argc, char *argv[])
break;
}
}
+
+ for (pi = phyints; pi != NULL; pi = pi->pi_next) {
+ if (pi->pi_notes != 0 &&
+ pollfds[i].fd == dlpi_fd(pi->pi_dh)) {
+ (void) dlpi_recv(pi->pi_dh, NULL, NULL,
+ NULL, NULL, 0, NULL);
+ break;
+ }
+ }
+
if (pollfds[i].fd == lsock_v4)
loopback_cmd(lsock_v4, AF_INET);
else if (pollfds[i].fd == lsock_v6)
loopback_cmd(lsock_v6, AF_INET6);
}
- if (full_scan_required) {
- initifs();
- full_scan_required = _B_FALSE;
- }
}
/* NOTREACHED */
return (EXIT_SUCCESS);
@@ -2481,29 +2384,23 @@ static struct {
{ "MI_PING", sizeof (uint32_t) },
{ "MI_OFFLINE", sizeof (mi_offline_t) },
{ "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) },
- { "MI_SETOINDEX", sizeof (mi_setoindex_t) },
{ "MI_QUERY", sizeof (mi_query_t) }
};
/*
- * Commands received over the loopback interface come here. Currently
- * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP
- * module. ifconfig only makes a connection, and closes it to check if
- * in.mpathd is running.
- * if_mpadm sends commands in the format specified by the mpathd_interface
- * structure.
+ * Commands received over the loopback interface come here (via libipmp).
*/
static void
loopback_cmd(int sock, int family)
{
int newfd;
ssize_t len;
+ boolean_t is_priv = _B_FALSE;
struct sockaddr_storage peer;
struct sockaddr_in *peer_sin;
struct sockaddr_in6 *peer_sin6;
socklen_t peerlen;
union mi_commands mpi;
- struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
char abuf[INET6_ADDRSTRLEN];
uint_t cmd;
int retval;
@@ -2528,10 +2425,11 @@ loopback_cmd(int sock, int family)
return;
}
peer_sin = (struct sockaddr_in *)&peer;
- if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) ||
- (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) {
- (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
- abuf, sizeof (abuf));
+ is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED;
+ (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
+ abuf, sizeof (abuf));
+
+ if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) {
logerr("Attempt to connect from addr %s port %d\n",
abuf, ntohs(peer_sin->sin_port));
(void) close(newfd);
@@ -2551,11 +2449,10 @@ loopback_cmd(int sock, int family)
* talking to us.
*/
peer_sin6 = (struct sockaddr_in6 *)&peer;
- if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) ||
- (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr,
- &loopback_addr))) {
- (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
- sizeof (abuf));
+ is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED;
+ (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
+ sizeof (abuf));
+ if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) {
logerr("Attempt to connect from addr %s port %d\n",
abuf, ntohs(peer_sin6->sin6_port));
(void) close(newfd);
@@ -2575,15 +2472,6 @@ loopback_cmd(int sock, int family)
len = read(newfd, &mpi, sizeof (mpi));
/*
- * ifconfig does not send any data. Just tests to see if mpathd
- * is already running.
- */
- if (len <= 0) {
- (void) close(newfd);
- return;
- }
-
- /*
* In theory, we can receive any sized message for a stream socket,
* but we don't expect that to happen for a small message over a
* loopback connection.
@@ -2591,6 +2479,8 @@ loopback_cmd(int sock, int family)
if (len < sizeof (uint32_t)) {
logerr("loopback_cmd: bad command format or read returns "
"partial data %d\n", len);
+ (void) close(newfd);
+ return;
}
cmd = mpi.mi_command;
@@ -2600,6 +2490,16 @@ loopback_cmd(int sock, int family)
return;
}
+ /*
+ * Only MI_PING and MI_QUERY can come from unprivileged sources.
+ */
+ if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) {
+ logerr("Unprivileged request from %s for privileged "
+ "command %s\n", abuf, commands[cmd].name);
+ (void) close(newfd);
+ return;
+ }
+
if (len < commands[cmd].size) {
logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
commands[cmd].name, commands[cmd].size, len);
@@ -2615,179 +2515,46 @@ loopback_cmd(int sock, int family)
(void) close(newfd);
}
-extern int global_errno; /* set by failover() or failback() */
-
/*
- * Process the offline, undo offline and set original index commands,
- * received from if_mpadm(1M)
+ * Process the commands received via libipmp.
*/
static unsigned int
process_cmd(int newfd, union mi_commands *mpi)
{
- uint_t nif = 0;
- uint32_t cmd;
struct phyint *pi;
- struct phyint *pi2;
- struct phyint_group *pg;
- boolean_t success;
- int error;
struct mi_offline *mio;
struct mi_undo_offline *miu;
- struct lifreq lifr;
- int ifsock;
- struct mi_setoindex *mis;
+ unsigned int retval;
- cmd = mpi->mi_command;
+ switch (mpi->mi_command) {
+ case MI_PING:
+ return (send_result(newfd, IPMP_SUCCESS, 0));
- switch (cmd) {
case MI_OFFLINE:
mio = &mpi->mi_ocmd;
- /*
- * Lookup the interface that needs to be offlined.
- * If it does not exist, return a suitable error.
- */
+
pi = phyint_lookup(mio->mio_ifname);
if (pi == NULL)
- return (send_result(newfd, IPMP_FAILURE, EINVAL));
-
- /*
- * Verify that the minimum redundancy requirements are met.
- * The multipathing group must have at least the specified
- * number of functional interfaces after offlining the
- * requested interface. Otherwise return a suitable error.
- */
- pg = pi->pi_group;
- nif = 0;
- if (pg != phyint_anongroup) {
- for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL;
- pi2 = pi2->pi_pgnext) {
- if ((pi2->pi_state == PI_RUNNING) ||
- (pg->pg_groupfailed &&
- !(pi2->pi_flags & IFF_OFFLINE)))
- nif++;
- }
- }
- if (nif < mio->mio_min_redundancy)
- return (send_result(newfd, IPMP_EMINRED, 0));
+ return (send_result(newfd, IPMP_EUNKIF, 0));
- /*
- * The order of operation is to set IFF_OFFLINE, followed by
- * failover. Setting IFF_OFFLINE ensures that no new ipif's
- * can be created. Subsequent failover moves everything on
- * the OFFLINE interface to some other functional interface.
- */
- success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE);
- if (success) {
- if (!pi->pi_empty) {
- error = try_failover(pi, FAILOVER_NORMAL);
- if (error != 0) {
- if (!change_lif_flags(pi, IFF_OFFLINE,
- _B_FALSE)) {
- logerr("process_cmd: couldn't"
- " clear OFFLINE flag on"
- " %s\n", pi->pi_name);
- /*
- * Offline interfaces should
- * not be probed.
- */
- stop_probing(pi);
- }
- return (send_result(newfd, error,
- global_errno));
- }
- }
- } else {
+ retval = phyint_offline(pi, mio->mio_min_redundancy);
+ if (retval == IPMP_FAILURE)
return (send_result(newfd, IPMP_FAILURE, errno));
- }
- /*
- * The interface is now Offline, so stop probing it.
- * Note that if_mpadm(1M) will down the test addresses,
- * after receiving a success reply from us. The routing
- * socket message will then make us close the socket used
- * for sending probes. But it is more logical that an
- * offlined interface must not be probed, even if it has
- * test addresses.
- */
- stop_probing(pi);
- return (send_result(newfd, IPMP_SUCCESS, 0));
+ return (send_result(newfd, retval, 0));
case MI_UNDO_OFFLINE:
miu = &mpi->mi_ucmd;
- /*
- * Undo the offline command. As usual lookup the interface.
- * Send an error if it does not exist or is not offline.
- */
- pi = phyint_lookup(miu->miu_ifname);
- if (pi == NULL || pi->pi_state != PI_OFFLINE)
- return (send_result(newfd, IPMP_FAILURE, EINVAL));
-
- /*
- * Reset the state of the interface based on the current link
- * state; if this phyint subsequently acquires a test address,
- * the state will be updated later as a result of the probes.
- */
- if (LINK_UP(pi))
- phyint_chstate(pi, PI_RUNNING);
- else
- phyint_chstate(pi, PI_FAILED);
-
- if (pi->pi_state == PI_RUNNING) {
- /*
- * Note that the success of MI_UNDO_OFFLINE is not
- * contingent on actually failing back; in the odd
- * case where we cannot do it here, we will try again
- * in initifs() since pi->pi_full will still be zero.
- */
- if (do_failback(pi) != IPMP_SUCCESS) {
- logdebug("process_cmd: cannot failback from "
- "%s during MI_UNDO_OFFLINE\n", pi->pi_name);
- }
- }
-
- /*
- * Clear the IFF_OFFLINE flag. We have to do this last
- * because do_failback() relies on it being set to decide
- * when to display messages.
- */
- (void) change_lif_flags(pi, IFF_OFFLINE, _B_FALSE);
-
- /*
- * Give the requestor time to configure test addresses
- * before complaining that they're missing.
- */
- pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
-
- return (send_result(newfd, IPMP_SUCCESS, 0));
-
- case MI_SETOINDEX:
- mis = &mpi->mi_scmd;
- /* Get the socket for doing ioctls */
- ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6;
-
- /*
- * Get index of new original interface.
- * The index is returned in lifr.lifr_index.
- */
- (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname,
- sizeof (lifr.lifr_name));
+ pi = phyint_lookup(miu->miu_ifname);
+ if (pi == NULL)
+ return (send_result(newfd, IPMP_EUNKIF, 0));
- if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0)
+ retval = phyint_undo_offline(pi);
+ if (retval == IPMP_FAILURE)
return (send_result(newfd, IPMP_FAILURE, errno));
- /*
- * Set new original interface index.
- * The new index was put into lifr.lifr_index by the
- * SIOCGLIFINDEX ioctl.
- */
- (void) strlcpy(lifr.lifr_name, mis->mis_lifname,
- sizeof (lifr.lifr_name));
-
- if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0)
- return (send_result(newfd, IPMP_FAILURE, errno));
-
- return (send_result(newfd, IPMP_SUCCESS, 0));
+ return (send_result(newfd, retval, 0));
case MI_QUERY:
return (process_query(newfd, &mpi->mi_qcmd));
@@ -2806,6 +2573,8 @@ process_cmd(int newfd, union mi_commands *mpi)
static unsigned int
process_query(int fd, mi_query_t *miq)
{
+ ipmp_addrinfo_t *adinfop;
+ ipmp_addrinfolist_t *adlp;
ipmp_groupinfo_t *grinfop;
ipmp_groupinfolist_t *grlp;
ipmp_grouplist_t *grlistp;
@@ -2815,6 +2584,19 @@ process_query(int fd, mi_query_t *miq)
unsigned int retval;
switch (miq->miq_inforeq) {
+ case IPMP_ADDRINFO:
+ retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr,
+ &adinfop);
+ if (retval != IPMP_SUCCESS)
+ return (send_result(fd, retval, errno));
+
+ retval = send_result(fd, IPMP_SUCCESS, 0);
+ if (retval == IPMP_SUCCESS)
+ retval = send_addrinfo(fd, adinfop);
+
+ ipmp_freeaddrinfo(adinfop);
+ return (retval);
+
case IPMP_GROUPLIST:
retval = getgrouplist(&grlistp);
if (retval != IPMP_SUCCESS)
@@ -2829,7 +2611,7 @@ process_query(int fd, mi_query_t *miq)
case IPMP_GROUPINFO:
miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
- retval = getgroupinfo(miq->miq_ifname, &grinfop);
+ retval = getgroupinfo(miq->miq_grname, &grinfop);
if (retval != IPMP_SUCCESS)
return (send_result(fd, retval, errno));
@@ -2854,6 +2636,11 @@ process_query(int fd, mi_query_t *miq)
return (retval);
case IPMP_SNAP:
+ /*
+ * Before taking the snapshot, sync with the kernel.
+ */
+ initifs();
+
retval = getsnap(&snap);
if (retval != IPMP_SUCCESS)
return (send_result(fd, retval, errno));
@@ -2883,6 +2670,13 @@ process_query(int fd, mi_query_t *miq)
if (retval != IPMP_SUCCESS)
goto out;
}
+
+ adlp = snap->sn_adinfolistp;
+ for (; adlp != NULL; adlp = adlp->adl_next) {
+ retval = send_addrinfo(fd, adlp->adl_adinfop);
+ if (retval != IPMP_SUCCESS)
+ goto out;
+ }
out:
ipmp_snap_free(snap);
return (retval);
@@ -2902,14 +2696,20 @@ static unsigned int
send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
{
ipmp_iflist_t *iflistp = grinfop->gr_iflistp;
+ ipmp_addrlist_t *adlistp = grinfop->gr_adlistp;
unsigned int retval;
retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
if (retval != IPMP_SUCCESS)
return (retval);
- return (ipmp_writetlv(fd, IPMP_IFLIST,
- IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp));
+ retval = ipmp_writetlv(fd, IPMP_IFLIST,
+ IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp);
+ if (retval != IPMP_SUCCESS)
+ return (retval);
+
+ return (ipmp_writetlv(fd, IPMP_ADDRLIST,
+ IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp));
}
/*
@@ -2919,7 +2719,31 @@ send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
static unsigned int
send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
{
- return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop));
+ ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp;
+ ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp;
+ unsigned int retval;
+
+ retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop);
+ if (retval != IPMP_SUCCESS)
+ return (retval);
+
+ retval = ipmp_writetlv(fd, IPMP_ADDRLIST,
+ IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p);
+ if (retval != IPMP_SUCCESS)
+ return (retval);
+
+ return (ipmp_writetlv(fd, IPMP_ADDRLIST,
+ IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p));
+}
+
+/*
+ * Send the address information pointed to by `adinfop' on file descriptor
+ * `fd'. Returns an IPMP error code.
+ */
+static unsigned int
+send_addrinfo(int fd, ipmp_addrinfo_t *adinfop)
+{
+ return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop));
}
/*
@@ -3109,3 +2933,32 @@ close_probe_socket(struct phyint_instance *pii, boolean_t polled)
pii->pii_probe_sock = -1;
pii->pii_basetime_inited = 0;
}
+
+boolean_t
+addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags,
+ struct sockaddr_storage *ssp)
+{
+ addrlist_t *addrp;
+
+ if ((addrp = malloc(sizeof (addrlist_t))) == NULL)
+ return (_B_FALSE);
+
+ (void) strlcpy(addrp->al_name, name, LIFNAMSIZ);
+ addrp->al_flags = flags;
+ addrp->al_addr = *ssp;
+ addrp->al_next = *addrsp;
+ *addrsp = addrp;
+ return (_B_TRUE);
+}
+
+void
+addrlist_free(addrlist_t **addrsp)
+{
+ addrlist_t *addrp, *next_addrp;
+
+ for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) {
+ next_addrp = addrp->al_next;
+ free(addrp);
+ }
+ *addrsp = NULL;
+}
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c
index a2ff76a983..cf327fbaff 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -20,8 +20,6 @@
* WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "mpd_defs.h"
#include "mpd_tables.h"
@@ -45,7 +43,7 @@ struct pr_icmp
uint16_t pr_icmp_cksum; /* checksum field */
uint16_t pr_icmp_id; /* Identification */
uint16_t pr_icmp_seq; /* sequence number */
- uint32_t pr_icmp_timestamp; /* Time stamp */
+ uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */
uint32_t pr_icmp_mtype; /* Message type */
};
@@ -58,11 +56,12 @@ static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */
-static void *find_ancillary(struct msghdr *msg, int cmsg_type);
-static void pi_set_crtt(struct target *tg, int m,
+static void *find_ancillary(struct msghdr *msg, int cmsg_level,
+ int cmsg_type);
+static void pi_set_crtt(struct target *tg, int64_t m,
boolean_t is_probe_uni);
static void incoming_echo_reply(struct phyint_instance *pii,
- struct pr_icmp *reply, struct in6_addr fromaddr);
+ struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
static void incoming_rtt_reply(struct phyint_instance *pii,
struct pr_icmp *reply, struct in6_addr fromaddr);
static void incoming_mcast_reply(struct phyint_instance *pii,
@@ -78,13 +77,11 @@ static void probe_success_info(struct phyint_instance *pii,
struct target *cur_tg, struct probe_success_count *psinfo);
static boolean_t phyint_repaired(struct phyint *pi);
-static int failover(struct phyint *from, struct phyint *to);
-static int failback(struct phyint *from, struct phyint *to);
-static struct phyint *get_failover_dst(struct phyint *pi, int failover_type);
-
static boolean_t highest_ack_tg(uint16_t seq, struct target *tg);
static int in_cksum(ushort_t *addr, int len);
static void reset_snxt_basetimes(void);
+static int ns2ms(int64_t ns);
+static int64_t tv2ns(struct timeval *);
/*
* CRTT - Conservative Round Trip Time Estimate
@@ -104,7 +101,7 @@ static void reset_snxt_basetimes(void);
* Phyint state diagram
*
* The state of a phyint that is capable of being probed, is completely
- * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>.
+ * specified by the 3-tuple <pi_state, pg_state, I>.
*
* A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state
* of the link (according to the driver). If the phyint is also configured
@@ -117,8 +114,8 @@ static void reset_snxt_basetimes(void);
* state, which indicates that the link is apparently functional but that
* in.mpathd is unable to send probes to verify functionality (in this case,
* in.mpathd makes the optimistic assumption that the interface is working
- * correctly and thus does not perform a failover, but reports the interface
- * as IPMP_IF_UNKNOWN through the async events and query interfaces).
+ * correctly and thus does not mark the interface FAILED, but reports it as
+ * IPMP_IF_UNKNOWN through the async events and query interfaces).
*
* At any point, a phyint may be administratively marked offline via if_mpadm.
* In this case, the interface always transitions to PI_OFFLINE, regardless
@@ -131,8 +128,11 @@ static void reset_snxt_basetimes(void);
* PI_RUNNING: The failure detection logic says the phyint is good.
* PI_FAILED: The failure detection logic says the phyint has failed.
*
- * pg_groupfailed - Group failure, all interfaces in the group have failed.
- * The pi_state may be either PI_FAILED or PI_NOTARGETS.
+ * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED.
+ * PG_OK: All interfaces in the group are OK.
+ * PG_DEGRADED: Some interfaces in the group are unusable.
+ * PG_FAILED: All interfaces in the group are unusable.
+ *
* In the case of router targets, we assume that the current list of
* targets obtained from the routing table, is still valid, so the
* phyint stat is PI_FAILED. In the case of host targets, we delete the
@@ -140,144 +140,46 @@ static void reset_snxt_basetimes(void);
* target list. So the phyints are in the PI_NOTARGETS state.
*
* I - value of (pi_flags & IFF_INACTIVE)
- * IFF_INACTIVE: No failovers have been done to this phyint, from
- * other phyints. This phyint is inactive. Phyint can be a Standby.
- * When failback has been disabled (FAILOVER=no configured),
- * phyint can also be a non-STANDBY. In this case IFF_INACTIVE
- * is set when phyint subsequently recovers after a failure.
- *
- * pi_empty
- * This phyint has failed over successfully to another phyint, and
- * this phyint is currently "empty". It does not host any addresses or
- * multicast membership etc. This is the state of a phyint after a
- * failover from the phyint has completed successfully and no subsequent
- * 'failover to' or 'failback to' has occurred on the phyint.
- * IP guarantees that no new logicals will be hosted nor any multicast
- * joins permitted on the phyint, since the phyint is either failed or
- * inactive. pi_empty is set implies the phyint is either failed or
- * inactive.
- *
- * pi_full
- * The phyint hosts all of its own addresses that it "owns". If the
- * phyint was previously failed or inactive, failbacks to the phyint
- * has completed successfully. i.e. No more failbacks to this phyint
- * can produce any change in system state whatsoever.
- *
- * Not all 32 possible combinations of the above 5-tuple are possible.
- * Furthermore some of the above combinations are transient. They may occur
- * only because the failover or failback did not complete successfully. The
- * failover/failback will be retried and eventually a stable state will be
- * reached.
- *
- * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd.
- * The following are the state machines. 'from' and 'to' are the src and
- * dst of the failover/failback, below
- *
- * pi_empty state machine
- * ---------------------------------------------------------------------------
- * Event State -> New State
- * ---------------------------------------------------------------------------
- * successful completion from.pi_empty = 0 -> from.pi_empty = 1
- * of failover
+ * IFF_INACTIVE: This phyint will not send or receive packets.
+ * Usually, inactive is tied to standby interfaces that are not yet
+ * needed (e.g., no non-standby interfaces in the group have failed).
+ * When failback has been disabled (FAILBACK=no configured), phyint can
+ * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
+ * subsequently recovers after a failure.
*
- * Initiate failover to.pi_empty = X -> to.pi_empty = 0
+ * Not all 9 possible combinations of the above 3-tuple are possible.
*
- * Initiate failback to.pi_empty = X -> to.pi_empty = 0
- *
- * group failure pi_empty = X -> pi_empty = 0
- * ---------------------------------------------------------------------------
- *
- * pi_full state machine
- * ---------------------------------------------------------------------------
- * Event State -> New State
- * ---------------------------------------------------------------------------
- * successful completion to.pi_full = 0 -> to.pi_full = 1
- * of failback from
- * each of the other phyints
- *
- * Initiate failover from.pi_full = X -> from.pi_full = 0
- *
- * group failure pi_full = X -> pi_full = 0
- * ---------------------------------------------------------------------------
+ * I is tracked by IP. pi_state is tracked by mpathd.
*
* pi_state state machine
* ---------------------------------------------------------------------------
* Event State New State
* Action:
* ---------------------------------------------------------------------------
- * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
+ * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
* detection : set IFF_FAILED on this phyint
- * : failover from this phyint to another
*
- * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
+ * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
* detection : set IFF_FAILED on this phyint
*
- * NIC repair (PI_FAILED, I == 0, FAILBACK=yes)
+ * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes)
* detection -> (PI_RUNNING, I == 0)
- * : to.pi_empty = 0
* : clear IFF_FAILED on this phyint
- * : failback to this phyint if enabled
*
- * NIC repair (PI_FAILED, I == 0, FAILBACK=no)
+ * IP interface repair (PI_FAILED, I == 0, FAILBACK=no)
* detection -> (PI_RUNNING, I == 1)
- * : to.pi_empty = 0
* : clear IFF_FAILED on this phyint
* : if failback is disabled set I == 1
*
* Group failure (perform on all phyints in the group)
* detection PI_RUNNING PI_FAILED
* (Router targets) : set IFF_FAILED
- * : clear pi_empty and pi_full
*
* Group failure (perform on all phyints in the group)
* detection PI_RUNNING PI_NOTARGETS
* (Host targets) : set IFF_FAILED
- * : clear pi_empty and pi_full
* : delete the target list on all phyints
* ---------------------------------------------------------------------------
- *
- * I state machine
- * ---------------------------------------------------------------------------
- * Event State Action:
- * ---------------------------------------------------------------------------
- * Turn on I pi_empty == 0, STANDBY : failover from standby
- *
- * Turn off I PI_RUNNING, STANDBY : pi_empty = 0
- * pi_full == 0 : failback to this if enabled
- * ---------------------------------------------------------------------------
- *
- * Assertions: (Read '==>' as implies)
- *
- * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED)
- * (pi_empty == 1) ==> (pi_full == 0)
- * (pi_full == 1) ==> (pi_empty == 0)
- *
- * Invariants
- *
- * pg_groupfailed = 0 &&
- * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby
- * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint
- * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint
- *
- * 1. says that an inactive standby, that is not empty, has to be failed
- * over. For a standby to be truly inactive, it should not host any
- * addresses. So we move them to some other phyint. Usually we catch the
- * turn on of IFF_INACTIVE, and perform this action. However if the failover
- * did not complete successfully, then subsequently we have lost the edge
- * trigger, and this invariant kicks in and completes the action.
- *
- * 2. says that any failed phyint that is not empty must be failed over.
- * Usually we do the failover when we detect NIC failure. However if the
- * failover does not complete successfully, this invariant kicks in and
- * completes the failover. We exclude inactive standby which is covered by 1.
- *
- * 3. says that any running phyint that is not full must be failed back.
- * Usually we do the failback when we detect NIC repair. However if the
- * failback does not complete successfully, this invariant kicks in and
- * completes the failback. Note that we don't want to failback to an inactive
- * standby.
- *
- * The invariants 1 - 3 and the actions are in initifs().
*/
struct probes_missed probes_missed;
@@ -295,7 +197,7 @@ struct probes_missed probes_missed;
* not less than the current CRTT. pii_probes[] stores data
* about these probes. These packets consume sequence number space.
*
- * PROBE_RTT: This type is used to make only rtt measurments. Normally these
+ * PROBE_RTT: This type is used to make only rtt measurements. Normally these
* are not used. Under heavy network load, the rtt may go up very high,
* due to a spike, or may appear to go high, due to extreme scheduling
* delays. Once the network stress is removed, mpathd takes long time to
@@ -310,17 +212,19 @@ struct probes_missed probes_missed;
* no targets are known. The packet is multicast to the all hosts addr.
*/
static void
-probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
+probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
{
+ hrtime_t sent_hrtime;
+ struct timeval sent_tv;
struct pr_icmp probe_pkt; /* Probe packet */
- struct sockaddr_in6 whereto6; /* target address IPv6 */
- struct sockaddr_in whereto; /* target address IPv4 */
+ struct sockaddr_storage targ; /* target address */
+ uint_t targaddrlen; /* targed address length */
int pr_ndx; /* probe index in pii->pii_probes[] */
boolean_t sent = _B_TRUE;
if (debug & D_TARGET) {
- logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af),
- pii->pii_name, probe_type, cur_time);
+ logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
+ pii->pii_name, probe_type, start_hrtime);
}
assert(pii->pii_probe_sock != -1);
@@ -339,7 +243,7 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
* network byte order at initialization itself.
*/
probe_pkt.pr_icmp_id = pii->pii_icmpid;
- probe_pkt.pr_icmp_timestamp = htonl(cur_time);
+ probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
probe_pkt.pr_icmp_mtype = htonl(probe_type);
/*
@@ -349,38 +253,34 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
pii->pii_rtt_target_next != NULL));
+ bzero(&targ, sizeof (targ));
+ targ.ss_family = pii->pii_af;
+
if (pii->pii_af == AF_INET6) {
- bzero(&whereto6, sizeof (whereto6));
- whereto6.sin6_family = AF_INET6;
+ struct in6_addr *addr6;
+
+ addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
+ targaddrlen = sizeof (struct sockaddr_in6);
if (probe_type == PROBE_MULTI) {
- whereto6.sin6_addr = all_nodes_mcast_v6;
+ *addr6 = all_nodes_mcast_v6;
} else if (probe_type == PROBE_UNI) {
- whereto6.sin6_addr = pii->pii_target_next->tg_address;
- } else {
- /* type is PROBE_RTT */
- whereto6.sin6_addr =
- pii->pii_rtt_target_next->tg_address;
- }
- if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
- sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6,
- sizeof (whereto6)) != sizeof (probe_pkt)) {
- logperror_pii(pii, "probe: probe sendto");
- sent = _B_FALSE;
+ *addr6 = pii->pii_target_next->tg_address;
+ } else { /* type is PROBE_RTT */
+ *addr6 = pii->pii_rtt_target_next->tg_address;
}
} else {
- bzero(&whereto, sizeof (whereto));
- whereto.sin_family = AF_INET;
+ struct in_addr *addr4;
+
+ addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
+ targaddrlen = sizeof (struct sockaddr_in);
if (probe_type == PROBE_MULTI) {
- whereto.sin_addr = all_nodes_mcast_v4;
+ *addr4 = all_nodes_mcast_v4;
} else if (probe_type == PROBE_UNI) {
IN6_V4MAPPED_TO_INADDR(
- &pii->pii_target_next->tg_address,
- &whereto.sin_addr);
- } else {
- /* type is PROBE_RTT */
+ &pii->pii_target_next->tg_address, addr4);
+ } else { /* type is PROBE_RTT */
IN6_V4MAPPED_TO_INADDR(
- &pii->pii_rtt_target_next->tg_address,
- &whereto.sin_addr);
+ &pii->pii_rtt_target_next->tg_address, addr4);
}
/*
@@ -388,12 +288,18 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
*/
probe_pkt.pr_icmp_cksum =
in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
- if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
- sizeof (probe_pkt), 0, (struct sockaddr *)&whereto,
- sizeof (whereto)) != sizeof (probe_pkt)) {
- logperror_pii(pii, "probe: probe sendto");
- sent = _B_FALSE;
- }
+ }
+
+ /*
+ * Use the current time as the time we sent. Not atomic, but the best
+ * we can do from here.
+ */
+ sent_hrtime = gethrtime();
+ (void) gettimeofday(&sent_tv, NULL);
+ if (sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
+ (struct sockaddr *)&targ, targaddrlen) != sizeof (probe_pkt)) {
+ logperror_pii(pii, "probe: probe sendto");
+ sent = _B_FALSE;
}
/*
@@ -415,9 +321,13 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
pii->pii_cum_stats.acked++;
pii->pii_cum_stats.sent++;
- pii->pii_probes[pr_ndx].pr_status = PR_UNACKED;
+ pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
+ pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
+ pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
+ pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
- pii->pii_probes[pr_ndx].pr_time_sent = cur_time;
+ probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
+
pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
pii->pii_target_next = target_next(pii->pii_target_next);
assert(pii->pii_target_next != NULL);
@@ -448,33 +358,42 @@ in_data(struct phyint_instance *pii)
{
struct sockaddr_in from;
struct in6_addr fromaddr;
- uint_t fromlen;
- static uint_t in_packet[(IP_MAXPACKET + 1)/4];
+ static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
+ static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
struct ip *ip;
int iphlen;
int len;
char abuf[INET_ADDRSTRLEN];
- struct pr_icmp *reply;
+ struct msghdr msg;
+ struct iovec iov;
+ struct pr_icmp *reply;
+ struct timeval *recv_tvp;
if (debug & D_PROBE) {
logdebug("in_data(%s %s)\n",
AF_STR(pii->pii_af), pii->pii_name);
}
+ iov.iov_base = (char *)in_packet;
+ iov.iov_len = sizeof (in_packet);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_name = (struct sockaddr *)&from;
+ msg.msg_namelen = sizeof (from);
+ msg.msg_control = ancillary_data;
+ msg.msg_controllen = sizeof (ancillary_data);
+
/*
* Poll has already told us that a message is waiting,
* on this socket. Read it now. We should not block.
*/
- fromlen = sizeof (from);
- len = recvfrom(pii->pii_probe_sock, (char *)in_packet,
- sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen);
- if (len < 0) {
- logperror_pii(pii, "in_data: recvfrom");
+ if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
+ logperror_pii(pii, "in_data: recvmsg");
return;
}
/*
- * If the NIC has indicated the link is down, don't go
+ * If the datalink has indicated the link is down, don't go
* any further.
*/
if (LINK_DOWN(pii->pii_phyint))
@@ -483,6 +402,15 @@ in_data(struct phyint_instance *pii)
/* Get the printable address for error reporting */
(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
+ /* Ignore packets > 64k or control buffers that don't fit */
+ if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
+ if (debug & D_PKTBAD) {
+ logdebug("Truncated message: msg_flags 0x%x from %s\n",
+ msg.msg_flags, abuf);
+ }
+ return;
+ }
+
/* Make sure packet contains at least minimum ICMP header */
ip = (struct ip *)in_packet;
iphlen = ip->ip_hl << 2;
@@ -528,10 +456,17 @@ in_data(struct phyint_instance *pii)
return;
}
+ recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
+ if (recv_tvp == NULL) {
+ logtrace("message without timestamp from %s on %s\n",
+ abuf, pii->pii_name);
+ return;
+ }
+
IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
/* Unicast probe reply */
- incoming_echo_reply(pii, reply, fromaddr);
+ incoming_echo_reply(pii, reply, fromaddr, recv_tvp);
else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
/* Multicast reply */
incoming_mcast_reply(pii, reply, fromaddr);
@@ -543,7 +478,6 @@ in_data(struct phyint_instance *pii)
reply->pr_icmp_mtype, abuf, pii->pii_name);
return;
}
-
}
/*
@@ -559,8 +493,9 @@ in6_data(struct phyint_instance *pii)
char abuf[INET6_ADDRSTRLEN];
struct msghdr msg;
struct iovec iov;
- uchar_t *opt;
+ void *opt;
struct pr_icmp *reply;
+ struct timeval *recv_tvp;
if (debug & D_PROBE) {
logdebug("in6_data(%s %s)\n",
@@ -577,12 +512,12 @@ in6_data(struct phyint_instance *pii)
msg.msg_controllen = sizeof (ancillary_data);
if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
- logperror_pii(pii, "in6_data: recvfrom");
+ logperror_pii(pii, "in6_data: recvmsg");
return;
}
/*
- * If the NIC has indicated that the link is down, don't go
+ * If the datalink has indicated that the link is down, don't go
* any further.
*/
if (LINK_DOWN(pii->pii_phyint))
@@ -623,13 +558,14 @@ in6_data(struct phyint_instance *pii)
"%s on %s\n", abuf, pii->pii_name);
return;
}
- opt = find_ancillary(&msg, IPV6_RTHDR);
+ opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR);
if (opt != NULL) {
/* Can't allow routing headers in probe replies */
logtrace("message with routing header from %s on %s\n",
abuf, pii->pii_name);
return;
}
+
if (reply->pr_icmp_code != 0) {
logtrace("probe reply code: %d from %s on %s\n",
reply->pr_icmp_code, abuf, pii->pii_name);
@@ -640,8 +576,16 @@ in6_data(struct phyint_instance *pii)
len, abuf, pii->pii_name);
return;
}
+
+ recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
+ if (recv_tvp == NULL) {
+ logtrace("message without timestamp from %s on %s\n",
+ abuf, pii->pii_name);
+ return;
+ }
+
if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
- incoming_echo_reply(pii, reply, from.sin6_addr);
+ incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp);
} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
incoming_mcast_reply(pii, reply, from.sin6_addr);
} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
@@ -663,11 +607,9 @@ static void
incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
struct in6_addr fromaddr)
{
- int m; /* rtt measurment in ms */
- uint32_t cur_time; /* in ms from some arbitrary point */
+ int64_t m; /* rtt measurement in ns */
char abuf[INET6_ADDRSTRLEN];
struct target *target;
- uint32_t pr_icmp_timestamp;
struct phyint_group *pg;
/* Get the printable address for error reporting */
@@ -683,10 +625,7 @@ incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
if (target == NULL)
return;
- pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp);
- cur_time = getcurrenttime();
- m = (int)(cur_time - pr_icmp_timestamp);
-
+ m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp));
/* Invalid rtt. It has wrapped around */
if (m < 0)
return;
@@ -754,29 +693,30 @@ incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
*/
static void
incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
- struct in6_addr fromaddr)
+ struct in6_addr fromaddr, struct timeval *recv_tvp)
{
- int m; /* rtt measurment in ms */
- uint32_t cur_time; /* in ms from some arbitrary point */
+ int64_t m; /* rtt measurement in ns */
+ hrtime_t cur_hrtime; /* in ns from some arbitrary point */
char abuf[INET6_ADDRSTRLEN];
int pr_ndx;
struct target *target;
boolean_t exception;
- uint32_t pr_icmp_timestamp;
+ uint64_t pr_icmp_timestamp;
uint16_t pr_icmp_seq;
+ struct probe_stats *pr_statp;
struct phyint_group *pg = pii->pii_phyint->pi_group;
/* Get the printable address for error reporting */
(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
if (debug & D_PROBE) {
- logdebug("incoming_echo_reply: %s %s %s seq %u\n",
+ logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
AF_STR(pii->pii_af), pii->pii_name, abuf,
- ntohs(reply->pr_icmp_seq));
+ ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp));
}
- pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp);
- pr_icmp_seq = ntohs(reply->pr_icmp_seq);
+ pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp);
+ pr_icmp_seq = ntohs(reply->pr_icmp_seq);
/* Reject out of window probe replies */
if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
@@ -786,15 +726,16 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
pii->pii_cum_stats.unknown++;
return;
}
- cur_time = getcurrenttime();
- m = (int)(cur_time - pr_icmp_timestamp);
+
+ cur_hrtime = gethrtime();
+ m = (int64_t)(cur_hrtime - pr_icmp_timestamp);
if (m < 0) {
/*
* This is a ridiculously high value of rtt. rtt has wrapped
* around. Log a message, and ignore the rtt.
*/
- logerr("incoming_echo_reply: rtt wraparound cur_time %u reply "
- "timestamp %u\n", cur_time, pr_icmp_timestamp);
+ logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
+ "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp);
}
/*
@@ -868,10 +809,10 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
* debugger, or the system was hung or too busy for a
* substantial time that we didn't get a chance to run.
*/
- if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) {
+ if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) {
/*
- * If the probe corresponding to this receieved response
- * was truly sent 'm' ms. ago, then this response must
+ * If the probe corresponding to this received response
+ * was truly sent 'm' ns. ago, then this response must
* have been rejected by the sequence number checks. The
* fact that it has passed the sequence number checks
* means that the measured rtt is wrong. We were probably
@@ -947,7 +888,7 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
* adjusts pii->pii_target_next
*/
target_delete(target);
- probe(pii, PROBE_MULTI, cur_time);
+ probe(pii, PROBE_MULTI, cur_hrtime);
}
} else {
/*
@@ -999,8 +940,12 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
}
}
out:
- pii->pii_probes[pr_ndx].pr_status = PR_ACKED;
- pii->pii_probes[pr_ndx].pr_time_acked = cur_time;
+ pr_statp = &pii->pii_probes[pr_ndx];
+ pr_statp->pr_hrtime_ackproc = cur_hrtime;
+ pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent +
+ (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent));
+
+ probe_chstate(pr_statp, pii, PR_ACKED);
/*
* Update pii->pii_rack, i.e. the sequence number of the last received
@@ -1240,13 +1185,13 @@ incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
*
* New scaled average and deviation are passed back via sap and svp
*/
-static int
-compute_crtt(int *sap, int *svp, int m)
+static int64_t
+compute_crtt(int64_t *sap, int64_t *svp, int64_t m)
{
- int sa = *sap;
- int sv = *svp;
- int crtt;
- int saved_m = m;
+ int64_t sa = *sap;
+ int64_t sv = *svp;
+ int64_t crtt;
+ int64_t saved_m = m;
assert(*sap >= -1);
assert(*svp >= 0);
@@ -1285,8 +1230,8 @@ compute_crtt(int *sap, int *svp, int m)
crtt = (sa >> 3) + sv;
if (debug & D_PROBE) {
- logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = "
- "%d\n", saved_m, sa, sv, crtt);
+ logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
+ "crtt = %lld\n", saved_m, sa, sv, crtt);
}
*sap = sa;
@@ -1300,22 +1245,22 @@ compute_crtt(int *sap, int *svp, int m)
}
static void
-pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
+pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni)
{
struct phyint_instance *pii = tg->tg_phyint_inst;
int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
- int sa = tg->tg_rtt_sa;
- int sv = tg->tg_rtt_sd;
+ int64_t sa = tg->tg_rtt_sa;
+ int64_t sv = tg->tg_rtt_sd;
int new_crtt;
int i;
if (debug & D_PROBE)
- logdebug("pi_set_crtt: target - m %d\n", m);
+ logdebug("pi_set_crtt: target - m %lld\n", m);
/* store the round trip time, in case we need to defer computation */
tg->tg_deferred[tg->tg_num_deferred] = m;
- new_crtt = compute_crtt(&sa, &sv, m);
+ new_crtt = ns2ms(compute_crtt(&sa, &sv, m));
/*
* If this probe's round trip time would singlehandedly cause an
@@ -1342,8 +1287,8 @@ pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
}
for (i = 0; i <= tg->tg_num_deferred; i++) {
- tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa,
- &tg->tg_rtt_sd, tg->tg_deferred[i]);
+ tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa,
+ &tg->tg_rtt_sd, tg->tg_deferred[i]));
}
tg->tg_num_deferred = 0;
@@ -1373,13 +1318,13 @@ pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
* If not found return NULL.
*/
static void *
-find_ancillary(struct msghdr *msg, int cmsg_type)
+find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type)
{
struct cmsghdr *cmsg;
for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
cmsg = CMSG_NXTHDR(msg, cmsg)) {
- if (cmsg->cmsg_level == IPPROTO_IPV6 &&
+ if (cmsg->cmsg_level == cmsg_level &&
cmsg->cmsg_type == cmsg_type) {
return (CMSG_DATA(cmsg));
}
@@ -1388,107 +1333,194 @@ find_ancillary(struct msghdr *msg, int cmsg_type)
}
/*
- * See if a previously failed interface has started working again.
+ * Try to activate another INACTIVE interface in the same group as `pi'.
+ * Prefer STANDBY INACTIVE to just INACTIVE.
*/
void
-phyint_check_for_repair(struct phyint *pi)
+phyint_activate_another(struct phyint *pi)
{
- if (phyint_repaired(pi)) {
- if (pi->pi_group == phyint_anongroup) {
- logerr("NIC repair detected on %s\n", pi->pi_name);
- } else {
- logerr("NIC repair detected on %s of group %s\n",
- pi->pi_name, pi->pi_group->pg_name);
- }
+ struct phyint *pi2;
+ struct phyint *inactivepi = NULL;
- /*
- * If the interface is offline, just clear the FAILED flag,
- * delaying the state change and failback operation until it
- * is brought back online.
- */
- if (pi->pi_state == PI_OFFLINE) {
- (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
- return;
- }
+ if (pi->pi_group == phyint_anongroup)
+ return;
- if (pi->pi_flags & IFF_STANDBY) {
- (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
- } else {
- if (try_failback(pi) != IPMP_FAILURE) {
- (void) change_lif_flags(pi,
- IFF_FAILED, _B_FALSE);
- /* Per state diagram */
- pi->pi_empty = 0;
+ for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
+ if (pi == pi2 || pi2->pi_state != PI_RUNNING ||
+ !(pi2->pi_flags & IFF_INACTIVE))
+ continue;
+
+ inactivepi = pi2;
+ if (pi2->pi_flags & IFF_STANDBY)
+ break;
+ }
+
+ if (inactivepi != NULL)
+ (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE);
+}
+
+/*
+ * Transition a phyint back to PI_RUNNING (from PI_FAILED or PI_OFFLINE). The
+ * caller must ensure that the transition is appropriate. Clears IFF_OFFLINE
+ * or IFF_FAILED, as appropriate. Also sets IFF_INACTIVE on this or other
+ * interfaces as appropriate (see comment below). Finally, also updates the
+ * phyint's group state to account for the change.
+ */
+void
+phyint_transition_to_running(struct phyint *pi)
+{
+ struct phyint *pi2;
+ struct phyint *actstandbypi = NULL;
+ uint_t nactive = 0, nnonstandby = 0;
+ boolean_t onlining = (pi->pi_state == PI_OFFLINE);
+ uint64_t set, clear;
+
+ /*
+ * The interface is running again, but should it or another interface
+ * in the group end up INACTIVE? There are three cases:
+ *
+ * 1. If it's a STANDBY interface, it should be end up INACTIVE if
+ * the group is operating at capacity (i.e., there are at least as
+ * many active interfaces as non-STANDBY interfaces in the group).
+ * No other interfaces should be changed.
+ *
+ * 2. If it's a non-STANDBY interface and we're onlining it or
+ * FAILBACK is enabled, then it should *not* end up INACTIVE.
+ * Further, if the group is above capacity as a result of this
+ * interface, then an active STANDBY interface in the group should
+ * end up INACTIVE.
+ *
+ * 3. If it's a non-STANDBY interface, we're repairing it, and
+ * FAILBACK is disabled, then it should end up INACTIVE *unless*
+ * the group was failed (in which case we have no choice but to
+ * use it). No other interfaces should be changed.
+ */
+ if (pi->pi_group != phyint_anongroup) {
+ pi2 = pi->pi_group->pg_phyint;
+ for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
+ if (!(pi2->pi_flags & IFF_STANDBY))
+ nnonstandby++;
+
+ if (pi2->pi_state == PI_RUNNING) {
+ if (!(pi2->pi_flags & IFF_INACTIVE)) {
+ nactive++;
+ if (pi2->pi_flags & IFF_STANDBY)
+ actstandbypi = pi2;
+ }
}
}
+ }
- phyint_chstate(pi, PI_RUNNING);
+ set = 0;
+ clear = (onlining ? IFF_OFFLINE : IFF_FAILED);
- if (GROUP_FAILED(pi->pi_group)) {
- /*
- * This is the 1st phyint to receive a response
- * after group failure.
- */
- logerr("At least 1 interface (%s) of group %s has "
- "repaired\n", pi->pi_name, pi->pi_group->pg_name);
- phyint_group_chstate(pi->pi_group, PG_RUNNING);
- }
+ if (pi->pi_flags & IFF_STANDBY) { /* case 1 */
+ if (nactive >= nnonstandby)
+ set |= IFF_INACTIVE;
+ else
+ clear |= IFF_INACTIVE;
+ } else if (onlining || failback_enabled) { /* case 2 */
+ if (nactive >= nnonstandby && actstandbypi != NULL)
+ (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0);
+ } else if (!GROUP_FAILED(pi->pi_group)) { /* case 3 */
+ set |= IFF_INACTIVE;
+ }
+ (void) change_pif_flags(pi, set, clear);
+
+ phyint_chstate(pi, PI_RUNNING);
+
+ /*
+ * Update the group state to account for the change.
+ */
+ phyint_group_refresh_state(pi->pi_group);
+}
+
+/*
+ * See if a previously failed interface has started working again.
+ */
+void
+phyint_check_for_repair(struct phyint *pi)
+{
+ if (!phyint_repaired(pi))
+ return;
+
+ if (pi->pi_group == phyint_anongroup) {
+ logerr("IP interface repair detected on %s\n", pi->pi_name);
+ } else {
+ logerr("IP interface repair detected on %s of group %s\n",
+ pi->pi_name, pi->pi_group->pg_name);
}
+
+ /*
+ * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
+ * So just clear IFF_OFFLINE and defer phyint_transition_to_running()
+ * until it is brought back online.
+ */
+ if (pi->pi_state == PI_OFFLINE) {
+ (void) change_pif_flags(pi, 0, IFF_FAILED);
+ return;
+ }
+
+ phyint_transition_to_running(pi); /* calls phyint_chstate() */
}
/*
- * See if a previously functioning interface has failed, or if the
- * whole group of interfaces has failed.
+ * See if an interface has failed, or if the whole group of interfaces has
+ * failed.
*/
static void
phyint_inst_check_for_failure(struct phyint_instance *pii)
{
- struct phyint *pi;
- struct phyint *pi2;
-
- pi = pii->pii_phyint;
+ struct phyint *pi = pii->pii_phyint;
+ struct phyint *pi2;
+ boolean_t was_active;
switch (failure_state(pii)) {
case PHYINT_FAILURE:
- (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
+ was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
+
+ (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
if (pi->pi_group == phyint_anongroup) {
- logerr("NIC failure detected on %s\n", pii->pii_name);
+ logerr("IP interface failure detected on %s\n",
+ pii->pii_name);
} else {
- logerr("NIC failure detected on %s of group %s\n",
- pii->pii_name, pi->pi_group->pg_name);
+ logerr("IP interface failure detected on %s of group"
+ " %s\n", pii->pii_name, pi->pi_group->pg_name);
}
+
/*
- * Do the failover, unless the interface is offline (in
- * which case we've already failed over).
+ * If the interface is offline, the state change will be
+ * noted when it comes back online.
*/
if (pi->pi_state != PI_OFFLINE) {
+ /*
+ * If the failed interface was active, activate
+ * another INACTIVE interface in the group if
+ * possible. (If the interface is PI_OFFLINE,
+ * we already activated another.)
+ */
+ if (was_active)
+ phyint_activate_another(pi);
+
phyint_chstate(pi, PI_FAILED);
reset_crtt_all(pi);
- if (!(pi->pi_flags & IFF_INACTIVE))
- (void) try_failover(pi, FAILOVER_NORMAL);
}
break;
case GROUP_FAILURE:
- logerr("All Interfaces in group %s have failed\n",
- pi->pi_group->pg_name);
- for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL;
- pi2 = pi2->pi_pgnext) {
- if (pi2->pi_flags & IFF_OFFLINE)
+ pi2 = pi->pi_group->pg_phyint;
+ for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
+ (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE);
+ if (pi2->pi_state == PI_OFFLINE) /* see comment above */
continue;
- (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE);
- reset_crtt_all(pi2);
+ reset_crtt_all(pi2);
/*
- * In the case of host targets, we
- * would have flushed the targets,
- * and gone to PI_NOTARGETS state.
+ * In the case of host targets, we would have flushed
+ * the targets, and gone to PI_NOTARGETS state.
*/
if (pi2->pi_state == PI_RUNNING)
phyint_chstate(pi2, PI_FAILED);
-
- pi2->pi_empty = 0;
- pi2->pi_full = 0;
}
break;
@@ -1519,7 +1551,8 @@ phyint_inst_timer(struct phyint_instance *pii)
hrtime_t cur_hrtime;
int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
- cur_time = getcurrenttime();
+ cur_hrtime = gethrtime();
+ cur_time = ns2ms(cur_hrtime);
if (debug & D_TIMER) {
logdebug("phyint_inst_timer(%s %s)\n",
@@ -1621,7 +1654,7 @@ phyint_inst_timer(struct phyint_instance *pii)
* the failure detection (fd) probe timer has not yet fired.
* Need to send only an rtt probe. The probe type is PROBE_RTT.
*/
- probe(pii, PROBE_RTT, cur_time);
+ probe(pii, PROBE_RTT, cur_hrtime);
return (interval);
}
/*
@@ -1651,7 +1684,7 @@ phyint_inst_timer(struct phyint_instance *pii)
* We can have at most, the latest 2 probes that we sent, in
* the PR_UNACKED state. All previous probes sent, are either
* PR_LOST or PR_ACKED. An unacknowledged probe is considered
- * timed out if the probe's time_sent + the CRTT < currenttime.
+ * timed out if the probe's time_start + the CRTT < currenttime.
* For each of the last 2 probes, examine whether it has timed
* out. If so, mark it PR_LOST. The probe stats is a circular array.
*/
@@ -1686,16 +1719,15 @@ phyint_inst_timer(struct phyint_instance *pii)
* not available use group's probe interval,
* which is a worst case estimate.
*/
+ timeout = ns2ms(pr_statp->pr_hrtime_start);
if (cur_tg->tg_crtt != 0) {
- timeout = pr_statp->pr_time_sent +
- cur_tg->tg_crtt;
+ timeout += cur_tg->tg_crtt;
} else {
- timeout = pr_statp->pr_time_sent +
- probe_interval;
+ timeout += probe_interval;
}
if (TIME_LT(timeout, cur_time)) {
- pr_statp->pr_status = PR_LOST;
pr_statp->pr_time_lost = timeout;
+ probe_chstate(pr_statp, pii, PR_LOST);
} else if (i == 1) {
/*
* We are forced to consider this probe
@@ -1711,8 +1743,8 @@ phyint_inst_timer(struct phyint_instance *pii)
* when the timer fires, we find 2 valid
* unacked probes, and they are yet to timeout
*/
- pr_statp->pr_status = PR_LOST;
pr_statp->pr_time_lost = cur_time;
+ probe_chstate(pr_statp, pii, PR_LOST);
} else {
/*
* Only the most recent probe can enter
@@ -1740,16 +1772,15 @@ phyint_inst_timer(struct phyint_instance *pii)
* The timer has fired. Take appropriate action depending
* on the current state of the phyint.
*
- * PI_RUNNING state - Failure detection and failover
- * PI_FAILED state - Repair detection and failback
+ * PI_RUNNING state - Failure detection
+ * PI_FAILED state - Repair detection
*/
switch (pii->pii_phyint->pi_state) {
case PI_FAILED:
/*
* If the most recent probe (excluding unacked probes that
* are yet to time out) has been acked, check whether the
- * phyint is now repaired. If the phyint is repaired, then
- * attempt failback, unless it is an inactive standby.
+ * phyint is now repaired.
*/
if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
phyint_check_for_repair(pii->pii_phyint);
@@ -1760,10 +1791,8 @@ phyint_inst_timer(struct phyint_instance *pii)
/*
* It's possible our probes have been lost because of a
* spanning-tree mandated quiet period on the switch. If so,
- * ignore the lost probes and consider the interface to still
- * be functioning.
+ * ignore the lost probes.
*/
- cur_hrtime = gethrtime();
if (pii->pii_fd_hrtime - cur_hrtime > 0)
break;
@@ -1771,8 +1800,7 @@ phyint_inst_timer(struct phyint_instance *pii)
/*
* We have 1 or more failed probes (excluding unacked
* probes that are yet to time out). Determine if the
- * phyint has failed. If so attempt a failover,
- * unless it is an inactive standby
+ * phyint has failed.
*/
phyint_inst_check_for_failure(pii);
}
@@ -1790,16 +1818,16 @@ phyint_inst_timer(struct phyint_instance *pii)
* was called, the target list may be empty.
*/
if (pii->pii_target_next != NULL) {
- probe(pii, PROBE_UNI, cur_time);
+ probe(pii, PROBE_UNI, cur_hrtime);
/*
* If we have just the one probe target, and we're not using
* router targets, try to find another as we presently have
* no resilience.
*/
if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
- probe(pii, PROBE_MULTI, cur_time);
+ probe(pii, PROBE_MULTI, cur_hrtime);
} else {
- probe(pii, PROBE_MULTI, cur_time);
+ probe(pii, PROBE_MULTI, cur_hrtime);
}
return (interval);
}
@@ -1859,8 +1887,8 @@ process_link_state_down(struct phyint *pi)
/*
* Clear the probe statistics arrays, we don't want the repair
- * detection logic relying on probes that were succesful prior
- * to the link going down.
+ * detection logic relying on probes that were successful prior
+ * to the link going down.
*/
if (PROBE_CAPABLE(pi->pi_v4))
clear_pii_probe_stats(pi->pi_v4);
@@ -2016,7 +2044,7 @@ phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
pii->pii_target_next = target_next(cur_tg);
} else {
target_delete(cur_tg);
- probe(pii, PROBE_MULTI, getcurrenttime());
+ probe(pii, PROBE_MULTI, gethrtime());
}
return (PHYINT_OK);
}
@@ -2065,13 +2093,13 @@ failure_state(struct phyint_instance *pii)
struct probe_success_count psinfo;
uint_t pi2_tls; /* time last success */
uint_t pi_tff; /* time first fail */
- struct phyint *pi2;
+ struct phyint *pi2;
struct phyint *pi;
struct phyint_instance *pii2;
struct phyint_group *pg;
- boolean_t alone;
+ int retval;
- if (debug & D_FAILOVER)
+ if (debug & D_FAILREP)
logdebug("phyint_failed(%s)\n", pii->pii_name);
pi = pii->pii_phyint;
@@ -2082,24 +2110,13 @@ failure_state(struct phyint_instance *pii)
return (PHYINT_OK);
/*
- * At this point, the link is down, or the phyint is suspect,
- * as it has lost NUM_PROBE_FAILS or more probes. If the phyint
- * does not belong to any group, or is the only member of the
- * group capable of being probed, return PHYINT_FAILURE.
+ * At this point, the link is down, or the phyint is suspect, as it
+ * has lost NUM_PROBE_FAILS or more probes. If the phyint does not
+ * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue
+ * on to determine whether this should be considered a PHYINT_FAILURE
+ * or GROUP_FAILURE.
*/
- alone = _B_TRUE;
- if (pg != phyint_anongroup) {
- for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
- if (pi2 == pi)
- continue;
- if (PROBE_CAPABLE(pi2->pi_v4) ||
- PROBE_CAPABLE(pi2->pi_v6)) {
- alone = _B_FALSE;
- break;
- }
- }
- }
- if (alone)
+ if (pg == phyint_anongroup)
return (PHYINT_FAILURE);
/*
@@ -2116,6 +2133,7 @@ failure_state(struct phyint_instance *pii)
* after it was received, so there is no point looking at the tls
* of other phyints.
*/
+ retval = GROUP_FAILURE;
for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
/* Exclude ourself from comparison */
if (pi2 == pi)
@@ -2123,76 +2141,86 @@ failure_state(struct phyint_instance *pii)
if (LINK_DOWN(pi)) {
/*
- * We use FLAGS_TO_LINK_STATE() to test the
- * flags directly, rather then LINK_UP() or
- * LINK_DOWN(), as we may not have got round
- * to processing the link state for the other
- * phyints in the group yet.
+ * We use FLAGS_TO_LINK_STATE() to test the flags
+ * directly, rather then LINK_UP() or LINK_DOWN(), as
+ * we may not have got round to processing the link
+ * state for the other phyints in the group yet.
*
- * The check for PI_RUNNING and group
- * failure handles the case when the
- * group begins to recover. The first
- * phyint to recover should not trigger
- * a failover from the soon-to-recover
- * other phyints to the first recovered
- * phyint. PI_RUNNING will be set, and
- * pg_groupfailed cleared only after
- * receipt of NUM_PROBE_REPAIRS, by
- * which time the other phyints should
- * have received at least 1 packet,
- * and so will not have NUM_PROBE_FAILS.
+ * The check for PI_RUNNING and group failure handles
+ * the case when the group begins to recover.
+ * PI_RUNNING will be set, and group failure cleared
+ * only after receipt of NUM_PROBE_REPAIRS, by which
+ * time the other phyints should have received at
+ * least 1 packet, and so will not have NUM_PROBE_FAILS.
*/
if ((pi2->pi_state == PI_RUNNING) &&
- !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2))
- return (PHYINT_FAILURE);
- } else {
- /*
- * Need to compare against both IPv4 and
- * IPv6 instances.
- */
- pii2 = pi2->pi_v4;
- if (pii2 != NULL) {
- probe_success_info(pii2, NULL, &psinfo);
- if (psinfo.ps_tls_valid) {
- pi2_tls = psinfo.ps_tls;
- /*
- * See comment above regarding check
- * for PI_RUNNING and group failure.
- */
- if (TIME_GT(pi2_tls, pi_tff) &&
- (pi2->pi_state == PI_RUNNING) &&
- !GROUP_FAILED(pg) &&
- FLAGS_TO_LINK_STATE(pi2))
- return (PHYINT_FAILURE);
+ !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) {
+ retval = PHYINT_FAILURE;
+ break;
+ }
+ continue;
+ }
+
+ if (LINK_DOWN(pi2))
+ continue;
+
+ /*
+ * If there's no probe-based failure detection on this
+ * interface, and its link is still up, then it's still
+ * working and thus the group has not failed.
+ */
+ if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) {
+ retval = PHYINT_FAILURE;
+ break;
+ }
+
+ /*
+ * Need to compare against both IPv4 and IPv6 instances.
+ */
+ pii2 = pi2->pi_v4;
+ if (pii2 != NULL) {
+ probe_success_info(pii2, NULL, &psinfo);
+ if (psinfo.ps_tls_valid) {
+ pi2_tls = psinfo.ps_tls;
+ /*
+ * See comment above regarding check
+ * for PI_RUNNING and group failure.
+ */
+ if (TIME_GT(pi2_tls, pi_tff) &&
+ (pi2->pi_state == PI_RUNNING) &&
+ !GROUP_FAILED(pg) &&
+ FLAGS_TO_LINK_STATE(pi2)) {
+ retval = PHYINT_FAILURE;
+ break;
}
}
+ }
- pii2 = pi2->pi_v6;
- if (pii2 != NULL) {
- probe_success_info(pii2, NULL, &psinfo);
- if (psinfo.ps_tls_valid) {
- pi2_tls = psinfo.ps_tls;
- /*
- * See comment above regarding check
- * for PI_RUNNING and group failure.
- */
- if (TIME_GT(pi2_tls, pi_tff) &&
- (pi2->pi_state == PI_RUNNING) &&
- !GROUP_FAILED(pg) &&
- FLAGS_TO_LINK_STATE(pi2))
- return (PHYINT_FAILURE);
+ pii2 = pi2->pi_v6;
+ if (pii2 != NULL) {
+ probe_success_info(pii2, NULL, &psinfo);
+ if (psinfo.ps_tls_valid) {
+ pi2_tls = psinfo.ps_tls;
+ /*
+ * See comment above regarding check
+ * for PI_RUNNING and group failure.
+ */
+ if (TIME_GT(pi2_tls, pi_tff) &&
+ (pi2->pi_state == PI_RUNNING) &&
+ !GROUP_FAILED(pg) &&
+ FLAGS_TO_LINK_STATE(pi2)) {
+ retval = PHYINT_FAILURE;
+ break;
}
}
}
}
/*
- * Change the group state to PG_FAILED if it's not already.
+ * Update the group state to account for the changes.
*/
- if (!GROUP_FAILED(pg))
- phyint_group_chstate(pg, PG_FAILED);
-
- return (GROUP_FAILURE);
+ phyint_group_refresh_state(pg);
+ return (retval);
}
/*
@@ -2215,7 +2243,7 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
uint_t timeout;
struct target *tg;
- if (debug & D_FAILOVER)
+ if (debug & D_FAILREP)
logdebug("probe_success_info(%s)\n", pii->pii_name);
bzero(psinfo, sizeof (*psinfo));
@@ -2248,10 +2276,11 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
* not available use the value of the group's probe
* interval which is a worst case estimate.
*/
+ timeout = ns2ms(pr_statp->pr_hrtime_start);
if (tg->tg_crtt != 0) {
- timeout = pr_statp->pr_time_sent + tg->tg_crtt;
+ timeout += tg->tg_crtt;
} else {
- timeout = pr_statp->pr_time_sent +
+ timeout +=
pii->pii_phyint->pi_group->pg_probeint;
}
@@ -2261,7 +2290,7 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
* recent consecutive successes.
*/
pr_statp->pr_time_lost = timeout;
- pr_statp->pr_status = PR_LOST;
+ probe_chstate(pr_statp, pii, PR_LOST);
pi_found_failure = _B_TRUE;
if (cur_tg != NULL && tg == cur_tg) {
/*
@@ -2292,7 +2321,8 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
* the most recent probe success.
*/
if (!psinfo->ps_tls_valid) {
- psinfo->ps_tls = pr_statp->pr_time_acked;
+ psinfo->ps_tls =
+ ns2ms(pr_statp->pr_hrtime_ackproc);
psinfo->ps_tls_valid = _B_TRUE;
}
break;
@@ -2339,7 +2369,7 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
uint_t timeout;
struct target *tg;
- if (debug & D_FAILOVER)
+ if (debug & D_FAILREP)
logdebug("probe_fail_info(%s)\n", pii->pii_name);
bzero(pfinfo, sizeof (*pfinfo));
@@ -2377,10 +2407,11 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
* not available use the group's probe interval,
* which is a worst case estimate.
*/
+ timeout = ns2ms(pr_statp->pr_hrtime_start);
if (tg->tg_crtt != 0) {
- timeout = pr_statp->pr_time_sent + tg->tg_crtt;
+ timeout += tg->tg_crtt;
} else {
- timeout = pr_statp->pr_time_sent +
+ timeout +=
pii->pii_phyint->pi_group->pg_probeint;
}
@@ -2388,7 +2419,7 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
break;
pr_statp->pr_time_lost = timeout;
- pr_statp->pr_status = PR_LOST;
+ probe_chstate(pr_statp, pii, PR_LOST);
/* FALLTHRU */
case PR_LOST:
@@ -2421,6 +2452,19 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
}
/*
+ * Change the state of probe `pr' on phyint_instance `pii' to state `state'.
+ */
+void
+probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state)
+{
+ if (pr->pr_status == state)
+ return;
+
+ pr->pr_status = state;
+ (void) probe_state_event(pr, pii);
+}
+
+/*
* Check if the phyint has been repaired. If no test address has been
* configured, then consider the interface repaired if the link is up (unless
* the link is flapping; see below). Otherwise, look for proof of probes
@@ -2436,7 +2480,7 @@ phyint_repaired(struct phyint *pi)
int pr_ndx;
uint_t cur_time;
- if (debug & D_FAILOVER)
+ if (debug & D_FAILREP)
logdebug("phyint_repaired(%s)\n", pi->pi_name);
if (LINK_DOWN(pi))
@@ -2458,7 +2502,7 @@ phyint_repaired(struct phyint *pi)
}
if (!pi->pi_lfmsg_printed) {
logerr("The link has come up on %s more than %d times "
- "in the last minute; disabling failback until it "
+ "in the last minute; disabling repair until it "
"stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
pi->pi_lfmsg_printed = 1;
}
@@ -2490,354 +2534,41 @@ phyint_repaired(struct phyint *pi)
}
/*
- * Try failover from phyint 'pi' to a suitable destination.
- */
-int
-try_failover(struct phyint *pi, int failover_type)
-{
- struct phyint *dst;
- int err;
-
- if (debug & D_FAILOVER)
- logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type);
-
- /*
- * Attempt to find a failover destination 'dst'.
- * dst will be null if any of the following is true
- * Phyint is not part of a group OR
- * Phyint is the only member of a group OR
- * No suitable failover dst was available
- */
- dst = get_failover_dst(pi, failover_type);
- if (dst == NULL)
- return (IPMP_EMINRED);
-
- dst->pi_empty = 0; /* Per state diagram */
- pi->pi_full = 0; /* Per state diagram */
-
- err = failover(pi, dst);
-
- if (debug & D_FAILOVER) {
- logdebug("failed over from %s to %s ret %d\n",
- pi->pi_name, dst->pi_name, err);
- }
- if (err == 0) {
- pi->pi_empty = 1; /* Per state diagram */
- /*
- * we don't want to print out this message if a
- * phyint is leaving the group, nor for failover from
- * standby
- */
- if (failover_type == FAILOVER_NORMAL) {
- logerr("Successfully failed over from NIC %s to NIC "
- "%s\n", pi->pi_name, dst->pi_name);
- }
- return (0);
- } else {
- /*
- * The failover did not succeed. We must retry the failover
- * only after resyncing our state based on the kernel's.
- * For eg. either the src or the dst might have been unplumbed
- * causing this failure. initifs() will be called again,
- * from main, since full_scan_required has been set to true
- * by failover();
- */
- return (IPMP_FAILURE);
- }
-}
-
-/*
- * global_errno captures the errno value, if failover() or failback()
- * fails. This is sent to if_mpadm(1M).
- */
-int global_errno;
-
-/*
- * Attempt failover from phyint 'from' to phyint 'to'.
- * IP moves everything from phyint 'from' to phyint 'to'.
- */
-static int
-failover(struct phyint *from, struct phyint *to)
-{
- struct lifreq lifr;
- int ret;
-
- if (debug & D_FAILOVER) {
- logdebug("failing over from %s to %s\n",
- from->pi_name, to->pi_name);
- }
-
- /*
- * Perform the failover. Both IPv4 and IPv6 are failed over
- * using a single ioctl by passing in AF_UNSPEC family.
- */
- lifr.lifr_addr.ss_family = AF_UNSPEC;
- (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
- lifr.lifr_movetoindex = to->pi_ifindex;
-
- ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr);
- if (ret < 0) {
- global_errno = errno;
- logperror("failover: ioctl (failover)");
- }
-
- /*
- * Set full_scan_required to true. This will make us read
- * the state from the kernel in initifs() and update our tables,
- * to reflect the current state after the failover. If the
- * failover has failed it will then reissue the failover.
- */
- full_scan_required = _B_TRUE;
- return (ret);
-}
-
-/*
- * phyint 'pi' has recovered. Attempt failback from every phyint in the same
- * group as phyint 'pi' that is a potential failback source, to phyint 'pi'.
- * Return values:
- * IPMP_SUCCESS: Failback successful from each of the other
- * phyints in the group.
- * IPMP_EFBPARTIAL: Failback successful from some of the other
- * phyints in the group.
- * IPMP_FAILURE: Failback syscall failed with some error.
- *
- * Note that failback is attempted regardless of the setting of the
- * failback_enabled flag.
- */
-int
-do_failback(struct phyint *pi)
-{
- struct phyint *from;
- boolean_t done;
- boolean_t partial;
- boolean_t attempted_failback = _B_FALSE;
-
- if (debug & D_FAILOVER)
- logdebug("do_failback(%s)\n", pi->pi_name);
-
- /* If this phyint is not part of a named group, return. */
- if (pi->pi_group == phyint_anongroup) {
- pi->pi_full = 1;
- return (IPMP_SUCCESS);
- }
-
- /*
- * Attempt failback from every phyint in the group to 'pi'.
- * The reason for doing this, instead of only from the
- * phyint to which we did the failover is given below.
- *
- * After 'pi' failed, if any app. tries to join on a multicast
- * address (IPv6), on the failed phyint, IP picks any arbitrary
- * non-failed phyint in the group, instead of the failed phyint,
- * in.mpathd is not aware of this. Thus failing back only from the
- * interface to which 'pi' failed over, will failback the ipif's
- * but not the ilm's. So we need to failback from all members of
- * the phyint group
- */
- done = _B_TRUE;
- partial = _B_FALSE;
- for (from = pi->pi_group->pg_phyint; from != NULL;
- from = from->pi_pgnext) {
- /* Exclude ourself as a failback src */
- if (from == pi)
- continue;
-
- /*
- * If the 'from' phyint has IPv4 plumbed, the 'to'
- * phyint must also have IPv4 plumbed. Similar check
- * for IPv6. IP makes the same check. Otherwise the
- * failback will fail.
- */
- if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) ||
- (from->pi_v6 != NULL && pi->pi_v6 == NULL)) {
- partial = _B_TRUE;
- continue;
- }
-
- pi->pi_empty = 0; /* Per state diagram */
- attempted_failback = _B_TRUE;
- if (failback(from, pi) != 0) {
- done = _B_FALSE;
- break;
- }
- }
-
- /*
- * We are done. No more phyint from which we can src the failback
- */
- if (done) {
- if (!partial)
- pi->pi_full = 1; /* Per state diagram */
- /*
- * Don't print out a message unless there is a
- * transition from FAILED to RUNNING. For eg.
- * we don't want to print out this message if a
- * phyint is leaving the group, or at startup
- */
- if (attempted_failback && (pi->pi_flags &
- (IFF_FAILED | IFF_OFFLINE))) {
- logerr("Successfully failed back to NIC %s\n",
- pi->pi_name);
- }
- return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
- }
-
- return (IPMP_FAILURE);
-}
-
-/*
- * This function is similar to do_failback() above, but respects the
- * failback_enabled flag for phyints in named groups.
- */
-int
-try_failback(struct phyint *pi)
-{
- if (debug & D_FAILOVER)
- logdebug("try_failback(%s)\n", pi->pi_name);
-
- if (pi->pi_group != phyint_anongroup && !failback_enabled)
- return (IPMP_EFBDISABLED);
-
- return (do_failback(pi));
-}
-
-/*
- * Failback everything from phyint 'from' that has the same ifindex
- * as phyint to's ifindex.
- */
-static int
-failback(struct phyint *from, struct phyint *to)
-{
- struct lifreq lifr;
- int ret;
-
- if (debug & D_FAILOVER)
- logdebug("failback(%s %s)\n", from->pi_name, to->pi_name);
-
- lifr.lifr_addr.ss_family = AF_UNSPEC;
- (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
- lifr.lifr_movetoindex = to->pi_ifindex;
-
- ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr);
- if (ret < 0) {
- global_errno = errno;
- logperror("failback: ioctl (failback)");
- }
-
- /*
- * Set full_scan_required to true. This will make us read
- * the state from the kernel in initifs() and update our tables,
- * to reflect the current state after the failback. If the
- * failback has failed it will then reissue the failback.
- */
- full_scan_required = _B_TRUE;
-
- return (ret);
-}
-
-/*
- * Select a target phyint for failing over from 'pi'.
- * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred
- * target phyint is chosen as follows,
- * 1. Pick any inactive standby interface.
- * 2. If no inactive standby is available, select any phyint in the
- * same group that has the least number of logints, (excluding
- * IFF_NOFAILOVER and !IFF_UP logints)
- * If we are failing over from a standby, failover_type is
- * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination.
- * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY,
- * and we won't return NULL, as long as there is at least 1 other phyint
- * in the group.
- */
-static struct phyint *
-get_failover_dst(struct phyint *pi, int failover_type)
-{
- struct phyint *maybe = NULL;
- struct phyint *pi2;
- struct phyint *last_choice = NULL;
-
- if (pi->pi_group == phyint_anongroup)
- return (NULL);
-
- /*
- * Loop thru the phyints in the group, and pick the preferred
- * phyint for the target.
- */
- for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
- /* Exclude ourself and offlined interfaces */
- if (pi2 == pi || pi2->pi_state == PI_OFFLINE)
- continue;
-
- /*
- * The chosen target phyint must have IPv4 instance
- * plumbed, if the src phyint has IPv4 plumbed. Similarly
- * for IPv6.
- */
- if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) ||
- (pi2->pi_v6 == NULL && pi->pi_v6 != NULL))
- continue;
-
- /* The chosen target must be PI_RUNNING. */
- if (pi2->pi_state != PI_RUNNING) {
- last_choice = pi2;
- continue;
- }
-
- if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) &&
- (failover_type != FAILOVER_TO_NONSTANDBY)) {
- return (pi2);
- } else {
- if (maybe == NULL)
- maybe = pi2;
- else if (logint_upcount(pi2) < logint_upcount(maybe))
- maybe = pi2;
- }
- }
- if (maybe == NULL && failover_type == FAILOVER_TO_ANY)
- return (last_choice);
- else
- return (maybe);
-}
-
-/*
* Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
*/
boolean_t
-change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
+change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear)
{
int ifsock;
struct lifreq lifr;
uint64_t old_flags;
- if (debug & D_FAILOVER) {
- logdebug("change_lif_flags(%s): flags %llx setfl %d\n",
- pi->pi_name, flags, (int)setfl);
+ if (debug & D_FAILREP) {
+ logdebug("change_pif_flags(%s): set %llx clear %llx\n",
+ pi->pi_name, set, clear);
}
- if (pi->pi_v4 != NULL) {
+ if (pi->pi_v4 != NULL)
ifsock = ifsock_v4;
- } else {
+ else
ifsock = ifsock_v6;
- }
/*
* Get the current flags from the kernel, and set/clear the
* desired phyint flags. Since we set only phyint flags, we can
* do it on either IPv4 or IPv6 instance.
*/
- (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
- lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
+ (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
+
if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
if (errno != ENXIO)
- logperror("change_lif_flags: ioctl (get flags)");
+ logperror("change_pif_flags: ioctl (get flags)");
return (_B_FALSE);
}
old_flags = lifr.lifr_flags;
- if (setfl)
- lifr.lifr_flags |= flags;
- else
- lifr.lifr_flags &= ~flags;
+ lifr.lifr_flags |= set;
+ lifr.lifr_flags &= ~clear;
if (old_flags == lifr.lifr_flags) {
/* No change in the flags. No need to send ioctl */
@@ -2846,7 +2577,7 @@ change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
if (errno != ENXIO)
- logperror("change_lif_flags: ioctl (set flags)");
+ logperror("change_pif_flags: ioctl (set flags)");
return (_B_FALSE);
}
@@ -2854,15 +2585,13 @@ change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
* Keep pi_flags in synch. with actual flags. Assumes flags are
* phyint flags.
*/
- if (setfl)
- pi->pi_flags |= flags;
- else
- pi->pi_flags &= ~flags;
+ pi->pi_flags |= set;
+ pi->pi_flags &= ~clear;
- if (pi->pi_v4)
+ if (pi->pi_v4 != NULL)
pi->pi_v4->pii_flags = pi->pi_flags;
- if (pi->pi_v6)
+ if (pi->pi_v6 != NULL)
pi->pi_v6->pii_flags = pi->pi_flags;
return (_B_TRUE);
@@ -2928,18 +2657,31 @@ reset_snxt_basetimes(void)
* and it is up, it is not possible to detect the interface failure.
* SIOCTMYADDR also doesn't consider local zone address as own address.
* So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
- * are stored in laddr_list.
+ * are stored in `localaddrs'
*/
-
boolean_t
own_address(struct in6_addr addr)
{
- struct local_addr *taddr = laddr_list;
+ addrlist_t *addrp;
+ struct sockaddr_storage ss;
+ int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6;
- for (; taddr != NULL; taddr = taddr->next) {
- if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) {
+ addr2storage(af, &addr, &ss);
+ for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) {
+ if (sockaddrcmp(&ss, &addrp->al_addr))
return (_B_TRUE);
- }
}
return (_B_FALSE);
}
+
+static int
+ns2ms(int64_t ns)
+{
+ return (ns / (NANOSEC / MILLISEC));
+}
+
+static int64_t
+tv2ns(struct timeval *tvp)
+{
+ return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000);
+}
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c
index b56648cf12..def08d39ce 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "mpd_defs.h"
#include "mpd_tables.h"
@@ -47,11 +45,7 @@ static void phyint_inst_print(struct phyint_instance *pii);
static void phyint_insert(struct phyint *pi, struct phyint_group *pg);
static void phyint_delete(struct phyint *pi);
-
-static void phyint_group_insert(struct phyint_group *pg);
-static void phyint_group_delete(struct phyint_group *pg);
-static struct phyint_group *phyint_group_lookup(const char *pg_name);
-static struct phyint_group *phyint_group_create(const char *pg_name);
+static boolean_t phyint_is_usable(struct phyint *pi);
static void logint_print(struct logint *li);
static void logint_insert(struct phyint_instance *pii, struct logint *li);
@@ -68,16 +62,13 @@ static void reset_pii_probes(struct phyint_instance *pii, struct target *tg);
static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii);
static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii);
-static void ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask);
-static boolean_t prefix_equal(struct in6_addr p1, struct in6_addr p2,
- int prefix_len);
-
static int phyint_state_event(struct phyint_group *pg, struct phyint *pi);
static int phyint_group_state_event(struct phyint_group *pg);
static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t);
static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
ipmp_if_op_t op);
+static int logint_upcount(struct phyint *pi);
static uint64_t gensig(void);
/* Initialize any per-file global state. Returns 0 on success, -1 on failure */
@@ -110,6 +101,183 @@ phyint_lookup(const char *name)
return (pi);
}
+/*
+ * Lookup a phyint in the group that has the same hardware address as `pi', or
+ * NULL if there's none. If `online_only' is set, then only online phyints
+ * are considered when matching. Otherwise, phyints that had been offlined
+ * due to a duplicate hardware address will also be considered.
+ */
+static struct phyint *
+phyint_lookup_hwaddr(struct phyint *pi, boolean_t online_only)
+{
+ struct phyint *pi2;
+
+ if (pi->pi_group == phyint_anongroup)
+ return (NULL);
+
+ for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
+ if (pi2 == pi)
+ continue;
+
+ /*
+ * NOTE: even when online_only is B_FALSE, we ignore phyints
+ * that are administratively offline (rather than offline
+ * because they're dups); when they're brought back online,
+ * they'll be flagged as dups if need be.
+ */
+ if (pi2->pi_state == PI_OFFLINE &&
+ (online_only || !pi2->pi_hwaddrdup))
+ continue;
+
+ if (pi2->pi_hwaddrlen == pi->pi_hwaddrlen &&
+ bcmp(pi2->pi_hwaddr, pi->pi_hwaddr, pi->pi_hwaddrlen) == 0)
+ return (pi2);
+ }
+ return (NULL);
+}
+
+/*
+ * Respond to DLPI notifications. Currently, this only processes physical
+ * address changes for the phyint passed via `arg' by onlining or offlining
+ * phyints in the group.
+ */
+/* ARGSUSED */
+static void
+phyint_link_notify(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg)
+{
+ struct phyint *pi = arg;
+ struct phyint *oduppi = NULL, *duppi = NULL;
+
+ assert((dnip->dni_note & pi->pi_notes) != 0);
+
+ if (dnip->dni_note != DL_NOTE_PHYS_ADDR)
+ return;
+
+ assert(dnip->dni_physaddrlen <= DLPI_PHYSADDR_MAX);
+
+ /*
+ * If our hardware address hasn't changed, there's nothing to do.
+ */
+ if (pi->pi_hwaddrlen == dnip->dni_physaddrlen &&
+ bcmp(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen) == 0)
+ return;
+
+ oduppi = phyint_lookup_hwaddr(pi, _B_FALSE);
+ pi->pi_hwaddrlen = dnip->dni_physaddrlen;
+ (void) memcpy(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen);
+ duppi = phyint_lookup_hwaddr(pi, _B_FALSE);
+
+ if (oduppi != NULL || pi->pi_hwaddrdup) {
+ /*
+ * Our old hardware address was a duplicate. If we'd been
+ * offlined because of it, and our new hardware address is not
+ * a duplicate, then bring us online. Otherwise, `oduppi'
+ * must've been the one brought offline; bring it online.
+ */
+ if (pi->pi_hwaddrdup) {
+ if (duppi == NULL)
+ (void) phyint_undo_offline(pi);
+ } else {
+ assert(oduppi->pi_hwaddrdup);
+ (void) phyint_undo_offline(oduppi);
+ }
+ }
+
+ if (duppi != NULL && !pi->pi_hwaddrdup) {
+ /*
+ * Our new hardware address was a duplicate and we're not
+ * yet flagged as a duplicate; bring us offline.
+ */
+ pi->pi_hwaddrdup = _B_TRUE;
+ (void) phyint_offline(pi, 0);
+ }
+}
+
+/*
+ * Initialize information about the underlying link for `pi', and set us
+ * up to be notified about future changes. Returns _B_TRUE on success.
+ */
+boolean_t
+phyint_link_init(struct phyint *pi)
+{
+ int retval;
+ uint_t notes;
+ const char *errmsg;
+ dlpi_notifyid_t id;
+
+ pi->pi_notes = 0;
+ retval = dlpi_open(pi->pi_name, &pi->pi_dh, 0);
+ if (retval != DLPI_SUCCESS) {
+ pi->pi_dh = NULL;
+ errmsg = "cannot open";
+ goto failed;
+ }
+
+ pi->pi_hwaddrlen = DLPI_PHYSADDR_MAX;
+ retval = dlpi_get_physaddr(pi->pi_dh, DL_CURR_PHYS_ADDR, pi->pi_hwaddr,
+ &pi->pi_hwaddrlen);
+ if (retval != DLPI_SUCCESS) {
+ errmsg = "cannot get hardware address";
+ goto failed;
+ }
+
+ retval = dlpi_bind(pi->pi_dh, DLPI_ANY_SAP, NULL);
+ if (retval != DLPI_SUCCESS) {
+ errmsg = "cannot bind to DLPI_ANY_SAP";
+ goto failed;
+ }
+
+ /*
+ * Check if the link supports DLPI link state notifications. For
+ * historical reasons, the actual changes are tracked through routing
+ * sockets, so we immediately disable the notification upon success.
+ */
+ notes = DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN;
+ retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
+ if (retval == DLPI_SUCCESS) {
+ (void) dlpi_disabnotify(pi->pi_dh, id, NULL);
+ pi->pi_notes |= notes;
+ }
+
+ /*
+ * Enable notification of hardware address changes to keep pi_hwaddr
+ * up-to-date and track if we need to offline/undo-offline phyints.
+ */
+ notes = DL_NOTE_PHYS_ADDR;
+ retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
+ if (retval == DLPI_SUCCESS && poll_add(dlpi_fd(pi->pi_dh)) == 0)
+ pi->pi_notes |= notes;
+
+ return (_B_TRUE);
+failed:
+ logerr("%s: %s: %s\n", pi->pi_name, errmsg, dlpi_strerror(retval));
+ if (pi->pi_dh != NULL) {
+ dlpi_close(pi->pi_dh);
+ pi->pi_dh = NULL;
+ }
+ return (_B_FALSE);
+}
+
+/*
+ * Close use of link on `pi'.
+ */
+void
+phyint_link_close(struct phyint *pi)
+{
+ if (pi->pi_notes & DL_NOTE_PHYS_ADDR) {
+ (void) poll_remove(dlpi_fd(pi->pi_dh));
+ pi->pi_notes &= ~DL_NOTE_PHYS_ADDR;
+ }
+
+ /*
+ * NOTE: we don't clear pi_notes here so that iflinkstate() can still
+ * properly report the link state even when offline (which is possible
+ * since we use IFF_RUNNING to track link state).
+ */
+ dlpi_close(pi->pi_dh);
+ pi->pi_dh = NULL;
+}
+
/* Return the phyint instance with the given name and the given family */
struct phyint_instance *
phyint_inst_lookup(int af, char *name)
@@ -128,7 +296,7 @@ phyint_inst_lookup(int af, char *name)
return (PHYINT_INSTANCE(pi, af));
}
-static struct phyint_group *
+struct phyint_group *
phyint_group_lookup(const char *pg_name)
{
struct phyint_group *pg;
@@ -173,6 +341,9 @@ phyint_insert(struct phyint *pi, struct phyint_group *pg)
pi->pi_pgnext->pi_pgprev = pi;
pg->pg_phyint = pi;
+ /* Refresh the group state now that this phyint has been added */
+ phyint_group_refresh_state(pg);
+
pg->pg_sig++;
(void) phyint_group_member_event(pg, pi, IPMP_IF_ADD);
}
@@ -214,24 +385,24 @@ phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
}
/*
- * Record the phyint values. Also insert the phyint into the
- * phyint group by calling phyint_insert().
+ * Record the phyint values.
*/
(void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name));
pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
pi->pi_ifindex = ifindex;
- pi->pi_icmpid =
- htons(((getpid() & 0xFF) << 8) | (pi->pi_ifindex & 0xFF));
+ pi->pi_icmpid = htons(((getpid() & 0xFF) << 8) | (ifindex & 0xFF));
+
/*
- * We optimistically start in the PI_RUNNING state. Later (in
- * process_link_state_changes()), we will readjust this to match the
+ * If the interface is offline, we set the state to PI_OFFLINE.
+ * Otherwise, we optimistically start in the PI_RUNNING state. Later
+ * (in process_link_state_changes()), we will adjust this to match the
* current state of the link. Further, if test addresses are
* subsequently assigned, we will transition to PI_NOTARGETS and then
- * either PI_RUNNING or PI_FAILED, depending on the result of the test
- * probes.
+ * to either PI_RUNNING or PI_FAILED depending on the probe results.
*/
- pi->pi_state = PI_RUNNING;
+ pi->pi_state = (flags & IFF_OFFLINE) ? PI_OFFLINE : PI_RUNNING;
pi->pi_flags = PHYINT_FLAGS(flags);
+
/*
* Initialise the link state. The link state is initialised to
* up, so that if the link is down when IPMP starts monitoring
@@ -241,19 +412,17 @@ phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
*/
INIT_LINK_STATE(pi);
+ if (!phyint_link_init(pi)) {
+ free(pi);
+ return (NULL);
+ }
+
/*
* Insert the phyint in the list of all phyints, and the
* list of phyint group members
*/
phyint_insert(pi, pg);
- /*
- * If we are joining a failed group, mark the interface as
- * failed.
- */
- if (GROUP_FAILED(pg))
- (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
-
return (pi);
}
@@ -313,15 +482,14 @@ phyint_chstate(struct phyint *pi, enum pi_state state)
return;
pi->pi_state = state;
- pi->pi_group->pg_sig++;
- (void) phyint_state_event(pi->pi_group, pi);
+ phyint_changed(pi);
}
/*
- * Note that the type of phyint `pi' has changed.
+ * Note that `pi' has changed state.
*/
void
-phyint_newtype(struct phyint *pi)
+phyint_changed(struct phyint *pi)
{
pi->pi_group->pg_sig++;
(void) phyint_state_event(pi->pi_group, pi);
@@ -331,7 +499,7 @@ phyint_newtype(struct phyint *pi)
* Insert the phyint group in the linked list of all phyint groups
* at the head of the list
*/
-static void
+void
phyint_group_insert(struct phyint_group *pg)
{
pg->pg_next = phyint_groups;
@@ -347,7 +515,7 @@ phyint_group_insert(struct phyint_group *pg)
/*
* Create a new phyint group called 'name'.
*/
-static struct phyint_group *
+struct phyint_group *
phyint_group_create(const char *name)
{
struct phyint_group *pg;
@@ -363,9 +531,16 @@ phyint_group_create(const char *name)
(void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name));
pg->pg_sig = gensig();
-
pg->pg_fdt = user_failure_detection_time;
pg->pg_probeint = user_probe_interval;
+ pg->pg_in_use = _B_TRUE;
+
+ /*
+ * Normal groups always start in the PG_FAILED state since they
+ * have no active interfaces. In contrast, anonymous groups are
+ * heterogeneous and thus always PG_OK.
+ */
+ pg->pg_state = (name[0] == '\0' ? PG_OK : PG_FAILED);
return (pg);
}
@@ -378,10 +553,20 @@ phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
{
assert(pg != phyint_anongroup);
+ /*
+ * To simplify things, some callers always set a given state
+ * regardless of the previous state of the group (e.g., setting
+ * PG_DEGRADED when it's already set). We shouldn't bother
+ * generating an event or consuming a signature for these, since
+ * the actual state of the group is unchanged.
+ */
+ if (pg->pg_state == state)
+ return;
+
+ pg->pg_state = state;
+
switch (state) {
case PG_FAILED:
- pg->pg_groupfailed = 1;
-
/*
* We can never know with certainty that a group has
* failed. It is possible that all known targets have
@@ -392,16 +577,15 @@ phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
* hosts, we have to discover it by multicast. So flush
* all the host targets. The next probe will send out a
* multicast echo request. If this is a group failure, we
- * will still not see any response, otherwise we will
- * clear the pg_groupfailed flag after we get
- * NUM_PROBE_REPAIRS consecutive unicast replies on any
- * phyint.
+ * will still not see any response, otherwise the group
+ * will be repaired after we get NUM_PROBE_REPAIRS
+ * consecutive unicast replies on any phyint.
*/
target_flush_hosts(pg);
break;
- case PG_RUNNING:
- pg->pg_groupfailed = 0;
+ case PG_OK:
+ case PG_DEGRADED:
break;
default:
@@ -432,7 +616,6 @@ phyint_inst_init_from_k(int af, char *pi_name)
struct lifreq lifr;
struct phyint *pi;
struct phyint_instance *pii;
- boolean_t pg_created;
boolean_t pi_created;
struct phyint_group *pg;
@@ -441,7 +624,6 @@ retry:
pi = NULL;
pg = NULL;
pi_created = _B_FALSE;
- pg_created = _B_FALSE;
if (debug & D_PHYINT) {
logdebug("phyint_inst_init_from_k(%s %s)\n",
@@ -454,11 +636,11 @@ retry:
ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
/*
- * Get the interface flags. Ignore loopback and multipoint
- * interfaces.
+ * Get the interface flags. Ignore virtual interfaces, IPMP
+ * meta-interfaces, point-to-point interfaces, and interfaces
+ * that can't support multicast.
*/
- (void) strncpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
- lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
+ (void) strlcpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
if (errno != ENXIO) {
logperror("phyint_inst_init_from_k:"
@@ -467,7 +649,8 @@ retry:
return (NULL);
}
flags = lifr.lifr_flags;
- if (!(flags & IFF_MULTICAST) || (flags & IFF_LOOPBACK))
+ if (!(flags & IFF_MULTICAST) ||
+ (flags & (IFF_VIRTUAL|IFF_IPMP|IFF_POINTOPOINT)))
return (NULL);
/*
@@ -493,8 +676,7 @@ retry:
}
return (NULL);
}
- (void) strncpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
- pg_name[sizeof (pg_name) - 1] = '\0';
+ (void) strlcpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
/*
* If the phyint is not part of any group, pg_name is the
@@ -503,12 +685,13 @@ retry:
*/
if (pg_name[0] == '\0' && !track_all_phyints) {
/*
- * If the IFF_FAILED or IFF_OFFLINE flags are set, reset
- * them. These flags shouldn't be set if IPMP isn't
- * tracking the interface.
+ * If the IFF_FAILED, IFF_INACTIVE, or IFF_OFFLINE flags are
+ * set, reset them. These flags shouldn't be set if in.mpathd
+ * isn't tracking the interface.
*/
- if ((flags & (IFF_FAILED | IFF_OFFLINE)) != 0) {
- lifr.lifr_flags = flags & ~(IFF_FAILED | IFF_OFFLINE);
+ if ((flags & (IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE))) {
+ lifr.lifr_flags = flags &
+ ~(IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE);
if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
if (errno != ENXIO) {
logperror("phyint_inst_init_from_k:"
@@ -520,21 +703,20 @@ retry:
}
/*
- * We need to create a new phyint instance. A phyint instance
- * belongs to a phyint, and the phyint belongs to a phyint group.
- * So we first lookup the 'parents' and if they don't exist then
- * we create them.
+ * We need to create a new phyint instance. We may also need to
+ * create the group if e.g. the SIOCGLIFCONF loop in initifs() found
+ * an underlying interface before it found its IPMP meta-interface.
+ * Note that we keep any created groups even if phyint_inst_from_k()
+ * fails since a group's existence is not dependent on the ability of
+ * in.mpathd to the track the group's interfaces.
*/
- pg = phyint_group_lookup(pg_name);
- if (pg == NULL) {
- pg = phyint_group_create(pg_name);
- if (pg == NULL) {
- logerr("phyint_inst_init_from_k:"
- " unable to create group %s\n", pg_name);
+ if ((pg = phyint_group_lookup(pg_name)) == NULL) {
+ if ((pg = phyint_group_create(pg_name)) == NULL) {
+ logerr("phyint_inst_init_from_k: cannot create group "
+ "%s\n", pg_name);
return (NULL);
}
phyint_group_insert(pg);
- pg_created = _B_TRUE;
}
/*
@@ -546,8 +728,6 @@ retry:
if (pi == NULL) {
logerr("phyint_inst_init_from_k:"
" unable to create phyint %s\n", pi_name);
- if (pg_created)
- phyint_group_delete(pg);
return (NULL);
}
pi_created = _B_TRUE;
@@ -564,8 +744,6 @@ retry:
* while we are yet to update our tables. Do it now.
*/
if (pi->pi_ifindex != ifindex) {
- if (pg_created)
- phyint_group_delete(pg);
phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af)));
goto retry;
}
@@ -577,9 +755,6 @@ retry:
* changed, while we are yet to update our tables. Do it now.
*/
if (strcmp(pi->pi_group->pg_name, pg_name) != 0) {
- if (pg_created)
- phyint_group_delete(pg);
- restore_phyint(pi);
phyint_inst_delete(PHYINT_INSTANCE(pi,
AF_OTHER(af)));
goto retry;
@@ -594,16 +769,25 @@ retry:
if (pii == NULL) {
logerr("phyint_inst_init_from_k: unable to create"
"phyint inst %s\n", pi->pi_name);
- if (pi_created) {
- /*
- * Deleting the phyint will delete the phyint group
- * if this is the last phyint in the group.
- */
+ if (pi_created)
phyint_delete(pi);
- }
+
return (NULL);
}
+ if (pi_created) {
+ /*
+ * If this phyint does not have a unique hardware address in its
+ * group, offline it. (The change_pif_flags() implementation
+ * requires that we defer this until after the phyint_instance
+ * is created.)
+ */
+ if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
+ pi->pi_hwaddrdup = _B_TRUE;
+ (void) phyint_offline(pi, 0);
+ }
+ }
+
return (pii);
}
@@ -677,16 +861,16 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii)
{
icmp6_filter_t filter;
int hopcount = 1;
- int int_op;
+ int off = 0;
+ int on = 1;
struct sockaddr_in6 testaddr;
/*
* Open a raw socket with ICMPv6 protocol.
*
- * Use IPV6_DONTFAILOVER_IF to make sure that probes go out
- * on the specified phyint only, and are not subject to load
- * balancing. Bind to the src address chosen will ensure that
- * the responses are received only on the specified phyint.
+ * Use IPV6_BOUND_IF to make sure that probes are sent and received on
+ * the specified phyint only. Bind to the test address to ensure that
+ * the responses are sent to the specified phyint.
*
* Set the hopcount to 1 so that probe packets are not routed.
* Disable multicast loopback. Set the receive filter to
@@ -696,7 +880,7 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii)
if (pii->pii_probe_sock < 0) {
logperror_pii(pii, "phyint_inst_v6_sockinit: socket");
return (_B_FALSE);
-}
+ }
bzero(&testaddr, sizeof (testaddr));
testaddr.sin6_family = AF_INET6;
@@ -709,14 +893,17 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii)
return (_B_FALSE);
}
- /*
- * IPV6_DONTFAILOVER_IF option takes precedence over setting
- * IP_MULTICAST_IF. So we don't set IPV6_MULTICAST_IF again.
- */
- if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_DONTFAILOVER_IF,
+ if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_IF,
(char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) {
logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
- " IPV6_DONTFAILOVER_IF");
+ " IPV6_MULTICAST_IF");
+ return (_B_FALSE);
+ }
+
+ if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_BOUND_IF,
+ &pii->pii_ifindex, sizeof (uint_t)) < 0) {
+ logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
+ " IPV6_BOUND_IF");
return (_B_FALSE);
}
@@ -734,9 +921,8 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii)
return (_B_FALSE);
}
- int_op = 0; /* used to turn off option */
if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP,
- (char *)&int_op, sizeof (int_op)) < 0) {
+ (char *)&off, sizeof (off)) < 0) {
logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
" IPV6_MULTICAST_LOOP");
return (_B_FALSE);
@@ -755,15 +941,22 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii)
return (_B_FALSE);
}
- /* Enable receipt of ancillary data */
- int_op = 1;
+ /* Enable receipt of hoplimit */
if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
- (char *)&int_op, sizeof (int_op)) < 0) {
+ &on, sizeof (on)) < 0) {
logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
" IPV6_RECVHOPLIMIT");
return (_B_FALSE);
}
+ /* Enable receipt of timestamp */
+ if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP,
+ &on, sizeof (on)) < 0) {
+ logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
+ " SO_TIMESTAMP");
+ return (_B_FALSE);
+ }
+
return (_B_TRUE);
}
@@ -775,20 +968,20 @@ static boolean_t
phyint_inst_v4_sockinit(struct phyint_instance *pii)
{
struct sockaddr_in testaddr;
- char char_op;
+ char char_off = 0;
int ttl = 1;
char char_ttl = 1;
+ int on = 1;
/*
* Open a raw socket with ICMPv4 protocol.
*
- * Use IP_DONTFAILOVER_IF to make sure that probes go out
- * on the specified phyint only, and are not subject to load
- * balancing. Bind to the src address chosen will ensure that
- * the responses are received only on the specified phyint.
+ * Use IP_BOUND_IF to make sure that probes are sent and received on
+ * the specified phyint only. Bind to the test address to ensure that
+ * the responses are sent to the specified phyint.
*
* Set the ttl to 1 so that probe packets are not routed.
- * Disable multicast loopback.
+ * Disable multicast loopback. Enable receipt of timestamp.
*/
pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP);
if (pii->pii_probe_sock < 0) {
@@ -808,14 +1001,17 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii)
return (_B_FALSE);
}
- /*
- * IP_DONTFAILOVER_IF option takes precedence over setting
- * IP_MULTICAST_IF. So we don't set IP_MULTICAST_IF again.
- */
- if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_DONTFAILOVER_IF,
+ if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_BOUND_IF,
+ &pii->pii_ifindex, sizeof (uint_t)) < 0) {
+ logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
+ " IP_BOUND_IF");
+ return (_B_FALSE);
+ }
+
+ if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_IF,
(char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) {
logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
- " IP_DONTFAILOVER");
+ " IP_MULTICAST_IF");
return (_B_FALSE);
}
@@ -826,9 +1022,8 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii)
return (_B_FALSE);
}
- char_op = 0; /* used to turn off option */
if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP,
- (char *)&char_op, sizeof (char_op)) == -1) {
+ (char *)&char_off, sizeof (char_off)) == -1) {
logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
" IP_MULTICAST_LOOP");
return (_B_FALSE);
@@ -841,6 +1036,13 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii)
return (_B_FALSE);
}
+ if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, &on,
+ sizeof (on)) < 0) {
+ logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
+ " SO_TIMESTAMP");
+ return (_B_FALSE);
+ }
+
return (_B_TRUE);
}
@@ -848,7 +1050,7 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii)
* Remove the phyint group from the list of 'all phyint groups'
* and free it.
*/
-static void
+void
phyint_group_delete(struct phyint_group *pg)
{
/*
@@ -881,10 +1083,69 @@ phyint_group_delete(struct phyint_group *pg)
phyint_grouplistsig++;
(void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE);
+ addrlist_free(&pg->pg_addrs);
free(pg);
}
/*
+ * Refresh the state of `pg' based on its current members.
+ */
+void
+phyint_group_refresh_state(struct phyint_group *pg)
+{
+ enum pg_state state;
+ enum pg_state origstate = pg->pg_state;
+ struct phyint *pi, *usablepi;
+ uint_t nif = 0, nusable = 0;
+
+ /*
+ * Anonymous groups never change state.
+ */
+ if (pg == phyint_anongroup)
+ return;
+
+ for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
+ nif++;
+ if (phyint_is_usable(pi)) {
+ nusable++;
+ usablepi = pi;
+ }
+ }
+
+ if (nusable == 0)
+ state = PG_FAILED;
+ else if (nif == nusable)
+ state = PG_OK;
+ else
+ state = PG_DEGRADED;
+
+ phyint_group_chstate(pg, state);
+
+ /*
+ * If we're shutting down, skip logging messages since otherwise our
+ * shutdown housecleaning will make us report that groups are unusable.
+ */
+ if (cleanup_started)
+ return;
+
+ /*
+ * NOTE: We use pg_failmsg_printed rather than origstate since
+ * otherwise at startup we'll log a "now usable" message when the
+ * first usable phyint is added to an empty group.
+ */
+ if (state != PG_FAILED && pg->pg_failmsg_printed) {
+ assert(origstate == PG_FAILED);
+ logerr("At least 1 IP interface (%s) in group %s is now "
+ "usable\n", usablepi->pi_name, pg->pg_name);
+ pg->pg_failmsg_printed = _B_FALSE;
+ } else if (origstate != PG_FAILED && state == PG_FAILED) {
+ logerr("All IP interfaces in group %s are now unusable\n",
+ pg->pg_name);
+ pg->pg_failmsg_printed = _B_TRUE;
+ }
+}
+
+/*
* Extract information from the kernel about the desired phyint.
* Look only for properties of the phyint and not properties of logints.
* Take appropriate action on the changes.
@@ -998,28 +1259,16 @@ phyint_inst_update_from_k(struct phyint_instance *pii)
if (pi->pi_v6 != NULL)
pi->pi_v6->pii_flags = pi->pi_flags;
+ /*
+ * Make sure the IFF_FAILED flag is set if and only if we think
+ * the interface should be failed.
+ */
if (pi->pi_flags & IFF_FAILED) {
- /*
- * If we are in the running and full state, we have
- * completed failbacks successfully and we would have
- * expected IFF_FAILED to have been clear. That it is
- * set means there was a race condition. Some other
- * process turned on the IFF_FAILED flag. Since the
- * flag setting is not atomic, i.e. a get ioctl followed
- * by a set ioctl, and since there is no way to set an
- * individual flag bit, this could have occurred.
- */
- if (pi->pi_state == PI_RUNNING && pi->pi_full)
- (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
+ if (pi->pi_state == PI_RUNNING)
+ (void) change_pif_flags(pi, 0, IFF_FAILED);
} else {
- /*
- * If we are in the failed state, there was a race.
- * we have completed failover successfully because our
- * state is failed and empty. Some other process turned
- * off the IFF_FAILED flag. Same comment as above
- */
- if (pi->pi_state == PI_FAILED && pi->pi_empty)
- (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
+ if (pi->pi_state == PI_FAILED)
+ (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
}
/* No change in phyint status */
@@ -1028,12 +1277,12 @@ phyint_inst_update_from_k(struct phyint_instance *pii)
/*
* Delete the phyint. Remove it from the list of all phyints, and the
- * list of phyint group members. If the group becomes empty, delete the
- * group also.
+ * list of phyint group members.
*/
static void
phyint_delete(struct phyint *pi)
{
+ struct phyint *pi2;
struct phyint_group *pg = pi->pi_group;
if (debug & D_PHYINT)
@@ -1065,6 +1314,9 @@ phyint_delete(struct phyint *pi)
pi->pi_pgnext = NULL;
pi->pi_pgprev = NULL;
+ /* Refresh the group state now that this phyint has been removed */
+ phyint_group_refresh_state(pg);
+
/* Remove the phyint from the global list of phyints */
if (pi->pi_prev == NULL) {
/* Phyint is the 1st in the list */
@@ -1077,11 +1329,153 @@ phyint_delete(struct phyint *pi)
pi->pi_next = NULL;
pi->pi_prev = NULL;
+ /*
+ * See if another phyint in the group had been offlined because
+ * it was a dup of `pi' -- and if so, online it.
+ */
+ if (!pi->pi_hwaddrdup &&
+ (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
+ assert(pi2->pi_hwaddrdup);
+ (void) phyint_undo_offline(pi2);
+ }
+ phyint_link_close(pi);
free(pi);
+}
+
+/*
+ * Offline phyint `pi' if at least `minred' usable interfaces remain in the
+ * group. Returns an IPMP error code.
+ */
+int
+phyint_offline(struct phyint *pi, uint_t minred)
+{
+ unsigned int nusable = 0;
+ struct phyint *pi2;
+ struct phyint_group *pg = pi->pi_group;
+
+ /*
+ * Verify that enough usable interfaces in the group would remain.
+ * As a special case, if the group has failed, allow any non-offline
+ * phyints to be offlined.
+ */
+ if (pg != phyint_anongroup) {
+ for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
+ if (pi2 == pi)
+ continue;
+ if (phyint_is_usable(pi2) ||
+ (GROUP_FAILED(pg) && pi2->pi_state != PI_OFFLINE))
+ nusable++;
+ }
+ }
+ if (nusable < minred)
+ return (IPMP_EMINRED);
+
+ if (!change_pif_flags(pi, IFF_OFFLINE, 0))
+ return (IPMP_FAILURE);
+
+ /*
+ * The interface is now offline, so stop probing it. Note that
+ * if_mpadm(1M) will down the test addresses, after receiving a
+ * success reply from us. The routing socket message will then make us
+ * close the socket used for sending probes. But it is more logical
+ * that an offlined interface must not be probed, even if it has test
+ * addresses.
+ *
+ * NOTE: stop_probing() also sets PI_OFFLINE.
+ */
+ stop_probing(pi);
+
+ /*
+ * If we're offlining the phyint because it has a duplicate hardware
+ * address, print a warning -- and leave the link open so that we can
+ * be notified of hardware address changes that make it usable again.
+ * Otherwise, close the link so that we won't prevent a detach.
+ */
+ if (pi->pi_hwaddrdup) {
+ logerr("IP interface %s has a hardware address which is not "
+ "unique in group %s; offlining\n", pi->pi_name,
+ pg->pg_name);
+ } else {
+ phyint_link_close(pi);
+ }
+
+ /*
+ * If this phyint was preventing another phyint with a duplicate
+ * hardware address from being online, bring that one online now.
+ */
+ if (!pi->pi_hwaddrdup &&
+ (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
+ assert(pi2->pi_hwaddrdup);
+ (void) phyint_undo_offline(pi2);
+ }
- /* Delete the phyint_group if the last phyint has been deleted */
- if (pg->pg_phyint == NULL)
- phyint_group_delete(pg);
+ /*
+ * If this interface was active, try to activate another INACTIVE
+ * interface in the group.
+ */
+ if (!(pi->pi_flags & IFF_INACTIVE))
+ phyint_activate_another(pi);
+
+ return (IPMP_SUCCESS);
+}
+
+/*
+ * Undo a previous offline of `pi'. Returns an IPMP error code.
+ */
+int
+phyint_undo_offline(struct phyint *pi)
+{
+ if (pi->pi_state != PI_OFFLINE) {
+ errno = EINVAL;
+ return (IPMP_FAILURE);
+ }
+
+ /*
+ * If necessary, reinitialize our link information and verify that its
+ * hardware address is still unique across the group.
+ */
+ if (pi->pi_dh == NULL && !phyint_link_init(pi)) {
+ errno = EIO;
+ return (IPMP_FAILURE);
+ }
+
+ if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
+ pi->pi_hwaddrdup = _B_TRUE;
+ return (IPMP_EHWADDRDUP);
+ }
+
+ if (pi->pi_hwaddrdup) {
+ logerr("IP interface %s now has a unique hardware address in "
+ "group %s; onlining\n", pi->pi_name, pi->pi_group->pg_name);
+ pi->pi_hwaddrdup = _B_FALSE;
+ }
+
+ if (!change_pif_flags(pi, 0, IFF_OFFLINE))
+ return (IPMP_FAILURE);
+
+ /*
+ * While the interface was offline, it may have failed (e.g. the link
+ * may have gone down). phyint_inst_check_for_failure() will have
+ * already set pi_flags with IFF_FAILED, so we can use that to decide
+ * whether the phyint should transition to running. Note that after
+ * we transition to running, we will start sending probes again (if
+ * test addresses are configured), which may also reveal that the
+ * interface is in fact failed.
+ */
+ if (pi->pi_flags & IFF_FAILED) {
+ phyint_chstate(pi, PI_FAILED);
+ } else {
+ /* calls phyint_chstate() */
+ phyint_transition_to_running(pi);
+ }
+
+ /*
+ * Give the requestor time to configure test addresses before
+ * complaining that they're missing.
+ */
+ pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
+
+ return (IPMP_SUCCESS);
}
/*
@@ -1166,11 +1560,10 @@ phyint_inst_print(struct phyint_instance *pii)
}
logdebug("\nPhyint instance: %s %s index %u state %x flags %llx "
- "sock %x in_use %d empty %x full %x\n",
+ "sock %x in_use %d\n",
AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex,
pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock,
- pii->pii_in_use, pii->pii_phyint->pi_empty,
- pii->pii_phyint->pi_full);
+ pii->pii_in_use);
for (li = pii->pii_logint; li != NULL; li = li->li_next)
logint_print(li);
@@ -1211,9 +1604,11 @@ phyint_inst_print(struct phyint_instance *pii)
} else {
logdebug("#%d target NULL ", i);
}
- logdebug("time_sent %u status %d time_ack/lost %u\n",
- pii->pii_probes[i].pr_time_sent,
+ logdebug("time_start %lld status %d "
+ "time_ackproc %lld time_lost %u",
+ pii->pii_probes[i].pr_hrtime_start,
pii->pii_probes[i].pr_status,
+ pii->pii_probes[i].pr_hrtime_ackproc,
pii->pii_probes[i].pr_time_lost);
i = PROBE_INDEX_PREV(i);
} while (i != most_recent);
@@ -1293,7 +1688,6 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name)
struct logint *li;
struct lifreq lifr;
struct in6_addr test_subnet;
- struct in6_addr test_subnet_mask;
struct in6_addr testaddr;
int test_subnet_len;
struct sockaddr_in6 *sin6;
@@ -1373,55 +1767,21 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name)
testaddr = sin6->sin6_addr;
}
- if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
- ptp = _B_TRUE;
- if (ioctl(ifsock, SIOCGLIFDSTADDR, (char *)&lifr) < 0) {
- if (errno != ENXIO) {
- logperror_li(li, "logint_init_from_k:"
- " (get dstaddr)");
- }
- goto error;
- }
- if (pii->pii_af == AF_INET) {
- sin = (struct sockaddr_in *)&lifr.lifr_addr;
- IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &tgaddr);
- } else {
- sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
- tgaddr = sin6->sin6_addr;
- }
- } else {
- if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
- /* Interface may have vanished */
- if (errno != ENXIO) {
- logperror_li(li, "logint_init_from_k:"
- " (get subnet)");
- }
- goto error;
- }
- if (lifr.lifr_subnet.ss_family == AF_INET6) {
- sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
- test_subnet = sin6->sin6_addr;
- test_subnet_len = lifr.lifr_addrlen;
- } else {
- sin = (struct sockaddr_in *)&lifr.lifr_subnet;
- IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
- test_subnet_len = lifr.lifr_addrlen +
- (IPV6_ABITS - IP_ABITS);
- }
- (void) ip_index_to_mask_v6(test_subnet_len, &test_subnet_mask);
- }
-
- /*
- * Also record the OINDEX for completeness. This information is
- * not used.
- */
- if (ioctl(ifsock, SIOCGLIFOINDEX, (char *)&lifr) < 0) {
- if (errno != ENXIO) {
- logperror_li(li, "logint_init_from_k:"
- " (get lifoindex)");
- }
+ if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
+ /* Interface may have vanished */
+ if (errno != ENXIO)
+ logperror_li(li, "logint_init_from_k: (get subnet)");
goto error;
}
+ if (lifr.lifr_subnet.ss_family == AF_INET6) {
+ sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
+ test_subnet = sin6->sin6_addr;
+ test_subnet_len = lifr.lifr_addrlen;
+ } else {
+ sin = (struct sockaddr_in *)&lifr.lifr_subnet;
+ IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
+ test_subnet_len = lifr.lifr_addrlen + (IPV6_ABITS - IP_ABITS);
+ }
/*
* If this is the logint corresponding to the test address used for
@@ -1454,7 +1814,6 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name)
/* Update the logint with the values obtained from the kernel. */
li->li_addr = testaddr;
li->li_in_use = 1;
- li->li_oifindex = lifr.lifr_index;
if (ptp) {
li->li_dstaddr = tgaddr;
li->li_subnet_len = (pii->pii_af == AF_INET) ?
@@ -1530,15 +1889,12 @@ static void
logint_print(struct logint *li)
{
char abuf[INET6_ADDRSTRLEN];
- int af;
-
- af = li->li_phyint_inst->pii_af;
+ int af = li->li_phyint_inst->pii_af;
logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name,
pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len);
- logdebug("\tFlags: %llx in_use %d oifindex %d\n",
- li->li_flags, li->li_in_use, li->li_oifindex);
+ logdebug("\tFlags: %llx in_use %d\n", li->li_flags, li->li_in_use);
}
char *
@@ -1555,6 +1911,33 @@ pr_addr(int af, struct in6_addr addr, char *abuf, int len)
return (abuf);
}
+/*
+ * Fill in the sockaddr_storage pointed to by `ssp' with the IP address
+ * represented by the [`af',`addr'] pair. Needed because in.mpathd internally
+ * stores all addresses as in6_addrs, but we don't want to expose that.
+ */
+void
+addr2storage(int af, const struct in6_addr *addr, struct sockaddr_storage *ssp)
+{
+ struct sockaddr_in *sinp = (struct sockaddr_in *)ssp;
+ struct sockaddr_in6 *sin6p = (struct sockaddr_in6 *)ssp;
+
+ assert(af == AF_INET || af == AF_INET6);
+
+ switch (af) {
+ case AF_INET:
+ (void) memset(sinp, 0, sizeof (*sinp));
+ sinp->sin_family = AF_INET;
+ IN6_V4MAPPED_TO_INADDR(addr, &sinp->sin_addr);
+ break;
+ case AF_INET6:
+ (void) memset(sin6p, 0, sizeof (*sin6p));
+ sin6p->sin6_family = AF_INET6;
+ sin6p->sin6_addr = *addr;
+ break;
+ }
+}
+
/* Lookup target on its address */
struct target *
target_lookup(struct phyint_instance *pii, struct in6_addr addr)
@@ -1686,7 +2069,7 @@ target_select_best(struct phyint_instance *pii)
if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
slow_recovered = tg;
/*
- * Promote the slow_recoverd to unused
+ * Promote the slow_recovered to unused
*/
tg->tg_status = TG_UNUSED;
} else {
@@ -1698,7 +2081,7 @@ target_select_best(struct phyint_instance *pii)
if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
dead_recovered = tg;
/*
- * Promote the dead_recoverd to slow
+ * Promote the dead_recovered to slow
*/
tg->tg_status = TG_SLOW;
tg->tg_latime = now;
@@ -1798,11 +2181,9 @@ target_create(struct phyint_instance *pii, struct in6_addr addr,
/*
* If there are multiple subnets associated with an interface, then
- * add the target to this phyint instance, only if it belongs to the
- * same subnet as the test address. The reason is that interface
- * routes derived from non-test-addresses i.e. non-IFF_NOFAILOVER
- * addresses, will disappear after failover, and the targets will not
- * be reachable from this interface.
+ * add the target to this phyint instance only if it belongs to the
+ * same subnet as the test address. This assures us that we will
+ * be able to reach this target through our routing table.
*/
if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len))
return;
@@ -1906,11 +2287,12 @@ target_add(struct phyint_instance *pii, struct in6_addr addr,
/*
* If the target does not exist, create it; target_create() will set
- * tg_in_use to true. If it exists already, and it is a router
- * target, set tg_in_use to to true, so that init_router_targets()
- * won't delete it
+ * tg_in_use to true. Even if it exists already, if it's a router
+ * target and we'd previously learned of it through multicast, then we
+ * need to recreate it as a router target. Otherwise, just set
+ * tg_in_use to to true so that init_router_targets() won't delete it.
*/
- if (tg == NULL)
+ if (tg == NULL || (is_router && !pii->pii_targets_are_routers))
target_create(pii, addr, is_router);
else if (is_router)
tg->tg_in_use = 1;
@@ -2034,16 +2416,17 @@ target_delete(struct target *tg)
* relevant any longer.
*/
assert(pii->pii_targets == NULL);
+ pii->pii_targets_are_routers = _B_FALSE;
clear_pii_probe_stats(pii);
pii_other = phyint_inst_other(pii);
/*
- * If there are no targets on both instances and the interface is
- * online, go back to PI_NOTARGETS state, since we cannot probe this
- * phyint any more. For more details, please see phyint state
- * diagram in mpd_probe.c.
+ * If there are no targets on both instances and the interface would
+ * otherwise be considered PI_RUNNING, go back to PI_NOTARGETS state,
+ * since we cannot probe this phyint any more. For more details,
+ * please see phyint state diagram in mpd_probe.c.
*/
- if (!PROBE_CAPABLE(pii_other) &&
+ if (!PROBE_CAPABLE(pii_other) && LINK_UP(pii->pii_phyint) &&
pii->pii_phyint->pi_state != PI_OFFLINE)
phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
}
@@ -2101,9 +2484,11 @@ reset_pii_probes(struct phyint_instance *pii, struct target *tg)
for (i = 0; i < PROBE_STATS_COUNT; i++) {
if (pii->pii_probes[i].pr_target == tg) {
+ if (pii->pii_probes[i].pr_status == PR_UNACKED) {
+ probe_chstate(&pii->pii_probes[i], pii,
+ PR_LOST);
+ }
pii->pii_probes[i].pr_target = NULL;
- if (pii->pii_probes[i].pr_status == PR_UNACKED)
- pii->pii_probes[i].pr_status = PR_LOST;
}
}
@@ -2132,7 +2517,7 @@ target_print(struct target *tg)
af = tg->tg_phyint_inst->pii_af;
logdebug("Target on %s %s addr %s\n"
- "status %d rtt_sa %d rtt_sd %d crtt %d tg_in_use %d\n",
+ "status %d rtt_sa %lld rtt_sd %lld crtt %d tg_in_use %d\n",
AF_STR(af), tg->tg_phyint_inst->pii_name,
pr_addr(af, tg->tg_address, abuf, sizeof (abuf)),
tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd,
@@ -2158,35 +2543,16 @@ phyint_inst_print_all(void)
}
/*
- * Convert length for a mask to the mask.
- */
-static void
-ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask)
-{
- int j;
-
- assert(masklen <= IPV6_ABITS);
- bzero((char *)bitmask, sizeof (*bitmask));
-
- /* Make the 'masklen' leftmost bits one */
- for (j = 0; masklen > 8; masklen -= 8, j++)
- bitmask->s6_addr[j] = 0xff;
-
- bitmask->s6_addr[j] = 0xff << (8 - masklen);
-
-}
-
-/*
* Compare two prefixes that have the same prefix length.
* Fails if the prefix length is unreasonable.
*/
-static boolean_t
-prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len)
+boolean_t
+prefix_equal(struct in6_addr p1, struct in6_addr p2, uint_t prefix_len)
{
uchar_t mask;
int j;
- if (prefix_len < 0 || prefix_len > IPV6_ABITS)
+ if (prefix_len > IPV6_ABITS)
return (_B_FALSE);
for (j = 0; prefix_len > 8; prefix_len -= 8, j++)
@@ -2202,35 +2568,25 @@ prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len)
}
/*
- * Get the number of UP logints (excluding IFF_NOFAILOVERs), on both
- * IPv4 and IPv6 put together. The phyint with the least such number
- * will be used as the failover destination, if no standby interface is
- * available
+ * Get the number of UP logints on phyint `pi'.
*/
-int
+static int
logint_upcount(struct phyint *pi)
{
struct logint *li;
- struct phyint_instance *pii;
int count = 0;
- pii = pi->pi_v4;
- if (pii != NULL) {
- for (li = pii->pii_logint; li != NULL; li = li->li_next) {
- if ((li->li_flags &
- (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) {
+ if (pi->pi_v4 != NULL) {
+ for (li = pi->pi_v4->pii_logint; li != NULL; li = li->li_next) {
+ if (li->li_flags & IFF_UP)
count++;
- }
}
}
- pii = pi->pi_v6;
- if (pii != NULL) {
- for (li = pii->pii_logint; li != NULL; li = li->li_next) {
- if ((li->li_flags &
- (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) {
+ if (pi->pi_v6 != NULL) {
+ for (li = pi->pi_v6->pii_logint; li != NULL; li = li->li_next) {
+ if (li->li_flags & IFF_UP)
count++;
- }
}
}
@@ -2250,6 +2606,28 @@ phyint_inst_other(struct phyint_instance *pii)
}
/*
+ * Check whether a phyint is functioning.
+ */
+static boolean_t
+phyint_is_functioning(struct phyint *pi)
+{
+ if (pi->pi_state == PI_RUNNING)
+ return (_B_TRUE);
+ return (pi->pi_state == PI_NOTARGETS && !(pi->pi_flags & IFF_FAILED));
+}
+
+/*
+ * Check whether a phyint is usable.
+ */
+static boolean_t
+phyint_is_usable(struct phyint *pi)
+{
+ if (logint_upcount(pi) == 0)
+ return (_B_FALSE);
+ return (phyint_is_functioning(pi));
+}
+
+/*
* Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'.
* Before sending the event, it prepends the current version of the IPMP
* sysevent API. Returns 0 on success, -1 on failure (in either case,
@@ -2258,16 +2636,18 @@ phyint_inst_other(struct phyint_instance *pii)
static int
post_event(const char *subclass, nvlist_t *nvl)
{
- sysevent_id_t eid;
+ static evchan_t *evchp = NULL;
/*
- * Since sysevents don't work yet in non-global zones, there cannot
- * possibly be any consumers yet, so don't bother trying to generate
- * them. (Otherwise, we'll spew warnings.)
+ * Initialize the event channel if we haven't already done so.
*/
- if (getzoneid() != GLOBAL_ZONEID) {
- nvlist_free(nvl);
- return (0);
+ if (evchp == NULL) {
+ errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evchp, EVCH_CREAT);
+ if (errno != 0) {
+ logerr("cannot create event channel `%s': %s\n",
+ IPMP_EVENT_CHAN, strerror(errno));
+ goto failed;
+ }
}
errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION,
@@ -2278,8 +2658,9 @@ post_event(const char *subclass, nvlist_t *nvl)
goto failed;
}
- if (sysevent_post_event(EC_IPMP, (char *)subclass, SUNW_VENDOR,
- "in.mpathd", nvl, &eid) == -1) {
+ errno = sysevent_evc_publish(evchp, EC_IPMP, subclass, "com.sun",
+ "in.mpathd", nvl, EVCH_NOSLEEP);
+ if (errno != 0) {
logerr("cannot send `%s' event: %s\n", subclass,
strerror(errno));
goto failed;
@@ -2300,6 +2681,8 @@ ifstate(struct phyint *pi)
{
switch (pi->pi_state) {
case PI_NOTARGETS:
+ if (pi->pi_flags & IFF_FAILED)
+ return (IPMP_IF_FAILED);
return (IPMP_IF_UNKNOWN);
case PI_OFFLINE:
@@ -2330,12 +2713,203 @@ iftype(struct phyint *pi)
}
/*
+ * Return the external IPMP link state associated with phyint `pi'.
+ */
+static ipmp_if_linkstate_t
+iflinkstate(struct phyint *pi)
+{
+ if (!(pi->pi_notes & (DL_NOTE_LINK_UP|DL_NOTE_LINK_DOWN)))
+ return (IPMP_LINK_UNKNOWN);
+
+ return (LINK_DOWN(pi) ? IPMP_LINK_DOWN : IPMP_LINK_UP);
+}
+
+/*
+ * Return the external IPMP probe state associated with phyint `pi'.
+ */
+static ipmp_if_probestate_t
+ifprobestate(struct phyint *pi)
+{
+ if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6))
+ return (IPMP_PROBE_DISABLED);
+
+ if (pi->pi_state == PI_FAILED)
+ return (IPMP_PROBE_FAILED);
+
+ if (!PROBE_CAPABLE(pi->pi_v4) && !PROBE_CAPABLE(pi->pi_v6))
+ return (IPMP_PROBE_UNKNOWN);
+
+ return (IPMP_PROBE_OK);
+}
+
+/*
+ * Return the external IPMP target mode associated with phyint instance `pii'.
+ */
+static ipmp_if_targmode_t
+iftargmode(struct phyint_instance *pii)
+{
+ if (!PROBE_ENABLED(pii))
+ return (IPMP_TARG_DISABLED);
+ else if (pii->pii_targets_are_routers)
+ return (IPMP_TARG_ROUTES);
+ else
+ return (IPMP_TARG_MULTICAST);
+}
+
+/*
+ * Return the external IPMP flags associated with phyint `pi'.
+ */
+static ipmp_if_flags_t
+ifflags(struct phyint *pi)
+{
+ ipmp_if_flags_t flags = 0;
+
+ if (logint_upcount(pi) == 0)
+ flags |= IPMP_IFFLAG_DOWN;
+ if (pi->pi_flags & IFF_INACTIVE)
+ flags |= IPMP_IFFLAG_INACTIVE;
+ if (pi->pi_hwaddrdup)
+ flags |= IPMP_IFFLAG_HWADDRDUP;
+ if (phyint_is_functioning(pi) && flags == 0)
+ flags |= IPMP_IFFLAG_ACTIVE;
+
+ return (flags);
+}
+
+/*
+ * Store the test address used on phyint instance `pii' in `ssp'. If there's
+ * no test address, 0.0.0.0 is stored.
+ */
+static struct sockaddr_storage *
+iftestaddr(struct phyint_instance *pii, struct sockaddr_storage *ssp)
+{
+ if (PROBE_ENABLED(pii))
+ addr2storage(pii->pii_af, &pii->pii_probe_logint->li_addr, ssp);
+ else
+ addr2storage(AF_INET6, &in6addr_any, ssp);
+
+ return (ssp);
+}
+
+/*
* Return the external IPMP group state associated with phyint group `pg'.
*/
static ipmp_group_state_t
groupstate(struct phyint_group *pg)
{
- return (GROUP_FAILED(pg) ? IPMP_GROUP_FAILED : IPMP_GROUP_OK);
+ switch (pg->pg_state) {
+ case PG_FAILED:
+ return (IPMP_GROUP_FAILED);
+ case PG_DEGRADED:
+ return (IPMP_GROUP_DEGRADED);
+ case PG_OK:
+ return (IPMP_GROUP_OK);
+ }
+
+ logerr("groupstate: unknown state %d; aborting\n", pg->pg_state);
+ abort();
+ /* NOTREACHED */
+}
+
+/*
+ * Return the external IPMP probe state associated with probe `ps'.
+ */
+static ipmp_probe_state_t
+probestate(struct probe_stats *ps)
+{
+ switch (ps->pr_status) {
+ case PR_UNUSED:
+ case PR_LOST:
+ return (IPMP_PROBE_LOST);
+ case PR_UNACKED:
+ return (IPMP_PROBE_SENT);
+ case PR_ACKED:
+ return (IPMP_PROBE_ACKED);
+ }
+
+ logerr("probestate: unknown state %d; aborting\n", ps->pr_status);
+ abort();
+ /* NOTREACHED */
+}
+
+/*
+ * Generate an ESC_IPMP_PROBE_STATE sysevent for the probe described by `pr'
+ * on phyint instance `pii'. Returns 0 on success, -1 on failure.
+ */
+int
+probe_state_event(struct probe_stats *pr, struct phyint_instance *pii)
+{
+ nvlist_t *nvl;
+ hrtime_t proc_time = 0, recv_time = 0;
+ struct sockaddr_storage ss;
+ struct target *tg = pr->pr_target;
+
+ errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
+ if (errno != 0) {
+ logperror("cannot create `interface change' event");
+ return (-1);
+ }
+
+ errno = nvlist_add_uint32(nvl, IPMP_PROBE_ID, pr->pr_id);
+ if (errno != 0)
+ goto failed;
+
+ errno = nvlist_add_string(nvl, IPMP_IF_NAME, pii->pii_phyint->pi_name);
+ if (errno != 0)
+ goto failed;
+
+ errno = nvlist_add_uint32(nvl, IPMP_PROBE_STATE, probestate(pr));
+ if (errno != 0)
+ goto failed;
+
+ errno = nvlist_add_hrtime(nvl, IPMP_PROBE_START_TIME,
+ pr->pr_hrtime_start);
+ if (errno != 0)
+ goto failed;
+
+ errno = nvlist_add_hrtime(nvl, IPMP_PROBE_SENT_TIME,
+ pr->pr_hrtime_sent);
+ if (errno != 0)
+ goto failed;
+
+ if (pr->pr_status == PR_ACKED) {
+ recv_time = pr->pr_hrtime_ackrecv;
+ proc_time = pr->pr_hrtime_ackproc;
+ }
+
+ errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, recv_time);
+ if (errno != 0)
+ goto failed;
+
+ errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, proc_time);
+ if (errno != 0)
+ goto failed;
+
+ if (tg != NULL)
+ addr2storage(pii->pii_af, &tg->tg_address, &ss);
+ else
+ addr2storage(pii->pii_af, &in6addr_any, &ss);
+
+ errno = nvlist_add_byte_array(nvl, IPMP_PROBE_TARGET, (uchar_t *)&ss,
+ sizeof (ss));
+ if (errno != 0)
+ goto failed;
+
+ errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTAVG,
+ tg->tg_rtt_sa / 8);
+ if (errno != 0)
+ goto failed;
+
+ errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTDEV,
+ tg->tg_rtt_sd / 4);
+ if (errno != 0)
+ goto failed;
+
+ return (post_event(ESC_IPMP_PROBE_STATE, nvl));
+failed:
+ logperror("cannot create `probe state' event");
+ nvlist_free(nvl);
+ return (-1);
}
/*
@@ -2529,10 +3103,15 @@ gensig(void)
unsigned int
getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
{
- struct phyint_group *pg;
struct phyint *pi;
+ struct phyint_group *pg;
char (*ifs)[LIFNAMSIZ];
- unsigned int nif, i;
+ unsigned int i, j;
+ unsigned int nif = 0, naddr = 0;
+ lifgroupinfo_t lifgr;
+ addrlist_t *addrp;
+ struct sockaddr_storage *addrs;
+ int fdt = 0;
pg = phyint_group_lookup(grname);
if (pg == NULL)
@@ -2540,39 +3119,143 @@ getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
/*
* Tally up the number of interfaces, allocate an array to hold them,
- * and insert their names into the array.
+ * and insert their names into the array. While we're at it, if any
+ * interface is actually enabled to send probes, save the group fdt.
*/
- for (nif = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
+ for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
nif++;
ifs = alloca(nif * sizeof (*ifs));
for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) {
assert(i < nif);
(void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ);
+ if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6))
+ fdt = pg->pg_fdt;
}
assert(i == nif);
- *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig,
- groupstate(pg), nif, ifs);
+ /*
+ * If this is the anonymous group, there's no other information to
+ * collect (since there's no IPMP interface).
+ */
+ if (pg == phyint_anongroup) {
+ *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
+ groupstate(pg), nif, ifs, "", "", "", "", 0, NULL);
+ return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
+ }
+
+ /*
+ * Grab some additional information about the group from the kernel.
+ * (NOTE: since SIOCGLIFGROUPINFO does not look up by interface name,
+ * we can use ifsock_v4 even for a V6-only group.)
+ */
+ (void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ);
+ if (ioctl(ifsock_v4, SIOCGLIFGROUPINFO, &lifgr) == -1) {
+ if (errno == ENOENT)
+ return (IPMP_EUNKGROUP);
+
+ logperror("getgroupinfo: SIOCGLIFGROUPINFO");
+ return (IPMP_FAILURE);
+ }
+
+ /*
+ * Tally up the number of data addresses, allocate an array to hold
+ * them, and insert their values into the array.
+ */
+ for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next)
+ naddr++;
+
+ addrs = alloca(naddr * sizeof (*addrs));
+ i = 0;
+ for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
+ /*
+ * It's possible to have duplicate addresses (if some are
+ * down). Weed the dups out to avoid confusing consumers.
+ * (If groups start having tons of addresses, we'll need a
+ * better algorithm here.)
+ */
+ for (j = 0; j < i; j++) {
+ if (sockaddrcmp(&addrs[j], &addrp->al_addr))
+ break;
+ }
+ if (j == i) {
+ assert(i < naddr);
+ addrs[i++] = addrp->al_addr;
+ }
+ }
+ naddr = i;
+
+ *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
+ groupstate(pg), nif, ifs, lifgr.gi_grifname, lifgr.gi_m4ifname,
+ lifgr.gi_m6ifname, lifgr.gi_bcifname, naddr, addrs);
return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
}
/*
+ * Store the target information associated with phyint instance `pii' into a
+ * dynamically allocated structure pointed to by `*targinfopp'. Returns an
+ * IPMP error code.
+ */
+unsigned int
+gettarginfo(struct phyint_instance *pii, const char *name,
+ ipmp_targinfo_t **targinfopp)
+{
+ uint_t ntarg = 0;
+ struct target *tg;
+ struct sockaddr_storage ss;
+ struct sockaddr_storage *targs = NULL;
+
+ if (PROBE_CAPABLE(pii)) {
+ targs = alloca(pii->pii_ntargets * sizeof (*targs));
+ tg = pii->pii_target_next;
+ do {
+ if (tg->tg_status == TG_ACTIVE) {
+ assert(ntarg < pii->pii_ntargets);
+ addr2storage(pii->pii_af, &tg->tg_address,
+ &targs[ntarg++]);
+ }
+ if ((tg = tg->tg_next) == NULL)
+ tg = pii->pii_targets;
+ } while (tg != pii->pii_target_next);
+
+ assert(ntarg == pii->pii_ntargets);
+ }
+
+ *targinfopp = ipmp_targinfo_create(name, iftestaddr(pii, &ss),
+ iftargmode(pii), ntarg, targs);
+ return (*targinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
+}
+
+/*
* Store the information associated with interface `ifname' into a dynamically
* allocated structure pointed to by `*ifinfopp'. Returns an IPMP error code.
*/
unsigned int
getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp)
{
+ int retval;
struct phyint *pi;
+ ipmp_targinfo_t *targinfo4;
+ ipmp_targinfo_t *targinfo6;
pi = phyint_lookup(ifname);
if (pi == NULL)
return (IPMP_EUNKIF);
+ if ((retval = gettarginfo(pi->pi_v4, pi->pi_name, &targinfo4)) != 0 ||
+ (retval = gettarginfo(pi->pi_v6, pi->pi_name, &targinfo6)) != 0)
+ goto out;
+
*ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name,
- ifstate(pi), iftype(pi));
- return (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
+ ifstate(pi), iftype(pi), iflinkstate(pi), ifprobestate(pi),
+ ifflags(pi), targinfo4, targinfo6);
+ retval = (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
+out:
+ if (targinfo4 != NULL)
+ ipmp_freetarginfo(targinfo4);
+ if (targinfo6 != NULL)
+ ipmp_freetarginfo(targinfo6);
+ return (retval);
}
/*
@@ -2605,6 +3288,54 @@ getgrouplist(ipmp_grouplist_t **grlistpp)
}
/*
+ * Store the address information for `ssp' (in group `grname') into a
+ * dynamically allocated structure pointed to by `*adinfopp'. Returns an IPMP
+ * error code. (We'd call this function getaddrinfo(), but it would conflict
+ * with getaddrinfo(3SOCKET)).
+ */
+unsigned int
+getgraddrinfo(const char *grname, struct sockaddr_storage *ssp,
+ ipmp_addrinfo_t **adinfopp)
+{
+ int ifsock;
+ addrlist_t *addrp, *addrmatchp = NULL;
+ ipmp_addr_state_t state;
+ const char *binding = "";
+ struct lifreq lifr;
+ struct phyint_group *pg;
+
+ if ((pg = phyint_group_lookup(grname)) == NULL)
+ return (IPMP_EUNKADDR);
+
+ /*
+ * Walk through the data addresses, and find a match. Note that since
+ * some of the addresses may be down, more than one may match. We
+ * prefer an up address (if one exists).
+ */
+ for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
+ if (sockaddrcmp(ssp, &addrp->al_addr)) {
+ addrmatchp = addrp;
+ if (addrmatchp->al_flags & IFF_UP)
+ break;
+ }
+ }
+
+ if (addrmatchp == NULL)
+ return (IPMP_EUNKADDR);
+
+ state = (addrmatchp->al_flags & IFF_UP) ? IPMP_ADDR_UP : IPMP_ADDR_DOWN;
+ if (state == IPMP_ADDR_UP) {
+ ifsock = (ssp->ss_family == AF_INET) ? ifsock_v4 : ifsock_v6;
+ (void) strlcpy(lifr.lifr_name, addrmatchp->al_name, LIFNAMSIZ);
+ if (ioctl(ifsock, SIOCGLIFBINDING, &lifr) >= 0)
+ binding = lifr.lifr_binding;
+ }
+
+ *adinfopp = ipmp_addrinfo_create(ssp, state, pg->pg_name, binding);
+ return (*adinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
+}
+
+/*
* Store a snapshot of the IPMP subsystem into a dynamically allocated
* structure pointed to by `*snapp'. Returns an IPMP error code.
*/
@@ -2613,10 +3344,12 @@ getsnap(ipmp_snap_t **snapp)
{
ipmp_grouplist_t *grlistp;
ipmp_groupinfo_t *grinfop;
+ ipmp_addrinfo_t *adinfop;
+ ipmp_addrlist_t *adlistp;
ipmp_ifinfo_t *ifinfop;
ipmp_snap_t *snap;
struct phyint *pi;
- unsigned int i;
+ unsigned int i, j;
int retval;
snap = ipmp_snap_create();
@@ -2627,26 +3360,37 @@ getsnap(ipmp_snap_t **snapp)
* Add group list.
*/
retval = getgrouplist(&snap->sn_grlistp);
- if (retval != IPMP_SUCCESS) {
- ipmp_snap_free(snap);
- return (retval);
- }
+ if (retval != IPMP_SUCCESS)
+ goto failed;
/*
- * Add information for each group in the list.
+ * Add information for each group in the list, along with all of its
+ * data addresses.
*/
grlistp = snap->sn_grlistp;
for (i = 0; i < grlistp->gl_ngroup; i++) {
retval = getgroupinfo(grlistp->gl_groups[i], &grinfop);
- if (retval != IPMP_SUCCESS) {
- ipmp_snap_free(snap);
- return (retval);
- }
+ if (retval != IPMP_SUCCESS)
+ goto failed;
+
retval = ipmp_snap_addgroupinfo(snap, grinfop);
if (retval != IPMP_SUCCESS) {
ipmp_freegroupinfo(grinfop);
- ipmp_snap_free(snap);
- return (retval);
+ goto failed;
+ }
+
+ adlistp = grinfop->gr_adlistp;
+ for (j = 0; j < adlistp->al_naddr; j++) {
+ retval = getgraddrinfo(grinfop->gr_name,
+ &adlistp->al_addrs[j], &adinfop);
+ if (retval != IPMP_SUCCESS)
+ goto failed;
+
+ retval = ipmp_snap_addaddrinfo(snap, adinfop);
+ if (retval != IPMP_SUCCESS) {
+ ipmp_freeaddrinfo(adinfop);
+ goto failed;
+ }
}
}
@@ -2655,18 +3399,19 @@ getsnap(ipmp_snap_t **snapp)
*/
for (pi = phyints; pi != NULL; pi = pi->pi_next) {
retval = getifinfo(pi->pi_name, &ifinfop);
- if (retval != IPMP_SUCCESS) {
- ipmp_snap_free(snap);
- return (retval);
- }
+ if (retval != IPMP_SUCCESS)
+ goto failed;
+
retval = ipmp_snap_addifinfo(snap, ifinfop);
if (retval != IPMP_SUCCESS) {
ipmp_freeifinfo(ifinfop);
- ipmp_snap_free(snap);
- return (retval);
+ goto failed;
}
}
*snapp = snap;
return (IPMP_SUCCESS);
+failed:
+ ipmp_snap_free(snap);
+ return (retval);
}
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h
index e4be3ccb30..39da2c3f1b 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _MPD_TABLES_H
#define _MPD_TABLES_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -47,20 +45,11 @@ extern "C" {
* switch AND
* (ii) share the same phyint group name.
* Load spreading and failover occur across members of the same phyint group.
- * phyint group members must be homogenous. i.e. if a phyint belonging to a
+ * phyint group members must be homogeneous. i.e. if a phyint belonging to a
* phyint group has a IPv6 protocol instance, then all members of the phyint
* group, must have IPv6 protocol instances. (struct phyint_group)
*/
-/*
- * Parameter passed to try_failover(), indicating the type of failover
- * that is requested.
- */
-#define FAILOVER_NORMAL 1 /* Failover to another phyint */
- /* that is preferably a standby */
-#define FAILOVER_TO_NONSTANDBY 2 /* Failover to non-standby phyint */
-#define FAILOVER_TO_ANY 3 /* Failover to any available phyint */
-
#define MAXDEFERREDRTT 1 /* Maximum number of deferred rtts */
/*
@@ -79,15 +68,9 @@ extern "C" {
#define PI_IOCTL_ERROR 4 /* Some ioctl error */
#define PI_GROUP_CHANGED 5 /* The phyint has changed group. */
-/*
- * Though IFF_POINTOPOINT is a logint property, for the purpose of
- * failover, we treat it as a phyint property. Note that we cannot failover
- * individual logints.
- */
#define PHYINT_FLAGS(flags) \
- (((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \
- IFF_POINTOPOINT | IFF_RUNNING)) | (handle_link_notifications ? \
- 0 : IFF_RUNNING))
+ (((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \
+ IFF_RUNNING)) | (handle_link_notifications ? 0 : IFF_RUNNING))
/* A Phyint can have up to 2 instances, the IPv4 and the IPv6 instance */
#define PHYINT_INSTANCE(pi, af) \
@@ -152,29 +135,32 @@ extern "C" {
* Phyint group states; see below for the phyint group definition.
*/
enum pg_state {
- PG_RUNNING = 1, /* at least one interface in group is working */
- PG_FAILED = 2 /* group has failed completely */
+ PG_OK = 1, /* all interfaces in the group are working */
+ PG_DEGRADED, /* some interfaces in the group are unusable */
+ PG_FAILED /* all interfaces in the group are unusable */
};
/*
* Convenience macro to check if the whole group has failed.
*/
-#define GROUP_FAILED(pg) ((pg)->pg_groupfailed)
+#define GROUP_FAILED(pg) ((pg)->pg_state == PG_FAILED)
/*
* A doubly linked list of all phyint groups in the system.
* A phyint group is identified by its group name.
*/
struct phyint_group {
- char pg_name[LIFNAMSIZ + 1]; /* Phyint group name */
+ char pg_name[LIFGRNAMSIZ]; /* Phyint group name */
struct phyint *pg_phyint; /* List of phyints in this group */
struct phyint_group *pg_next; /* Next phyint group */
struct phyint_group *pg_prev; /* Prev phyint group */
- uint64_t pg_sig; /* Current signature of this group */
- int pg_probeint; /* Interval between probes */
- int pg_fdt; /* Time needed to detect failure */
- uint_t
- pg_groupfailed : 1; /* The whole group has failed */
+ uint64_t pg_sig; /* Current signature of this group */
+ int pg_probeint; /* Interval between probes */
+ int pg_fdt; /* Time needed to detect failure */
+ enum pg_state pg_state; /* Current group state */
+ boolean_t pg_in_use; /* To detect removed groups */
+ struct addrlist *pg_addrs; /* Data addresses in this group */
+ boolean_t pg_failmsg_printed; /* Group failure msg printed */
};
/*
@@ -207,6 +193,11 @@ struct phyint {
uint16_t pi_icmpid; /* icmp id in icmp echo request */
uint64_t pi_taddrthresh; /* time (in secs) to delay logging */
/* about missing test addresses */
+ dlpi_handle_t pi_dh; /* DLPI handle to underlying link */
+ uint_t pi_notes; /* enabled DLPI notifications */
+ uchar_t pi_hwaddr[DLPI_PHYSADDR_MAX]; /* phyint's hw address */
+ size_t pi_hwaddrlen; /* phyint's hw address length */
+
/*
* The pi_whenup array is a circular buffer of the most recent
* times (in milliseconds since some arbitrary point of time in
@@ -217,14 +208,12 @@ struct phyint {
unsigned int pi_whendx;
uint_t
- pi_empty : 1, /* failover done, empty */
- pi_full : 1, /* failback done, full */
- /* More details in probe.c */
pi_taddrmsg_printed : 1, /* testaddr msg printed */
pi_duptaddrmsg_printed : 1, /* dup testaddr msg printed */
pi_cfgmsg_printed : 1, /* bad config msg printed */
pi_lfmsg_printed : 1, /* link-flapping msg printed */
- pi_link_state : 1; /* interface link state */
+ pi_link_state : 1, /* interface link state */
+ pi_hwaddrdup : 1; /* disabled due to dup hw address */
};
/*
@@ -260,19 +249,19 @@ struct phyint_instance {
uint64_t pii_flags; /* Phyint flags from kernel */
struct probe_stats {
- struct target *pr_target; /* Probe Target */
- uint_t pr_time_sent; /* Time probe was sent */
+ uint_t pr_id; /* Full ID of probe */
+ struct target *pr_target; /* Probe Target */
+ uint_t pr_time_lost; /* Time probe declared lost */
+ struct timeval pr_tv_sent; /* Wall time probe was sent */
+ hrtime_t pr_hrtime_start; /* hrtime probe op started */
+ hrtime_t pr_hrtime_sent; /* hrtime probe was sent */
+ hrtime_t pr_hrtime_ackrecv; /* hrtime probe ack received */
+ hrtime_t pr_hrtime_ackproc; /* hrtime probe ack processed */
uint_t pr_status; /* probe status as below */
#define PR_UNUSED 0 /* Probe slot unused */
#define PR_UNACKED 1 /* Probe is unacknowledged */
#define PR_ACKED 2 /* Probe has been acknowledged */
#define PR_LOST 3 /* Probe is declared lost */
- union {
- uint_t tl; /* time probe is declared lost */
- uint_t ta; /* time probe is acked */
- } prt;
-#define pr_time_lost prt.tl
-#define pr_time_acked prt.ta
} pii_probes[PROBE_STATS_COUNT];
uint_t
@@ -319,7 +308,6 @@ struct logint {
struct in6_addr li_subnet; /* prefix / subnet */
uint_t li_subnet_len; /* prefix / subnet length */
uint64_t li_flags; /* IFF_* flags */
- uint_t li_oifindex; /* original ifindex (SIOCGLIFOINDEX) */
uint_t
li_in_use : 1, /* flag to detect deleted logints */
li_dupaddr : 1; /* test address is not unique */
@@ -345,12 +333,12 @@ struct target {
#define TG_DEAD 4 /* Target is not responding */
hrtime_t tg_latime; /* Target's last active time */
- int tg_rtt_sa; /* Scaled round trip time(RTT) avg. */
- int tg_rtt_sd; /* Scaled RTT deviation */
- int tg_crtt; /* Conservative RTT = A + 4D */
+ int64_t tg_rtt_sa; /* Scaled RTT average (in ns) */
+ int64_t tg_rtt_sd; /* Scaled RTT deviation (in ns) */
+ int tg_crtt; /* Conservative RTT = A + 4D (in ms) */
uint32_t
tg_in_use : 1; /* In use flag */
- int tg_deferred[MAXDEFERREDRTT + 1];
+ int64_t tg_deferred[MAXDEFERREDRTT + 1];
/* Deferred rtt data points */
int tg_num_deferred;
/* Number of deferred rtt data points */
@@ -393,19 +381,20 @@ struct probe_success_count
struct probes_missed
{
uint_t pm_nprobes; /* Cumulative number of missed probes */
- uint_t pm_ntimes; /* Total number of occassions */
+ uint_t pm_ntimes; /* Total number of occasions */
};
-struct local_addr
-{
- struct in6_addr addr;
- struct local_addr *next;
-};
+typedef struct addrlist {
+ struct addrlist *al_next; /* next address */
+ char al_name[LIFNAMSIZ]; /* address lif name */
+ uint64_t al_flags; /* address flags */
+ struct sockaddr_storage al_addr; /* address */
+} addrlist_t;
/*
* Globals
*/
-extern struct local_addr *laddr_list;
+extern addrlist_t *localaddrs;
/* List of all local addresses, including local zones */
extern struct phyint *phyints; /* List of all phyints */
extern struct phyint_group *phyint_groups; /* List of all phyint groups */
@@ -428,10 +417,19 @@ extern void phyint_inst_delete(struct phyint_instance *pii);
extern uint_t phyint_inst_timer(struct phyint_instance *pii);
extern boolean_t phyint_inst_sockinit(struct phyint_instance *pii);
-extern void phyint_newtype(struct phyint *pi);
+extern void phyint_changed(struct phyint *pi);
extern void phyint_chstate(struct phyint *pi, enum pi_state state);
extern void phyint_group_chstate(struct phyint_group *pg, enum pg_state state);
+extern struct phyint_group *phyint_group_create(const char *pg_name);
+extern struct phyint_group *phyint_group_lookup(const char *pg_name);
+extern void phyint_group_insert(struct phyint_group *pg);
+extern void phyint_group_delete(struct phyint_group *pg);
+extern void phyint_group_refresh_state(struct phyint_group *pg);
extern void phyint_check_for_repair(struct phyint *pi);
+extern void phyint_transition_to_running(struct phyint *pi);
+extern void phyint_activate_another(struct phyint *pi);
+extern int phyint_offline(struct phyint *pi, unsigned int);
+extern int phyint_undo_offline(struct phyint *pi);
extern void logint_init_from_k(struct phyint_instance *pii, char *li_name);
extern void logint_delete(struct logint *li);
@@ -448,34 +446,40 @@ extern void target_add(struct phyint_instance *pii, struct in6_addr addr,
extern void in_data(struct phyint_instance *pii);
extern void in6_data(struct phyint_instance *pii);
-extern int try_failover(struct phyint *pi, int failover_type);
-extern int try_failback(struct phyint *pi);
-extern int do_failback(struct phyint *pi);
-extern boolean_t change_lif_flags(struct phyint *pi, uint64_t flags,
- boolean_t setfl);
-
extern void logperror_pii(struct phyint_instance *pii, const char *str);
extern void logperror_li(struct logint *li, const char *str);
extern char *pr_addr(int af, struct in6_addr addr, char *abuf, int len);
+extern void addr2storage(int af, const struct in6_addr *addr,
+ struct sockaddr_storage *ssp);
extern void phyint_inst_print_all(void);
+extern boolean_t prefix_equal(struct in6_addr, struct in6_addr, uint_t);
-extern int logint_upcount(struct phyint *pi);
-extern void restore_phyint(struct phyint *pi);
extern void reset_crtt_all(struct phyint *pi);
extern int failure_state(struct phyint_instance *pii);
extern void process_link_state_changes(void);
extern void clear_pii_probe_stats(struct phyint_instance *pii);
extern void start_timer(struct phyint_instance *pii);
+extern void stop_probing(struct phyint *pi);
extern boolean_t own_address(struct in6_addr addr);
+extern boolean_t change_pif_flags(struct phyint *pi, uint64_t set,
+ uint64_t clear);
extern void close_probe_socket(struct phyint_instance *pii, boolean_t flag);
+extern int probe_state_event(struct probe_stats *, struct phyint_instance *);
+extern void probe_chstate(struct probe_stats *, struct phyint_instance *, int);
+extern unsigned int getgraddrinfo(const char *, struct sockaddr_storage *,
+ ipmp_addrinfo_t **);
extern unsigned int getifinfo(const char *, ipmp_ifinfo_t **);
extern unsigned int getgroupinfo(const char *, ipmp_groupinfo_t **);
extern unsigned int getgrouplist(ipmp_grouplist_t **);
extern unsigned int getsnap(ipmp_snap_t **);
+extern boolean_t addrlist_add(addrlist_t **, const char *, uint64_t,
+ struct sockaddr_storage *);
+extern void addrlist_free(addrlist_t **);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c
index 27716cabce..703ddcfaad 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c
@@ -17,14 +17,11 @@
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "defs.h"
#include "tables.h"
#include <fcntl.h>
@@ -122,7 +119,7 @@ sendpacket(struct sockaddr_in6 *sin6, int sock, int size, int flags)
char abuf[INET6_ADDRSTRLEN];
cc = sendto(sock, (char *)packet, size, flags,
- (struct sockaddr *)sin6, sizeof (*sin6));
+ (struct sockaddr *)sin6, sizeof (*sin6));
if (cc < 0 || cc != size) {
if (cc < 0) {
logperror("sendpacket: sendto");
@@ -135,6 +132,32 @@ sendpacket(struct sockaddr_in6 *sin6, int sock, int size, int flags)
}
}
+/*
+ * If possible, place an ND_OPT_SOURCE_LINKADDR option at `optp'.
+ * Return the number of bytes placed in the option.
+ */
+static uint_t
+add_opt_lla(struct phyint *pi, struct nd_opt_lla *optp)
+{
+ uint_t optlen;
+ uint_t hwaddrlen;
+ struct lifreq lifr;
+
+ /* If this phyint doesn't have a link-layer address, bail */
+ if (phyint_get_lla(pi, &lifr) == -1)
+ return (0);
+
+ hwaddrlen = lifr.lifr_nd.lnr_hdw_len;
+ /* roundup to multiple of 8 and make padding zero */
+ optlen = ((sizeof (struct nd_opt_hdr) + hwaddrlen + 7) / 8) * 8;
+ bzero(optp, optlen);
+ optp->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR;
+ optp->nd_opt_lla_len = optlen / 8;
+ bcopy(lifr.lifr_nd.lnr_hdw_addr, optp->nd_opt_lla_hdw_addr, hwaddrlen);
+
+ return (optlen);
+}
+
/* Send a Router Solicitation */
static void
solicit(struct sockaddr_in6 *sin6, struct phyint *pi)
@@ -151,24 +174,8 @@ solicit(struct sockaddr_in6 *sin6, struct phyint *pi)
packetlen += sizeof (*rs);
pptr += sizeof (*rs);
- /* Attach any options */
- if (pi->pi_hdw_addr_len != 0) {
- struct nd_opt_lla *lo = (struct nd_opt_lla *)pptr;
- int optlen;
-
- /* roundup to multiple of 8 and make padding zero */
- optlen = ((sizeof (struct nd_opt_hdr) +
- pi->pi_hdw_addr_len + 7) / 8) * 8;
- bzero(pptr, optlen);
-
- lo->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR;
- lo->nd_opt_lla_len = optlen / 8;
- bcopy((char *)pi->pi_hdw_addr,
- (char *)lo->nd_opt_lla_hdw_addr,
- pi->pi_hdw_addr_len);
- packetlen += optlen;
- pptr += optlen;
- }
+ /* add options */
+ packetlen += add_opt_lla(pi, (struct nd_opt_lla *)pptr);
if (debug & D_PKTOUT) {
print_route_sol("Sending solicitation to ", pi, rs, packetlen,
@@ -224,24 +231,9 @@ advertise(struct sockaddr_in6 *sin6, struct phyint *pi, boolean_t no_prefixes)
return;
}
- /* Attach any options */
- if (pi->pi_hdw_addr_len != 0) {
- struct nd_opt_lla *lo = (struct nd_opt_lla *)pptr;
- int optlen;
-
- /* roundup to multiple of 8 and make padding zero */
- optlen = ((sizeof (struct nd_opt_hdr) +
- pi->pi_hdw_addr_len + 7) / 8) * 8;
- bzero(pptr, optlen);
-
- lo->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR;
- lo->nd_opt_lla_len = optlen / 8;
- bcopy((char *)pi->pi_hdw_addr,
- (char *)lo->nd_opt_lla_hdw_addr,
- pi->pi_hdw_addr_len);
- packetlen += optlen;
- pptr += optlen;
- }
+ /* add options */
+ packetlen += add_opt_lla(pi, (struct nd_opt_lla *)pptr);
+ pptr = (char *)packet + packetlen;
if (pi->pi_AdvLinkMTU != 0) {
struct nd_opt_mtu *mo = (struct nd_opt_mtu *)pptr;
@@ -1671,10 +1663,10 @@ process_rtsock(int rtsock)
return;
}
- if (ifm->ifm_flags != pi->pi_flags) {
+ if (ifm->ifm_flags != (uint_t)pi->pi_flags) {
if (debug & D_IFSCAN) {
logmsg(LOG_DEBUG, "process_rtsock: clr for "
- "%s old flags 0x%x new flags 0x%x\n",
+ "%s old flags 0x%llx new flags 0x%x\n",
pi->pi_name, pi->pi_flags, ifm->ifm_flags);
}
}
@@ -1825,141 +1817,67 @@ process_mibsock(int mibsock)
}
/*
- * Check whether the address formed by pr->pr_prefix and pi_token
- * exists in the kernel. Cannot call SIOCTMYADDR/ONLINK as it
- * does not check for down addresses. This function should not
- * be called for onlink prefixes.
- */
-static boolean_t
-is_address_present(struct phyint *pi, struct prefix *pr, uint64_t flags)
-{
- int s;
- in6_addr_t addr, *token;
- int i;
- int ret;
- struct sockaddr_in6 sin6;
-
- s = socket(AF_INET6, SOCK_DGRAM, 0);
- if (s < 0) {
- logperror("is_address_present: socket");
- /*
- * By returning B_TRUE, we make the caller delete
- * the prefix from the internal table. In the worst
- * case the next RA will create the prefix.
- */
- return (_B_TRUE);
- }
- if (flags & IFF_TEMPORARY)
- token = &pi->pi_tmp_token;
- else
- token = &pi->pi_token;
- for (i = 0; i < 16; i++) {
- /*
- * prefix_create ensures that pr_prefix has all-zero
- * bits after prefixlen.
- */
- addr.s6_addr[i] = pr->pr_prefix.s6_addr[i] | token->s6_addr[i];
- }
- (void) memset(&sin6, 0, sizeof (struct sockaddr_in6));
- sin6.sin6_family = AF_INET6;
- sin6.sin6_addr = addr;
- ret = bind(s, (struct sockaddr *)&sin6, sizeof (struct sockaddr_in6));
- (void) close(s);
- if (ret < 0 && errno == EADDRNOTAVAIL)
- return (_B_FALSE);
- else
- return (_B_TRUE);
-}
-
-/*
* Look if the phyint or one of its prefixes have been removed from
* the kernel and take appropriate action.
- * Uses {pi,pr}_in_use.
+ * Uses pr_in_use and pi{,_kernel}_state.
*/
static void
check_if_removed(struct phyint *pi)
{
- struct prefix *pr;
- struct prefix *next_pr;
+ struct prefix *pr, *next_pr;
/*
- * Detect phyints that have been removed from the kernel.
- * Since we can't recreate it here (would require ifconfig plumb
- * logic) we just terminate use of that phyint.
- */
- if (!(pi->pi_kernel_state & PI_PRESENT) &&
- (pi->pi_state & PI_PRESENT)) {
- logmsg(LOG_ERR, "Interface %s has been removed from kernel. "
- "in.ndpd will no longer use it\n", pi->pi_name);
- /*
- * Clear state so that should the phyint reappear
- * we will start with initial advertisements or
- * solicitations.
- */
- phyint_cleanup(pi);
- }
- /*
* Detect prefixes which are removed.
- *
- * We remove the prefix in all of the following cases :
- *
- * 1) Static prefixes are not the ones we create. So,
- * just remove it from our tables.
- *
- * 2) On-link prefixes potentially move to a different
- * phyint during failover. As it does not have
- * an address, we can't use the logic in is_address_present
- * to detect whether it is present in the kernel or not.
- * Thus when it is manually removed we don't recreate it.
- *
- * 3) If there is a token mis-match and this prefix is not
- * in the kernel, it means we don't need this prefix on
- * this interface anymore. It must have been moved to a
- * different interface by in.mpathd. This normally
- * happens after a failover followed by a failback (or
- * another failover) and we re-read the network
- * configuration. For the failover from A to B, we would
- * have created state on B about A's address, which will
- * not be in use after the subsequent failback. So, we
- * remove that prefix here.
- *
- * 4) If the physical interface is not present, then remove
- * the prefix. In the cases where we are advertising
- * prefixes, the state is kept in advertisement prefix and
- * hence we can delete the prefix.
- *
- * 5) Similar to case (3), when we failover from A to B, the
- * prefix in A will not be in use as it has been moved to B.
- * We will delete it from our tables and recreate it when
- * it fails back. is_address_present makes sure that the
- * address is still valid in kernel.
- *
- * If none of the above is true, we recreate the prefix as it
- * has been manually removed. We do it only when the interface
- * is not FAILED or INACTIVE or OFFLINE.
+ * Static prefixes are just removed from our tables.
+ * Non-static prefixes are recreated i.e. in.ndpd takes precedence
+ * over manually removing prefixes via ifconfig.
*/
for (pr = pi->pi_prefix_list; pr != NULL; pr = next_pr) {
next_pr = pr->pr_next;
if (!pr->pr_in_use) {
- /* Clear PR_AUTO and PR_ONLINK */
+ /* Clear everything except PR_STATIC */
pr->pr_kernel_state &= PR_STATIC;
- if ((pr->pr_state & PR_STATIC) ||
- !(pr->pr_state & PR_AUTO) ||
- !(prefix_token_match(pi, pr, pr->pr_flags)) ||
- (!(pi->pi_kernel_state & PI_PRESENT)) ||
- (is_address_present(pi, pr, pr->pr_flags))) {
+ pr->pr_name[0] = '\0';
+ if (pr->pr_state & PR_STATIC) {
prefix_delete(pr);
- } else if (!(pi->pi_flags &
- (IFF_FAILED|IFF_INACTIVE|IFF_OFFLINE)) &&
- pr->pr_state != pr->pr_kernel_state) {
- pr->pr_name[0] = '\0';
+ } else if (!(pi->pi_kernel_state & PI_PRESENT)) {
+ /*
+ * Ensure that there are no future attempts to
+ * run prefix_update_k since the phyint is gone.
+ */
+ pr->pr_state = pr->pr_kernel_state;
+ } else if (pr->pr_state != pr->pr_kernel_state) {
logmsg(LOG_INFO, "Prefix manually removed "
- "on %s - recreating it!\n",
- pi->pi_name);
+ "on %s; recreating\n", pi->pi_name);
prefix_update_k(pr);
}
}
}
+
+ /*
+ * Detect phyints that have been removed from the kernel, and tear
+ * down any prefixes we created that are associated with that phyint.
+ * (NOTE: IPMP depends on in.ndpd tearing down these prefixes so an
+ * administrator can easily place an IP interface with ADDRCONF'd
+ * addresses into an IPMP group.)
+ */
+ if (!(pi->pi_kernel_state & PI_PRESENT) &&
+ (pi->pi_state & PI_PRESENT)) {
+ logmsg(LOG_ERR, "Interface %s has been removed from kernel. "
+ "in.ndpd will no longer use it\n", pi->pi_name);
+
+ for (pr = pi->pi_prefix_list; pr != NULL; pr = next_pr) {
+ next_pr = pr->pr_next;
+ if (pr->pr_state & PR_AUTO)
+ prefix_delete(pr);
+ }
+
+ /*
+ * Clear state so that should the phyint reappear we will
+ * start with initial advertisements or solicitations.
+ */
+ phyint_cleanup(pi);
+ }
}
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c
index 5d64a9303d..0a9e1e6a13 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -383,29 +383,12 @@ incoming_ra(struct phyint *pi, struct nd_router_advert *ra, int len,
if (no_loopback && loopback)
return;
- /*
- * If the interface is FAILED or INACTIVE or OFFLINE, don't
- * create any addresses on them. in.mpathd assumes that no new
- * addresses will appear on these. This implies that we
- * won't create any new prefixes advertised by the router
- * on FAILED/INACTIVE/OFFLINE interfaces. When the state changes,
- * the next RA will create the prefix on this interface.
- */
- if (pi->pi_flags & (IFF_FAILED|IFF_INACTIVE|IFF_OFFLINE))
- return;
+ bzero(&lifr, sizeof (lifr));
+ (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
- (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
- lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
- if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) {
- if (errno == ENXIO)
- return;
- logperror_pi(pi, "incoming_ra: SIOCGLIFLNKINFO");
- return;
- }
if (ra->nd_ra_curhoplimit != CURHOP_UNSPECIFIED &&
ra->nd_ra_curhoplimit != pi->pi_CurHopLimit) {
pi->pi_CurHopLimit = ra->nd_ra_curhoplimit;
-
lifr.lifr_ifinfo.lir_maxhops = pi->pi_CurHopLimit;
set_needed = _B_TRUE;
}
@@ -460,7 +443,7 @@ incoming_ra(struct phyint *pi, struct nd_router_advert *ra, int len,
logmsg(LOG_DEBUG,
"incoming_ra: trigger dhcp %s on %s\n",
(ra->nd_ra_flags_reserved & ~pi->pi_ra_flags &
- ND_RA_FLAG_MANAGED) ? "MANAGED" : "OTHER",
+ ND_RA_FLAG_MANAGED) ? "MANAGED" : "OTHER",
pi->pi_name);
}
pi->pi_ra_flags |= ra->nd_ra_flags_reserved;
@@ -999,11 +982,9 @@ incoming_prefix_addrconf_process(struct phyint *pi, struct prefix *pr,
* Delete this prefix structure as kernel
* does not allow duplicated addresses
*/
-
logmsg(LOG_ERR, "incoming_prefix_addrconf_process: "
- "Duplicate prefix %s received on interface %s\n",
- inet_ntop(AF_INET6,
- (void *)&po->nd_opt_pi_prefix, abuf,
+ "Duplicate prefix %s received on interface %s\n",
+ inet_ntop(AF_INET6, &po->nd_opt_pi_prefix, abuf,
sizeof (abuf)), pi->pi_name);
logmsg(LOG_ERR, "incoming_prefix_addrconf_process: "
"Prefix already exists in interface %s\n",
@@ -1129,12 +1110,8 @@ incoming_mtu_opt(struct phyint *pi, uchar_t *opt,
}
pi->pi_LinkMTU = mtu;
- (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
- lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
- if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) {
- logperror_pi(pi, "incoming_mtu_opt: SIOCGLIFLNKINFO");
- return;
- }
+ bzero(&lifr, sizeof (lifr));
+ (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
lifr.lifr_ifinfo.lir_maxmtu = pi->pi_LinkMTU;
if (ioctl(pi->pi_sock, SIOCSLIFLNKINFO, (char *)&lifr) < 0) {
logperror_pi(pi, "incoming_mtu_opt: SIOCSLIFLNKINFO");
@@ -1155,33 +1132,33 @@ incoming_lla_opt(struct phyint *pi, uchar_t *opt,
struct sockaddr_in6 *sin6;
int max_content_len;
- if (pi->pi_hdw_addr_len == 0)
+ /*
+ * Get our link-layer address length. We may not have one, in which
+ * case we can just bail.
+ */
+ if (phyint_get_lla(pi, &lifr) != 0)
return;
/*
* Can't remove padding since it is link type specific.
- * However, we check against the length of our link-layer
- * address.
- * Note: assumes that all links have a fixed lengh address.
+ * However, we check against the length of our link-layer address.
+ * Note: assumes that all links have a fixed length address.
*/
max_content_len = lo->nd_opt_lla_len * 8 - sizeof (struct nd_opt_hdr);
- if (max_content_len < pi->pi_hdw_addr_len ||
+ if (max_content_len < lifr.lifr_nd.lnr_hdw_len ||
(max_content_len >= 8 &&
- max_content_len - 7 > pi->pi_hdw_addr_len)) {
+ max_content_len - 7 > lifr.lifr_nd.lnr_hdw_len)) {
char abuf[INET6_ADDRSTRLEN];
(void) inet_ntop(AF_INET6, (void *)&from->sin6_addr,
abuf, sizeof (abuf));
logmsg(LOG_INFO, "lla option from %s on %s too long with bad "
- "physaddr length (%d vs. %d bytes)\n",
- abuf, pi->pi_name,
- max_content_len, pi->pi_hdw_addr_len);
+ "physaddr length (%d vs. %d bytes)\n", abuf, pi->pi_name,
+ max_content_len, lifr.lifr_nd.lnr_hdw_len);
return;
}
- lifr.lifr_nd.lnr_hdw_len = pi->pi_hdw_addr_len;
- bcopy((char *)lo->nd_opt_lla_hdw_addr,
- (char *)lifr.lifr_nd.lnr_hdw_addr,
+ bcopy(lo->nd_opt_lla_hdw_addr, lifr.lifr_nd.lnr_hdw_addr,
lifr.lifr_nd.lnr_hdw_len);
sin6 = (struct sockaddr_in6 *)&lifr.lifr_nd.lnr_addr;
@@ -1196,8 +1173,7 @@ incoming_lla_opt(struct phyint *pi, uchar_t *opt,
lifr.lifr_nd.lnr_state_same_lla = ND_UNCHANGED;
lifr.lifr_nd.lnr_state_diff_lla = ND_STALE;
lifr.lifr_nd.lnr_flags = isrouter;
- (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
- lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
+ (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
if (ioctl(pi->pi_sock, SIOCLIFSETND, (char *)&lifr) < 0) {
logperror_pi(pi, "incoming_lla_opt: SIOCLIFSETND");
return;
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c
index c8fc6381b7..09e6137965 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "defs.h"
#include "tables.h"
@@ -171,6 +169,7 @@ phyint_init_from_k(struct phyint *pi)
struct ipv6_mreq v6mcastr;
struct lifreq lifr;
int fd;
+ int save_errno;
boolean_t newsock;
uint_t ttl;
struct sockaddr_in6 *sin6;
@@ -297,30 +296,6 @@ start_over:
pi->pi_dst_token = in6addr_any;
}
- /* Get link-layer address */
- if (!(pi->pi_flags & IFF_MULTICAST) ||
- (pi->pi_flags & IFF_POINTOPOINT)) {
- pi->pi_hdw_addr_len = 0;
- } else {
- sin6 = (struct sockaddr_in6 *)&lifr.lifr_nd.lnr_addr;
- bzero(sin6, sizeof (struct sockaddr_in6));
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = pi->pi_ifaddr;
-
- if (ioctl(fd, SIOCLIFGETND, (char *)&lifr) < 0) {
- logperror_pi(pi, "phyint_init_from_k: SIOCLIFGETND");
- goto error;
- }
-
- pi->pi_hdw_addr_len = lifr.lifr_nd.lnr_hdw_len;
-
- if (lifr.lifr_nd.lnr_hdw_len != 0) {
- bcopy((char *)lifr.lifr_nd.lnr_hdw_addr,
- (char *)pi->pi_hdw_addr,
- lifr.lifr_nd.lnr_hdw_len);
- }
- }
-
if (newsock) {
icmp6_filter_t filter;
int on = 1;
@@ -360,8 +335,21 @@ start_over:
v6mcastr.ipv6mr_interface = pi->pi_index;
if (setsockopt(fd, IPPROTO_IPV6, IPV6_JOIN_GROUP,
(char *)&v6mcastr, sizeof (v6mcastr)) < 0) {
- logperror_pi(pi, "phyint_init_from_k: "
- "setsockopt IPV6_JOIN_GROUP");
+ /*
+ * One benign reason IPV6_JOIN_GROUP could fail is
+ * when `pi' has been placed into an IPMP group and we
+ * haven't yet processed the routing socket message
+ * informing us of its disappearance. As such, if
+ * it's now in a group, don't print an error.
+ */
+ save_errno = errno;
+ (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ);
+ if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1 ||
+ lifr.lifr_groupname[0] == '\0') {
+ errno = save_errno;
+ logperror_pi(pi, "phyint_init_from_k: "
+ "setsockopt IPV6_JOIN_GROUP");
+ }
goto error;
}
pi->pi_state |= PI_JOINED_ALLNODES;
@@ -403,8 +391,17 @@ start_over:
v6mcastr.ipv6mr_interface = pi->pi_index;
if (setsockopt(fd, IPPROTO_IPV6, IPV6_JOIN_GROUP,
(char *)&v6mcastr, sizeof (v6mcastr)) < 0) {
- logperror_pi(pi, "phyint_init_from_k: setsockopt "
- "IPV6_JOIN_GROUP");
+ /*
+ * See IPV6_JOIN_GROUP comment above.
+ */
+ save_errno = errno;
+ (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ);
+ if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1 ||
+ lifr.lifr_groupname[0] == '\0') {
+ errno = save_errno;
+ logperror_pi(pi, "phyint_init_from_k: "
+ "setsockopt IPV6_JOIN_GROUP");
+ }
goto error;
}
pi->pi_state |= PI_JOINED_ALLROUTERS;
@@ -569,22 +566,16 @@ phyint_print(struct phyint *pi)
struct adv_prefix *adv_pr;
struct router *dr;
char abuf[INET6_ADDRSTRLEN];
- char llabuf[BUFSIZ];
logmsg(LOG_DEBUG, "Phyint %s index %d state %x, kernel %x, "
"num routers %d\n",
pi->pi_name, pi->pi_index, pi->pi_state, pi->pi_kernel_state,
pi->pi_num_k_routers);
- logmsg(LOG_DEBUG, "\taddress: %s flags %x\n",
+ logmsg(LOG_DEBUG, "\taddress: %s flags %llx\n",
inet_ntop(AF_INET6, (void *)&pi->pi_ifaddr,
abuf, sizeof (abuf)), pi->pi_flags);
- logmsg(LOG_DEBUG, "\tsock %d mtu %d hdw_addr len %d <%s>\n",
- pi->pi_sock, pi->pi_mtu, pi->pi_hdw_addr_len,
- ((pi->pi_hdw_addr_len != 0) ?
- fmt_lla(llabuf, sizeof (llabuf), pi->pi_hdw_addr,
- pi->pi_hdw_addr_len) : "none"));
- logmsg(LOG_DEBUG, "\ttoken: len %d %s\n",
- pi->pi_token_length,
+ logmsg(LOG_DEBUG, "\tsock %d mtu %d\n", pi->pi_sock, pi->pi_mtu);
+ logmsg(LOG_DEBUG, "\ttoken: len %d %s\n", pi->pi_token_length,
inet_ntop(AF_INET6, (void *)&pi->pi_token,
abuf, sizeof (abuf)));
if (pi->pi_TmpAddrsEnabled) {
@@ -632,6 +623,43 @@ phyint_print(struct phyint *pi)
logmsg(LOG_DEBUG, "\n");
}
+
+/*
+ * Store the LLA for the phyint `pi' `lifrp'. Returns 0 on success, or
+ * -1 on failure.
+ *
+ * Note that we do not cache the hardware address since there's no reliable
+ * mechanism to determine when it's become stale.
+ */
+int
+phyint_get_lla(struct phyint *pi, struct lifreq *lifrp)
+{
+ struct sockaddr_in6 *sin6;
+
+ /* If this phyint doesn't have a link-layer address, bail */
+ if (!(pi->pi_flags & IFF_MULTICAST) ||
+ (pi->pi_flags & IFF_POINTOPOINT)) {
+ return (-1);
+ }
+
+ (void) strlcpy(lifrp->lifr_name, pi->pi_name, LIFNAMSIZ);
+ sin6 = (struct sockaddr_in6 *)&(lifrp->lifr_nd.lnr_addr);
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = pi->pi_ifaddr;
+ if (ioctl(pi->pi_sock, SIOCLIFGETND, lifrp) < 0) {
+ /*
+ * For IPMP interfaces, don't report ESRCH errors since that
+ * merely indicates that there are no active interfaces in the
+ * IPMP group (and thus there's no working hardware address),
+ * and the packet will thus never make it out anyway.
+ */
+ if (!(pi->pi_flags & IFF_IPMP) || errno != ESRCH)
+ logperror_pi(pi, "phyint_get_lla: SIOCLIFGETND");
+ return (-1);
+ }
+ return (0);
+}
+
/*
* Randomize pi->pi_ReachableTime.
* Done periodically when there are no RAs and at a maximum frequency when
@@ -642,20 +670,14 @@ phyint_print(struct phyint *pi)
void
phyint_reach_random(struct phyint *pi, boolean_t set_needed)
{
+ struct lifreq lifr;
+
pi->pi_ReachableTime = GET_RANDOM(
(int)(ND_MIN_RANDOM_FACTOR * pi->pi_BaseReachableTime),
(int)(ND_MAX_RANDOM_FACTOR * pi->pi_BaseReachableTime));
if (set_needed) {
- struct lifreq lifr;
-
- (void) strncpy(lifr.lifr_name, pi->pi_name,
- sizeof (lifr.lifr_name));
- pi->pi_name[sizeof (pi->pi_name) - 1] = '\0';
- if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) {
- logperror_pi(pi,
- "phyint_reach_random: SIOCGLIFLNKINFO");
- return;
- }
+ bzero(&lifr, sizeof (lifr));
+ (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ);
lifr.lifr_ifinfo.lir_reachtime = pi->pi_ReachableTime;
if (ioctl(pi->pi_sock, SIOCSLIFLNKINFO, (char *)&lifr) < 0) {
logperror_pi(pi,
@@ -1386,12 +1408,12 @@ prefix_modify_flags(struct prefix *pr, uint64_t onflags, uint64_t offflags)
(void) strncpy(lifr.lifr_name, pr->pr_name, sizeof (lifr.lifr_name));
lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
if (ioctl(pi->pi_sock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
- logperror_pr(pr, "prefix_modify_flags: SIOCGLIFFLAGS");
- logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx "
- "on 0x%llx off 0x%llx\n",
- pr->pr_physical->pi_name,
- pr->pr_name,
- pr->pr_flags, onflags, offflags);
+ if (errno != ENXIO) {
+ logperror_pr(pr, "prefix_modify_flags: SIOCGLIFFLAGS");
+ logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx"
+ " on 0x%llx off 0x%llx\n", pr->pr_physical->pi_name,
+ pr->pr_name, pr->pr_flags, onflags, offflags);
+ }
return (-1);
}
old_flags = lifr.lifr_flags;
@@ -1399,12 +1421,13 @@ prefix_modify_flags(struct prefix *pr, uint64_t onflags, uint64_t offflags)
lifr.lifr_flags &= ~offflags;
pr->pr_flags = lifr.lifr_flags;
if (ioctl(pi->pi_sock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
- logperror_pr(pr, "prefix_modify_flags: SIOCSLIFFLAGS");
- logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx "
- "new 0x%llx on 0x%llx off 0x%llx\n",
- pr->pr_physical->pi_name,
- pr->pr_name,
- old_flags, lifr.lifr_flags, onflags, offflags);
+ if (errno != ENXIO) {
+ logperror_pr(pr, "prefix_modify_flags: SIOCSLIFFLAGS");
+ logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx"
+ " new 0x%llx on 0x%llx off 0x%llx\n",
+ pr->pr_physical->pi_name, pr->pr_name,
+ old_flags, lifr.lifr_flags, onflags, offflags);
+ }
return (-1);
}
return (0);
@@ -1540,7 +1563,8 @@ prefix_update_k(struct prefix *pr)
/* Remove logical interface based on pr_name */
lifr.lifr_addr.ss_family = AF_UNSPEC;
- if (ioctl(pi->pi_sock, SIOCLIFREMOVEIF, (char *)&lifr) < 0) {
+ if (ioctl(pi->pi_sock, SIOCLIFREMOVEIF, (char *)&lifr) < 0 &&
+ errno != ENXIO) {
logperror_pr(pr, "prefix_update_k: SIOCLIFREMOVEIF");
}
pr->pr_kernel_state = 0;
@@ -1865,36 +1889,6 @@ prefix_print(struct prefix *pr)
}
/*
- * Does the address formed by pr->pr_prefix and pi->pi_token match
- * pr->pr_address. It does not match if a failover has happened
- * earlier (done by in.mpathd) from a different pi. Should not
- * be called for onlink prefixes.
- */
-boolean_t
-prefix_token_match(struct phyint *pi, struct prefix *pr, uint64_t flags)
-{
- int i;
- in6_addr_t addr, *token;
-
- if (flags & IFF_TEMPORARY)
- token = &pi->pi_tmp_token;
- else
- token = &pi->pi_token;
- for (i = 0; i < 16; i++) {
- /*
- * prefix_create ensures that pr_prefix has all-zero
- * bits after prefixlen.
- */
- addr.s6_addr[i] = pr->pr_prefix.s6_addr[i] | token->s6_addr[i];
- }
- if (IN6_ARE_ADDR_EQUAL(&pr->pr_address, &addr)) {
- return (_B_TRUE);
- } else {
- return (_B_FALSE);
- }
-}
-
-/*
* Lookup advertisement prefix structure that matches the prefix and
* prefix length.
* Assumes that the bits after prefixlen might not be zero.
@@ -2305,8 +2299,7 @@ phyint_print_all(void)
}
void
-phyint_cleanup(pi)
- struct phyint *pi;
+phyint_cleanup(struct phyint *pi)
{
pi->pi_state = 0;
pi->pi_kernel_state = 0;
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h
index 409600a402..dfc5414d5d 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _NDPD_TABLES_H
#define _NDPD_TABLES_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -58,9 +56,7 @@ struct phyint {
char pi_name[LIFNAMSIZ]; /* Used to identify it */
int pi_sock; /* For sending and receiving */
struct in6_addr pi_ifaddr; /* Local address */
- uint_t pi_flags; /* IFF_* flags */
- uint_t pi_hdw_addr_len;
- uchar_t pi_hdw_addr[ND_MAX_HDW_LEN];
+ uint64_t pi_flags; /* IFF_* flags */
uint_t pi_mtu; /* From SIOCGLIFMTU */
struct in6_addr pi_token;
uint_t pi_token_length;
@@ -256,6 +252,7 @@ extern int phyint_init_from_k(struct phyint *pi);
extern void phyint_delete(struct phyint *pi);
extern uint_t phyint_timer(struct phyint *pi, uint_t elapsed);
extern void phyint_print_all(void);
+extern int phyint_get_lla(struct phyint *pi, struct lifreq *lifrp);
extern void phyint_reach_random(struct phyint *pi,
boolean_t set_needed);
extern void phyint_cleanup(struct phyint *pi);
@@ -280,8 +277,6 @@ extern void prefix_update_k(struct prefix *pr);
extern uint_t prefix_timer(struct prefix *pr, uint_t elapsed);
extern uint_t adv_prefix_timer(struct adv_prefix *adv_pr,
uint_t elapsed);
-extern boolean_t prefix_token_match(struct phyint *pi,
- struct prefix *pr, uint64_t flags);
extern struct prefix *prefix_lookup_addr(struct phyint *pi,
struct in6_addr prefix);
diff --git a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c
index 15db1b7539..b76341e303 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c
@@ -1,3 +1,7 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
/* -*- Mode: C; tab-width: 4 -*-
*
* Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
@@ -130,8 +134,6 @@ First checkin
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "mDNSUNP.h"
#include "mDNSDebug.h"
@@ -398,13 +400,11 @@ select_src_ifi_info_solaris(int sockfd, int numifs,
continue;
/*
* Avoid address if any of the following flags are set:
- * IFF_NOFAILOVER: IPMP test address for use only by in.mpathd
* IFF_NOXMIT: no packets transmitted over interface
* IFF_NOLOCAL: no address
* IFF_PRIVATE: is not advertised
*/
- if (ifflags & (IFF_NOFAILOVER | IFF_NOXMIT
- | IFF_NOLOCAL | IFF_PRIVATE))
+ if (ifflags & (IFF_NOXMIT | IFF_NOLOCAL | IFF_PRIVATE))
continue;
if (*best_lifr != NULL) {
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/Makefile
index d91d113347..e29c1765ec 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/Makefile
+++ b/usr/src/cmd/cmd-inet/usr.sbin/Makefile
@@ -20,7 +20,7 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -65,12 +65,13 @@ K5TELNETOBJS= in.telnetd.o
SRCS= $(PROGSRCS) $(OTHERSRC)
SUBDIRS= bootconfchk htable ifconfig in.ftpd in.rdisc in.routed \
- in.talkd inetadm inetconv ipqosconf kssl/kssladm kssl/ksslcfg \
- ping routeadm snoop sppptun traceroute wificonfig ipsecutils
+ in.talkd inetadm inetconv ipmpstat ipqosconf ipsecutils \
+ kssl/kssladm kssl/ksslcfg ping routeadm snoop sppptun \
+ traceroute wificonfig
MSGSUBDIRS= bootconfchk htable ifconfig in.ftpd in.routed in.talkd inetadm \
- inetconv ipqosconf kssl/ksslcfg routeadm sppptun snoop \
- wificonfig ipsecutils
+ inetconv ipmpstat ipqosconf ipsecutils kssl/ksslcfg routeadm \
+ sppptun snoop wificonfig
# As programs get lint-clean, add them here and to the 'lint' target.
# Eventually this hack should go away, and all in PROG should be
@@ -83,7 +84,8 @@ LINTCLEAN= 6to4relay arp in.rlogind in.rshd in.telnetd in.tftpd \
# with SUBDIRS. Also (sigh) deal with the commented-out build lines
# for the lint rule.
LINTSUBDIRS= bootconfchk in.rdisc in.routed in.talkd inetadm inetconv \
- ipqosconf ping routeadm sppptun traceroute wificonfig ipsecutils
+ ipmpstat ipqosconf ipsecutils ping routeadm sppptun traceroute \
+ wificonfig
# And as programs are verified not to attempt to write into constants,
# -xstrconst should be used to ensure they stay that way.
CONSTCLEAN=
@@ -144,6 +146,8 @@ LDLIBS += $(K5LIBS)
$(TSNETPROG) := LDLIBS += -ltsnet
in.rarpd := LDLIBS += -linetutil -ldlpi
+if_mpadm := LDLIBS += -linetutil -lipmp
+if_mpadm.po := XGETFLAGS += -a
route := CPPFLAGS += -DNDEBUG
ndd := LDLIBS += -ldladm
gettable in.comsat := LDFLAGS += $(MAPFILE.NGB:%=-M%)
@@ -245,7 +249,7 @@ lint: $(LINTSUBDIRS)
-I$(SRC)/lib/gss_mechs/mech_krb5/include \
-I$(SRC)/lib/pam_modules/krb5 \
in.telnetd.c $(LDLIBS) -lbsm -lpam -lsocket -lnsl
- $(LINT.c) if_mpadm.c $(LDLIBS) -lsocket -lnsl -lipmp
+ $(LINT.c) if_mpadm.c $(LDLIBS) -lsocket -lnsl -lipmp -linetutil
$(LINT.c) ipaddrsel.c $(LDLIBS) -lsocket -lnsl
$(LINT.c) route.c $(LDLIBS) -lsocket -lnsl -ltsnet
$(LINT.c) syncinit.c $(LDLIBS) -ldlpi
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c b/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c
index d4874135fd..7c5d73c796 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,660 +19,250 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <unistd.h>
-#include <stdlib.h>
+#include <errno.h>
+#include <ipmp_admin.h>
+#include <libinetutil.h>
+#include <locale.h>
+#include <net/if.h>
+#include <stdarg.h>
#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
#include <sys/sockio.h>
-#include <net/if.h>
-#include <errno.h>
-#include <strings.h>
-#include <ipmp_mpathd.h>
-#include <libintl.h>
+#include <sys/types.h>
-static int if_down(int ifsock, struct lifreq *lifr);
-static int if_up(int ifsock, struct lifreq *lifr);
-static void send_cmd(int cmd, char *ifname);
-static int connect_to_mpathd(sa_family_t family);
-static void do_offline(char *ifname);
-static void undo_offline(char *ifname);
-static boolean_t offline_set(char *ifname);
+typedef void offline_func_t(const char *, ipmp_handle_t);
-#define IF_SEPARATOR ':'
-#define MAX_RETRIES 3
+static const char *progname;
+static int sioc4fd, sioc6fd;
+static offline_func_t do_offline, undo_offline;
+static boolean_t set_lifflags(const char *, uint64_t);
+static boolean_t is_offline(const char *);
+static void warn(const char *, ...);
+static void die(const char *, ...);
static void
usage()
{
- (void) fprintf(stderr, "Usage : if_mpadm [-d | -r] <interface_name>\n");
+ (void) fprintf(stderr, "Usage: %s [-d | -r] <interface>\n", progname);
+ exit(1);
}
-static void
-print_mpathd_error_msg(uint32_t error)
+static const char *
+mpadm_errmsg(uint32_t error)
{
switch (error) {
- case MPATHD_MIN_RED_ERROR:
- (void) fprintf(stderr, gettext(
- "Offline failed as there is no other functional "
- "interface available in the multipathing group "
- "for failing over the network access.\n"));
- break;
-
- case MPATHD_FAILBACK_PARTIAL:
- (void) fprintf(stderr, gettext(
- "Offline cannot be undone because multipathing "
- "configuration is not consistent across all the "
- "interfaces in the group.\n"));
- break;
-
+ case IPMP_EUNKIF:
+ return ("not a physical interface or not in an IPMP group");
+ case IPMP_EMINRED:
+ return ("no other functioning interfaces are in its IPMP "
+ "group");
default:
- /*
- * We shouldn't get here. All errors should have a
- * meaningful error message, as shown in the above
- * cases. If we get here, someone has made a mistake.
- */
- (void) fprintf(stderr, gettext(
- "Operation returned an unrecognized error: %u\n"),
- error);
- break;
+ return (ipmp_errmsg(error));
}
}
int
main(int argc, char **argv)
{
- char *ifname;
- int cmd = 0;
+ int retval;
+ ipmp_handle_t handle;
+ offline_func_t *ofuncp = NULL;
+ const char *ifname;
int c;
-#if !defined(TEXT_DOMAIN)
-#define TEXT_DOMAIN "SYS_TEST"
-#endif
+ if ((progname = strrchr(argv[0], '/')) != NULL)
+ progname++;
+ else
+ progname = argv[0];
+
+ (void) setlocale(LC_ALL, "");
(void) textdomain(TEXT_DOMAIN);
while ((c = getopt(argc, argv, "d:r:")) != EOF) {
switch (c) {
case 'd':
ifname = optarg;
- cmd = MI_OFFLINE;
- if (offline_set(ifname)) {
- (void) fprintf(stderr, gettext("Interface "
- "already offlined\n"));
- exit(1);
- }
+ ofuncp = do_offline;
break;
case 'r':
ifname = optarg;
- cmd = MI_UNDO_OFFLINE;
- if (!offline_set(ifname)) {
- (void) fprintf(stderr, gettext("Interface not "
- "offlined\n"));
- exit(1);
- }
+ ofuncp = undo_offline;
break;
default :
usage();
- exit(1);
}
}
- if (cmd == 0) {
+ if (ofuncp == NULL)
usage();
- exit(1);
- }
/*
- * Send the command to in.mpathd which is generic to
- * both the commands. send_cmd returns only if there
- * is no error.
+ * Create the global V4 and V6 socket ioctl descriptors.
*/
- send_cmd(cmd, ifname);
- if (cmd == MI_OFFLINE) {
- do_offline(ifname);
- } else {
- undo_offline(ifname);
- }
+ sioc4fd = socket(AF_INET, SOCK_DGRAM, 0);
+ sioc6fd = socket(AF_INET6, SOCK_DGRAM, 0);
+ if (sioc4fd == -1 || sioc6fd == -1)
+ die("cannot create sockets");
- return (0);
-}
+ if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS)
+ die("cannot create ipmp handle: %s\n", ipmp_errmsg(retval));
-/*
- * Is IFF_OFFLINE set ?
- * Returns B_FALSE on failure and B_TRUE on success.
- */
-boolean_t
-offline_set(char *ifname)
-{
- struct lifreq lifr;
- int s4;
- int s6;
- int ret;
-
- s4 = socket(AF_INET, SOCK_DGRAM, 0);
- if (s4 < 0) {
- perror("socket");
- exit(1);
- }
- s6 = socket(AF_INET6, SOCK_DGRAM, 0);
- if (s6 < 0) {
- perror("socket");
- exit(1);
- }
- (void) strncpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name));
- ret = ioctl(s4, SIOCGLIFFLAGS, (caddr_t)&lifr);
- if (ret < 0) {
- if (errno != ENXIO) {
- perror("ioctl: SIOCGLIFFLAGS");
- exit(1);
- }
- ret = ioctl(s6, SIOCGLIFFLAGS, (caddr_t)&lifr);
- if (ret < 0) {
- perror("ioctl: SIOCGLIFFLAGS");
- exit(1);
- }
- }
- (void) close(s4);
- (void) close(s6);
- if (lifr.lifr_flags & IFF_OFFLINE)
- return (B_TRUE);
- else
- return (B_FALSE);
+ (*ofuncp)(ifname, handle);
+
+ ipmp_close(handle);
+ (void) close(sioc4fd);
+ (void) close(sioc6fd);
+
+ return (EXIT_SUCCESS);
}
/*
- * Sends the command to in.mpathd. If not successful, prints
- * an error message and exits.
+ * Checks whether IFF_OFFLINE is set on `ifname'.
*/
-void
-send_cmd(int cmd, char *ifname)
+boolean_t
+is_offline(const char *ifname)
{
- struct mi_offline mio;
- struct mi_undo_offline miu;
- struct mi_result me;
- int ret;
- int cmd_len;
- int i;
- int s;
-
- for (i = 0; i < MAX_RETRIES; i++) {
- s = connect_to_mpathd(AF_INET);
- if (s == -1) {
- s = connect_to_mpathd(AF_INET6);
- if (s == -1) {
- (void) fprintf(stderr, gettext("Cannot "
- "establish communication with "
- "in.mpathd.\n"));
- exit(1);
- }
- }
- switch (cmd) {
- case MI_OFFLINE :
- cmd_len = sizeof (struct mi_offline);
- bzero(&mio, cmd_len);
- mio.mio_command = cmd;
- (void) strncpy(mio.mio_ifname, ifname, LIFNAMSIZ);
- mio.mio_min_redundancy = 1;
- ret = write(s, &mio, cmd_len);
- if (ret != cmd_len) {
- /* errno is set only when ret is -1 */
- if (ret == -1)
- perror("write");
- (void) fprintf(stderr, gettext("Failed to "
- "successfully send command to "
- "in.mpathd.\n"));
- exit(1);
- }
- break;
- case MI_UNDO_OFFLINE:
- cmd_len = sizeof (struct mi_undo_offline);
- bzero(&miu, cmd_len);
- miu.miu_command = cmd;
- (void) strncpy(miu.miu_ifname, ifname, LIFNAMSIZ);
- ret = write(s, &miu, cmd_len);
- if (ret != cmd_len) {
- /* errno is set only when ret is -1 */
- if (ret == -1)
- perror("write");
- (void) fprintf(stderr, gettext("Failed to "
- "successfully send command to "
- "in.mpathd.\n"));
- exit(1);
- }
- break;
- default :
- (void) fprintf(stderr, "Unknown command \n");
- exit(1);
- }
+ struct lifreq lifr = { 0 };
- /* Read the result from mpathd */
- ret = read(s, &me, sizeof (me));
- if (ret != sizeof (me)) {
- /* errno is set only when ret is -1 */
- if (ret == -1)
- perror("read");
- (void) fprintf(stderr, gettext("Failed to successfully "
- "read result from in.mpathd.\n"));
- exit(1);
+ (void) strlcpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name));
+ if (ioctl(sioc4fd, SIOCGLIFFLAGS, &lifr) == -1) {
+ if (errno != ENXIO ||
+ ioctl(sioc6fd, SIOCGLIFFLAGS, &lifr) == -1) {
+ die("cannot get interface flags on %s", ifname);
}
- if (me.me_mpathd_error == 0) {
- if (i != 0) {
- /*
- * We retried at least once. Tell the user
- * that things succeeded now.
- */
- (void) fprintf(stderr,
- gettext("Retry Successful.\n"));
- }
- return; /* Successful */
- }
-
- if (me.me_mpathd_error == MPATHD_SYS_ERROR) {
- if (me.me_sys_error == EAGAIN) {
- (void) close(s);
- (void) sleep(1);
- (void) fprintf(stderr,
- gettext("Retrying ...\n"));
- continue; /* Retry */
- }
- errno = me.me_sys_error;
- perror("if_mpadm");
- } else {
- print_mpathd_error_msg(me.me_mpathd_error);
- }
- exit(1);
}
- /*
- * We come here only if we retry the operation multiple
- * times and did not succeed. Let the user try it again
- * later.
- */
- (void) fprintf(stderr,
- gettext("Device busy. Retry the operation later.\n"));
- exit(1);
+
+ return ((lifr.lifr_flags & IFF_OFFLINE) != 0);
}
static void
-do_offline(char *ifname)
+do_offline(const char *ifname, ipmp_handle_t handle)
{
- struct lifreq lifr;
- struct lifreq *lifcr;
- struct lifnum lifn;
- struct lifconf lifc;
- char *buf;
- int numifs;
- int n;
- char pi_name[LIFNAMSIZ + 1];
- char *cp;
- int ifsock_v4;
- int ifsock_v6;
- int af;
- int ret;
+ ifaddrlistx_t *ifaddrp, *ifaddrs;
+ int retval;
+
+ if (is_offline(ifname))
+ die("interface %s is already offline\n", ifname);
+
+ if ((retval = ipmp_offline(handle, ifname, 1)) != IPMP_SUCCESS)
+ die("cannot offline %s: %s\n", ifname, mpadm_errmsg(retval));
/*
- * Verify whether IFF_OFFLINE is not set as a sanity check.
- */
- if (!offline_set(ifname)) {
- (void) fprintf(stderr, gettext("Operation failed : in.mpathd "
- "has not set IFF_OFFLINE on %s\n"), ifname);
- exit(1);
- }
- /*
- * Get both the sockets as we may need to bring both
- * IPv4 and IPv6 interfaces down.
- */
- ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
- if (ifsock_v4 < 0) {
- perror("socket");
- exit(1);
- }
- ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
- if (ifsock_v6 < 0) {
- perror("socket");
- exit(1);
- }
- /*
- * Get all the logicals for "ifname" and mark them down.
- * There is no easy way of doing this. We get all the
- * interfaces in the system using SICGLIFCONF and mark the
- * ones matching the name down.
+ * Get all the up addresses for `ifname' and bring them down.
*/
- lifn.lifn_family = AF_UNSPEC;
- lifn.lifn_flags = 0;
- if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
- perror("ioctl : SIOCGLIFNUM");
- exit(1);
- }
- numifs = lifn.lifn_count;
-
- buf = calloc(numifs, sizeof (struct lifreq));
- if (buf == NULL) {
- perror("calloc");
- exit(1);
- }
+ if (ifaddrlistx(ifname, IFF_UP, 0, &ifaddrs) == -1)
+ die("cannot get addresses on %s", ifname);
- lifc.lifc_family = AF_UNSPEC;
- lifc.lifc_flags = 0;
- lifc.lifc_len = numifs * sizeof (struct lifreq);
- lifc.lifc_buf = buf;
+ for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) {
+ if (!(ifaddrp->ia_flags & IFF_OFFLINE))
+ warn("IFF_OFFLINE vanished on %s\n", ifaddrp->ia_name);
- if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
- perror("ioctl : SIOCGLIFCONF");
- exit(1);
+ if (!set_lifflags(ifaddrp->ia_name,
+ ifaddrp->ia_flags & ~IFF_UP))
+ warn("cannot bring down address on %s\n",
+ ifaddrp->ia_name);
}
- lifcr = (struct lifreq *)lifc.lifc_req;
- for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifcr++) {
- af = lifcr->lifr_addr.ss_family;
- (void) strncpy(pi_name, lifcr->lifr_name,
- sizeof (pi_name));
- pi_name[sizeof (pi_name) - 1] = '\0';
- if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
- *cp = '\0';
- if (strcmp(pi_name, ifname) == 0) {
- /* It matches the interface name that was offlined */
- (void) strncpy(lifr.lifr_name, lifcr->lifr_name,
- sizeof (lifr.lifr_name));
- if (af == AF_INET)
- ret = if_down(ifsock_v4, &lifr);
- else
- ret = if_down(ifsock_v6, &lifr);
- if (ret != 0) {
- (void) fprintf(stderr, gettext("Bringing down "
- "the interfaces failed.\n"));
- exit(1);
- }
- }
- }
+ ifaddrlistx_free(ifaddrs);
}
static void
-undo_offline(char *ifname)
+undo_offline(const char *ifname, ipmp_handle_t handle)
{
- struct lifreq lifr;
- struct lifreq *lifcr;
- struct lifnum lifn;
- struct lifconf lifc;
- char *buf;
- int numifs;
- int n;
- char pi_name[LIFNAMSIZ + 1];
- char *cp;
- int ifsock_v4;
- int ifsock_v6;
- int af;
- int ret;
+ ifaddrlistx_t *ifaddrp, *ifaddrs;
+ int retval;
+
+ if (!is_offline(ifname))
+ die("interface %s is not offline\n", ifname);
/*
- * Verify whether IFF_OFFLINE is set as a sanity check.
- */
- if (offline_set(ifname)) {
- (void) fprintf(stderr, gettext("Operation failed : in.mpathd "
- "has not cleared IFF_OFFLINE on %s\n"), ifname);
- exit(1);
- }
- /*
- * Get both the sockets as we may need to bring both
- * IPv4 and IPv6 interfaces UP.
- */
- ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
- if (ifsock_v4 < 0) {
- perror("socket");
- exit(1);
- }
- ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
- if (ifsock_v6 < 0) {
- perror("socket");
- exit(1);
- }
- /*
- * Get all the logicals for "ifname" and mark them up.
- * There is no easy way of doing this. We get all the
- * interfaces in the system using SICGLIFCONF and mark the
- * ones matching the name up.
+ * Get all the down addresses for `ifname' and bring them up.
*/
- lifn.lifn_family = AF_UNSPEC;
- lifn.lifn_flags = 0;
- if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
- perror("ioctl : SIOCGLIFNUM");
- exit(1);
- }
- numifs = lifn.lifn_count;
-
- buf = calloc(numifs, sizeof (struct lifreq));
- if (buf == NULL) {
- perror("calloc");
- exit(1);
- }
+ if (ifaddrlistx(ifname, 0, IFF_UP, &ifaddrs) == -1)
+ die("cannot get addresses for %s", ifname);
- lifc.lifc_family = AF_UNSPEC;
- lifc.lifc_flags = 0;
- lifc.lifc_len = numifs * sizeof (struct lifreq);
- lifc.lifc_buf = buf;
+ for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) {
+ if (!(ifaddrp->ia_flags & IFF_OFFLINE))
+ warn("IFF_OFFLINE vanished on %s\n", ifaddrp->ia_name);
- if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
- perror("ioctl : SIOCGLIFCONF");
- exit(1);
+ if (!set_lifflags(ifaddrp->ia_name, ifaddrp->ia_flags | IFF_UP))
+ warn("cannot bring up address on %s\n",
+ ifaddrp->ia_name);
}
- lifcr = (struct lifreq *)lifc.lifc_req;
- for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifcr++) {
- af = lifcr->lifr_addr.ss_family;
- (void) strncpy(pi_name, lifcr->lifr_name,
- sizeof (pi_name));
- pi_name[sizeof (pi_name) - 1] = '\0';
- if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
- *cp = '\0';
-
- if (strcmp(pi_name, ifname) == 0) {
- /* It matches the interface name that was offlined */
- (void) strncpy(lifr.lifr_name, lifcr->lifr_name,
- sizeof (lifr.lifr_name));
- if (af == AF_INET)
- ret = if_up(ifsock_v4, &lifr);
- else
- ret = if_up(ifsock_v6, &lifr);
- if (ret != 0) {
- (void) fprintf(stderr, gettext("Bringing up "
- "the interfaces failed.\n"));
- exit(1);
- }
- }
- }
-}
+ ifaddrlistx_free(ifaddrs);
-/*
- * Returns -1 on failure. Returns the socket file descriptor on
- * success.
- */
-static int
-connect_to_mpathd(sa_family_t family)
-{
- int s;
- struct sockaddr_storage ss;
- struct sockaddr_in *sin = (struct sockaddr_in *)&ss;
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss;
- struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
- int addrlen;
- int ret;
- int on;
-
- s = socket(family, SOCK_STREAM, 0);
- if (s < 0) {
- perror("socket");
- return (-1);
- }
- bzero((char *)&ss, sizeof (ss));
- ss.ss_family = family;
/*
- * Need to bind to a privileged port. For non-root, this
- * will fail. in.mpathd verifies that only commands coming
- * from privileged ports succeed so that the ordinary user
- * can't issue offline commands.
+ * Undo the offline.
*/
- on = 1;
- if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on,
- sizeof (on)) < 0) {
- perror("setsockopt : TCP_ANONPRIVBIND");
- exit(1);
- }
- switch (family) {
- case AF_INET:
- sin->sin_port = 0;
- sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
- addrlen = sizeof (struct sockaddr_in);
- break;
- case AF_INET6:
- sin6->sin6_port = 0;
- sin6->sin6_addr = loopback_addr;
- addrlen = sizeof (struct sockaddr_in6);
- break;
- }
- ret = bind(s, (struct sockaddr *)&ss, addrlen);
- if (ret != 0) {
- perror("bind");
- return (-1);
- }
- switch (family) {
- case AF_INET:
- sin->sin_port = htons(MPATHD_PORT);
- break;
- case AF_INET6:
- sin6->sin6_port = htons(MPATHD_PORT);
- break;
+ if ((retval = ipmp_undo_offline(handle, ifname)) != IPMP_SUCCESS) {
+ die("cannot undo-offline %s: %s\n", ifname,
+ mpadm_errmsg(retval));
}
- ret = connect(s, (struct sockaddr *)&ss, addrlen);
- if (ret != 0) {
- perror("connect");
- return (-1);
- }
- on = 0;
- if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on,
- sizeof (on)) < 0) {
- perror("setsockopt : TCP_ANONPRIVBIND");
- return (-1);
- }
- return (s);
+
+ /*
+ * Verify whether IFF_OFFLINE is set as a sanity check.
+ */
+ if (is_offline(ifname))
+ warn("in.mpathd has not cleared IFF_OFFLINE on %s\n", ifname);
}
/*
- * Bring down the interface specified by the name lifr->lifr_name.
- *
- * Returns -1 on failure. Returns 0 on success.
+ * Change `lifname' to have `flags' set. Returns B_TRUE on success.
*/
-static int
-if_down(int ifsock, struct lifreq *lifr)
+static boolean_t
+set_lifflags(const char *lifname, uint64_t flags)
{
- int ret;
+ struct lifreq lifr = { 0 };
+ int fd = (flags & IFF_IPV4) ? sioc4fd : sioc6fd;
- ret = ioctl(ifsock, SIOCGLIFFLAGS, (caddr_t)lifr);
- if (ret < 0) {
- perror("ioctl: SIOCGLIFFLAGS");
- return (-1);
- }
+ (void) strlcpy(lifr.lifr_name, lifname, LIFNAMSIZ);
+ lifr.lifr_flags = flags;
- /* IFF_OFFLINE was set to start with. Is it still there ? */
- if (!(lifr->lifr_flags & (IFF_OFFLINE))) {
- (void) fprintf(stderr, gettext("IFF_OFFLINE disappeared on "
- "%s\n"), lifr->lifr_name);
- return (-1);
- }
- lifr->lifr_flags &= ~IFF_UP;
- ret = ioctl(ifsock, SIOCSLIFFLAGS, (caddr_t)lifr);
- if (ret < 0) {
- perror("ioctl: SIOCSLIFFLAGS");
- return (-1);
- }
- return (0);
+ return (ioctl(fd, SIOCSLIFFLAGS, &lifr) >= 0);
}
-/*
- * Bring up the interface specified by the name lifr->lifr_name.
- *
- * Returns -1 on failure. Returns 0 on success.
- */
-static int
-if_up(int ifsock, struct lifreq *lifr)
+/* PRINTFLIKE1 */
+static void
+die(const char *format, ...)
{
- int ret;
- boolean_t zeroaddr = B_FALSE;
- struct sockaddr_in *addr;
-
- ret = ioctl(ifsock, SIOCGLIFADDR, lifr);
- if (ret < 0) {
- perror("ioctl: SIOCGLIFADDR");
- return (-1);
- }
+ va_list alist;
+ char *errstr = strerror(errno);
- addr = (struct sockaddr_in *)&lifr->lifr_addr;
- switch (addr->sin_family) {
- case AF_INET:
- zeroaddr = (addr->sin_addr.s_addr == INADDR_ANY);
- break;
+ format = gettext(format);
+ (void) fprintf(stderr, gettext("%s: fatal: "), progname);
- case AF_INET6:
- zeroaddr = IN6_IS_ADDR_UNSPECIFIED(
- &((struct sockaddr_in6 *)addr)->sin6_addr);
- break;
+ va_start(alist, format);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
- default:
- break;
- }
+ if (strchr(format, '\n') == NULL)
+ (void) fprintf(stderr, ": %s\n", errstr);
- ret = ioctl(ifsock, SIOCGLIFFLAGS, lifr);
- if (ret < 0) {
- perror("ioctl: SIOCGLIFFLAGS");
- return (-1);
- }
- /*
- * Don't affect the state of addresses that failed back.
- *
- * XXX Link local addresses that are not marked IFF_NOFAILOVER
- * will not be brought up. Link local addresses never failover.
- * When the interface was offlined, we brought the link local
- * address down. We will not bring it up now if IFF_NOFAILOVER
- * is not marked. We check for IFF_NOFAILOVER below so that
- * we want to maintain the state of all other addresses as it
- * was before offline. Normally link local addresses are marked
- * IFF_NOFAILOVER and hence this is not an issue. These can
- * be fixed in future with RCM and it is beyond the scope
- * of if_mpadm to maintain state and do this correctly.
- */
- if (!(lifr->lifr_flags & IFF_NOFAILOVER))
- return (0);
+ exit(EXIT_FAILURE);
+}
- /*
- * When a data address associated with the physical interface itself
- * is failed over (e.g., qfe0, rather than qfe0:1), the kernel must
- * fill the ipif data structure for qfe0 with a placeholder entry (the
- * "replacement ipif"). Replacement ipif's cannot be brought IFF_UP
- * (nor would it make any sense to do so), so we must be careful to
- * skip them; thankfully they can be easily identified since they
- * all have a zeroed address.
- */
- if (zeroaddr)
- return (0);
-
- /* IFF_OFFLINE was not set to start with. Is it there ? */
- if (lifr->lifr_flags & IFF_OFFLINE) {
- (void) fprintf(stderr,
- gettext("IFF_OFFLINE set wrongly on %s\n"),
- lifr->lifr_name);
- return (-1);
- }
- lifr->lifr_flags |= IFF_UP;
- ret = ioctl(ifsock, SIOCSLIFFLAGS, lifr);
- if (ret < 0) {
- perror("ioctl: SIOCSLIFFLAGS");
- return (-1);
- }
- return (0);
+/* PRINTFLIKE1 */
+static void
+warn(const char *format, ...)
+{
+ va_list alist;
+ char *errstr = strerror(errno);
+
+ format = gettext(format);
+ (void) fprintf(stderr, gettext("%s: warning: "), progname);
+
+ va_start(alist, format);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
+
+ if (strchr(format, '\n') == NULL)
+ (void) fprintf(stderr, ": %s\n", errstr);
}
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile
index 69e91758ea..e99f2945a7 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile
@@ -19,10 +19,9 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#
PROG = ifconfig
ROOTFS_PROG = $(PROG)
@@ -38,7 +37,7 @@ COMMONSRCS= $(CMDINETCOMMONDIR)/$(COMMONOBJS:%.o=%.c)
SRCS= $(LOCALSRCS) $(COMMONSRCS)
CPPFLAGS += -I$(CMDINETCOMMONDIR) -I$(SRC)/common/net/dhcp
-LDLIBS += -ldhcpagent -linetcfg -ldlpi -ldladm
+LDLIBS += -ldhcpagent -ldlpi -linetutil -linetcfg -lipmp -ldladm
LINTFLAGS += -m
ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%)
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h
index c993baeb02..4aa1aa0ed7 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -11,13 +11,12 @@
#ifndef _DEFS_H
#define _DEFS_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
#include <errno.h>
+#include <limits.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
@@ -54,7 +53,10 @@ extern "C" {
#include <assert.h>
#include <ipmp_mpathd.h>
+#include <ipmp_admin.h>
#include <inetcfg.h>
+#include <libinetutil.h>
+#include <alloca.h>
#ifdef __cplusplus
}
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
index f49fca249c..d5517a4700 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -23,6 +23,7 @@
#define TUN_NAME "tun"
#define ATUN_NAME "atun"
#define TUN6TO4_NAME "6to4tun"
+#define IPMPSTUB (void *)-1
typedef struct if_flags {
uint64_t iff_value;
@@ -67,7 +68,20 @@ static if_flags_t if_flags_tbl[] = {
{ IFF_TEMPORARY, "TEMPORARY" },
{ IFF_FIXEDMTU, "FIXEDMTU" },
{ IFF_VIRTUAL, "VIRTUAL" },
- { IFF_DUPLICATE, "DUPLICATE" }
+ { IFF_DUPLICATE, "DUPLICATE" },
+ { IFF_IPMP, "IPMP"}
+};
+
+typedef struct {
+ const char *ia_app;
+ uint64_t ia_flag;
+ uint_t ia_tries;
+} if_appflags_t;
+
+static const if_appflags_t if_appflags_tbl[] = {
+ { "dhcpagent(1M)", IFF_DHCPRUNNING, 1 },
+ { "in.ndpd(1M)", IFF_ADDRCONF, 3 },
+ { NULL, 0, 0 }
};
static struct lifreq lifr;
@@ -75,7 +89,6 @@ static struct lifreq lifr;
static char name[LIFNAMSIZ];
/* foreach interface saved name */
static char origname[LIFNAMSIZ];
-static char savedname[LIFNAMSIZ]; /* For addif */
static int setaddr;
/*
@@ -89,20 +102,7 @@ static int setaddr;
#define NO_ESP_AALG 256
#define NO_ESP_EALG 256
-/*
- * iface_t
- * used by setifether to create a list of interfaces to mark
- * down-up when changing the ethernet address of an interface
- */
-typedef struct iface {
- struct lifreq lifr;
- struct iface *next; /* pointer to the next list element */
-} iface_t;
-
-static iface_t *logifs = NULL; /* list of logical interfaces */
-static iface_t *phyif = NULL; /* physical interface */
-
-int s;
+int s, s4, s6;
int af = AF_INET; /* default address family */
int debug = 0;
int all = 0; /* setifdhcp() needs to know this */
@@ -113,6 +113,7 @@ int v4compat = 0; /* Compatible printing format */
* Function prototypes for command functions.
*/
static int addif(char *arg, int64_t param);
+static int inetipmp(char *arg, int64_t param);
static int inetplumb(char *arg, int64_t param);
static int inetunplumb(char *arg, int64_t param);
static int removeif(char *arg, int64_t param);
@@ -141,7 +142,7 @@ static int modinsert(char *arg, int64_t param);
static int modremove(char *arg, int64_t param);
static int setifgroupname(char *arg, int64_t param);
static int configinfo(char *arg, int64_t param);
-static void print_config_flags(uint64_t flags);
+static void print_config_flags(int af, uint64_t flags);
static void print_flags(uint64_t flags);
static void print_ifether(char *ifname);
static int set_tun_encap_limit(char *arg, int64_t param);
@@ -150,6 +151,7 @@ static int set_tun_hop_limit(char *arg, int64_t param);
static int setzone(char *arg, int64_t param);
static int setallzones(char *arg, int64_t param);
static int setifsrc(char *arg, int64_t param);
+static int lifnum(const char *ifname);
/*
* Address family specific function prototypes.
@@ -179,19 +181,22 @@ static int settaddr(char *, int (*)(icfg_handle_t,
static void status(void);
static void ifstatus(const char *);
static void usage(void);
-static int strioctl(int s, int cmd, char *buf, int buflen);
+static int strioctl(int s, int cmd, void *buf, int buflen);
static int setifdhcp(const char *caller, const char *ifname,
int argc, char *argv[]);
static int ip_domux2fd(int *, int *, int *, int *, int *);
static int ip_plink(int, int, int, int, int);
static int modop(char *arg, char op);
-static void selectifs(int argc, char *argv[], int af,
- struct lifreq *lifrp);
-static int updownifs(iface_t *ifs, int up);
static int find_all_global_interfaces(struct lifconf *lifcp, char **buf,
int64_t lifc_flags);
static int find_all_zone_interfaces(struct lifconf *lifcp, char **buf,
int64_t lifc_flags);
+static int create_ipmp(const char *grname, int af, const char *ifname,
+ boolean_t implicit);
+static int create_ipmp_peer(int af, const char *ifname);
+static void start_ipmp_daemon(void);
+static boolean_t ifaddr_up(ifaddrlistx_t *ifaddrp);
+static boolean_t ifaddr_down(ifaddrlistx_t *ifaddrp);
#define max(a, b) ((a) < (b) ? (b) : (a))
@@ -251,6 +256,7 @@ struct cmd {
{ "index", NEXTARG, setifindex, 0, AF_ANY },
{ "broadcast", NEXTARG, setifbroadaddr, 0, AF_INET },
{ "auto-revarp", 0, setifrevarp, 1, AF_INET },
+ { "ipmp", 0, inetipmp, 1, AF_ANY },
{ "plumb", 0, inetplumb, 1, AF_ANY },
{ "unplumb", 0, inetunplumb, 0, AF_ANY },
{ "subnet", NEXTARG, setifsubnet, 0, AF_ANY },
@@ -297,22 +303,30 @@ struct cmd {
typedef struct if_config_cmd {
uint64_t iff_flag;
+ int iff_af;
char *iff_name;
} if_config_cmd_t;
+/*
+ * NOTE: print_config_flags() processes this table in order, so we put "up"
+ * last so that we can be sure "-failover" will take effect first. Otherwise,
+ * IPMP test addresses will erroneously migrate to the IPMP interface.
+ */
static if_config_cmd_t if_config_cmd_tbl[] = {
- { IFF_UP, "up" },
- { IFF_NOTRAILERS, "-trailers" },
- { IFF_PRIVATE, "private" },
- { IFF_NOXMIT, "-xmit" },
- { IFF_ANYCAST, "anycast" },
- { IFF_NOLOCAL, "-local" },
- { IFF_DEPRECATED, "deprecated" },
- { IFF_NOFAILOVER, "-failover" },
- { IFF_STANDBY, "standby" },
- { IFF_FAILED, "failed" },
- { IFF_PREFERRED, "preferred" },
- { 0, 0 },
+ { IFF_NOTRAILERS, AF_UNSPEC, "-trailers" },
+ { IFF_PRIVATE, AF_UNSPEC, "private" },
+ { IFF_NOXMIT, AF_UNSPEC, "-xmit" },
+ { IFF_ANYCAST, AF_INET6, "anycast" },
+ { IFF_NOLOCAL, AF_UNSPEC, "-local" },
+ { IFF_DEPRECATED, AF_UNSPEC, "deprecated" },
+ { IFF_NOFAILOVER, AF_UNSPEC, "-failover" },
+ { IFF_STANDBY, AF_UNSPEC, "standby" },
+ { IFF_FAILED, AF_UNSPEC, "failed" },
+ { IFF_PREFERRED, AF_UNSPEC, "preferred" },
+ { IFF_NONUD, AF_INET6, "-nud" },
+ { IFF_NOARP, AF_INET, "-arp" },
+ { IFF_UP, AF_UNSPEC, "up" },
+ { 0, 0, NULL },
};
typedef struct ni {
@@ -345,10 +359,11 @@ struct afswtch *afp; /* the address family being set or asked about */
int
main(int argc, char *argv[])
{
- /* Include IFF_NOXMIT, IFF_TEMPORARY and all zone interfaces */
- int64_t lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
+ int64_t lifc_flags;
char *default_ip_str;
+ lifc_flags = LIFC_NOXMIT|LIFC_TEMPORARY|LIFC_ALLZONES|LIFC_UNDER_IPMP;
+
if (argc < 2) {
usage();
exit(1);
@@ -388,9 +403,10 @@ main(int argc, char *argv[])
}
s = socket(SOCKET_AF(af), SOCK_DGRAM, 0);
- if (s < 0) {
+ s4 = socket(AF_INET, SOCK_DGRAM, 0);
+ s6 = socket(AF_INET6, SOCK_DGRAM, 0);
+ if (s == -1 || s4 == -1 || s6 == -1)
Perror0_exit("socket");
- }
/*
* Special interface names is any combination of these flags.
@@ -1441,39 +1457,38 @@ setifdstaddr(char *addr, int64_t param)
static int
setifflags(char *val, int64_t value)
{
- int phyintlen, origphyintlen;
+ struct lifreq lifrl; /* local lifreq struct */
+ boolean_t bringup = _B_FALSE;
(void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0)
Perror0_exit("setifflags: SIOCGLIFFLAGS");
- if (value == IFF_NOFAILOVER) {
- /*
- * Fail if '-failover' is set after a prior addif created the
- * alias on a different interface. This can happen when the
- * interface is part of an IPMP group.
- */
- phyintlen = strcspn(name, ":");
- origphyintlen = strcspn(origname, ":");
- if (phyintlen != origphyintlen ||
- strncmp(name, origname, phyintlen) != 0) {
- (void) fprintf(stderr, "ifconfig: can't set -failover "
- "on failed/standby/offlined interface %s\n",
- origname);
- exit(1);
- }
- }
-
if (value < 0) {
value = -value;
+
+ if ((value & IFF_NOFAILOVER) && (lifr.lifr_flags & IFF_UP)) {
+ /*
+ * The kernel does not allow administratively up test
+ * addresses to be converted to data addresses. Bring
+ * the address down first, then bring it up after it's
+ * been converted to a data address.
+ */
+ lifr.lifr_flags &= ~IFF_UP;
+ (void) ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr);
+ bringup = _B_TRUE;
+ }
+
lifr.lifr_flags &= ~value;
- if ((value & IFF_UP) && (lifr.lifr_flags & IFF_DUPLICATE)) {
+ if ((value & (IFF_UP | IFF_NOFAILOVER)) &&
+ (lifr.lifr_flags & IFF_DUPLICATE)) {
/*
* If the user is trying to mark an interface with a
- * duplicate address as "down," then fetch the address
- * and set it. This will cause IP to clear the
- * IFF_DUPLICATE flag and stop the automatic recovery
- * timer.
+ * duplicate address as "down," or convert a duplicate
+ * test address to a data address, then fetch the
+ * address and set it. This will cause IP to clear
+ * the IFF_DUPLICATE flag and stop the automatic
+ * recovery timer.
*/
value = lifr.lifr_flags;
if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) >= 0)
@@ -1483,10 +1498,48 @@ setifflags(char *val, int64_t value)
} else {
lifr.lifr_flags |= value;
}
+
+ /*
+ * If we're about to bring up an underlying physical IPv6 interface in
+ * an IPMP group, ensure the IPv6 IPMP interface is also up. This is
+ * for backward compatibility with legacy configurations in which
+ * there are no explicit hostname files for IPMP interfaces. (For
+ * IPv4, this is automatically handled by the kernel when migrating
+ * the underlying interface's data address to the IPMP interface.)
+ */
+ (void) strlcpy(lifrl.lifr_name, name, LIFNAMSIZ);
+
+ if (lifnum(lifr.lifr_name) == 0 &&
+ (lifr.lifr_flags & (IFF_UP|IFF_IPV6)) == (IFF_UP|IFF_IPV6) &&
+ ioctl(s, SIOCGLIFGROUPNAME, &lifrl) == 0 &&
+ lifrl.lifr_groupname[0] != '\0') {
+ lifgroupinfo_t lifgr;
+
+ (void) strlcpy(lifgr.gi_grname, lifrl.lifr_groupname,
+ LIFGRNAMSIZ);
+ if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1)
+ Perror0_exit("setifflags: SIOCGLIFGROUPINFO");
+
+ (void) strlcpy(lifrl.lifr_name, lifgr.gi_grifname, LIFNAMSIZ);
+ if (ioctl(s, SIOCGLIFFLAGS, &lifrl) == -1)
+ Perror0_exit("setifflags: SIOCGLIFFLAGS");
+ if (!(lifrl.lifr_flags & IFF_UP)) {
+ lifrl.lifr_flags |= IFF_UP;
+ if (ioctl(s, SIOCSLIFFLAGS, &lifrl) == -1)
+ Perror0_exit("setifflags: SIOCSLIFFLAGS");
+ }
+ }
+
(void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
- if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) {
+ if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0)
Perror0_exit("setifflags: SIOCSLIFFLAGS");
+
+ if (bringup) {
+ lifr.lifr_flags |= IFF_UP;
+ if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0)
+ Perror0_exit("setifflags: SIOCSLIFFLAGS IFF_UP");
}
+
return (0);
}
@@ -1524,12 +1577,21 @@ setifindex(char *val, int64_t param)
}
/* ARGSUSED */
+static void
+notifycb(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg)
+{
+}
+
+/* ARGSUSED */
static int
setifether(char *addr, int64_t param)
{
- uchar_t *ea;
- iface_t *current;
- int maclen;
+ uchar_t *hwaddr;
+ int hwaddrlen;
+ int retval;
+ ifaddrlistx_t *ifaddrp, *ifaddrs = NULL;
+ dlpi_handle_t dh;
+ dlpi_notifyid_t id;
if (addr == NULL) {
ifstatus(name);
@@ -1537,9 +1599,6 @@ setifether(char *addr, int64_t param)
return (0);
}
- phyif = NULL;
- logifs = NULL;
-
/*
* if the IP interface in the arguments is a logical
* interface, exit with an error now.
@@ -1550,79 +1609,68 @@ setifether(char *addr, int64_t param)
exit(1);
}
- ea = _link_aton(addr, &maclen);
- if (ea == NULL) {
- if (maclen == -1)
+ if ((hwaddr = _link_aton(addr, &hwaddrlen)) == NULL) {
+ if (hwaddrlen == -1)
(void) fprintf(stderr,
- "ifconfig: %s: bad address\n", addr);
+ "ifconfig: %s: bad address\n", hwaddr);
else
(void) fprintf(stderr, "ifconfig: malloc() failed\n");
exit(1);
}
- (void) strncpy(savedname, name, sizeof (savedname));
+ if ((retval = dlpi_open(name, &dh, 0)) != DLPI_SUCCESS)
+ Perrdlpi_exit("cannot dlpi_open() link", name, retval);
- /*
- * Call selectifs only for the IP interfaces that are ipv4.
- * offflags == IFF_IPV6 because you should not change the
- * Ethernet address of an ipv6 interface
- */
- foreachinterface(selectifs, 0, (char **)NULL, 0, 0, IFF_IPV6, 0);
+ if ((retval = dlpi_bind(dh, DLPI_ANY_SAP, NULL)) != DLPI_SUCCESS)
+ Perrdlpi_exit("cannot dlpi_bind() link", name, retval);
- /* If physical interface not found, exit now */
- if (phyif == NULL) {
- (void) fprintf(stderr,
- "ifconfig: interface %s not found\n", savedname);
- exit(1);
- }
-
- /* Restore */
- (void) strncpy(name, savedname, sizeof (name));
- (void) strncpy(origname, savedname, sizeof (origname));
- (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
-
- /*
- * close and reopen the socket
- * we don't know which type of socket we have now
- */
- (void) close(s);
- s = socket(SOCKET_AF(AF_UNSPEC), SOCK_DGRAM, 0);
- if (s < 0) {
- Perror0_exit("socket");
- }
-
- /*
- * mark down the logical interfaces first,
- * and then the physical interface
- */
- if (updownifs(logifs, 0) < 0 || updownifs(phyif, 0) < 0) {
- Perror0_exit("mark down interface failed");
+ retval = dlpi_enabnotify(dh, DL_NOTE_PHYS_ADDR, notifycb, NULL, &id);
+ if (retval == DLPI_SUCCESS) {
+ (void) dlpi_disabnotify(dh, id, NULL);
+ } else {
+ /*
+ * This link does not support DL_NOTE_PHYS_ADDR: bring down
+ * all of the addresses to flush the old hardware address
+ * information out of IP.
+ *
+ * NOTE: Skipping this when DL_NOTE_PHYS_ADDR is supported is
+ * more than an optimization: in.mpathd will set IFF_OFFLINE
+ * if it's notified and the new address is a duplicate of
+ * another in the group -- but the flags manipulation in
+ * ifaddr_{down,up}() cannot be atomic and thus might clobber
+ * IFF_OFFLINE, confusing in.mpathd.
+ */
+ if (ifaddrlistx(name, IFF_UP, 0, &ifaddrs) == -1)
+ Perror2_exit(name, "cannot get address list");
+
+ ifaddrp = ifaddrs;
+ for (; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) {
+ if (!ifaddr_down(ifaddrp)) {
+ Perror2_exit(ifaddrp->ia_name,
+ "cannot bring down");
+ }
+ }
}
/*
- * Change the physical address
+ * Change the hardware address.
*/
- if (dlpi_set_address(savedname, ea, maclen) == -1) {
+ retval = dlpi_set_physaddr(dh, DL_CURR_PHYS_ADDR, hwaddr, hwaddrlen);
+ if (retval != DLPI_SUCCESS) {
(void) fprintf(stderr,
- "ifconfig: failed setting mac address on %s\n",
- savedname);
+ "ifconfig: failed setting mac address on %s\n", name);
}
+ dlpi_close(dh);
/*
- * if any interfaces were marked down before changing the
- * ethernet address, put them up again.
- * First the physical interface, then the logical ones.
+ * If any addresses were brought down before changing the hardware
+ * address, bring them up again.
*/
- if (updownifs(phyif, 1) < 0 || updownifs(logifs, 1) < 0) {
- Perror0_exit("mark down interface failed");
- }
-
- /* Free the memory allocated by selectifs */
- free(phyif);
- for (current = logifs; current != NULL; current = logifs) {
- logifs = logifs->next;
- free(current);
+ for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) {
+ if (!ifaddr_up(ifaddrp))
+ Perror2_exit(ifaddrp->ia_name, "cannot bring up");
}
+ ifaddrlistx_free(ifaddrs);
return (0);
}
@@ -1655,8 +1703,8 @@ print_ifether(char *ifname)
}
(void) close(fd);
- /* Virtual interfaces don't have MAC addresses */
- if (lifr.lifr_flags & IFF_VIRTUAL)
+ /* VNI and IPMP interfaces don't have MAC addresses */
+ if (lifr.lifr_flags & (IFF_VIRTUAL|IFF_IPMP))
return;
/*
@@ -1685,104 +1733,6 @@ print_ifether(char *ifname)
}
/*
- * static void selectifs(int argc, char *argv[], int af, struct lifreq *rp)
- *
- * Called inside setifether() to create a list of interfaces to
- * mark down/up when changing the Ethernet address.
- * If the current interface is the physical interface passed
- * as an argument to ifconfig, update phyif.
- * If the current interface is a logical interface associated
- * to the physical interface, add it to the logifs list.
- */
-/* ARGSUSED */
-static void
-selectifs(int argc, char *argv[], int af, struct lifreq *rp)
-{
- char *colonp;
- int length;
- iface_t *current;
-
- /*
- * savedname= name of the IP interface to which you want to
- * change ethernet address
- * name= name of the current IP interface
- */
- colonp = strchr(name, ':');
- if (colonp == NULL)
- length = max(strlen(savedname), strlen(name));
- else
- length = max(strlen(savedname), colonp - name);
- if (strncmp(savedname, name, length) == 0) {
- (void) strcpy(lifr.lifr_name, name);
- if (ioctl(s, SIOCGLIFFLAGS, &lifr) < 0) {
- Perror0("selectifs: SIOCGLIFFLAGS");
- return;
- }
-
- if ((current = malloc(sizeof (iface_t))) == NULL) {
- Perror0_exit("selectifs: malloc failed\n");
- }
-
- if (colonp == NULL) {
- /* this is the physical interface */
- phyif = current;
- bcopy(&lifr, &phyif->lifr, sizeof (struct lifreq));
- phyif->next = NULL;
- } else {
- /* this is a logical interface */
- bcopy(&lifr, &current->lifr, sizeof (struct lifreq));
- current->next = logifs;
- logifs = current;
- }
- }
-}
-
-/*
- * static int updownifs(iface_t *ifs, int up)
- *
- * It takes in input a list of IP interfaces (ifs)
- * and a flag (up).
- * It marks each interface in the list down (up = 0)
- * or up (up > 0). This is done ONLY if the IP
- * interface was originally up.
- *
- * Return values:
- * 0 = everything OK
- * -1 = problem
- */
-static int
-updownifs(iface_t *ifs, int up)
-{
- iface_t *current;
- int ret = 0;
- int save_errno;
- char savename[LIFNAMSIZ];
- uint64_t orig_flags;
-
- for (current = ifs; current != NULL; current = current->next) {
- if (current->lifr.lifr_flags & IFF_UP) {
- orig_flags = current->lifr.lifr_flags;
- if (!up)
- current->lifr.lifr_flags &= ~IFF_UP;
- if (ioctl(s, SIOCSLIFFLAGS, &current->lifr) < 0) {
- save_errno = errno;
- (void) strcpy(savename,
- current->lifr.lifr_name);
- ret = -1;
- }
- if (!up) /* restore the original flags */
- current->lifr.lifr_flags = orig_flags;
- }
- }
-
- if (ret == -1) {
- (void) strcpy(lifr.lifr_name, savename);
- errno = save_errno;
- }
- return (ret);
-}
-
-/*
* static int find_all_global_interfaces(struct lifconf *lifcp, char **buf,
* int64_t lifc_flags)
*
@@ -2109,130 +2059,217 @@ setiftoken(char *addr, int64_t param)
return (0);
}
-/*
- * Return value: 0 on success, -1 on failure.
- */
-static int
-connect_to_mpathd(int family)
-{
- int s;
- struct sockaddr_storage ss;
- struct sockaddr_in *sin = (struct sockaddr_in *)&ss;
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss;
- struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
- int addrlen;
- int ret;
- int on;
-
- s = socket(family, SOCK_STREAM, 0);
- if (s < 0) {
- Perror0_exit("connect_to_mpathd: socket");
- }
- (void) bzero((char *)&ss, sizeof (ss));
- ss.ss_family = family;
- /*
- * Need to bind to a privileged port. For non-root, this
- * will fail. in.mpathd verifies that only commands coming
- * from privileged ports succeed so that ordinary users
- * can't connect and start talking to in.mpathd
- */
- on = 1;
- if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on,
- sizeof (on)) < 0) {
- Perror0_exit("connect_to_mpathd: setsockopt");
- }
- switch (family) {
- case AF_INET:
- sin->sin_port = 0;
- sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
- addrlen = sizeof (struct sockaddr_in);
- break;
- case AF_INET6:
- sin6->sin6_port = 0;
- sin6->sin6_addr = loopback_addr;
- addrlen = sizeof (struct sockaddr_in6);
- break;
- }
- ret = bind(s, (struct sockaddr *)&ss, addrlen);
- if (ret != 0) {
- (void) close(s);
- return (-1);
- }
-
- switch (family) {
- case AF_INET:
- sin->sin_port = htons(MPATHD_PORT);
- break;
- case AF_INET6:
- sin6->sin6_port = htons(MPATHD_PORT);
- break;
- }
- ret = connect(s, (struct sockaddr *)&ss, addrlen);
- (void) close(s);
- return (ret);
-}
-
/* ARGSUSED */
static int
-setifgroupname(char *grpname, int64_t param)
+setifgroupname(char *grname, int64_t param)
{
+ lifgroupinfo_t lifgr;
+ struct lifreq lifrl;
+ ifaddrlistx_t *ifaddrp, *nextifaddrp;
+ ifaddrlistx_t *ifaddrs = NULL, *downaddrs = NULL;
+ int af;
+
if (debug) {
(void) printf("Setting groupname %s on interface %s\n",
- grpname, name);
- }
- (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
- (void) strncpy(lifr.lifr_groupname, grpname,
- sizeof (lifr.lifr_groupname));
- if (ioctl(s, SIOCSLIFGROUPNAME, (caddr_t)&lifr) < 0) {
- Perror0_exit("setifgroupname: SIOCSLIFGROUPNAME");
+ grname, name);
}
- /*
- * If the SUNW_NO_MPATHD environment variable is set then don't
- * bother starting up in.mpathd. See PSARC/2002/249 for the
- * depressing details on this bit of stupidity.
- */
- if (getenv("SUNW_NO_MPATHD") != NULL) {
- return (0);
+ (void) strlcpy(lifrl.lifr_name, name, LIFNAMSIZ);
+ (void) strlcpy(lifrl.lifr_groupname, grname, LIFGRNAMSIZ);
+
+ while (ioctl(s, SIOCSLIFGROUPNAME, &lifrl) == -1) {
+ switch (errno) {
+ case ENOENT:
+ /*
+ * The group doesn't yet exist; create it and repeat.
+ */
+ af = afp->af_af;
+ if (create_ipmp(grname, af, NULL, _B_TRUE) == -1) {
+ if (errno == EEXIST)
+ continue;
+
+ Perror2(grname, "cannot create IPMP group");
+ goto fail;
+ }
+ continue;
+
+ case EALREADY:
+ /*
+ * The interface is already in another group; must
+ * remove existing membership first.
+ */
+ lifrl.lifr_groupname[0] = '\0';
+ if (ioctl(s, SIOCSLIFGROUPNAME, &lifrl) == -1) {
+ Perror2(name, "cannot remove existing "
+ "IPMP group membership");
+ goto fail;
+ }
+ (void) strlcpy(lifrl.lifr_groupname, grname,
+ LIFGRNAMSIZ);
+ continue;
+
+ case EAFNOSUPPORT:
+ /*
+ * The group exists, but it's not configured with the
+ * address families the interface needs. Since only
+ * two address families are currently supported, just
+ * configure the "other" address family. Note that we
+ * may race with group deletion or creation by another
+ * process (ENOENT or EEXIST); in such cases we repeat
+ * our original SIOCSLIFGROUPNAME.
+ */
+ (void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ);
+ if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1) {
+ if (errno == ENOENT)
+ continue;
+
+ Perror2(grname, "SIOCGLIFGROUPINFO");
+ goto fail;
+ }
+
+ af = lifgr.gi_v4 ? AF_INET6 : AF_INET;
+ if (create_ipmp(grname, af, lifgr.gi_grifname,
+ _B_TRUE) == -1) {
+ if (errno == EEXIST)
+ continue;
+
+ Perror2(grname, "cannot configure IPMP group");
+ goto fail;
+ }
+ continue;
+
+ case EADDRINUSE:
+ /*
+ * Some addresses are in-use (or under control of DAD).
+ * Bring them down and retry the group join operation.
+ * We will bring them back up after the interface has
+ * been placed in the group.
+ */
+ if (ifaddrlistx(lifrl.lifr_name, IFF_UP|IFF_DUPLICATE,
+ 0, &ifaddrs) == -1) {
+ Perror2(grname, "cannot get address list");
+ goto fail;
+ }
+
+ ifaddrp = ifaddrs;
+ for (; ifaddrp != NULL; ifaddrp = nextifaddrp) {
+ if (!ifaddr_down(ifaddrp)) {
+ ifaddrs = ifaddrp;
+ goto fail;
+ }
+ nextifaddrp = ifaddrp->ia_next;
+ ifaddrp->ia_next = downaddrs;
+ downaddrs = ifaddrp;
+ }
+ ifaddrs = NULL;
+ continue;
+
+ case EADDRNOTAVAIL: {
+ /*
+ * Some data addresses are under application control.
+ * For some of these (e.g., ADDRCONF), the application
+ * should remove the address, in which case we retry a
+ * few times (since the application's action is not
+ * atomic with respect to us) before bailing out and
+ * informing the user.
+ */
+ int ntries, nappaddr = 0;
+ const if_appflags_t *iap = if_appflags_tbl;
+
+ for (; iap->ia_app != NULL; iap++) {
+ ntries = 0;
+again:
+ if (ifaddrlistx(lifrl.lifr_name, iap->ia_flag,
+ IFF_NOFAILOVER, &ifaddrs) == -1) {
+ (void) fprintf(stderr, "ifconfig: %s: "
+ "cannot get data addresses managed "
+ "by %s\n", lifrl.lifr_name,
+ iap->ia_app);
+ goto fail;
+ }
+
+ if (ifaddrs == NULL)
+ continue;
+
+ ifaddrlistx_free(ifaddrs);
+ ifaddrs = NULL;
+
+ if (++ntries < iap->ia_tries) {
+ (void) poll(NULL, 0, 100);
+ goto again;
+ }
+
+ (void) fprintf(stderr, "ifconfig: cannot join "
+ "IPMP group: %s has data addresses managed "
+ "by %s\n", lifrl.lifr_name, iap->ia_app);
+ nappaddr++;
+ }
+ if (nappaddr > 0)
+ goto fail;
+ continue;
+ }
+ default:
+ Perror2(name, "SIOCSLIFGROUPNAME");
+ goto fail;
+ }
}
/*
- * Try to connect to in.mpathd using IPv4. If we succeed,
- * we conclude that in.mpathd is running, and quit.
+ * If there were addresses that we had to bring down, it's time to
+ * bring them up again. As part of bringing them up, the kernel will
+ * automatically move them to the new IPMP interface.
*/
- if (connect_to_mpathd(AF_INET) == 0) {
- /* connect succeeded, mpathd is already running */
- return (0);
+ for (ifaddrp = downaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) {
+ if (!ifaddr_up(ifaddrp) && errno != ENXIO) {
+ (void) fprintf(stderr, "ifconfig: cannot bring back up "
+ "%s: %s\n", ifaddrp->ia_name, strerror(errno));
+ }
}
+ ifaddrlistx_free(downaddrs);
+ return (0);
+fail:
/*
- * Try to connect to in.mpathd using IPv6. If we succeed,
- * we conclude that in.mpathd is running, and quit.
+ * Attempt to bring back up any interfaces that we downed.
*/
- if (connect_to_mpathd(AF_INET6) == 0) {
- /* connect succeeded, mpathd is already running */
- return (0);
+ for (ifaddrp = downaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) {
+ if (!ifaddr_up(ifaddrp) && errno != ENXIO) {
+ (void) fprintf(stderr, "ifconfig: cannot bring back up "
+ "%s: %s\n", ifaddrp->ia_name, strerror(errno));
+ }
}
+ ifaddrlistx_free(downaddrs);
+ ifaddrlistx_free(ifaddrs);
/*
- * in.mpathd may not be running. Start it now. If it is already
- * running, in.mpathd will take care of handling multiple incarnations
- * of itself. ifconfig only tries to optimize performance by not
- * starting another incarnation of in.mpathd.
+ * We'd return -1, but foreachinterface() doesn't propagate the error
+ * into the exit status, so we're forced to explicitly exit().
*/
- switch (fork()) {
+ exit(1);
+ /* NOTREACHED */
+}
- case -1:
- Perror0_exit("setifgroupname: fork");
- /* NOTREACHED */
- case 0:
- (void) execl(MPATHD_PATH, MPATHD_PATH, NULL);
- _exit(1);
- /* NOTREACHED */
- default:
- return (0);
+static boolean_t
+modcheck(const char *ifname)
+{
+ (void) strlcpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name));
+
+ if (ioctl(s, SIOCGLIFFLAGS, &lifr) < 0) {
+ Perror0("SIOCGLIFFLAGS");
+ return (_B_FALSE);
}
-}
+ if (lifr.lifr_flags & IFF_IPMP) {
+ (void) fprintf(stderr, "ifconfig: %s: module operations not"
+ " supported on IPMP interfaces\n", ifname);
+ return (_B_FALSE);
+ }
+ if (lifr.lifr_flags & IFF_VIRTUAL) {
+ (void) fprintf(stderr, "ifconfig: %s: module operations not"
+ " supported on virtual IP interfaces\n", ifname);
+ return (_B_FALSE);
+ }
+ return (_B_TRUE);
+}
/*
* To list all the modules above a given network interface.
@@ -2250,7 +2287,13 @@ modlist(char *null, int64_t param)
struct str_list strlist;
int orig_arpid;
- (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
+ /*
+ * We'd return -1, but foreachinterface() doesn't propagate the error
+ * into the exit status, so we're forced to explicitly exit().
+ */
+ if (!modcheck(name))
+ exit(1);
+
if (ip_domux2fd(&muxfd, &muxid_fd, &ipfd_lowstr, &arpfd_lowstr,
&orig_arpid) < 0) {
return (-1);
@@ -2354,8 +2397,8 @@ open_arp_on_udp(char *udp_dev_name)
* Return:
* -1 if operation fails, 0 otherwise.
*
- * Please see the big block comment above plumb_one_device()
- * for the logic of the PLINK/PUNLINK
+ * Please see the big block comment above ifplumb() for the logic of the
+ * PLINK/PUNLINK
*/
static int
ip_domux2fd(int *muxfd, int *muxid_fd, int *ipfd_lowstr, int *arpfd_lowstr,
@@ -2467,8 +2510,8 @@ ip_domux2fd(int *muxfd, int *muxid_fd, int *ipfd_lowstr, int *arpfd_lowstr,
* Return:
* -1 if operation fails, 0 otherwise.
*
- * Please see the big block comment above plumb_one_device()
- * for the logic of the PLINK/PUNLINK
+ * Please see the big block comment above ifplumb() for the logic of the
+ * PLINK/PUNLINK
*/
static int
ip_plink(int muxfd, int muxid_fd, int ipfd_lowstr, int arpfd_lowstr,
@@ -2530,7 +2573,12 @@ modop(char *arg, char op)
char *arg_str;
int orig_arpid;
- (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
+ /*
+ * We'd return -1, but foreachinterface() doesn't propagate the error
+ * into the exit status, so we're forced to explicitly exit().
+ */
+ if (!modcheck(name))
+ exit(1);
/* Need to save the original string for -a option. */
if ((arg_str = malloc(strlen(arg) + 1)) == NULL) {
@@ -3067,13 +3115,14 @@ status(void)
static int
configinfo(char *null, int64_t param)
{
+ char *cp;
struct afswtch *p = afp;
uint64_t flags;
- char phydevname[LIFNAMSIZ];
+ char lifname[LIFNAMSIZ];
char if_usesrc_name[LIFNAMSIZ];
- char *cp;
(void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
+
if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) {
Perror0_exit("status: SIOCGLIFFLAGS");
}
@@ -3084,13 +3133,13 @@ configinfo(char *null, int64_t param)
name, flags, p != NULL ? p->af_af : -1);
}
- /* remove LIF component */
- (void) strncpy(phydevname, name, sizeof (phydevname));
- cp = strchr(phydevname, ':');
- if (cp) {
- *cp = 0;
- }
- phydevname[sizeof (phydevname) - 1] = '\0';
+ /*
+ * Build the interface name to print (we can't directly use `name'
+ * because one cannot "plumb" ":0" interfaces).
+ */
+ (void) strlcpy(lifname, name, LIFNAMSIZ);
+ if ((cp = strchr(lifname, ':')) != NULL && atoi(cp + 1) == 0)
+ *cp = '\0';
/*
* if the interface is IPv4
@@ -3105,7 +3154,7 @@ configinfo(char *null, int64_t param)
if (v4compat)
flags &= ~IFF_IPV4;
- (void) printf("%s inet plumb", phydevname);
+ (void) printf("%s inet plumb", lifname);
} else if (flags & IFF_IPV6) {
/*
* else if the interface is IPv6
@@ -3117,7 +3166,7 @@ configinfo(char *null, int64_t param)
if (v4compat)
return (-1);
- (void) printf("%s inet6 plumb", phydevname);
+ (void) printf("%s inet6 plumb", lifname);
}
(void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
@@ -3131,8 +3180,8 @@ configinfo(char *null, int64_t param)
ioctl(s, SIOCGLIFMTU, (caddr_t)&lifr) >= 0)
(void) printf(" mtu %d", lifr.lifr_metric);
- /* don't print index when in compatibility mode */
- if (!v4compat) {
+ /* Index only applies to the zeroth interface */
+ if (lifnum(name) == 0) {
if (ioctl(s, SIOCGLIFINDEX, (caddr_t)&lifr) >= 0)
(void) printf(" index %d", lifr.lifr_index);
}
@@ -3162,7 +3211,6 @@ configinfo(char *null, int64_t param)
}
(void) printf("\n");
-
return (0);
}
@@ -3398,15 +3446,11 @@ in_status(int force, uint64_t flags)
inet_ntoa(sin->sin_addr));
}
}
- /* If there is a groupname, print it for lun 0 alone */
+ /* If there is a groupname, print it for only the physical interface */
if (strchr(name, ':') == NULL) {
- (void) memset(lifr.lifr_groupname, 0,
- sizeof (lifr.lifr_groupname));
- if (ioctl(s, SIOCGLIFGROUPNAME, (caddr_t)&lifr) >= 0) {
- if (strlen(lifr.lifr_groupname) > 0) {
- (void) printf("\n\tgroupname %s",
- lifr.lifr_groupname);
- }
+ if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 &&
+ lifr.lifr_groupname[0] != '\0') {
+ (void) printf("\n\tgroupname %s", lifr.lifr_groupname);
}
}
(void) putchar('\n');
@@ -3550,11 +3594,7 @@ in_configinfo(int force, uint64_t flags)
Perror0_exit("in_configinfo: SIOCGLIFADDR");
}
sin = (struct sockaddr_in *)&lifr.lifr_addr;
- if (strchr(name, ':') != NULL) {
- (void) printf(" addif %s ", inet_ntoa(sin->sin_addr));
- } else {
- (void) printf(" set %s ", inet_ntoa(sin->sin_addr));
- }
+ (void) printf(" set %s ", inet_ntoa(sin->sin_addr));
laddr = sin;
}
@@ -3614,8 +3654,8 @@ in_configinfo(int force, uint64_t flags)
}
}
- /* If there is a groupname, print it for only the physical interface */
- if (strchr(name, ':') == NULL) {
+ /* If there is a groupname, print it for only the zeroth interface */
+ if (lifnum(name) == 0) {
if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 &&
lifr.lifr_groupname[0] != '\0') {
(void) printf(" group %s ", lifr.lifr_groupname);
@@ -3623,12 +3663,7 @@ in_configinfo(int force, uint64_t flags)
}
/* Print flags to configure */
- print_config_flags(flags);
-
- /* IFF_NOARP applies to AF_INET only */
- if (flags & IFF_NOARP) {
- (void) printf("-arp ");
- }
+ print_config_flags(AF_INET, flags);
}
static void
@@ -3657,17 +3692,9 @@ in6_configinfo(int force, uint64_t flags)
Perror0_exit("in6_configinfo: SIOCGLIFADDR");
}
sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
- if (strchr(name, ':') != NULL) {
- (void) printf(" addif %s/%d ",
- inet_ntop(AF_INET6, (void *)&sin6->sin6_addr,
- abuf, sizeof (abuf)),
- lifr.lifr_addrlen);
- } else {
- (void) printf(" set %s/%d ",
- inet_ntop(AF_INET6, (void *)&sin6->sin6_addr,
- abuf, sizeof (abuf)),
- lifr.lifr_addrlen);
- }
+ (void) printf(" set %s/%d ",
+ inet_ntop(AF_INET6, &sin6->sin6_addr, abuf, sizeof (abuf)),
+ lifr.lifr_addrlen);
laddr6 = sin6;
}
(void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
@@ -3720,8 +3747,8 @@ in6_configinfo(int force, uint64_t flags)
lifr.lifr_addrlen);
}
- /* If there is a groupname, print it for only the physical interface */
- if (strchr(name, ':') == NULL) {
+ /* If there is a groupname, print it for only the zeroth interface */
+ if (lifnum(name) == 0) {
if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 &&
lifr.lifr_groupname[0] != '\0') {
(void) printf(" group %s ", lifr.lifr_groupname);
@@ -3729,12 +3756,7 @@ in6_configinfo(int force, uint64_t flags)
}
/* Print flags to configure */
- print_config_flags(flags);
-
- /* IFF_NONUD applies to AF_INET6 only */
- if (flags & IFF_NONUD) {
- (void) printf("-nud ");
- }
+ print_config_flags(AF_INET6, flags);
}
/*
@@ -3768,31 +3790,41 @@ in6_configinfo(int force, uint64_t flags)
* compatibility for other utilities like atmifconfig etc. In this case
* the utility must use SIOCSLIFMUXID.
*/
-static void
-plumb_one_device(int af)
+static int
+ifplumb(const char *linkname, const char *ifname, boolean_t genppa, int af)
{
int arp_muxid = -1, ip_muxid;
int mux_fd, ip_fd, arp_fd;
int retval;
- uint_t ppa;
char *udp_dev_name;
- char provider[DLPI_LINKNAME_MAX];
+ uint64_t flags;
+ uint_t dlpi_flags;
dlpi_handle_t dh_arp, dh_ip;
/*
- * We use DLPI_NOATTACH because the ip module will do the attach
- * itself for DLPI style-2 devices.
+ * Always dlpi_open() with DLPI_NOATTACH because the IP and ARP module
+ * will do the attach themselves for DLPI style-2 links.
*/
- retval = dlpi_open(name, &dh_ip, DLPI_NOATTACH);
- if (retval != DLPI_SUCCESS)
- Perrdlpi_exit("cannot open link", name, retval);
+ dlpi_flags = DLPI_NOATTACH;
- if ((retval = dlpi_parselink(name, provider, &ppa)) != DLPI_SUCCESS)
- Perrdlpi_exit("dlpi_parselink", name, retval);
+ /*
+ * If `linkname' is the special token IPMPSTUB, then this is a request
+ * to create an IPMP interface atop /dev/ipmpstub0. (We can't simply
+ * pass "ipmpstub0" as `linkname' since an admin *could* have a normal
+ * vanity-named link named "ipmpstub0" that they'd like to plumb.)
+ */
+ if (linkname == IPMPSTUB) {
+ linkname = "ipmpstub0";
+ dlpi_flags |= DLPI_DEVONLY;
+ }
+
+ retval = dlpi_open(linkname, &dh_ip, dlpi_flags);
+ if (retval != DLPI_SUCCESS)
+ Perrdlpi_exit("cannot open link", linkname, retval);
if (debug) {
- (void) printf("ifconfig: plumb_one_device: provider %s,"
- " ppa %u\n", provider, ppa);
+ (void) printf("ifconfig: ifplumb: link %s, ifname %s, "
+ "genppa %u\n", linkname, ifname, genppa);
}
ip_fd = dlpi_fd(dh_ip);
@@ -3812,29 +3844,106 @@ plumb_one_device(int af)
Perror2_exit("I_PUSH", ARP_MOD_NAME);
/*
- * Set IFF_IPV4/IFF_IPV6 flags.
- * At this point in time the kernel also allows an
- * override of the CANTCHANGE flags.
+ * Prepare to set IFF_IPV4/IFF_IPV6 flags as part of SIOCSLIFNAME.
+ * (At this point in time the kernel also allows an override of the
+ * IFF_CANTCHANGE flags.)
*/
lifr.lifr_name[0] = '\0';
if (ioctl(ip_fd, SIOCGLIFFLAGS, (char *)&lifr) == -1)
- Perror0_exit("plumb_one_device: SIOCGLIFFLAGS");
+ Perror0_exit("ifplumb: SIOCGLIFFLAGS");
- /* Set the name string and the IFF_IPV* flag */
if (af == AF_INET6) {
- lifr.lifr_flags |= IFF_IPV6;
- lifr.lifr_flags &= ~(IFF_BROADCAST | IFF_IPV4);
+ flags = lifr.lifr_flags | IFF_IPV6;
+ flags &= ~(IFF_BROADCAST | IFF_IPV4);
} else {
- lifr.lifr_flags |= IFF_IPV4;
- lifr.lifr_flags &= ~IFF_IPV6;
+ flags = lifr.lifr_flags | IFF_IPV4;
+ flags &= ~IFF_IPV6;
}
- /* record the device and module names as interface name */
- lifr.lifr_ppa = ppa;
- (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
+ /*
+ * Set the interface name. If we've been asked to generate the PPA,
+ * then find the lowest available PPA (only currently used for IPMP
+ * interfaces). Otherwise, use the interface name as-is.
+ */
+ if (genppa) {
+ int ppa;
+
+ /*
+ * We'd like to just set lifr_ppa to UINT_MAX and have the
+ * kernel pick a PPA. Unfortunately, that would mishandle
+ * two cases:
+ *
+ * 1. If the PPA is available but the groupname is taken
+ * (e.g., the "ipmp2" IP interface name is available
+ * but the "ipmp2" groupname is taken) then the
+ * auto-assignment by the kernel will fail.
+ *
+ * 2. If we're creating (e.g.) an IPv6-only IPMP
+ * interface, and there's already an IPv4-only IPMP
+ * interface, the kernel will allow us to accidentally
+ * reuse the IPv6 IPMP interface name (since
+ * SIOCSLIFNAME uniqueness is per-interface-type).
+ * This will cause administrative confusion.
+ *
+ * Thus, we instead take a brute-force approach of checking
+ * whether the IPv4 or IPv6 name is already in-use before
+ * attempting the SIOCSLIFNAME. As per (1) above, the
+ * SIOCSLIFNAME may still fail, in which case we just proceed
+ * to the next one. If this approach becomes too slow, we
+ * can add a new SIOC* to handle this case in the kernel.
+ */
+ for (ppa = 0; ppa < UINT_MAX; ppa++) {
+ (void) snprintf(lifr.lifr_name, LIFNAMSIZ, "%s%d",
+ ifname, ppa);
+
+ if (ioctl(s4, SIOCGLIFFLAGS, &lifr) != -1 ||
+ errno != ENXIO)
+ continue;
+
+ if (ioctl(s6, SIOCGLIFFLAGS, &lifr) != -1 ||
+ errno != ENXIO)
+ continue;
+
+ lifr.lifr_ppa = ppa;
+ lifr.lifr_flags = flags;
+ retval = ioctl(ip_fd, SIOCSLIFNAME, &lifr);
+ if (retval != -1 || errno != EEXIST)
+ break;
+ }
+ } else {
+ ifspec_t ifsp;
+
+ /*
+ * The interface name could have come from the command-line;
+ * check it.
+ */
+ if (!ifparse_ifspec(ifname, &ifsp) || ifsp.ifsp_lunvalid)
+ Perror2_exit("invalid IP interface name", ifname);
+
+ /*
+ * Before we call SIOCSLIFNAME, ensure that the IPMP group
+ * interface for this address family exists. Otherwise, the
+ * kernel will kick the interface out of the group when we do
+ * the SIOCSLIFNAME.
+ *
+ * Example: suppose bge0 is plumbed for IPv4 and in group "a".
+ * If we're now plumbing bge0 for IPv6, but the IPMP group
+ * interface for "a" is not plumbed for IPv6, the SIOCSLIFNAME
+ * will kick bge0 out of group "a", which is undesired.
+ */
+ if (create_ipmp_peer(af, ifname) == -1) {
+ (void) fprintf(stderr, "ifconfig: warning: cannot "
+ "create %s IPMP group; %s will be removed from "
+ "group\n", af == AF_INET ? "IPv4" : "IPv6", ifname);
+ }
- /* set the interface name */
- if (ioctl(ip_fd, SIOCSLIFNAME, (char *)&lifr) == -1) {
+ lifr.lifr_ppa = ifsp.ifsp_ppa;
+ lifr.lifr_flags = flags;
+ (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ);
+ retval = ioctl(ip_fd, SIOCSLIFNAME, &lifr);
+ }
+
+ if (retval == -1) {
if (errno != EEXIST)
Perror0_exit("SIOCSLIFNAME for ip");
/*
@@ -3847,15 +3956,15 @@ plumb_one_device(int af)
* called for EEXIST.
*/
Perror0("SIOCSLIFNAME for ip");
- return;
+ return (-1);
}
/* Get the full set of existing flags for this stream */
if (ioctl(ip_fd, SIOCGLIFFLAGS, (char *)&lifr) == -1)
- Perror0_exit("plumb_one_device: SIOCFLIFFLAGS");
+ Perror0_exit("ifplumb: SIOCGLIFFLAGS");
if (debug) {
- (void) printf("ifconfig: plumb_one_device: %s got flags:\n",
+ (void) printf("ifconfig: ifplumb: %s got flags:\n",
lifr.lifr_name);
print_flags(lifr.lifr_flags);
(void) putchar('\n');
@@ -3890,7 +3999,7 @@ plumb_one_device(int af)
if ((ip_muxid = ioctl(mux_fd, I_PLINK, ip_fd)) == -1)
Perror0_exit("I_PLINK for ip");
(void) close(mux_fd);
- return;
+ return (lifr.lifr_ppa);
}
/*
@@ -3901,15 +4010,11 @@ plumb_one_device(int af)
* only on the interface stream, not on the ARP stream.
*/
if (debug)
- (void) printf("ifconfig: plumb_one_device: ifname: %s\n", name);
+ (void) printf("ifconfig: ifplumb: interface %s", ifname);
- /*
- * We use DLPI_NOATTACH because the arp module will do the attach
- * itself for DLPI style-2 devices.
- */
- retval = dlpi_open(name, &dh_arp, DLPI_NOATTACH);
+ retval = dlpi_open(linkname, &dh_arp, dlpi_flags);
if (retval != DLPI_SUCCESS)
- Perrdlpi_exit("cannot open link", name, retval);
+ Perrdlpi_exit("cannot open link", linkname, retval);
arp_fd = dlpi_fd(dh_arp);
if (ioctl(arp_fd, I_PUSH, ARP_MOD_NAME) == -1)
@@ -3919,16 +4024,13 @@ plumb_one_device(int af)
* Tell ARP the name and unit number for this interface.
* Note that arp has no support for transparent ioctls.
*/
- if (strioctl(arp_fd, SIOCSLIFNAME, (char *)&lifr,
- sizeof (lifr)) == -1) {
+ if (strioctl(arp_fd, SIOCSLIFNAME, &lifr, sizeof (lifr)) == -1) {
if (errno != EEXIST)
Perror0_exit("SIOCSLIFNAME for arp");
Perror0("SIOCSLIFNAME for arp");
- dlpi_close(dh_arp);
- dlpi_close(dh_ip);
- (void) close(mux_fd);
- return;
+ goto out;
}
+
/*
* PLINK the IP and ARP streams so that ifconfig can exit
* without tearing down the stream.
@@ -3942,12 +4044,13 @@ plumb_one_device(int af)
if (debug)
(void) printf("arp muxid = %d\n", arp_muxid);
+out:
dlpi_close(dh_ip);
dlpi_close(dh_arp);
(void) close(mux_fd);
+ return (lifr.lifr_ppa);
}
-
/*
* If this is a physical interface then remove it.
* If it is a logical interface name use SIOCLIFREMOVEIF to
@@ -3965,6 +4068,7 @@ inetunplumb(char *arg, int64_t param)
uint64_t flags;
boolean_t changed_arp_muxid = _B_FALSE;
int save_errno;
+ boolean_t v6 = (afp->af_af == AF_INET6);
strptr = strchr(name, ':');
if (strptr != NULL || strcmp(name, LOOPBACK_IF) == 0) {
@@ -3986,7 +4090,7 @@ inetunplumb(char *arg, int64_t param)
* We used /dev/udp or udp6 to set up the mux. So we have to use
* the same now for PUNLINK also.
*/
- if (afp->af_af == AF_INET6)
+ if (v6)
udp_dev_name = UDP6_DEV_NAME;
else
udp_dev_name = UDP_DEV_NAME;
@@ -4002,6 +4106,50 @@ inetunplumb(char *arg, int64_t param)
Perror0_exit("unplumb: SIOCGLIFFLAGS");
}
flags = lifr.lifr_flags;
+
+ if (flags & IFF_IPMP) {
+ lifgroupinfo_t lifgr;
+ ifaddrlistx_t *ifaddrs, *ifaddrp;
+
+ /*
+ * The kernel will fail the I_PUNLINK if the group still has
+ * members, but check now to provide a better error message.
+ */
+ if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) == -1)
+ Perror0_exit("unplumb: SIOCGLIFGROUPNAME");
+
+ (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname,
+ LIFGRNAMSIZ);
+ if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1)
+ Perror0_exit("unplumb: SIOCGLIFGROUPINFO");
+
+ if ((v6 && lifgr.gi_nv6 != 0) || (!v6 && lifgr.gi_nv4 != 0)) {
+ (void) fprintf(stderr, "ifconfig: %s: cannot unplumb:"
+ " IPMP group is not empty\n", name);
+ exit(1);
+ }
+
+ /*
+ * The kernel will fail the I_PUNLINK if the IPMP interface
+ * has administratively up addresses; bring 'em down.
+ */
+ if (ifaddrlistx(name, IFF_UP|IFF_DUPLICATE, 0, &ifaddrs) == -1)
+ Perror2_exit(name, "cannot get address list");
+
+ ifaddrp = ifaddrs;
+ for (; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) {
+ if (((ifaddrp->ia_flags & IFF_IPV6) && !v6) ||
+ (!(ifaddrp->ia_flags & IFF_IPV6) && v6))
+ continue;
+
+ if (!ifaddr_down(ifaddrp)) {
+ Perror2_exit(ifaddrp->ia_name,
+ "cannot bring down");
+ }
+ }
+ ifaddrlistx_free(ifaddrs);
+ }
+
if (ioctl(muxid_fd, SIOCGLIFMUXID, (caddr_t)&lifr) < 0) {
Perror0_exit("unplumb: SIOCGLIFMUXID");
}
@@ -4098,12 +4246,6 @@ inetplumb(char *arg, int64_t param)
Perror2_exit("plumb: SIOCLIFADDIF", name);
}
}
- /*
- * IP can create the new logical interface on a different
- * physical interface in the same IPMP group. Take the new
- * interface into account for further operations.
- */
- (void) strncpy(name, lifr.lifr_name, sizeof (name));
return (0);
}
@@ -4131,10 +4273,229 @@ inetplumb(char *arg, int64_t param)
if (debug)
(void) printf("inetplumb: %s af %d\n", name, afp->af_af);
- plumb_one_device(afp->af_af);
+ (void) ifplumb(name, name, _B_FALSE, afp->af_af);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+inetipmp(char *arg, int64_t param)
+{
+ int retval;
+
+ /*
+ * Treat e.g. "ifconfig ipmp0:2 ipmp" as "ifconfig ipmp0:2 plumb".
+ * Otherwise, try to create the requested IPMP interface.
+ */
+ if (strchr(name, ':') != NULL)
+ retval = inetplumb(arg, param);
+ else
+ retval = create_ipmp(name, afp->af_af, name, _B_FALSE);
+
+ /*
+ * We'd return -1, but foreachinterface() doesn't propagate the error
+ * into the exit status, so we're forced to explicitly exit().
+ */
+ if (retval == -1)
+ exit(1);
return (0);
}
+/*
+ * Create an IPMP group `grname' with address family `af'. If `ifname' is
+ * non-NULL, it specifies the interface name to use. Otherwise, use the name
+ * ipmpN, where N corresponds to the lowest available integer. If `implicit'
+ * is set, then the group is being created as a side-effect of placing an
+ * underlying interface in a group. Also start in.mpathd if necessary.
+ */
+static int
+create_ipmp(const char *grname, int af, const char *ifname, boolean_t implicit)
+{
+ int ppa;
+ static int ipmp_daemon_started;
+
+ if (debug) {
+ (void) printf("create_ipmp: ifname %s grname %s af %d\n",
+ ifname != NULL ? ifname : "NULL", grname, af);
+ }
+
+ if (ifname != NULL)
+ ppa = ifplumb(IPMPSTUB, ifname, _B_FALSE, af);
+ else
+ ppa = ifplumb(IPMPSTUB, "ipmp", _B_TRUE, af);
+
+ if (ppa == -1) {
+ Perror2(grname, "cannot create IPMP interface");
+ return (-1);
+ }
+
+ if (ifname != NULL)
+ (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ);
+ else
+ (void) snprintf(lifr.lifr_name, LIFNAMSIZ, "ipmp%d", ppa);
+
+ /*
+ * To preserve backward-compatibility, always bring up the link-local
+ * address for implicitly-created IPv6 IPMP interfaces.
+ */
+ if (implicit && af == AF_INET6) {
+ if (ioctl(s6, SIOCGLIFFLAGS, &lifr) == 0) {
+ lifr.lifr_flags |= IFF_UP;
+ (void) ioctl(s6, SIOCSLIFFLAGS, &lifr);
+ }
+ }
+
+ /*
+ * If the caller requested a different group name, issue a
+ * SIOCSLIFGROUPNAME on the new IPMP interface.
+ */
+ if (strcmp(lifr.lifr_name, grname) != 0) {
+ (void) strlcpy(lifr.lifr_groupname, grname, LIFGRNAMSIZ);
+ if (ioctl(s, SIOCSLIFGROUPNAME, &lifr) == -1) {
+ Perror0("SIOCSLIFGROUPNAME");
+ return (-1);
+ }
+ }
+
+ /*
+ * If we haven't done so yet, ensure in.mpathd is started.
+ */
+ if (ipmp_daemon_started++ == 0)
+ start_ipmp_daemon();
+
+ return (0);
+}
+
+/*
+ * Check if `ifname' is plumbed and in an IPMP group on its "other" address
+ * family. If so, create a matching IPMP group for address family `af'.
+ */
+static int
+create_ipmp_peer(int af, const char *ifname)
+{
+ int fd;
+ lifgroupinfo_t lifgr;
+
+ assert(af == AF_INET || af == AF_INET6);
+
+ /*
+ * Get the socket for the "other" address family.
+ */
+ fd = (af == AF_INET) ? s6 : s4;
+
+ (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ);
+ if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) != 0)
+ return (0);
+
+ (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname, LIFGRNAMSIZ);
+ if (ioctl(fd, SIOCGLIFGROUPINFO, &lifgr) != 0)
+ return (0);
+
+ /*
+ * If `ifname' *is* the IPMP group interface, or if the relevant
+ * address family is already configured, then there's nothing to do.
+ */
+ if (strcmp(lifgr.gi_grifname, ifname) == 0 ||
+ (af == AF_INET && lifgr.gi_v4) || (af == AF_INET6 && lifgr.gi_v6))
+ return (0);
+
+ return (create_ipmp(lifgr.gi_grname, af, lifgr.gi_grifname, _B_TRUE));
+}
+
+/*
+ * Start in.mpathd if it's not already running.
+ */
+static void
+start_ipmp_daemon(void)
+{
+ int retval;
+ ipmp_handle_t ipmp_handle;
+
+ /*
+ * Ping in.mpathd to see if it's running already.
+ */
+ if ((retval = ipmp_open(&ipmp_handle)) != IPMP_SUCCESS) {
+ (void) fprintf(stderr, "ifconfig: cannot create IPMP handle: "
+ "%s\n", ipmp_errmsg(retval));
+ return;
+ }
+
+ retval = ipmp_ping_daemon(ipmp_handle);
+ ipmp_close(ipmp_handle);
+
+ switch (retval) {
+ case IPMP_ENOMPATHD:
+ break;
+ case IPMP_SUCCESS:
+ return;
+ default:
+ (void) fprintf(stderr, "ifconfig: cannot ping in.mpathd: %s\n",
+ ipmp_errmsg(retval));
+ break;
+ }
+
+ /*
+ * Start in.mpathd. Note that in.mpathd will handle multiple
+ * incarnations (ipmp_ping_daemon() is just an optimization) so we
+ * don't need to worry about racing with another ifconfig process.
+ */
+ switch (fork()) {
+ case -1:
+ Perror0_exit("start_ipmp_daemon: fork");
+ /* NOTREACHED */
+ case 0:
+ (void) execl(MPATHD_PATH, MPATHD_PATH, NULL);
+ _exit(1);
+ /* NOTREACHED */
+ default:
+ break;
+ }
+}
+
+/*
+ * Bring the address named by `ifaddrp' up or down. Doesn't trust any mutable
+ * values in ia_flags since they may be stale.
+ */
+static boolean_t
+ifaddr_op(ifaddrlistx_t *ifaddrp, boolean_t up)
+{
+ struct lifreq lifrl; /* Local lifreq struct */
+ int fd = (ifaddrp->ia_flags & IFF_IPV4) ? s4 : s6;
+
+ (void) memset(&lifrl, 0, sizeof (lifrl));
+ (void) strlcpy(lifrl.lifr_name, ifaddrp->ia_name, LIFNAMSIZ);
+ if (ioctl(fd, SIOCGLIFFLAGS, &lifrl) == -1)
+ return (_B_FALSE);
+
+ if (up) {
+ lifrl.lifr_flags |= IFF_UP;
+ } else {
+ /*
+ * If we've been asked to bring down an IFF_DUPLICATE address,
+ * then get the address and set it. This will cause IP to
+ * clear IFF_DUPLICATE and stop the automatic recovery timer.
+ */
+ if (lifrl.lifr_flags & IFF_DUPLICATE) {
+ return (ioctl(fd, SIOCGLIFADDR, &lifrl) != -1 &&
+ ioctl(fd, SIOCSLIFADDR, &lifrl) != -1);
+ }
+ lifrl.lifr_flags &= ~IFF_UP;
+ }
+ return (ioctl(fd, SIOCSLIFFLAGS, &lifrl) == 0);
+}
+
+static boolean_t
+ifaddr_up(ifaddrlistx_t *ifaddrp)
+{
+ return (ifaddr_op(ifaddrp, _B_TRUE));
+}
+
+static boolean_t
+ifaddr_down(ifaddrlistx_t *ifaddrp)
+{
+ return (ifaddr_op(ifaddrp, _B_FALSE));
+}
+
void
Perror0(const char *cmd)
{
@@ -4404,14 +4765,14 @@ print_flags(uint64_t flags)
}
static void
-print_config_flags(uint64_t flags)
+print_config_flags(int af, uint64_t flags)
{
- int cnt, i;
+ if_config_cmd_t *cmdp;
- cnt = sizeof (if_config_cmd_tbl) / sizeof (if_config_cmd_t);
- for (i = 0; i < cnt; i++) {
- if (flags & if_config_cmd_tbl[i].iff_flag) {
- (void) printf("%s ", if_config_cmd_tbl[i].iff_name);
+ for (cmdp = if_config_cmd_tbl; cmdp->iff_flag != 0; cmdp++) {
+ if ((flags & cmdp->iff_flag) &&
+ (cmdp->iff_af == AF_UNSPEC || cmdp->iff_af == af)) {
+ (void) printf("%s ", cmdp->iff_name);
}
}
}
@@ -4454,7 +4815,18 @@ in_getmask(struct sockaddr_in *saddr, boolean_t addr_set)
}
static int
-strioctl(int s, int cmd, char *buf, int buflen)
+lifnum(const char *ifname)
+{
+ const char *cp;
+
+ if ((cp = strchr(ifname, ':')) == NULL)
+ return (0);
+ else
+ return (atoi(cp + 1));
+}
+
+static int
+strioctl(int s, int cmd, void *buf, int buflen)
{
struct strioctl ioc;
@@ -4681,6 +5053,7 @@ usage(void)
"\t[ modlist ]\n"
"\t[ modinsert <module_name@position> ]\n"
"\t[ modremove <module_name@position> ]\n"
+ "\t[ ipmp ]\n"
"\t[ group <groupname>] | [ group \"\"]\n"
"\t[ deprecated | -deprecated ]\n"
"\t[ standby | -standby ]\n"
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h
index 0ac600001f..f11f4d0a94 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -11,8 +11,6 @@
#ifndef _IFCONFIG_H
#define _IFCONFIG_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -39,7 +37,6 @@ extern void Perrdlpi_exit(const char *, const char *, int);
extern int doifrevarp(const char *, struct sockaddr_in *);
-extern int dlpi_set_address(const char *, uchar_t *, uint_t);
extern void dlpi_print_address(const char *);
#ifdef __cplusplus
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c
index 725c8b24c3..aba4794942 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c
@@ -19,14 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "defs.h"
#include "ifconfig.h"
#include <sys/types.h>
@@ -88,6 +86,7 @@ doifrevarp(const char *linkname, struct sockaddr_in *laddr)
/* don't try to revarp if we know it won't work */
if ((lifr.lifr_flags & IFF_LOOPBACK) ||
(lifr.lifr_flags & IFF_NOARP) ||
+ (lifr.lifr_flags & IFF_IPMP) ||
(lifr.lifr_flags & IFF_POINTOPOINT)) {
(void) close(s);
return (0);
@@ -326,28 +325,6 @@ rarp_recv(dlpi_handle_t dh, struct arphdr *ans, size_t msglen,
return (DLPI_ETIMEDOUT);
}
-int
-dlpi_set_address(const char *linkname, uchar_t *physaddr, uint_t physaddrlen)
-{
- int retval;
- dlpi_handle_t dh;
-
- if ((retval = dlpi_open(linkname, &dh, 0)) != DLPI_SUCCESS) {
- Perrdlpi("dlpi_open failed", linkname, retval);
- return (-1);
- }
-
- if ((retval = dlpi_set_physaddr(dh, DL_CURR_PHYS_ADDR, physaddr,
- physaddrlen)) != DLPI_SUCCESS) {
- Perrdlpi("dlpi_set_physaddr failed", linkname, retval);
- dlpi_close(dh);
- return (-1);
- }
-
- dlpi_close(dh);
- return (0);
-}
-
void
dlpi_print_address(const char *linkname)
{
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h
index 900b5841ed..5cca3ecb2e 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h
+++ b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* Copyright (c) 1983, 1988, 1993
@@ -414,16 +414,9 @@ struct interface {
(IS_REMOTE|IS_PASSIVE))
/*
- * Is an IP interface up? Because of the way IPMP uses deprecated
- * interfaces, we need to check more than the IFF_UP and IFF_RUNNING
- * interface flags here. Basically, we do not want to use IFF_DEPRECATED
- * interfaces unless they are also IFF_STANDBY and not IFF_INACTIVE.
+ * Is an IP interface up?
*/
-#define IFF_GOOD (IFF_UP|IFF_RUNNING)
-#define IS_IFF_UP(f) \
- ((((f) & (IFF_GOOD|IFF_DEPRECATED)) == IFF_GOOD) || \
- (((f) & (IFF_GOOD|IFF_INACTIVE|IFF_STANDBY)) == \
- (IFF_GOOD|IFF_STANDBY)))
+#define IS_IFF_UP(f) (((f) & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING))
/*
* This defines interfaces that we should not use for advertising or
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c
index 79ae02e703..a3a26ac2cb 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* Copyright (c) 1983, 1988, 1993
@@ -36,8 +36,6 @@
* $FreeBSD: src/sbin/routed/trace.c,v 1.6 2000/08/11 08:24:38 sheldonh Exp $
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "defs.h"
#include "pathnames.h"
#include <signal.h>
@@ -566,6 +564,7 @@ static struct bits if_bits[] = {
{ IFF_TEMPORARY, 0, "TEMPORARY" },
{ IFF_FIXEDMTU, 0, "FIXEDMTU" },
{ IFF_VIRTUAL, 0, "VIRTUAL"},
+ { IFF_IPMP, 0, "IPMP"},
{ 0, 0, NULL}
};
@@ -898,8 +897,8 @@ trace_upslot(struct rt_entry *rt,
print_rts(rts, 0, 0,
rts->rts_gate != new->rts_gate,
rts->rts_tag != new->rts_tag,
- rts != rt->rt_spares || AGE_RT(rt->rt_state,
- rts->rts_origin, rt->rt_ifp));
+ rts != rt->rt_spares ||
+ AGE_RT(rt->rt_state, rts->rts_origin, rt->rt_ifp));
(void) fprintf(ftrace, "\n %19s%-16s ", "",
(new->rts_gate != rts->rts_gate ?
@@ -1173,10 +1172,9 @@ trace_rip(const char *dir1, const char *dir2,
if (NA->a_type == RIP_AUTH_PW &&
n == msg->rip_nets) {
(void) fprintf(ftrace, "\tPassword"
- " Authentication:"
- " \"%s\"\n",
+ " Authentication: \"%s\"\n",
qstring(NA->au.au_pw,
- RIP_AUTH_PW_LEN));
+ RIP_AUTH_PW_LEN));
continue;
}
@@ -1186,13 +1184,12 @@ trace_rip(const char *dir1, const char *dir2,
"\tMD5 Auth"
" pkt_len=%d KeyID=%u"
" auth_len=%d"
- " seqno=%#lx"
- " rsvd=%#x,%#x\n",
+ " seqno=%#x"
+ " rsvd=%#hx,%#hx\n",
ntohs(NA->au.a_md5.md5_pkt_len),
NA->au.a_md5.md5_keyid,
NA->au.a_md5.md5_auth_len,
- (unsigned long)ntohl(NA->au.a_md5.
- md5_seqno),
+ ntohl(NA->au.a_md5.md5_seqno),
ntohs(NA->au.a_md5.rsvd[0]),
ntohs(NA->au.a_md5.rsvd[1]));
continue;
@@ -1217,14 +1214,12 @@ trace_rip(const char *dir1, const char *dir2,
inet_ntoa(tmp_mask));
} else if (msg->rip_vers == RIPv1) {
(void) fprintf(ftrace, "\t%-18s ",
- addrname(n->n_dst,
- ntohl(n->n_mask),
- n->n_mask == 0 ? 2 : 1));
+ addrname(n->n_dst, ntohl(n->n_mask),
+ n->n_mask == 0 ? 2 : 1));
} else {
(void) fprintf(ftrace, "\t%-18s ",
- addrname(n->n_dst,
- ntohl(n->n_mask),
- n->n_mask == 0 ? 2 : 0));
+ addrname(n->n_dst, ntohl(n->n_mask),
+ n->n_mask == 0 ? 2 : 0));
}
(void) fprintf(ftrace, "metric=%-2lu ",
(unsigned long)ntohl(n->n_metric));
@@ -1242,8 +1237,8 @@ trace_rip(const char *dir1, const char *dir2,
break;
case RIPCMD_TRACEON:
- (void) fprintf(ftrace, "\tfile=\"%.*s\"\n", size-4,
- msg->rip_tracefile);
+ (void) fprintf(ftrace, "\tfile=\"%.*s\"\n", size - 4,
+ msg->rip_tracefile);
break;
case RIPCMD_TRACEOFF:
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile
new file mode 100644
index 0000000000..a256cf5f49
--- /dev/null
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile
@@ -0,0 +1,48 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+PROG = ipmpstat
+ROOTFS_PROG = $(PROG)
+ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%)
+
+include $(SRC)/cmd/Makefile.cmd
+
+C99MODE = $(C99_ENABLE)
+LDLIBS += -lipmp -lsocket -lsysevent -lnvpair
+XGETFLAGS += -a -x $(PROG).xcl
+
+.KEEP_STATE:
+
+all: $(PROG)
+
+install: all $(ROOTSBINPROG) $(ROOTUSRSBINLINKS)
+
+clean:
+
+lint: lint_PROG
+
+$(ROOTUSRSBINLINKS):
+ -$(RM) $@; $(SYMLINK) ../../sbin/$(@F) $@
+
+include $(SRC)/cmd/Makefile.targ
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c
new file mode 100644
index 0000000000..4620c34a24
--- /dev/null
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c
@@ -0,0 +1,1498 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <alloca.h>
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <ipmp_admin.h>
+#include <ipmp_query.h>
+#include <libintl.h>
+#include <libnvpair.h>
+#include <libsysevent.h>
+#include <locale.h>
+#include <netdb.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/ipmp.h>
+#include <sys/sysmacros.h>
+#include <sys/termios.h>
+#include <sys/types.h>
+
+/*
+ * ipmpstat -- display IPMP subsystem status.
+ *
+ * This utility makes extensive use of libipmp and IPMP sysevents to gather
+ * and pretty-print the status of the IPMP subsystem. All output formats
+ * except for -p (probe) use libipmp to create a point-in-time snapshot of the
+ * IPMP subsystem (unless the test-special -L flag is used), and then output
+ * the contents of that snapshot in a user-specified manner. Because the
+ * output format and requested fields aren't known until run-time, three sets
+ * of function pointers and two core data structures are used. Specifically:
+ *
+ * * The ipmpstat_walker_t function pointers (walk_*) iterate through
+ * all instances of a given IPMP object (group, interface, or address).
+ * At most one ipmpstat_walker_t is used per ipmpstat invocation.
+ * Since target information is included with the interface information,
+ * both -i and -t use the interface walker (walk_if()).
+ *
+ * * The ipmpstat_sfunc_t function pointers (sfunc_*) obtain a given
+ * value for a given IPMP object. Each ipmpstat_sunc_t is passed a
+ * buffer to write its result into, the buffer's size, and an
+ * ipmpstat_sfunc_arg_t state structure. The state structure consists
+ * of a pointer to the IPMP object to obtain information from
+ * (sa_data), and an open libipmp handle (sa_ih) which can be used to
+ * do additional libipmp queries, if necessary (e.g., because the
+ * object does not have all of the needed information).
+ *
+ * * The ipmpstat_field_t structure provides the list of supported fields
+ * for a given output format, along with output formatting information
+ * (e.g., field width), and a pointer to an ipmpstat_sfunc_t function
+ * that can obtain the value for a IPMP given object. For a given
+ * ipmpstat output format, there's a corresponding array of
+ * ipmpstat_field_t structures. Thus, one ipmpstat_field_t array is
+ * used per ipmpstat invocation.
+ *
+ * * The ipmpstat_ofmt_t provides an ordered list of the requested
+ * ipmpstat_field_t's (e.g., via -o) for a given ipmpstat invocation.
+ * It is built at runtime from the command-line arguments. This
+ * structure (and a given IPMP object) is used by ofmt_output() to
+ * output a single line of information about that IPMP object.
+ *
+ * * The ipmpstat_cbfunc_t function pointers (*_cbfunc) are called back
+ * by the walkers. They are used both internally to implement nested
+ * walks, and by the ipmpstat output logic to provide the glue between
+ * the IPMP object walkers and the ofmt_output() logic. Usually, a
+ * single line is output for each IPMP object, and thus ofmt_output()
+ * can be directly invoked (see info_output_cbfunc()). However, if
+ * multiple lines need to be output, then a more complex cbfunc is
+ * needed (see targinfo_output_cbfunc()). At most one cbfunc is used
+ * per ipmpstat invocation.
+ */
+
+/*
+ * Data type used by the sfunc callbacks to obtain the requested information
+ * from the agreed-upon object.
+ */
+typedef struct ipmpstat_sfunc_arg {
+ ipmp_handle_t sa_ih;
+ void *sa_data;
+} ipmpstat_sfunc_arg_t;
+
+typedef void ipmpstat_sfunc_t(ipmpstat_sfunc_arg_t *, char *, uint_t);
+
+/*
+ * Data type that describes how to output a field; used by ofmt_output*().
+ */
+typedef struct ipmpstat_field {
+ const char *f_name; /* field name */
+ uint_t f_width; /* output width */
+ ipmpstat_sfunc_t *f_sfunc; /* value->string function */
+} ipmpstat_field_t;
+
+/*
+ * Data type that specifies the output field order; used by ofmt_output*()
+ */
+typedef struct ipmpstat_ofmt {
+ const ipmpstat_field_t *o_field; /* current field info */
+ struct ipmpstat_ofmt *o_next; /* next field */
+} ipmpstat_ofmt_t;
+
+/*
+ * Function pointers used to iterate through IPMP objects.
+ */
+typedef void ipmpstat_cbfunc_t(ipmp_handle_t, void *, void *);
+typedef void ipmpstat_walker_t(ipmp_handle_t, ipmpstat_cbfunc_t *, void *);
+
+/*
+ * Data type used to implement nested walks.
+ */
+typedef struct ipmpstat_walkdata {
+ ipmpstat_cbfunc_t *iw_func; /* caller-specified callback */
+ void *iw_funcarg; /* caller-specified arg */
+} ipmpstat_walkdata_t;
+
+/*
+ * Data type used by enum2str() to map an enumerated value to a string.
+ */
+typedef struct ipmpstat_enum {
+ const char *e_name; /* string */
+ int e_val; /* value */
+} ipmpstat_enum_t;
+
+/*
+ * Data type used to pass state between probe_output() and probe_event().
+ */
+typedef struct ipmpstat_probe_state {
+ ipmp_handle_t ps_ih; /* open IPMP handle */
+ ipmpstat_ofmt_t *ps_ofmt; /* requested ofmt string */
+} ipmpstat_probe_state_t;
+
+/*
+ * Options that modify the output mode; more than one may be lit.
+ */
+typedef enum {
+ IPMPSTAT_OPT_NUMERIC = 0x1,
+ IPMPSTAT_OPT_PARSABLE = 0x2
+} ipmpstat_opt_t;
+
+/*
+ * Indices for the FLAGS field of the `-i' output format.
+ */
+enum {
+ IPMPSTAT_IFLAG_INDEX, IPMPSTAT_SFLAG_INDEX, IPMPSTAT_M4FLAG_INDEX,
+ IPMPSTAT_BFLAG_INDEX, IPMPSTAT_M6FLAG_INDEX, IPMPSTAT_DFLAG_INDEX,
+ IPMPSTAT_HFLAG_INDEX, IPMPSTAT_NUM_FLAGS
+};
+
+#define IPMPSTAT_NCOL 80
+#define NS2FLOATMS(ns) ((float)(ns) / (NANOSEC / MILLISEC))
+#define MS2FLOATSEC(ms) ((float)(ms) / 1000)
+
+static const char *progname;
+static hrtime_t probe_output_start;
+static struct winsize winsize;
+static ipmpstat_opt_t opt;
+static ipmpstat_enum_t addr_state[], group_state[], if_state[], if_link[];
+static ipmpstat_enum_t if_probe[], targ_mode[];
+static ipmpstat_field_t addr_fields[], group_fields[], if_fields[];
+static ipmpstat_field_t probe_fields[], targ_fields[];
+static ipmpstat_cbfunc_t walk_addr_cbfunc, walk_if_cbfunc;
+static ipmpstat_cbfunc_t info_output_cbfunc, targinfo_output_cbfunc;
+static ipmpstat_walker_t walk_addr, walk_if, walk_group;
+
+static int probe_event(sysevent_t *, void *);
+static void probe_output(ipmp_handle_t, ipmpstat_ofmt_t *);
+static ipmpstat_field_t *field_find(ipmpstat_field_t *, const char *);
+static ipmpstat_ofmt_t *ofmt_create(const char *, ipmpstat_field_t []);
+static void ofmt_output(const ipmpstat_ofmt_t *, ipmp_handle_t, void *);
+static void ofmt_destroy(ipmpstat_ofmt_t *);
+static void enum2str(const ipmpstat_enum_t *, int, char *, uint_t);
+static void sockaddr2str(const struct sockaddr_storage *, char *, uint_t);
+static void sighandler(int);
+static void usage(void);
+static void die(const char *, ...);
+static void die_ipmperr(int, const char *, ...);
+static void warn(const char *, ...);
+static void warn_ipmperr(int, const char *, ...);
+
+int
+main(int argc, char **argv)
+{
+ int c;
+ int err;
+ const char *ofields = NULL;
+ ipmp_handle_t ih;
+ ipmp_qcontext_t qcontext = IPMP_QCONTEXT_SNAP;
+ ipmpstat_ofmt_t *ofmt;
+ ipmpstat_field_t *fields = NULL;
+ ipmpstat_cbfunc_t *cbfunc;
+ ipmpstat_walker_t *walker;
+
+ if ((progname = strrchr(argv[0], '/')) == NULL)
+ progname = argv[0];
+ else
+ progname++;
+
+ (void) setlocale(LC_ALL, "");
+ (void) textdomain(TEXT_DOMAIN);
+
+ while ((c = getopt(argc, argv, "nLPo:agipt")) != EOF) {
+ if (fields != NULL && strchr("agipt", c) != NULL)
+ die("only one output format may be specified\n");
+
+ switch (c) {
+ case 'n':
+ opt |= IPMPSTAT_OPT_NUMERIC;
+ break;
+ case 'L':
+ /* Undocumented option: for testing use ONLY */
+ qcontext = IPMP_QCONTEXT_LIVE;
+ break;
+ case 'P':
+ opt |= IPMPSTAT_OPT_PARSABLE;
+ break;
+ case 'o':
+ ofields = optarg;
+ break;
+ case 'a':
+ walker = walk_addr;
+ cbfunc = info_output_cbfunc;
+ fields = addr_fields;
+ break;
+ case 'g':
+ walker = walk_group;
+ cbfunc = info_output_cbfunc;
+ fields = group_fields;
+ break;
+ case 'i':
+ walker = walk_if;
+ cbfunc = info_output_cbfunc;
+ fields = if_fields;
+ break;
+ case 'p':
+ fields = probe_fields;
+ break;
+ case 't':
+ walker = walk_if;
+ cbfunc = targinfo_output_cbfunc;
+ fields = targ_fields;
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (argc > optind || fields == NULL)
+ usage();
+
+ if (opt & IPMPSTAT_OPT_PARSABLE) {
+ if (ofields == NULL) {
+ die("output field list (-o) required in parsable "
+ "output mode\n");
+ } else if (strcasecmp(ofields, "all") == 0) {
+ die("\"all\" not allowed in parsable output mode\n");
+ }
+ }
+
+ /*
+ * Obtain the window size and monitor changes to the size. This data
+ * is used to redisplay the output headers when necessary.
+ */
+ (void) sigset(SIGWINCH, sighandler);
+ sighandler(SIGWINCH);
+
+ if ((err = ipmp_open(&ih)) != IPMP_SUCCESS)
+ die_ipmperr(err, "cannot create IPMP handle");
+
+ if (ipmp_ping_daemon(ih) != IPMP_SUCCESS)
+ die("cannot contact in.mpathd(1M) -- is IPMP in use?\n");
+
+ /*
+ * Create the ofmt linked list that will eventually be passed to
+ * to ofmt_output() to output the fields.
+ */
+ ofmt = ofmt_create(ofields, fields);
+
+ /*
+ * If we've been asked to display probes, then call the probe output
+ * function. Otherwise, snapshot IPMP state (or use live state) and
+ * invoke the specified walker with the specified callback function.
+ */
+ if (fields == probe_fields) {
+ probe_output(ih, ofmt);
+ } else {
+ if ((err = ipmp_setqcontext(ih, qcontext)) != IPMP_SUCCESS) {
+ if (qcontext == IPMP_QCONTEXT_SNAP)
+ die_ipmperr(err, "cannot snapshot IPMP state");
+ else
+ die_ipmperr(err, "cannot use live IPMP state");
+ }
+ (*walker)(ih, cbfunc, ofmt);
+ }
+
+ ofmt_destroy(ofmt);
+ ipmp_close(ih);
+
+ return (EXIT_SUCCESS);
+}
+
+/*
+ * Walks all IPMP groups on the system and invokes `cbfunc' on each, passing
+ * it `ih', the ipmp_groupinfo_t pointer, and `arg'.
+ */
+static void
+walk_group(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg)
+{
+ int err;
+ uint_t i;
+ ipmp_groupinfo_t *grinfop;
+ ipmp_grouplist_t *grlistp;
+
+ if ((err = ipmp_getgrouplist(ih, &grlistp)) != IPMP_SUCCESS)
+ die_ipmperr(err, "cannot get IPMP group list");
+
+ for (i = 0; i < grlistp->gl_ngroup; i++) {
+ err = ipmp_getgroupinfo(ih, grlistp->gl_groups[i], &grinfop);
+ if (err != IPMP_SUCCESS) {
+ warn_ipmperr(err, "cannot get info for group `%s'",
+ grlistp->gl_groups[i]);
+ continue;
+ }
+ (*cbfunc)(ih, grinfop, arg);
+ ipmp_freegroupinfo(grinfop);
+ }
+
+ ipmp_freegrouplist(grlistp);
+}
+
+/*
+ * Walks all IPMP interfaces on the system and invokes `cbfunc' on each,
+ * passing it `ih', the ipmp_ifinfo_t pointer, and `arg'.
+ */
+static void
+walk_if(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg)
+{
+ ipmpstat_walkdata_t iw = { cbfunc, arg };
+
+ walk_group(ih, walk_if_cbfunc, &iw);
+}
+
+/*
+ * Walks all IPMP data addresses on the system and invokes `cbfunc' on each.
+ * passing it `ih', the ipmp_addrinfo_t pointer, and `arg'.
+ */
+static void
+walk_addr(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg)
+{
+ ipmpstat_walkdata_t iw = { cbfunc, arg };
+
+ walk_group(ih, walk_addr_cbfunc, &iw);
+}
+
+/*
+ * Nested walker callback function for walk_if().
+ */
+static void
+walk_if_cbfunc(ipmp_handle_t ih, void *infop, void *arg)
+{
+ int err;
+ uint_t i;
+ ipmp_groupinfo_t *grinfop = infop;
+ ipmp_ifinfo_t *ifinfop;
+ ipmp_iflist_t *iflistp = grinfop->gr_iflistp;
+ ipmpstat_walkdata_t *iwp = arg;
+
+ for (i = 0; i < iflistp->il_nif; i++) {
+ err = ipmp_getifinfo(ih, iflistp->il_ifs[i], &ifinfop);
+ if (err != IPMP_SUCCESS) {
+ warn_ipmperr(err, "cannot get info for interface `%s'",
+ iflistp->il_ifs[i]);
+ continue;
+ }
+ (*iwp->iw_func)(ih, ifinfop, iwp->iw_funcarg);
+ ipmp_freeifinfo(ifinfop);
+ }
+}
+
+/*
+ * Nested walker callback function for walk_addr().
+ */
+static void
+walk_addr_cbfunc(ipmp_handle_t ih, void *infop, void *arg)
+{
+ int err;
+ uint_t i;
+ ipmp_groupinfo_t *grinfop = infop;
+ ipmp_addrinfo_t *adinfop;
+ ipmp_addrlist_t *adlistp = grinfop->gr_adlistp;
+ ipmpstat_walkdata_t *iwp = arg;
+ char addr[INET6_ADDRSTRLEN];
+ struct sockaddr_storage *addrp;
+
+ for (i = 0; i < adlistp->al_naddr; i++) {
+ addrp = &adlistp->al_addrs[i];
+ err = ipmp_getaddrinfo(ih, grinfop->gr_name, addrp, &adinfop);
+ if (err != IPMP_SUCCESS) {
+ sockaddr2str(addrp, addr, sizeof (addr));
+ warn_ipmperr(err, "cannot get info for `%s'", addr);
+ continue;
+ }
+ (*iwp->iw_func)(ih, adinfop, iwp->iw_funcarg);
+ ipmp_freeaddrinfo(adinfop);
+ }
+}
+
+static void
+sfunc_nvwarn(const char *nvname, char *buf, uint_t bufsize)
+{
+ warn("cannot retrieve %s\n", nvname);
+ (void) strlcpy(buf, "?", bufsize);
+}
+
+static void
+sfunc_addr_address(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_addrinfo_t *adinfop = arg->sa_data;
+
+ sockaddr2str(&adinfop->ad_addr, buf, bufsize);
+}
+
+static void
+sfunc_addr_group(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ int err;
+ ipmp_addrinfo_t *adinfop = arg->sa_data;
+ ipmp_groupinfo_t *grinfop;
+
+ err = ipmp_getgroupinfo(arg->sa_ih, adinfop->ad_group, &grinfop);
+ if (err != IPMP_SUCCESS) {
+ warn_ipmperr(err, "cannot get info for group `%s'",
+ adinfop->ad_group);
+ (void) strlcpy(buf, "?", bufsize);
+ return;
+ }
+ (void) strlcpy(buf, grinfop->gr_ifname, bufsize);
+ ipmp_freegroupinfo(grinfop);
+}
+
+static void
+sfunc_addr_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_addrinfo_t *adinfop = arg->sa_data;
+
+ enum2str(addr_state, adinfop->ad_state, buf, bufsize);
+}
+
+static void
+sfunc_addr_inbound(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_addrinfo_t *adinfop = arg->sa_data;
+
+ (void) strlcpy(buf, adinfop->ad_binding, bufsize);
+}
+
+static void
+sfunc_addr_outbound(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ int err;
+ uint_t i, nactive = 0;
+ ipmp_ifinfo_t *ifinfop;
+ ipmp_iflist_t *iflistp;
+ ipmp_addrinfo_t *adinfop = arg->sa_data;
+ ipmp_groupinfo_t *grinfop;
+
+ if (adinfop->ad_state == IPMP_ADDR_DOWN)
+ return;
+
+ /*
+ * If there's no inbound interface for this address, there can't
+ * be any outbound traffic.
+ */
+ if (adinfop->ad_binding[0] == '\0')
+ return;
+
+ /*
+ * The address can use any active interface in the group, so
+ * obtain all of those.
+ */
+ err = ipmp_getgroupinfo(arg->sa_ih, adinfop->ad_group, &grinfop);
+ if (err != IPMP_SUCCESS) {
+ warn_ipmperr(err, "cannot get info for group `%s'",
+ adinfop->ad_group);
+ (void) strlcpy(buf, "?", bufsize);
+ return;
+ }
+
+ iflistp = grinfop->gr_iflistp;
+ for (i = 0; i < iflistp->il_nif; i++) {
+ err = ipmp_getifinfo(arg->sa_ih, iflistp->il_ifs[i], &ifinfop);
+ if (err != IPMP_SUCCESS) {
+ warn_ipmperr(err, "cannot get info for interface `%s'",
+ iflistp->il_ifs[i]);
+ continue;
+ }
+
+ if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE) {
+ if (nactive++ != 0)
+ (void) strlcat(buf, " ", bufsize);
+ (void) strlcat(buf, ifinfop->if_name, bufsize);
+ }
+ ipmp_freeifinfo(ifinfop);
+ }
+ ipmp_freegroupinfo(grinfop);
+}
+
+static void
+sfunc_group_name(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_groupinfo_t *grinfop = arg->sa_data;
+
+ (void) strlcpy(buf, grinfop->gr_name, bufsize);
+}
+
+static void
+sfunc_group_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_groupinfo_t *grinfop = arg->sa_data;
+
+ (void) strlcpy(buf, grinfop->gr_ifname, bufsize);
+}
+
+static void
+sfunc_group_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_groupinfo_t *grinfop = arg->sa_data;
+
+ enum2str(group_state, grinfop->gr_state, buf, bufsize);
+}
+
+static void
+sfunc_group_fdt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_groupinfo_t *grinfop = arg->sa_data;
+
+ if (grinfop->gr_fdt == 0)
+ return;
+
+ (void) snprintf(buf, bufsize, "%.2fs", MS2FLOATSEC(grinfop->gr_fdt));
+}
+
+static void
+sfunc_group_interfaces(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ int err;
+ uint_t i;
+ char *active, *inactive, *unusable;
+ uint_t nactive = 0, ninactive = 0, nunusable = 0;
+ ipmp_groupinfo_t *grinfop = arg->sa_data;
+ ipmp_iflist_t *iflistp = grinfop->gr_iflistp;
+ ipmp_ifinfo_t *ifinfop;
+
+ active = alloca(bufsize);
+ active[0] = '\0';
+ inactive = alloca(bufsize);
+ inactive[0] = '\0';
+ unusable = alloca(bufsize);
+ unusable[0] = '\0';
+
+ for (i = 0; i < iflistp->il_nif; i++) {
+ err = ipmp_getifinfo(arg->sa_ih, iflistp->il_ifs[i], &ifinfop);
+ if (err != IPMP_SUCCESS) {
+ warn_ipmperr(err, "cannot get info for interface `%s'",
+ iflistp->il_ifs[i]);
+ continue;
+ }
+
+ if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE) {
+ if (nactive++ != 0)
+ (void) strlcat(active, " ", bufsize);
+ (void) strlcat(active, ifinfop->if_name, bufsize);
+ } else if (ifinfop->if_flags & IPMP_IFFLAG_INACTIVE) {
+ if (ninactive++ != 0)
+ (void) strlcat(inactive, " ", bufsize);
+ (void) strlcat(inactive, ifinfop->if_name, bufsize);
+ } else {
+ if (nunusable++ != 0)
+ (void) strlcat(unusable, " ", bufsize);
+ (void) strlcat(unusable, ifinfop->if_name, bufsize);
+ }
+
+ ipmp_freeifinfo(ifinfop);
+ }
+
+ (void) strlcpy(buf, active, bufsize);
+
+ if (ninactive > 0) {
+ if (nactive != 0)
+ (void) strlcat(buf, " ", bufsize);
+
+ (void) strlcat(buf, "(", bufsize);
+ (void) strlcat(buf, inactive, bufsize);
+ (void) strlcat(buf, ")", bufsize);
+ }
+
+ if (nunusable > 0) {
+ if (nactive + ninactive != 0)
+ (void) strlcat(buf, " ", bufsize);
+
+ (void) strlcat(buf, "[", bufsize);
+ (void) strlcat(buf, unusable, bufsize);
+ (void) strlcat(buf, "]", bufsize);
+ }
+}
+
+static void
+sfunc_if_name(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_ifinfo_t *ifinfop = arg->sa_data;
+
+ (void) strlcpy(buf, ifinfop->if_name, bufsize);
+}
+
+static void
+sfunc_if_active(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_ifinfo_t *ifinfop = arg->sa_data;
+
+ if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE)
+ (void) strlcpy(buf, "yes", bufsize);
+ else
+ (void) strlcpy(buf, "no", bufsize);
+}
+
+static void
+sfunc_if_group(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ int err;
+ ipmp_ifinfo_t *ifinfop = arg->sa_data;
+ ipmp_groupinfo_t *grinfop;
+
+ err = ipmp_getgroupinfo(arg->sa_ih, ifinfop->if_group, &grinfop);
+ if (err != IPMP_SUCCESS) {
+ warn_ipmperr(err, "cannot get info for group `%s'",
+ ifinfop->if_group);
+ (void) strlcpy(buf, "?", bufsize);
+ return;
+ }
+
+ (void) strlcpy(buf, grinfop->gr_ifname, bufsize);
+ ipmp_freegroupinfo(grinfop);
+}
+
+static void
+sfunc_if_flags(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ int err;
+ ipmp_ifinfo_t *ifinfop = arg->sa_data;
+ ipmp_groupinfo_t *grinfop;
+
+ assert(bufsize > IPMPSTAT_NUM_FLAGS);
+
+ (void) memset(buf, '-', IPMPSTAT_NUM_FLAGS);
+ buf[IPMPSTAT_NUM_FLAGS] = '\0';
+
+ if (ifinfop->if_type == IPMP_IF_STANDBY)
+ buf[IPMPSTAT_SFLAG_INDEX] = 's';
+
+ if (ifinfop->if_flags & IPMP_IFFLAG_INACTIVE)
+ buf[IPMPSTAT_IFLAG_INDEX] = 'i';
+
+ if (ifinfop->if_flags & IPMP_IFFLAG_DOWN)
+ buf[IPMPSTAT_DFLAG_INDEX] = 'd';
+
+ if (ifinfop->if_flags & IPMP_IFFLAG_HWADDRDUP)
+ buf[IPMPSTAT_HFLAG_INDEX] = 'h';
+
+ err = ipmp_getgroupinfo(arg->sa_ih, ifinfop->if_group, &grinfop);
+ if (err != IPMP_SUCCESS) {
+ warn_ipmperr(err, "cannot get broadcast/multicast info for "
+ "group `%s'", ifinfop->if_group);
+ return;
+ }
+
+ if (strcmp(grinfop->gr_m4ifname, ifinfop->if_name) == 0)
+ buf[IPMPSTAT_M4FLAG_INDEX] = 'm';
+
+ if (strcmp(grinfop->gr_m6ifname, ifinfop->if_name) == 0)
+ buf[IPMPSTAT_M6FLAG_INDEX] = 'M';
+
+ if (strcmp(grinfop->gr_bcifname, ifinfop->if_name) == 0)
+ buf[IPMPSTAT_BFLAG_INDEX] = 'b';
+
+ ipmp_freegroupinfo(grinfop);
+}
+
+static void
+sfunc_if_link(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_ifinfo_t *ifinfop = arg->sa_data;
+
+ enum2str(if_link, ifinfop->if_linkstate, buf, bufsize);
+}
+
+static void
+sfunc_if_probe(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_ifinfo_t *ifinfop = arg->sa_data;
+
+ enum2str(if_probe, ifinfop->if_probestate, buf, bufsize);
+}
+
+static void
+sfunc_if_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_ifinfo_t *ifinfop = arg->sa_data;
+
+ enum2str(if_state, ifinfop->if_state, buf, bufsize);
+}
+
+static void
+sfunc_probe_id(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ uint32_t probe_id;
+ nvlist_t *nvl = arg->sa_data;
+
+ if (nvlist_lookup_uint32(nvl, IPMP_PROBE_ID, &probe_id) != 0) {
+ sfunc_nvwarn("IPMP_PROBE_ID", buf, bufsize);
+ return;
+ }
+
+ (void) snprintf(buf, bufsize, "%u", probe_id);
+}
+
+static void
+sfunc_probe_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ char *ifname;
+ nvlist_t *nvl = arg->sa_data;
+
+ if (nvlist_lookup_string(nvl, IPMP_IF_NAME, &ifname) != 0) {
+ sfunc_nvwarn("IPMP_IF_NAME", buf, bufsize);
+ return;
+ }
+
+ (void) strlcpy(buf, ifname, bufsize);
+}
+
+static void
+sfunc_probe_time(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ hrtime_t start;
+ nvlist_t *nvl = arg->sa_data;
+
+ if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_START_TIME, &start) != 0) {
+ sfunc_nvwarn("IPMP_PROBE_START_TIME", buf, bufsize);
+ return;
+ }
+
+ (void) snprintf(buf, bufsize, "%.2fs",
+ (float)(start - probe_output_start) / NANOSEC);
+}
+
+static void
+sfunc_probe_target(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ uint_t nelem;
+ struct sockaddr_storage *target;
+ nvlist_t *nvl = arg->sa_data;
+
+ if (nvlist_lookup_byte_array(nvl, IPMP_PROBE_TARGET,
+ (uchar_t **)&target, &nelem) != 0) {
+ sfunc_nvwarn("IPMP_PROBE_TARGET", buf, bufsize);
+ return;
+ }
+
+ sockaddr2str(target, buf, bufsize);
+}
+
+static void
+sfunc_probe_rtt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ hrtime_t start, ackproc;
+ nvlist_t *nvl = arg->sa_data;
+ uint32_t state;
+
+ if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) {
+ sfunc_nvwarn("IPMP_PROBE_STATE", buf, bufsize);
+ return;
+ }
+
+ if (state != IPMP_PROBE_ACKED)
+ return;
+
+ if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_START_TIME, &start) != 0) {
+ sfunc_nvwarn("IPMP_PROBE_START_TIME", buf, bufsize);
+ return;
+ }
+
+ if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, &ackproc) != 0) {
+ sfunc_nvwarn("IPMP_PROBE_ACKPROC_TIME", buf, bufsize);
+ return;
+ }
+
+ (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(ackproc - start));
+}
+
+static void
+sfunc_probe_netrtt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ hrtime_t sent, ackrecv;
+ nvlist_t *nvl = arg->sa_data;
+ uint32_t state;
+
+ if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) {
+ sfunc_nvwarn("IPMP_PROBE_STATE", buf, bufsize);
+ return;
+ }
+
+ if (state != IPMP_PROBE_ACKED)
+ return;
+
+ if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_SENT_TIME, &sent) != 0) {
+ sfunc_nvwarn("IPMP_PROBE_SENT_TIME", buf, bufsize);
+ return;
+ }
+
+ if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, &ackrecv) != 0) {
+ sfunc_nvwarn("IPMP_PROBE_ACKRECV_TIME", buf, bufsize);
+ return;
+ }
+
+ (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(ackrecv - sent));
+}
+
+static void
+sfunc_probe_rttavg(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ int64_t rttavg;
+ nvlist_t *nvl = arg->sa_data;
+
+ if (nvlist_lookup_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, &rttavg) != 0) {
+ sfunc_nvwarn("IPMP_PROBE_TARGET_RTTAVG", buf, bufsize);
+ return;
+ }
+
+ if (rttavg != 0)
+ (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(rttavg));
+}
+
+static void
+sfunc_probe_rttdev(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ int64_t rttdev;
+ nvlist_t *nvl = arg->sa_data;
+
+ if (nvlist_lookup_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, &rttdev) != 0) {
+ sfunc_nvwarn("IPMP_PROBE_TARGET_RTTDEV", buf, bufsize);
+ return;
+ }
+
+ if (rttdev != 0)
+ (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(rttdev));
+}
+
+/* ARGSUSED */
+static void
+probe_enabled_cbfunc(ipmp_handle_t ih, void *infop, void *arg)
+{
+ uint_t *nenabledp = arg;
+ ipmp_ifinfo_t *ifinfop = infop;
+
+ if (ifinfop->if_probestate != IPMP_PROBE_DISABLED)
+ (*nenabledp)++;
+}
+
+static void
+probe_output(ipmp_handle_t ih, ipmpstat_ofmt_t *ofmt)
+{
+ char sub[MAX_SUBID_LEN];
+ evchan_t *evch;
+ ipmpstat_probe_state_t ps = { ih, ofmt };
+ uint_t nenabled = 0;
+
+ /*
+ * Check if any interfaces are enabled for probe-based failure
+ * detection. If not, immediately fail.
+ */
+ walk_if(ih, probe_enabled_cbfunc, &nenabled);
+ if (nenabled == 0)
+ die("probe-based failure detection is disabled\n");
+
+ probe_output_start = gethrtime();
+
+ /*
+ * Unfortunately, until 4791900 is fixed, only privileged processes
+ * can bind and thus receive sysevents.
+ */
+ errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evch, EVCH_CREAT);
+ if (errno != 0) {
+ if (errno == EPERM)
+ die("insufficient privileges for -p\n");
+ die("sysevent_evc_bind to channel %s failed", IPMP_EVENT_CHAN);
+ }
+
+ /*
+ * The subscriber must be unique in order for sysevent_evc_subscribe()
+ * to succeed, so combine our name and pid.
+ */
+ (void) snprintf(sub, sizeof (sub), "%d-%s", getpid(), progname);
+
+ errno = sysevent_evc_subscribe(evch, sub, EC_IPMP, probe_event, &ps, 0);
+ if (errno != 0)
+ die("sysevent_evc_subscribe for class %s failed", EC_IPMP);
+
+ for (;;)
+ (void) pause();
+}
+
+static int
+probe_event(sysevent_t *ev, void *arg)
+{
+ nvlist_t *nvl;
+ uint32_t state;
+ uint32_t version;
+ ipmpstat_probe_state_t *psp = arg;
+
+ if (strcmp(sysevent_get_subclass_name(ev), ESC_IPMP_PROBE_STATE) != 0)
+ return (0);
+
+ if (sysevent_get_attr_list(ev, &nvl) != 0) {
+ warn("sysevent_get_attr_list failed; dropping event");
+ return (0);
+ }
+
+ if (nvlist_lookup_uint32(nvl, IPMP_EVENT_VERSION, &version) != 0) {
+ warn("dropped event with no IPMP_EVENT_VERSION\n");
+ goto out;
+ }
+
+ if (version != IPMP_EVENT_CUR_VERSION) {
+ warn("dropped event with unsupported IPMP_EVENT_VERSION %d\n",
+ version);
+ goto out;
+ }
+
+ if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) {
+ warn("dropped event with no IPMP_PROBE_STATE\n");
+ goto out;
+ }
+
+ if (state == IPMP_PROBE_ACKED || state == IPMP_PROBE_LOST)
+ ofmt_output(psp->ps_ofmt, psp->ps_ih, nvl);
+out:
+ nvlist_free(nvl);
+ return (0);
+}
+
+static void
+sfunc_targ_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_targinfo_t *targinfop = arg->sa_data;
+
+ (void) strlcpy(buf, targinfop->it_name, bufsize);
+}
+
+static void
+sfunc_targ_mode(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_targinfo_t *targinfop = arg->sa_data;
+
+ enum2str(targ_mode, targinfop->it_targmode, buf, bufsize);
+}
+
+static void
+sfunc_targ_testaddr(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ ipmp_targinfo_t *targinfop = arg->sa_data;
+
+ if (targinfop->it_targmode != IPMP_TARG_DISABLED)
+ sockaddr2str(&targinfop->it_testaddr, buf, bufsize);
+}
+
+static void
+sfunc_targ_targets(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize)
+{
+ uint_t i;
+ char *targname = alloca(bufsize);
+ ipmp_targinfo_t *targinfop = arg->sa_data;
+ ipmp_addrlist_t *targlistp = targinfop->it_targlistp;
+
+ for (i = 0; i < targlistp->al_naddr; i++) {
+ sockaddr2str(&targlistp->al_addrs[i], targname, bufsize);
+ (void) strlcat(buf, targname, bufsize);
+ if ((i + 1) < targlistp->al_naddr)
+ (void) strlcat(buf, " ", bufsize);
+ }
+}
+
+static void
+info_output_cbfunc(ipmp_handle_t ih, void *infop, void *arg)
+{
+ ofmt_output(arg, ih, infop);
+}
+
+static void
+targinfo_output_cbfunc(ipmp_handle_t ih, void *infop, void *arg)
+{
+ ipmp_ifinfo_t *ifinfop = infop;
+ ipmp_if_targmode_t targmode4 = ifinfop->if_targinfo4.it_targmode;
+ ipmp_if_targmode_t targmode6 = ifinfop->if_targinfo6.it_targmode;
+
+ /*
+ * Usually, either IPv4 or IPv6 probing will be enabled, but the admin
+ * may enable both. If only one is enabled, omit the other one so as
+ * to not encourage the admin to enable both. If neither is enabled,
+ * we still print one just so the admin can see a MODE of "disabled".
+ */
+ if (targmode4 != IPMP_TARG_DISABLED || targmode6 == IPMP_TARG_DISABLED)
+ ofmt_output(arg, ih, &ifinfop->if_targinfo4);
+ if (targmode6 != IPMP_TARG_DISABLED)
+ ofmt_output(arg, ih, &ifinfop->if_targinfo6);
+}
+
+/*
+ * Creates an ipmpstat_ofmt_t field list from the comma-separated list of
+ * user-specified fields passed via `ofields'. The table of known fields
+ * (and their attributes) is passed via `fields'.
+ */
+static ipmpstat_ofmt_t *
+ofmt_create(const char *ofields, ipmpstat_field_t fields[])
+{
+ char *token, *lasts, *ofields_dup;
+ const char *fieldname;
+ ipmpstat_ofmt_t *ofmt, *ofmt_head = NULL, *ofmt_tail;
+ ipmpstat_field_t *fieldp;
+ uint_t cols = 0;
+
+ /*
+ * If "-o" was omitted or "-o all" was specified, build a list of
+ * field names. If "-o" was omitted, stop building the list when
+ * we run out of columns.
+ */
+ if (ofields == NULL || strcasecmp(ofields, "all") == 0) {
+ for (fieldp = fields; fieldp->f_name != NULL; fieldp++) {
+ cols += fieldp->f_width;
+ if (ofields == NULL && cols > IPMPSTAT_NCOL)
+ break;
+
+ if ((ofmt = calloc(sizeof (*ofmt), 1)) == NULL)
+ die("cannot allocate output format list");
+
+ ofmt->o_field = fieldp;
+ if (ofmt_head == NULL) {
+ ofmt_head = ofmt;
+ ofmt_tail = ofmt;
+ } else {
+ ofmt_tail->o_next = ofmt;
+ ofmt_tail = ofmt;
+ }
+ }
+ return (ofmt_head);
+ }
+
+ if ((ofields_dup = strdup(ofields)) == NULL)
+ die("cannot allocate output format list");
+
+ token = ofields_dup;
+ while ((fieldname = strtok_r(token, ",", &lasts)) != NULL) {
+ token = NULL;
+
+ if ((fieldp = field_find(fields, fieldname)) == NULL) {
+ /*
+ * Since machine parsers are unlikely to be able to
+ * gracefully handle missing fields, die if we're in
+ * parsable mode. Otherwise, just print a warning.
+ */
+ if (opt & IPMPSTAT_OPT_PARSABLE)
+ die("unknown output field `%s'\n", fieldname);
+
+ warn("ignoring unknown output field `%s'\n", fieldname);
+ continue;
+ }
+
+ if ((ofmt = calloc(sizeof (*ofmt), 1)) == NULL)
+ die("cannot allocate output format list");
+
+ ofmt->o_field = fieldp;
+ if (ofmt_head == NULL) {
+ ofmt_head = ofmt;
+ ofmt_tail = ofmt;
+ } else {
+ ofmt_tail->o_next = ofmt;
+ ofmt_tail = ofmt;
+ }
+ }
+
+ free(ofields_dup);
+ if (ofmt_head == NULL)
+ die("no valid output fields specified\n");
+
+ return (ofmt_head);
+}
+
+/*
+ * Destroys the provided `ofmt' field list.
+ */
+static void
+ofmt_destroy(ipmpstat_ofmt_t *ofmt)
+{
+ ipmpstat_ofmt_t *ofmt_next;
+
+ for (; ofmt != NULL; ofmt = ofmt_next) {
+ ofmt_next = ofmt->o_next;
+ free(ofmt);
+ }
+}
+
+/*
+ * Outputs a header for the fields named by `ofmt'.
+ */
+static void
+ofmt_output_header(const ipmpstat_ofmt_t *ofmt)
+{
+ const ipmpstat_field_t *fieldp;
+
+ for (; ofmt != NULL; ofmt = ofmt->o_next) {
+ fieldp = ofmt->o_field;
+
+ if (ofmt->o_next == NULL)
+ (void) printf("%s", fieldp->f_name);
+ else
+ (void) printf("%-*s", fieldp->f_width, fieldp->f_name);
+ }
+ (void) printf("\n");
+}
+
+/*
+ * Outputs one row of values for the fields named by `ofmt'. The values to
+ * output are obtained through the `ofmt' function pointers, which are
+ * indirectly passed the `ih' and `arg' structures for state; see the block
+ * comment at the start of this file for details.
+ */
+static void
+ofmt_output(const ipmpstat_ofmt_t *ofmt, ipmp_handle_t ih, void *arg)
+{
+ int i;
+ char buf[1024];
+ boolean_t escsep;
+ static int nrow;
+ const char *value;
+ uint_t width, valwidth;
+ uint_t compress, overflow = 0;
+ const ipmpstat_field_t *fieldp;
+ ipmpstat_sfunc_arg_t sfunc_arg;
+
+ /*
+ * For each screenful of data, display the header.
+ */
+ if ((nrow++ % winsize.ws_row) == 0 && !(opt & IPMPSTAT_OPT_PARSABLE)) {
+ ofmt_output_header(ofmt);
+ nrow++;
+ }
+
+ /*
+ * Check if we'll be displaying multiple fields per line, and thus
+ * need to escape the field separator.
+ */
+ escsep = (ofmt != NULL && ofmt->o_next != NULL);
+
+ for (; ofmt != NULL; ofmt = ofmt->o_next) {
+ fieldp = ofmt->o_field;
+
+ sfunc_arg.sa_ih = ih;
+ sfunc_arg.sa_data = arg;
+
+ buf[0] = '\0';
+ (*fieldp->f_sfunc)(&sfunc_arg, buf, sizeof (buf));
+
+ if (opt & IPMPSTAT_OPT_PARSABLE) {
+ for (i = 0; buf[i] != '\0'; i++) {
+ if (escsep && (buf[i] == ':' || buf[i] == '\\'))
+ (void) putchar('\\');
+ (void) putchar(buf[i]);
+ }
+ if (ofmt->o_next != NULL)
+ (void) putchar(':');
+ } else {
+ value = (buf[0] == '\0') ? "--" : buf;
+
+ /*
+ * To avoid needless line-wraps, for the last field,
+ * don't include any trailing whitespace.
+ */
+ if (ofmt->o_next == NULL) {
+ (void) printf("%s", value);
+ continue;
+ }
+
+ /*
+ * For other fields, grow the width as necessary to
+ * ensure the value completely fits. However, if
+ * there's unused whitespace in subsequent fields,
+ * then "compress" that whitespace to attempt to get
+ * the columns to line up again.
+ */
+ width = fieldp->f_width;
+ valwidth = strlen(value);
+
+ if (valwidth + overflow >= width) {
+ overflow += valwidth - width + 1;
+ (void) printf("%s ", value);
+ continue;
+ }
+
+ if (overflow > 0) {
+ compress = MIN(overflow, width - valwidth);
+ overflow -= compress;
+ width -= compress;
+ }
+ (void) printf("%-*s", width, value);
+ }
+ }
+ (void) printf("\n");
+
+ /*
+ * In case stdout has been redirected to e.g. a pipe, flush stdout so
+ * that commands can act on our output immediately.
+ */
+ (void) fflush(stdout);
+}
+
+/*
+ * Searches the `fields' array for a field matching `fieldname'. Returns
+ * a pointer to that field on success, or NULL on failure.
+ */
+static ipmpstat_field_t *
+field_find(ipmpstat_field_t *fields, const char *fieldname)
+{
+ ipmpstat_field_t *fieldp;
+
+ for (fieldp = fields; fieldp->f_name != NULL; fieldp++) {
+ if (strcasecmp(fieldp->f_name, fieldname) == 0)
+ return (fieldp);
+ }
+ return (NULL);
+}
+
+/*
+ * Uses `enums' to map `enumval' to a string, and stores at most `bufsize'
+ * bytes of that string into `buf'.
+ */
+static void
+enum2str(const ipmpstat_enum_t *enums, int enumval, char *buf, uint_t bufsize)
+{
+ const ipmpstat_enum_t *enump;
+
+ for (enump = enums; enump->e_name != NULL; enump++) {
+ if (enump->e_val == enumval) {
+ (void) strlcpy(buf, enump->e_name, bufsize);
+ return;
+ }
+ }
+ (void) snprintf(buf, bufsize, "<%d>", enumval);
+}
+
+/*
+ * Stores the stringified value of the sockaddr_storage pointed to by `ssp'
+ * into at most `bufsize' bytes of `buf'.
+ */
+static void
+sockaddr2str(const struct sockaddr_storage *ssp, char *buf, uint_t bufsize)
+{
+ int flags = NI_NOFQDN;
+ socklen_t socklen;
+ struct sockaddr *sp = (struct sockaddr *)ssp;
+
+ /*
+ * Sadly, getnameinfo() does not allow the socklen to be oversized for
+ * a given family -- so we must determine the exact size to pass to it.
+ */
+ switch (ssp->ss_family) {
+ case AF_INET:
+ socklen = sizeof (struct sockaddr_in);
+ break;
+ case AF_INET6:
+ socklen = sizeof (struct sockaddr_in6);
+ break;
+ default:
+ (void) strlcpy(buf, "?", bufsize);
+ return;
+ }
+
+ if (opt & IPMPSTAT_OPT_NUMERIC)
+ flags |= NI_NUMERICHOST;
+
+ (void) getnameinfo(sp, socklen, buf, bufsize, NULL, 0, flags);
+}
+
+static void
+sighandler(int sig)
+{
+ assert(sig == SIGWINCH);
+
+ if (ioctl(1, TIOCGWINSZ, &winsize) == -1 ||
+ winsize.ws_col == 0 || winsize.ws_row == 0) {
+ winsize.ws_col = 80;
+ winsize.ws_row = 24;
+ }
+}
+
+static void
+usage(void)
+{
+ const char *argstr = gettext("[-n] [-o <field> [-P]] -a|-g|-i|-p|-t");
+
+ (void) fprintf(stderr, gettext("usage: %s %s\n"), progname, argstr);
+ exit(EXIT_FAILURE);
+}
+
+/* PRINTFLIKE1 */
+static void
+warn(const char *format, ...)
+{
+ va_list alist;
+ int error = errno;
+
+ format = gettext(format);
+ (void) fprintf(stderr, gettext("%s: warning: "), progname);
+
+ va_start(alist, format);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
+
+ if (strchr(format, '\n') == NULL)
+ (void) fprintf(stderr, ": %s\n", strerror(error));
+}
+
+/* PRINTFLIKE2 */
+static void
+warn_ipmperr(int ipmperr, const char *format, ...)
+{
+ va_list alist;
+
+ format = gettext(format);
+ (void) fprintf(stderr, gettext("%s: warning: "), progname);
+
+ va_start(alist, format);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
+
+ (void) fprintf(stderr, ": %s\n", ipmp_errmsg(ipmperr));
+}
+
+/* PRINTFLIKE1 */
+static void
+die(const char *format, ...)
+{
+ va_list alist;
+ int error = errno;
+
+ format = gettext(format);
+ (void) fprintf(stderr, "%s: ", progname);
+
+ va_start(alist, format);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
+
+ if (strchr(format, '\n') == NULL)
+ (void) fprintf(stderr, ": %s\n", strerror(error));
+
+ exit(EXIT_FAILURE);
+}
+
+/* PRINTFLIKE2 */
+static void
+die_ipmperr(int ipmperr, const char *format, ...)
+{
+ va_list alist;
+
+ format = gettext(format);
+ (void) fprintf(stderr, "%s: ", progname);
+
+ va_start(alist, format);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
+ (void) fprintf(stderr, ": %s\n", ipmp_errmsg(ipmperr));
+
+ exit(EXIT_FAILURE);
+}
+
+static ipmpstat_field_t addr_fields[] = {
+ { "ADDRESS", 26, sfunc_addr_address },
+ { "STATE", 7, sfunc_addr_state },
+ { "GROUP", 12, sfunc_addr_group },
+ { "INBOUND", 12, sfunc_addr_inbound },
+ { "OUTBOUND", 23, sfunc_addr_outbound },
+ { NULL, 0, NULL }
+};
+
+static ipmpstat_field_t group_fields[] = {
+ { "GROUP", 12, sfunc_group_ifname },
+ { "GROUPNAME", 12, sfunc_group_name },
+ { "STATE", 10, sfunc_group_state },
+ { "FDT", 10, sfunc_group_fdt },
+ { "INTERFACES", 30, sfunc_group_interfaces },
+ { NULL, 0, NULL }
+};
+
+static ipmpstat_field_t if_fields[] = {
+ { "INTERFACE", 12, sfunc_if_name },
+ { "ACTIVE", 8, sfunc_if_active },
+ { "GROUP", 12, sfunc_if_group },
+ { "FLAGS", 10, sfunc_if_flags },
+ { "LINK", 10, sfunc_if_link },
+ { "PROBE", 10, sfunc_if_probe },
+ { "STATE", 10, sfunc_if_state },
+ { NULL, 0, NULL }
+};
+
+static ipmpstat_field_t probe_fields[] = {
+ { "TIME", 10, sfunc_probe_time },
+ { "INTERFACE", 12, sfunc_probe_ifname },
+ { "PROBE", 7, sfunc_probe_id },
+ { "NETRTT", 10, sfunc_probe_netrtt },
+ { "RTT", 10, sfunc_probe_rtt },
+ { "RTTAVG", 10, sfunc_probe_rttavg },
+ { "TARGET", 20, sfunc_probe_target },
+ { "RTTDEV", 10, sfunc_probe_rttdev },
+ { NULL, 0, NULL }
+};
+
+static ipmpstat_field_t targ_fields[] = {
+ { "INTERFACE", 12, sfunc_targ_ifname },
+ { "MODE", 10, sfunc_targ_mode },
+ { "TESTADDR", 20, sfunc_targ_testaddr },
+ { "TARGETS", 38, sfunc_targ_targets },
+ { NULL, 0, NULL }
+};
+
+static ipmpstat_enum_t addr_state[] = {
+ { "up", IPMP_ADDR_UP },
+ { "down", IPMP_ADDR_DOWN },
+ { NULL, 0 }
+};
+
+static ipmpstat_enum_t group_state[] = {
+ { "ok", IPMP_GROUP_OK },
+ { "failed", IPMP_GROUP_FAILED },
+ { "degraded", IPMP_GROUP_DEGRADED },
+ { NULL, 0 }
+};
+
+static ipmpstat_enum_t if_link[] = {
+ { "up", IPMP_LINK_UP },
+ { "down", IPMP_LINK_DOWN },
+ { "unknown", IPMP_LINK_UNKNOWN },
+ { NULL, 0 }
+};
+
+static ipmpstat_enum_t if_probe[] = {
+ { "ok", IPMP_PROBE_OK },
+ { "failed", IPMP_PROBE_FAILED },
+ { "unknown", IPMP_PROBE_UNKNOWN },
+ { "disabled", IPMP_PROBE_DISABLED },
+ { NULL, 0 }
+};
+
+static ipmpstat_enum_t if_state[] = {
+ { "ok", IPMP_IF_OK },
+ { "failed", IPMP_IF_FAILED },
+ { "unknown", IPMP_IF_UNKNOWN },
+ { "offline", IPMP_IF_OFFLINE },
+ { NULL, 0 }
+};
+
+static ipmpstat_enum_t targ_mode[] = {
+ { "disabled", IPMP_TARG_DISABLED },
+ { "routes", IPMP_TARG_ROUTES },
+ { "multicast", IPMP_TARG_MULTICAST },
+ { NULL, 0 }
+};
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl
new file mode 100644
index 0000000000..e2398aaf64
--- /dev/null
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl
@@ -0,0 +1,106 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+msgid " "
+msgid "%-*s"
+msgid "%.2fms"
+msgid "%.2fs"
+msgid "%d-%s"
+msgid "%s"
+msgid "%s "
+msgid "%s: "
+msgid "%u"
+msgid "("
+msgid ")"
+msgid ","
+msgid "--"
+msgid ": %s\n"
+msgid "?"
+msgid "["
+msgid "]"
+msgid "<%d>"
+msgid "\n"
+msgid "ACTIVE"
+msgid "ADDRESS"
+msgid "EC_ipmp"
+msgid "ESC_ipmp_probe_state"
+msgid "FDT"
+msgid "FLAGS"
+msgid "GROUP"
+msgid "GROUPNAME"
+msgid "INBOUND"
+msgid "INTERFACE"
+msgid "INTERFACES"
+msgid "IPMP_IF_NAME"
+msgid "IPMP_PROBE_ACKPROC_TIME"
+msgid "IPMP_PROBE_ACKRECV_TIME"
+msgid "IPMP_PROBE_ID"
+msgid "IPMP_PROBE_SENT_TIME"
+msgid "IPMP_PROBE_START_TIME"
+msgid "IPMP_PROBE_STATE"
+msgid "IPMP_PROBE_TARGET"
+msgid "IPMP_PROBE_TARGET_RTTAVG"
+msgid "IPMP_PROBE_TARGET_RTTDEV"
+msgid "LINK"
+msgid "MODE"
+msgid "NETRTT"
+msgid "OUTBOUND"
+msgid "PROBE"
+msgid "RTT"
+msgid "RTTAVG"
+msgid "RTTDEV"
+msgid "STATE"
+msgid "TARGET"
+msgid "TARGETS"
+msgid "TESTADDR"
+msgid "TIME"
+msgid "agipt"
+msgid "all"
+msgid "bufsize > IPMPSTAT_NUM_FLAGS"
+msgid "com.sun:ipmp:events"
+msgid "degraded"
+msgid "disabled"
+msgid "down"
+msgid "failed"
+msgid "ipmp_event_version"
+msgid "ipmp_if_name"
+msgid "ipmp_probe_ackproc_time"
+msgid "ipmp_probe_ackrecv_time"
+msgid "ipmp_probe_id"
+msgid "ipmp_probe_sent_time"
+msgid "ipmp_probe_start_time"
+msgid "ipmp_probe_state"
+msgid "ipmp_probe_target"
+msgid "ipmp_probe_target_rttavg"
+msgid "ipmp_probe_target_rttdev"
+msgid "ipmpstat.c"
+msgid "multicast"
+msgid "nLPo:agipt"
+msgid "no"
+msgid "offline"
+msgid "ok"
+msgid "routes"
+msgid "sig == SIGWINCH"
+msgid "unknown"
+msgid "up"
+msgid "yes"
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types b/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types
index bb15199492..e42bc626d8 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types
@@ -1,13 +1,12 @@
#
-# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -23,15 +22,12 @@
# CDDL HEADER END
#
-#pragma ident "%Z%%M% %I% %E% SMI"
-
fmt_version 1.0
mod_version 1.0
#PERM_CLASS default
filter name string
-filter if_groupname string
filter user user
filter projid int32
filter if_name ifname
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
index 17891ffc78..2a4ff60d57 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
@@ -18,7 +18,7 @@
*
* CDDL HEADER END
*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -37,8 +37,6 @@
* contributors.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdio.h>
#include <strings.h>
#include <errno.h>
@@ -243,7 +241,7 @@ main(int argc, char *argv[])
ushort_t udp_src_port6; /* used to identify replies */
uint_t flowinfo = 0;
uint_t class = 0;
- char tmp_buf[INET6_ADDRSTRLEN];
+ char abuf[INET6_ADDRSTRLEN];
int c;
int i;
boolean_t has_sys_ip_config;
@@ -671,24 +669,18 @@ main(int argc, char *argv[])
Printf("PING %s: %d data bytes\n", targethost, datalen);
} else {
if (ai_dst->ai_family == AF_INET) {
- Printf("PING %s (%s): %d data bytes\n",
- targethost,
- inet_ntop(AF_INET,
- /* LINTED E_BAD_PTR_CAST_ALIGN */
- &((struct sockaddr_in *)
- ai_dst->ai_addr)->sin_addr,
- tmp_buf, sizeof (tmp_buf)),
- datalen);
+ (void) inet_ntop(AF_INET,
+ &((struct sockaddr_in *)(void *)
+ ai_dst->ai_addr)->sin_addr,
+ abuf, sizeof (abuf));
} else {
- Printf("PING %s (%s): %d data bytes\n",
- targethost,
- inet_ntop(AF_INET6,
- /* LINTED E_BAD_PTR_CAST_ALIGN */
- &((struct sockaddr_in6 *)
- ai_dst->ai_addr)->sin6_addr,
- tmp_buf, sizeof (tmp_buf)),
- datalen);
+ (void) inet_ntop(AF_INET6,
+ &((struct sockaddr_in6 *)(void *)
+ ai_dst->ai_addr)->sin6_addr,
+ abuf, sizeof (abuf));
}
+ Printf("PING %s (%s): %d data bytes\n",
+ targethost, abuf, datalen);
}
}
@@ -1074,12 +1066,12 @@ select_all_src_addrs(union any_in_addr **src_addr_list, struct addrinfo *ai,
int num_dst = 1;
int i;
- if (probe_all)
- for (aip = ai; aip->ai_next != NULL;
- aip = aip->ai_next, num_dst++);
+ if (probe_all) {
+ for (aip = ai; aip->ai_next != NULL; aip = aip->ai_next)
+ num_dst++;
+ }
- list = (union any_in_addr *)
- calloc((size_t)num_dst, sizeof (union any_in_addr));
+ list = calloc((size_t)num_dst, sizeof (union any_in_addr));
if (list == NULL) {
Fprintf(stderr, "%s: calloc: %s\n", progname, strerror(errno));
exit(EXIT_FAILURE);
@@ -1472,7 +1464,7 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
int i;
/* pull out the interface list */
- num_ifs = ifaddrlist(&al, family, errbuf);
+ num_ifs = ifaddrlist(&al, family, LIFC_UNDER_IPMP, errbuf);
if (num_ifs == -1) {
Fprintf(stderr, "%s: %s\n", progname, errbuf);
exit(EXIT_FAILURE);
@@ -1699,8 +1691,8 @@ send_scheduled_probe()
} else {
Printf("no answer from %s(%s)\n", targethost,
inet_ntop(current_targetaddr->family,
- &current_targetaddr->dst_addr,
- tmp_buf, sizeof (tmp_buf)));
+ &current_targetaddr->dst_addr,
+ tmp_buf, sizeof (tmp_buf)));
}
}
/*
@@ -1736,9 +1728,8 @@ send_scheduled_probe()
* Each time we move to a new targetaddr, which has
* a different target IP address, we update this field.
*/
- current_targetaddr->starting_seq_num =
- use_udp ? dest_port :
- (ntransmitted % (MAX_ICMP_SEQ + 1));
+ current_targetaddr->starting_seq_num = use_udp ?
+ dest_port : (ntransmitted % (MAX_ICMP_SEQ + 1));
}
}
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c
index f062247997..e5b23fa126 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -139,7 +139,7 @@ check_device(dlpi_handle_t *dhp, char **devicep)
if (ioctl(s, SIOCGIFFLAGS, (char *)ifr) < 0)
pr_err("ioctl SIOCGIFFLAGS");
if ((ifr->ifr_flags &
- (IFF_VIRTUAL|IFF_LOOPBACK|IFF_UP|
+ (IFF_VIRTUAL|IFF_IPMP|IFF_UP|
IFF_RUNNING)) == (IFF_UP|IFF_RUNNING))
break;
}
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c
index adc6a932b0..cae75df60d 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -27,8 +27,6 @@
* @(#)$Header: traceroute.c,v 1.49 97/06/13 02:30:23 leres Exp $ (LBL)
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/param.h>
#include <sys/file.h>
#include <sys/ioctl.h>
@@ -707,7 +705,7 @@ get_hostinfo(char *host, int family, struct addrinfo **aipp)
struct addrinfo hints, *ai;
struct in6_addr addr6;
struct in_addr addr;
- char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */
+ char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */
int rc;
/*
@@ -720,11 +718,10 @@ get_hostinfo(char *host, int family, struct addrinfo **aipp)
IN6_V4MAPPED_TO_INADDR(&addr6, &addr);
/* convert it back to a string */
- (void) inet_ntop(AF_INET, (void *)&addr, temp_buf,
- sizeof (temp_buf));
+ (void) inet_ntop(AF_INET, &addr, abuf, sizeof (abuf));
/* now the host is an IPv4 address */
- (void) strcpy(host, temp_buf);
+ (void) strcpy(host, abuf);
/*
* If it's a mapped address, we convert it into IPv4
@@ -826,15 +823,19 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp)
struct sockaddr_in6 *sin6_from = (struct sockaddr_in6 *)pr->from;
struct addrinfo *aip;
char errbuf[ERRBUFSIZE];
- char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */
+ char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */
int num_ifs; /* all the interfaces */
int num_src_ifs; /* exclude loopback and down */
int i;
+ uint_t ifaddrflags = 0;
source = source_input;
+ if (device != NULL)
+ ifaddrflags |= LIFC_UNDER_IPMP;
+
/* get the interface address list */
- num_ifs = ifaddrlist(&al, pr->family, errbuf);
+ num_ifs = ifaddrlist(&al, pr->family, ifaddrflags, errbuf);
if (num_ifs < 0) {
Fprintf(stderr, "%s: ifaddrlist: %s\n", prog, errbuf);
exit(EXIT_FAILURE);
@@ -881,26 +882,20 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp)
if (pr->family == AF_INET)
ap = (union any_in_addr *)
/* LINTED E_BAD_PTR_CAST_ALIGN */
- &((struct sockaddr_in *)
- aip->ai_addr)->sin_addr;
+ &((struct sockaddr_in *)aip->ai_addr)->sin_addr;
else
ap = (union any_in_addr *)
/* LINTED E_BAD_PTR_CAST_ALIGN */
- &((struct sockaddr_in6 *)
- aip->ai_addr)->sin6_addr;
+ &((struct sockaddr_in6 *)aip->ai_addr)->sin6_addr;
/*
* LBNL bug fixed: used to accept any src address
*/
tmp2_al = find_ifaddr(al, num_ifs, ap, pr->family);
-
if (tmp2_al == NULL) {
- Fprintf(stderr,
- "%s: %s is not a local %s address\n",
- prog, inet_ntop(pr->family, ap,
- temp_buf, sizeof (temp_buf)),
- pr->name);
-
+ (void) inet_ntop(pr->family, ap, abuf, sizeof (abuf));
+ Fprintf(stderr, "%s: %s is not a local %s address\n",
+ prog, abuf, pr->name);
free(al);
freeaddrinfo(aip);
return (0);
@@ -928,13 +923,11 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp)
set_sin(pr->from, ap, pr->family);
if (aip->ai_next != NULL) {
- Fprintf(stderr,
- "%s: Warning: %s has multiple "
- "addresses; using %s\n",
- prog, source,
- inet_ntop(pr->family,
- (const void *)pr->from_sin_addr,
- temp_buf, sizeof (temp_buf)));
+ (void) inet_ntop(pr->family, pr->from_sin_addr,
+ abuf, sizeof (abuf));
+ Fprintf(stderr, "%s: Warning: %s has multiple "
+ "addresses; using %s\n", prog, source,
+ abuf);
}
} else { /* -i and -s used */
/*
@@ -1484,7 +1477,7 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr,
uchar_t code; /* icmp code */
int reply;
int seq = 0;
- char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */
+ char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */
int longjmp_return; /* return value from longjump */
struct ip *ip = (struct ip *)packet;
boolean_t got_there = _B_FALSE; /* we hit the destination */
@@ -1535,13 +1528,11 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr,
if (dev_name == NULL)
dev_name = "?";
+ (void) inet_ntop(pr->family, pr->from_sin_addr, abuf,
+ sizeof (abuf));
Fprintf(stderr,
"%s: Warning: Multiple interfaces found;"
- " using %s @ %s\n",
- prog, inet_ntop(pr->family,
- (const void *)pr->from_sin_addr,
- temp_buf, sizeof (temp_buf)),
- dev_name);
+ " using %s @ %s\n", prog, abuf, dev_name);
}
}
@@ -1558,8 +1549,7 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr,
Fprintf(stderr, "%s to %s", prog, hostname);
} else {
Fprintf(stderr, "%s to %s (%s)", prog, hostname,
- inet_ntop(pr->family, (const void *)ip_addr, temp_buf,
- sizeof (temp_buf)));
+ inet_ntop(pr->family, ip_addr, abuf, sizeof (abuf)));
}
if (source)
@@ -1700,9 +1690,8 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr,
}
if (pr->family == AF_INET6) {
- intp =
- (int *)find_ancillary_data(&in_msg,
- IPPROTO_IPV6, IPV6_HOPLIMIT);
+ intp = find_ancillary_data(&in_msg,
+ IPPROTO_IPV6, IPV6_HOPLIMIT);
if (intp == NULL) {
Fprintf(stderr,
"%s: can't find "
@@ -2188,10 +2177,11 @@ static void
usage(void)
{
Fprintf(stderr, "Usage: %s [-adFIlnSvx] [-A address_family] "
-"[-c traffic_class] \n"
-"\t[-f first_hop] [-g gateway [-g gateway ...]| -r] [-i iface]\n"
-"\t[-L flow_label] [-m max_hop] [-P pause_sec] [-p port] [-Q max_timeout]\n"
-"\t[-q nqueries] [-s src_addr] [-t tos] [-w wait_time] host [packetlen]\n",
- prog);
+ "[-c traffic_class]\n"
+ "\t[-f first_hop] [-g gateway [-g gateway ...]| -r] [-i iface]\n"
+ "\t[-L flow_label] [-m max_hop] [-P pause_sec] [-p port] "
+ "[-Q max_timeout]\n"
+ "\t[-q nqueries] [-s src_addr] [-t tos] [-w wait_time] host "
+ "[packetlen]\n", prog);
exit(EXIT_FAILURE);
}
diff --git a/usr/src/cmd/devfsadm/misc_link.c b/usr/src/cmd/devfsadm/misc_link.c
index c72be6be37..44756c3e98 100644
--- a/usr/src/cmd/devfsadm/misc_link.c
+++ b/usr/src/cmd/devfsadm/misc_link.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -104,7 +104,7 @@ static devfsadm_create_t misc_cbt[] = {
"(^ip$)|(^tcp$)|(^udp$)|(^icmp$)|(^sctp$)|"
"(^ip6$)|(^tcp6$)|(^udp6$)|(^icmp6$)|(^sctp6$)|"
"(^rts$)|(^arp$)|(^ipsecah$)|(^ipsecesp$)|(^keysock$)|(^spdsock$)|"
- "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)",
+ "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)|(^dlpistub$)",
TYPE_EXACT | DRV_RE, ILEVEL_1, minor_name
},
{ "pseudo", "ddi_pseudo",
diff --git a/usr/src/cmd/mdb/common/modules/ip/ip.c b/usr/src/cmd/mdb/common/modules/ip/ip.c
index f2dadd5261..f064b58d83 100644
--- a/usr/src/cmd/mdb/common/modules/ip/ip.c
+++ b/usr/src/cmd/mdb/common/modules/ip/ip.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stropts.h>
#include <sys/stream.h>
@@ -524,8 +522,7 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg)
static const mdb_bitmask_t mmasks[] = {
{ "CONDEMNED", IRE_MARK_CONDEMNED, IRE_MARK_CONDEMNED },
- { "NORECV", IRE_MARK_NORECV, IRE_MARK_NORECV },
- { "HIDDEN", IRE_MARK_HIDDEN, IRE_MARK_HIDDEN },
+ { "TESTHIDDEN", IRE_MARK_TESTHIDDEN, IRE_MARK_TESTHIDDEN },
{ "NOADD", IRE_MARK_NOADD, IRE_MARK_NOADD },
{ "TEMPORARY", IRE_MARK_TEMPORARY, IRE_MARK_TEMPORARY },
{ "USESRC", IRE_MARK_USESRC_CHECK, IRE_MARK_USESRC_CHECK },
diff --git a/usr/src/cmd/rcm_daemon/Makefile.com b/usr/src/cmd/rcm_daemon/Makefile.com
index 365371c45c..dbe3c1f1d1 100644
--- a/usr/src/cmd/rcm_daemon/Makefile.com
+++ b/usr/src/cmd/rcm_daemon/Makefile.com
@@ -19,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -124,7 +124,7 @@ SUNW_network_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
SUNW_vlan_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
SUNW_vnic_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
SUNW_aggr_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
-SUNW_ip_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil -ldladm
+SUNW_ip_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil -ldladm -lipmp
SUNW_ip_anon_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil
LDLIBS += -lgen -lelf -lrcm -lnvpair -ldevinfo -lnsl -lsocket
diff --git a/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c b/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c
index be9a31f952..6e1fe1bf39 100644
--- a/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c
+++ b/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* RCM module to prevent plumbed IP addresses from being removed.
*/
@@ -177,7 +175,7 @@ ip_anon_register(rcm_handle_t *hdl)
if (_cladm(CL_INITIALIZE, CL_GET_BOOTFLAG, &bootflags) != 0) {
rcm_log_message(RCM_ERROR,
- gettext("unable to check cluster status\n"));
+ gettext("unable to check cluster status\n"));
(void) mutex_unlock(&ip_list_lock);
return (RCM_FAILURE);
}
@@ -199,7 +197,7 @@ ip_anon_register(rcm_handle_t *hdl)
else {
if ((exclude_addrs.cladm_netaddrs_array =
malloc(sizeof (cladm_netaddr_entry_t) *
- (num_exclude_addrs))) == NULL) {
+ (num_exclude_addrs))) == NULL) {
rcm_log_message(RCM_ERROR,
gettext("out of memory\n"));
(void) mutex_unlock(&ip_list_lock);
@@ -274,7 +272,7 @@ ip_anon_register(rcm_handle_t *hdl)
rcm_log_message(RCM_DEBUG,
"ip_anon: obtaining list of IPv4 addresses.\n");
- num_ifs = ifaddrlist(&al, AF_INET, errbuf);
+ num_ifs = ifaddrlist(&al, AF_INET, LIFC_UNDER_IPMP, errbuf);
if (num_ifs == -1) {
rcm_log_message(RCM_ERROR,
gettext("cannot get IPv4 address list errno=%d (%s)\n"),
@@ -286,7 +284,7 @@ ip_anon_register(rcm_handle_t *hdl)
rcm_log_message(RCM_DEBUG,
"ip_anon: obtaining list of IPv6 addresses.\n");
- num_ifs6 = ifaddrlist(&al6, AF_INET6, errbuf);
+ num_ifs6 = ifaddrlist(&al6, AF_INET6, LIFC_UNDER_IPMP, errbuf);
if (num_ifs6 == -1) {
rcm_log_message(RCM_ERROR,
gettext("cannot get IPv6 address list errno=%d (%s)\n"),
@@ -392,7 +390,7 @@ ip_anon_register(rcm_handle_t *hdl)
* currently know about it.
*/
if (!(tentry->flags & IP_FLAG_CL) &&
- !(tentry->flags & IP_FLAG_REG)) {
+ !(tentry->flags & IP_FLAG_REG)) {
tentry->flags |= IP_FLAG_REG;
rcm_log_message(RCM_DEBUG,
"ip_anon: registering interest in %s\n",
diff --git a/usr/src/cmd/rcm_daemon/common/ip_rcm.c b/usr/src/cmd/rcm_daemon/common/ip_rcm.c
index f62b3dfc19..24be0cafeb 100644
--- a/usr/src/cmd/rcm_daemon/common/ip_rcm.c
+++ b/usr/src/cmd/rcm_daemon/common/ip_rcm.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -38,23 +38,22 @@
#include <errno.h>
#include <fcntl.h>
#include <sys/types.h>
+#include <sys/wait.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <net/if.h>
#include <netinet/in.h>
-#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <stropts.h>
#include <strings.h>
-#include <libdevinfo.h>
-#include <sys/systeminfo.h>
-#include <netdb.h>
+#include <sys/sysmacros.h>
#include <inet/ip.h>
#include <libinetutil.h>
#include <libdllink.h>
+#include <libgen.h>
+#include <ipmp_admin.h>
-#include <ipmp_mpathd.h>
#include "rcm_module.h"
/*
@@ -75,42 +74,19 @@
#define RCM_LINK_RESOURCE_MAX (13 + LINKID_STR_WIDTH)
#define RCM_STR_SUNW_IP "SUNW_ip/" /* IP address export prefix */
-#define RCM_SIZE_SUNW_IP 9 /* strlen("SUNW_ip/") + 1 */
-/* ifconfig(1M) */
-#define USR_SBIN_IFCONFIG "/usr/sbin/ifconfig" /* ifconfig command */
-#define CFGFILE_FMT_IPV4 "/etc/hostname." /* IPV4 config file */
-#define CFGFILE_FMT_IPV6 "/etc/hostname6." /* IPV6 config file */
+#define SBIN_IFCONFIG "/sbin/ifconfig" /* ifconfig command */
+#define SBIN_IFPARSE "/sbin/ifparse" /* ifparse command */
+#define DHCPFILE_FMT "/etc/dhcp.%s" /* DHCP config file */
+#define CFGFILE_FMT_IPV4 "/etc/hostname.%s" /* IPV4 config file */
+#define CFGFILE_FMT_IPV6 "/etc/hostname6.%s" /* IPV6 config file */
#define CFG_CMDS_STD " netmask + broadcast + up" /* Normal config string */
-#define CONFIG_AF_INET 0x1 /* Post-configure IPv4 */
-#define CONFIG_AF_INET6 0x2 /* Post-configure IPv6 */
-#define MAXLINE 1024 /* Max. line length */
-#define MAXARGS 512 /* Max. args in ifconfig cmd */
-
-/* Physical interface flags mask */
-#define RCM_PIF_FLAGS (IFF_OFFLINE | IFF_INACTIVE | IFF_FAILED | \
- IFF_STANDBY)
+#define CFG_DHCP_CMD "dhcp wait 0" /* command to start DHCP */
/* Some useful macros */
-#ifndef MAX
-#define MAX(a, b) (((a) > (b))?(a):(b))
-#endif /* MAX */
-
-#ifndef ISSPACE
#define ISSPACE(c) ((c) == ' ' || (c) == '\t')
-#endif
-
-#ifndef ISEOL
#define ISEOL(c) ((c) == '\n' || (c) == '\r' || (c) == '\0')
-#endif
-
-#ifndef STREQ
#define STREQ(a, b) (*(a) == *(b) && strcmp((a), (b)) == 0)
-#endif
-
-#ifndef ADDSPACE
-#define ADDSPACE(a) ((void) strcat((a), " "))
-#endif
/* Interface Cache state flags */
#define CACHE_IF_STALE 0x1 /* stale cached data */
@@ -125,48 +101,20 @@
/* RCM IPMP Module specific property definitions */
#define RCM_IPMP_MIN_REDUNDANCY 1 /* default min. redundancy */
-/* in.mpathd(1M) specifics */
-#define MPATHD_MAX_RETRIES 5 /* Max. offline retries */
-
/* Stream module operations */
#define MOD_INSERT 0 /* Insert a mid-stream module */
#define MOD_REMOVE 1 /* Remove a mid-stream module */
#define MOD_CHECK 2 /* Check mid-stream module safety */
/*
- * in.mpathd(1M) message passing formats
- */
-typedef struct mpathd_cmd {
- uint32_t cmd_command; /* message command */
- char cmd_ifname[LIFNAMSIZ]; /* this interface name */
- char cmd_movetoif[LIFNAMSIZ]; /* move to interface */
- uint32_t cmd_min_red; /* min. redundancy */
-/* Message passing values for MI_SETOINDEX */
-#define from_lifname cmd_ifname /* current logical interface */
-#define to_pifname cmd_movetoif /* new physical interface */
-#define addr_family cmd_min_red /* address family */
-} mpathd_cmd_t;
-
-/* This is needed since mpathd checks message size for offline */
-typedef struct mpathd_unoffline {
- uint32_t cmd_command; /* offline / undo offline */
- char cmd_ifname[LIFNAMSIZ]; /* this interface name */
-} mpathd_unoffline_t;
-
-typedef struct mpathd_response {
- uint32_t resp_sys_errno; /* system errno */
- uint32_t resp_mpathd_err; /* mpathd error information */
-} mpathd_response_t;
-
-/*
* IP module data types
*/
/* Physical interface representation */
typedef struct ip_pif {
- char pi_ifname[LIFNAMSIZ+1]; /* interface name */
- char pi_grpname[LIFNAMSIZ+1]; /* IPMP group name */
- struct ip_lif *pi_lifs; /* ptr to logical interfaces */
+ char pi_ifname[LIFNAMSIZ]; /* interface name */
+ char pi_grname[LIFGRNAMSIZ]; /* IPMP group name */
+ struct ip_lif *pi_lifs; /* ptr to logical interfaces */
} ip_pif_t;
/* Logical interface representation */
@@ -239,7 +187,7 @@ static void free_node(ip_cache_t *);
static void cache_insert(ip_cache_t *);
static char *ip_usage(ip_cache_t *);
static int update_pif(rcm_handle_t *, int, int, struct lifreq *);
-static int ip_ipmp_offline(ip_cache_t *, ip_cache_t *);
+static int ip_ipmp_offline(ip_cache_t *);
static int ip_ipmp_undo_offline(ip_cache_t *);
static int if_cfginfo(ip_cache_t *, uint_t);
static int if_unplumb(ip_cache_t *);
@@ -247,9 +195,6 @@ static int if_replumb(ip_cache_t *);
static void ip_log_err(ip_cache_t *, char **, char *);
static char *get_link_resource(const char *);
static void clr_cfg_state(ip_pif_t *);
-static uint64_t if_get_flags(ip_pif_t *);
-static int mpathd_send_cmd(mpathd_cmd_t *);
-static int connect_to_mpathd(int);
static int modop(char *, char *, int, char);
static int get_modlist(char *, ip_lif_t *);
static int ip_domux2fd(int *, int *, int *, struct lifreq *);
@@ -262,15 +207,13 @@ static char **ip_get_addrlist(ip_cache_t *);
static void ip_free_addrlist(char **);
static void ip_consumer_notify(rcm_handle_t *, datalink_id_t, char **,
uint_t, rcm_info_t **);
+static boolean_t ip_addrstr(ip_lif_t *, char *, size_t);
static int if_configure(datalink_id_t);
-static int isgrouped(char *);
-static int if_ipmp_config(char *, int, int);
-static int if_mpathd_configure(char *, char *, int, int);
-static char *get_mpathd_dest(char *, int);
-static int if_getcount(int);
-static void tokenize(char *, char **, char *, int *);
-
+static boolean_t isgrouped(const char *);
+static int if_config_inst(const char *, FILE *, int, boolean_t);
+static uint_t ntok(const char *cp);
+static boolean_t ifconfig(const char *, const char *, const char *, boolean_t);
/* Module-Private data */
static struct rcm_mod_ops ip_ops =
@@ -429,9 +372,9 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
{
ip_cache_t *node;
ip_pif_t *pif;
- int detachable = 0;
- int nofailover = 0;
- int ipmp = 0;
+ boolean_t detachable = B_FALSE;
+ boolean_t ipmp;
+ int retval;
rcm_log_message(RCM_TRACE1, "IP: offline(%s)\n", rsrc);
@@ -455,25 +398,17 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
pif = node->ip_pif;
/* Establish default detachability criteria */
- if (flags & RCM_FORCE) {
- detachable++;
- }
+ if (flags & RCM_FORCE)
+ detachable = B_TRUE;
- /* Check if the interface is an IPMP grouped interface */
- if (strcmp(pif->pi_grpname, "")) {
- ipmp++;
- }
-
- if (if_get_flags(pif) & IFF_NOFAILOVER) {
- nofailover++;
- }
+ /* Check if the interface is under IPMP */
+ ipmp = (pif->pi_grname[0] != '\0');
/*
- * Even if the interface is not in an IPMP group, it's possible that
- * it's still okay to offline it as long as there are higher-level
- * failover mechanisms for the addresses it owns (e.g., clustering).
- * In this case, ip_offlinelist() will return RCM_SUCCESS, and we
- * charge on.
+ * Even if the interface is not under IPMP, it's possible that it's
+ * still okay to offline it as long as there are higher-level failover
+ * mechanisms for the addresses it owns (e.g., clustering). In this
+ * case, ip_offlinelist() will return RCM_SUCCESS, and we charge on.
*/
if (!ipmp && !detachable) {
/* Inform consumers of IP addresses being offlined */
@@ -489,17 +424,6 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
}
}
- /*
- * Cannot remove an IPMP interface if IFF_NOFAILOVER is set.
- */
- if (ipmp && nofailover) {
- /* Interface is part of an IPMP group, and cannot failover */
- ip_log_err(node, errorp, "Failover disabled");
- errno = EBUSY;
- (void) mutex_unlock(&cache_lock);
- return (RCM_FAILURE);
- }
-
/* Check if it's a query */
if (flags & RCM_QUERY) {
rcm_log_message(RCM_TRACE1, "IP: offline query success(%s)\n",
@@ -534,38 +458,32 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
}
/*
- * This an IPMP interface that can be failed over.
- * Request in.mpathd(1M) to failover the physical interface.
+ * This is an IPMP interface that can be offlined.
+ * Request in.mpathd(1M) to offline the physical interface.
*/
+ if ((retval = ip_ipmp_offline(node)) != IPMP_SUCCESS)
+ ip_log_err(node, errorp, "in.mpathd offline failed");
- /* Failover to "any", let mpathd determine best failover candidate */
- if (ip_ipmp_offline(node, NULL) < 0) {
- ip_log_err(node, errorp, "in.mpathd failover failed");
+ if (retval == IPMP_EMINRED && !detachable) {
/*
- * Odds are that in.mpathd(1M) could not offline the device
- * because it was the last interface in the group. However,
- * it's possible that it's still okay to offline it as long as
- * there are higher-level failover mechanisms for the
- * addresses it owns (e.g., clustering). In this case,
- * ip_offlinelist() will return RCM_SUCCESS, and we charge on.
- *
- * TODO: change ip_ipmp_offline() to return the actual failure
- * from in.mpathd so that we can verify that it did indeed
- * fail with IPMP_EMINRED.
+ * in.mpathd(1M) could not offline the device because it was
+ * the last interface in the group. However, it's possible
+ * that it's still okay to offline it as long as there are
+ * higher-level failover mechanisms for the addresses it owns
+ * (e.g., clustering). In this case, ip_offlinelist() will
+ * return RCM_SUCCESS, and we charge on.
*/
- if (!detachable) {
- /* Inform consumers of IP addresses being offlined */
- if (ip_offlinelist(hd, node, errorp, flags,
- depend_info) == RCM_SUCCESS) {
- rcm_log_message(RCM_DEBUG,
- "IP: consumers agree on detach");
- } else {
- ip_log_err(node, errorp,
- "Device consumers prohibit offline");
- (void) mutex_unlock(&cache_lock);
- errno = EBUSY;
- return (RCM_FAILURE);
- }
+ /* Inform consumers of IP addresses being offlined */
+ if (ip_offlinelist(hd, node, errorp, flags,
+ depend_info) == RCM_SUCCESS) {
+ rcm_log_message(RCM_DEBUG,
+ "IP: consumers agree on detach");
+ } else {
+ ip_log_err(node, errorp,
+ "Device consumers prohibit offline");
+ (void) mutex_unlock(&cache_lock);
+ errno = EBUSY;
+ return (RCM_FAILURE);
}
}
@@ -574,8 +492,8 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
_("IP: Unplumb failed (%s)\n"),
pif->pi_ifname);
- /* Request mpathd to undo the offline */
- if (ip_ipmp_undo_offline(node) < 0) {
+ /* Request in.mpathd to undo the offline */
+ if (ip_ipmp_undo_offline(node) != IPMP_SUCCESS) {
ip_log_err(node, errorp, "Undo offline failed");
(void) mutex_unlock(&cache_lock);
return (RCM_FAILURE);
@@ -862,18 +780,16 @@ static char *
ip_usage(ip_cache_t *node)
{
ip_lif_t *lif;
- int numifs;
- char *buf;
- char *linkidstr;
+ uint_t numup;
+ char *sep, *buf, *linkidstr;
datalink_id_t linkid;
- const char *fmt;
- char *sep;
+ const char *msg;
char link[MAXLINKNAMELEN];
char addrstr[INET6_ADDRSTRLEN];
char errmsg[DLADM_STRSIZE];
dladm_status_t status;
- int offline = 0;
- size_t bufsz;
+ boolean_t offline, ipmp;
+ size_t bufsz = 0;
rcm_log_message(RCM_TRACE2, "IP: usage(%s)\n", node->ip_resource);
@@ -904,76 +820,53 @@ ip_usage(ip_cache_t *node)
/* TRANSLATION_NOTE: separator used between IP addresses */
sep = _(", ");
- numifs = 0;
- for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next) {
- if (lif->li_ifflags & IFF_UP) {
- numifs++;
- }
- }
+ numup = 0;
+ for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next)
+ if (lif->li_ifflags & IFF_UP)
+ numup++;
- if (node->ip_cachestate & CACHE_IF_OFFLINED) {
- offline++;
- }
+ ipmp = (node->ip_pif->pi_grname[0] != '\0');
+ offline = ((node->ip_cachestate & CACHE_IF_OFFLINED) != 0);
- if (!offline && numifs) {
- fmt = _("%1$s hosts IP addresses: ");
- } else if (offline) {
- fmt = _("%1$s offlined");
+ if (offline) {
+ msg = _("offlined");
+ } else if (numup == 0) {
+ msg = _("plumbed but down");
} else {
- fmt = _("%1$s plumbed but down");
+ if (ipmp) {
+ msg = _("providing connectivity for IPMP group ");
+ bufsz += LIFGRNAMSIZ;
+ } else {
+ msg = _("hosts IP addresses: ");
+ bufsz += (numup * (INET6_ADDRSTRLEN + strlen(sep)));
+ }
}
- /* space for addresses and separators, plus message */
- bufsz = ((numifs * (INET6_ADDRSTRLEN + strlen(sep))) +
- strlen(fmt) + strlen(link) + 1);
+ bufsz += strlen(link) + strlen(msg) + 1;
if ((buf = malloc(bufsz)) == NULL) {
rcm_log_message(RCM_ERROR,
_("IP: usage(%s) malloc failure(%s)\n"),
node->ip_resource, strerror(errno));
return (NULL);
}
- bzero(buf, bufsz);
- (void) sprintf(buf, fmt, link);
-
- if (offline || (numifs == 0)) { /* Nothing else to do */
- rcm_log_message(RCM_TRACE2, "IP: usage (%s) info = %s\n",
- node->ip_resource, buf);
-
- return (buf);
- }
-
- for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next) {
+ (void) snprintf(buf, bufsz, "%s: %s", link, msg);
- void *addr;
- int af;
-
- if (!(lif->li_ifflags & IFF_UP)) {
- /* ignore interfaces not up */
- continue;
- }
- af = lif->li_addr.family;
- if (af == AF_INET6) {
- addr = &lif->li_addr.ip6.sin6_addr;
- } else if (af == AF_INET) {
- addr = &lif->li_addr.ip4.sin_addr;
+ if (!offline && numup > 0) {
+ if (ipmp) {
+ (void) strlcat(buf, node->ip_pif->pi_grname, bufsz);
} else {
- rcm_log_message(RCM_DEBUG,
- "IP: unknown addr family %d, assuming AF_INET\n",
- af);
- af = AF_INET;
- addr = &lif->li_addr.ip4.sin_addr;
- }
- if (inet_ntop(af, addr, addrstr, INET6_ADDRSTRLEN) == NULL) {
- rcm_log_message(RCM_ERROR,
- _("IP: inet_ntop: %s\n"), strerror(errno));
- continue;
- }
- rcm_log_message(RCM_DEBUG, "IP addr := %s\n", addrstr);
+ lif = node->ip_pif->pi_lifs;
+ for (; lif != NULL; lif = lif->li_next) {
+ if (!(lif->li_ifflags & IFF_UP))
+ continue;
+
+ if (!ip_addrstr(lif, addrstr, sizeof (addrstr)))
+ continue;
- (void) strcat(buf, addrstr);
- numifs--;
- if (numifs > 0) {
- (void) strcat(buf, ", ");
+ (void) strlcat(buf, addrstr, bufsz);
+ if (--numup > 0)
+ (void) strlcat(buf, sep, bufsz);
+ }
}
}
@@ -983,6 +876,32 @@ ip_usage(ip_cache_t *node)
return (buf);
}
+static boolean_t
+ip_addrstr(ip_lif_t *lif, char *addrstr, size_t addrsize)
+{
+ int af = lif->li_addr.family;
+ void *addr;
+
+ if (af == AF_INET6) {
+ addr = &lif->li_addr.ip6.sin6_addr;
+ } else if (af == AF_INET) {
+ addr = &lif->li_addr.ip4.sin_addr;
+ } else {
+ rcm_log_message(RCM_DEBUG,
+ "IP: unknown addr family %d, assuming AF_INET\n", af);
+ af = AF_INET;
+ addr = &lif->li_addr.ip4.sin_addr;
+ }
+ if (inet_ntop(af, addr, addrstr, addrsize) == NULL) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: inet_ntop: %s\n"), strerror(errno));
+ return (B_FALSE);
+ }
+
+ rcm_log_message(RCM_DEBUG, "IP addr := %s\n", addrstr);
+ return (B_TRUE);
+}
+
/*
* Cache management routines, all cache management functions should be
* be called with cache_lock held.
@@ -1121,11 +1040,13 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr)
ifnumber = ifspec.ifsp_lun;
/* Get the interface flags */
- (void) strcpy(lifreq.lifr_name, lifr->lifr_name);
+ (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ);
if (ioctl(sock, SIOCGLIFFLAGS, (char *)&lifreq) < 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: SIOCGLIFFLAGS(%s): %s\n"),
- pif.pi_ifname, strerror(errno));
+ if (errno != ENXIO) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: SIOCGLIFFLAGS(%s): %s\n"),
+ lifreq.lifr_name, strerror(errno));
+ }
return (-1);
}
(void) memcpy(&ifflags, &lifreq.lifr_flags, sizeof (ifflags));
@@ -1135,12 +1056,13 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr)
* - IFF_VIRTUAL: e.g., loopback and vni
* - IFF_POINTOPOINT: e.g., sppp and ip.tun
* - !IFF_MULTICAST: e.g., ip.6to4tun
+ * - IFF_IPMP: IPMP meta-interfaces
*
* Note: The !IFF_MULTICAST check can be removed once iptun is
* implemented as a datalink.
*/
if (!(ifflags & IFF_MULTICAST) ||
- (ifflags & (IFF_POINTOPOINT | IFF_VIRTUAL))) {
+ (ifflags & (IFF_POINTOPOINT | IFF_VIRTUAL | IFF_IPMP))) {
rcm_log_message(RCM_TRACE3, "IP: if ignored (%s)\n",
pif.pi_ifname);
return (0);
@@ -1148,23 +1070,26 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr)
/* Get the interface group name for this interface */
if (ioctl(sock, SIOCGLIFGROUPNAME, (char *)&lifreq) < 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: SIOCGLIFGROUPNAME(%s): %s\n"),
- lifreq.lifr_name, strerror(errno));
+ if (errno != ENXIO) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: SIOCGLIFGROUPNAME(%s): %s\n"),
+ lifreq.lifr_name, strerror(errno));
+ }
return (-1);
}
/* copy the group name */
- (void) memcpy(&pif.pi_grpname, &lifreq.lifr_groupname,
- sizeof (pif.pi_grpname));
- pif.pi_grpname[sizeof (pif.pi_grpname) - 1] = '\0';
+ (void) strlcpy(pif.pi_grname, lifreq.lifr_groupname,
+ sizeof (pif.pi_grname));
/* Get the interface address for this interface */
if (ioctl(sock, SIOCGLIFADDR, (char *)&lifreq) < 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: SIOCGLIFADDR(%s): %s\n"),
- lifreq.lifr_name, strerror(errno));
- return (-1);
+ if (errno != ENXIO) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: SIOCGLIFADDR(%s): %s\n"),
+ lifreq.lifr_name, strerror(errno));
+ return (-1);
+ }
}
(void) memcpy(&ifaddr, &lifreq.lifr_addr, sizeof (ifaddr));
@@ -1241,9 +1166,9 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr)
sizeof (pif.pi_ifname));
}
- /* save pif properties */
- (void) memcpy(&probepif->pi_grpname, &pif.pi_grpname,
- sizeof (pif.pi_grpname));
+ /* save the group name */
+ (void) strlcpy(probepif->pi_grname, pif.pi_grname,
+ sizeof (pif.pi_grname));
/* add lif, if this is a lif and it is not in cache */
if (!lif_listed) {
@@ -1304,7 +1229,7 @@ update_ipifs(rcm_handle_t *hd, int af)
}
lifn.lifn_family = af;
- lifn.lifn_flags = 0;
+ lifn.lifn_flags = LIFC_UNDER_IPMP;
if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) {
rcm_log_message(RCM_ERROR,
_("IP: SIOCLGIFNUM failed: %s\n"),
@@ -1321,7 +1246,7 @@ update_ipifs(rcm_handle_t *hd, int af)
}
lifc.lifc_family = af;
- lifc.lifc_flags = 0;
+ lifc.lifc_flags = LIFC_UNDER_IPMP;
lifc.lifc_len = sizeof (struct lifreq) * lifn.lifn_count;
lifc.lifc_buf = buf;
@@ -1480,39 +1405,33 @@ static void
ip_log_err(ip_cache_t *node, char **errorp, char *errmsg)
{
char *ifname = NULL;
- int len;
+ int size;
const char *errfmt;
- char *error;
+ char *error = NULL;
if ((node != NULL) && (node->ip_pif != NULL) &&
(node->ip_pif->pi_ifname != NULL)) {
ifname = node->ip_pif->pi_ifname;
}
- if (errorp != NULL)
- *errorp = NULL;
-
if (ifname == NULL) {
rcm_log_message(RCM_ERROR, _("IP: %s\n"), errmsg);
errfmt = _("IP: %s");
- len = strlen(errfmt) + strlen(errmsg) + 1;
- if (error = (char *)calloc(1, len)) {
- (void) sprintf(error, errfmt, errmsg);
- }
+ size = strlen(errfmt) + strlen(errmsg) + 1;
+ if (errorp != NULL && (error = malloc(size)) != NULL)
+ (void) snprintf(error, size, errfmt, errmsg);
} else {
rcm_log_message(RCM_ERROR, _("IP: %s(%s)\n"), errmsg, ifname);
errfmt = _("IP: %s(%s)");
- len = strlen(errfmt) + strlen(errmsg) + strlen(ifname) + 1;
- if (error = (char *)calloc(1, len)) {
- (void) sprintf(error, errfmt, errmsg, ifname);
- }
+ size = strlen(errfmt) + strlen(errmsg) + strlen(ifname) + 1;
+ if (errorp != NULL && (error = malloc(size)) != NULL)
+ (void) snprintf(error, size, errfmt, errmsg, ifname);
}
if (errorp != NULL)
*errorp = error;
}
-
/*
* if_cfginfo() - Save off the config info for all interfaces
*/
@@ -1538,7 +1457,7 @@ if_cfginfo(ip_cache_t *node, uint_t force)
rcm_log_message(RCM_ERROR,
_("IP: get modlist error (%s) %s\n"),
pif->pi_ifname, strerror(errno));
- (void) clr_cfg_state(pif);
+ clr_cfg_state(pif);
return (-1);
}
@@ -1551,7 +1470,7 @@ if_cfginfo(ip_cache_t *node, uint_t force)
rcm_log_message(RCM_ERROR,
_("IP: module %s@%d\n"),
lif->li_modules[i], i);
- (void) clr_cfg_state(pif);
+ clr_cfg_state(pif);
return (-1);
}
}
@@ -1595,11 +1514,11 @@ if_cfginfo(ip_cache_t *node, uint_t force)
/* Save reconfiguration information */
if (lif->li_ifflags & IFF_IPV4) {
(void) snprintf(syscmd, sizeof (syscmd),
- "%s %s:%d configinfo\n", USR_SBIN_IFCONFIG,
+ "%s %s:%d configinfo\n", SBIN_IFCONFIG,
pif->pi_ifname, lif->li_ifnum);
} else if (lif->li_ifflags & IFF_IPV6) {
(void) snprintf(syscmd, sizeof (syscmd),
- "%s %s:%d inet6 configinfo\n", USR_SBIN_IFCONFIG,
+ "%s %s:%d inet6 configinfo\n", SBIN_IFCONFIG,
pif->pi_ifname, lif->li_ifnum);
}
rcm_log_message(RCM_TRACE2, "IP: %s\n", syscmd);
@@ -1609,7 +1528,7 @@ if_cfginfo(ip_cache_t *node, uint_t force)
rcm_log_message(RCM_ERROR,
_("IP: ifconfig configinfo error (%s:%d) %s\n"),
pif->pi_ifname, lif->li_ifnum, strerror(errno));
- (void) clr_cfg_state(pif);
+ clr_cfg_state(pif);
return (-1);
}
bzero(buf, MAX_RECONFIG_SIZE);
@@ -1619,20 +1538,18 @@ if_cfginfo(ip_cache_t *node, uint_t force)
_("IP: ifconfig configinfo error (%s:%d) %s\n"),
pif->pi_ifname, lif->li_ifnum, strerror(errno));
(void) pclose(fp);
- (void) clr_cfg_state(pif);
+ clr_cfg_state(pif);
return (-1);
}
(void) pclose(fp);
- lif->li_reconfig = malloc(strlen(buf)+1);
- if (lif->li_reconfig == NULL) {
+ if ((lif->li_reconfig = strdup(buf)) == NULL) {
rcm_log_message(RCM_ERROR,
_("IP: malloc error (%s) %s\n"),
pif->pi_ifname, strerror(errno));
- (void) clr_cfg_state(pif);
+ clr_cfg_state(pif);
return (-1);
}
- (void) strcpy(lif->li_reconfig, buf);
rcm_log_message(RCM_DEBUG,
"IP: if_cfginfo: reconfig string(%s:%d) = %s\n",
pif->pi_ifname, lif->li_ifnum, lif->li_reconfig);
@@ -1654,57 +1571,37 @@ static int
if_unplumb(ip_cache_t *node)
{
ip_lif_t *lif;
- ip_pif_t *pif;
- int ipv4 = 0, ipv6 = 0;
- char syscmd[MAX_RECONFIG_SIZE + LIFNAMSIZ];
+ ip_pif_t *pif = node->ip_pif;
+ boolean_t ipv4 = B_FALSE;
+ boolean_t ipv6 = B_FALSE;
rcm_log_message(RCM_TRACE2, "IP: if_unplumb(%s)\n", node->ip_resource);
- pif = node->ip_pif;
- lif = pif->pi_lifs;
-
- while (lif != NULL) {
+ for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) {
if (lif->li_ifflags & IFF_IPV4) {
- ipv4++;
+ ipv4 = B_TRUE;
} else if (lif->li_ifflags & IFF_IPV6) {
- ipv6++;
+ ipv6 = B_TRUE;
} else {
/* Unlikely case */
rcm_log_message(RCM_DEBUG,
"IP: Unplumb ignored (%s:%d)\n",
pif->pi_ifname, lif->li_ifnum);
- lif = lif->li_next;
- continue;
}
- lif = lif->li_next;
}
- /* Unplumb the physical interface */
- if (ipv4) {
- rcm_log_message(RCM_TRACE2,
- "IP: if_unplumb: ifconfig %s unplumb\n", pif->pi_ifname);
- (void) snprintf(syscmd, sizeof (syscmd), "%s %s unplumb\n",
- USR_SBIN_IFCONFIG, pif->pi_ifname);
- if (rcm_exec_cmd(syscmd) != 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: Cannot unplumb (%s) %s\n"),
- pif->pi_ifname, strerror(errno));
- return (-1);
- }
+ if (ipv4 && !ifconfig(pif->pi_ifname, "inet", "unplumb", B_FALSE)) {
+ rcm_log_message(RCM_ERROR, _("IP: Cannot unplumb (%s) %s\n"),
+ pif->pi_ifname, strerror(errno));
+ return (-1);
}
- if (ipv6) {
- rcm_log_message(RCM_TRACE2,
- "IP: if_unplumb: ifconfig %s inet6 unplumb\n",
- pif->pi_ifname);
- (void) snprintf(syscmd, sizeof (syscmd),
- "%s %s inet6 unplumb\n", USR_SBIN_IFCONFIG, pif->pi_ifname);
- if (rcm_exec_cmd(syscmd) != 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: Cannot unplumb (%s) %s\n"),
- pif->pi_ifname, strerror(errno));
- return (-1);
- }
+
+ if (ipv6 && !ifconfig(pif->pi_ifname, "inet6", "unplumb", B_FALSE)) {
+ rcm_log_message(RCM_ERROR, _("IP: Cannot unplumb (%s) %s\n"),
+ pif->pi_ifname, strerror(errno));
+ return (-1);
}
+
rcm_log_message(RCM_TRACE2, "IP: if_unplumb(%s) success\n",
node->ip_resource);
@@ -1723,8 +1620,11 @@ if_replumb(ip_cache_t *node)
ip_lif_t *lif;
ip_pif_t *pif;
int i;
- char syscmd[LIFNAMSIZ+MAXPATHLEN]; /* must be big enough */
- int max_ipv4 = 0, max_ipv6 = 0;
+ boolean_t success, ipmp;
+ const char *fstr;
+ char lifname[LIFNAMSIZ];
+ char buf[MAX_RECONFIG_SIZE];
+ int max_lifnum = 0;
rcm_log_message(RCM_TRACE2, "IP: if_replumb(%s)\n", node->ip_resource);
@@ -1738,100 +1638,103 @@ if_replumb(ip_cache_t *node)
*/
pif = node->ip_pif;
- lif = pif->pi_lifs;
+ ipmp = (node->ip_pif->pi_grname[0] != '\0');
/*
* Make a first pass to plumb in physical interfaces and get a count
* of the max logical interfaces
*/
- while (lif != NULL) {
+ for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) {
+ max_lifnum = MAX(lif->li_ifnum, max_lifnum);
if (lif->li_ifflags & IFF_IPV4) {
- if (lif->li_ifnum > max_ipv4) {
- max_ipv4 = lif->li_ifnum;
- }
+ fstr = "inet";
} else if (lif->li_ifflags & IFF_IPV6) {
- if (lif->li_ifnum > max_ipv6) {
- max_ipv6 = lif->li_ifnum;
- }
+ fstr = "inet6";
} else {
/* Unlikely case */
rcm_log_message(RCM_DEBUG,
"IP: Re-plumb ignored (%s:%d)\n",
pif->pi_ifname, lif->li_ifnum);
- lif = lif->li_next;
continue;
}
- if (lif->li_ifnum == 0) { /* physical interface instance */
- if ((lif->li_ifflags & IFF_NOFAILOVER) ||
- (strcmp(pif->pi_grpname, "") == 0)) {
- (void) snprintf(syscmd, sizeof (syscmd),
- "%s %s\n", USR_SBIN_IFCONFIG,
- lif->li_reconfig);
- } else if (lif->li_ifflags & IFF_IPV4) {
- (void) snprintf(syscmd, sizeof (syscmd),
- "%s %s inet plumb group %s\n",
- USR_SBIN_IFCONFIG,
- pif->pi_ifname, pif->pi_grpname);
- } else if (lif->li_ifflags & IFF_IPV6) {
- (void) snprintf(syscmd, sizeof (syscmd),
- "%s %s inet6 plumb group %s\n",
- USR_SBIN_IFCONFIG,
- pif->pi_ifname, pif->pi_grpname);
- }
+ /* ignore logical interface instances */
+ if (lif->li_ifnum != 0)
+ continue;
+
+ if ((lif->li_ifflags & IFF_NOFAILOVER) || !ipmp) {
+ success = ifconfig("", "", lif->li_reconfig, B_FALSE);
+ } else {
+ (void) snprintf(buf, sizeof (buf), "plumb group %s",
+ pif->pi_grname);
+ success = ifconfig(pif->pi_ifname, fstr, buf, B_FALSE);
+ }
+
+ if (!success) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: Cannot plumb (%s) %s\n"), pif->pi_ifname,
+ strerror(errno));
+ return (-1);
+ }
+
+ /*
+ * Restart DHCP if necessary.
+ */
+ if ((lif->li_ifflags & IFF_DHCPRUNNING) &&
+ !ifconfig(pif->pi_ifname, fstr, CFG_DHCP_CMD, B_FALSE)) {
+ rcm_log_message(RCM_ERROR, _("IP: Cannot start DHCP "
+ "(%s) %s\n"), pif->pi_ifname, strerror(errno));
+ return (-1);
+ }
+ rcm_log_message(RCM_TRACE2,
+ "IP: if_replumb: Modcnt = %d\n", lif->li_modcnt);
+ /* modinsert modules in order, ignore driver(last) */
+ for (i = 0; i < (lif->li_modcnt - 1); i++) {
rcm_log_message(RCM_TRACE2,
- "IP: if_replumb: %s\n", syscmd);
- if (rcm_exec_cmd(syscmd) != 0) {
+ "IP: modinsert: Pos = %d Mod = %s\n",
+ i, lif->li_modules[i]);
+ if (modop(pif->pi_ifname, lif->li_modules[i], i,
+ MOD_INSERT) == -1) {
rcm_log_message(RCM_ERROR,
- _("IP: Cannot plumb (%s) %s\n"),
- pif->pi_ifname, strerror(errno));
+ _("IP: modinsert error(%s)\n"),
+ pif->pi_ifname);
return (-1);
}
-
- rcm_log_message(RCM_TRACE2,
- "IP: if_replumb: Modcnt = %d\n", lif->li_modcnt);
- /* modinsert modules in order, ignore driver(last) */
- for (i = 0; i < (lif->li_modcnt - 1); i++) {
- rcm_log_message(RCM_TRACE2,
- "IP: modinsert: Pos = %d Mod = %s\n",
- i, lif->li_modules[i]);
- if (modop(pif->pi_ifname, lif->li_modules[i], i,
- MOD_INSERT) == -1) {
- rcm_log_message(RCM_ERROR,
- _("IP: modinsert error(%s)\n"),
- pif->pi_ifname);
- return (-1);
- }
- }
}
-
- lif = lif->li_next;
}
/* Now, add all the logical interfaces in the correct order */
- for (i = 1; i <= MAX(max_ipv6, max_ipv4); i++) {
+ for (i = 1; i <= max_lifnum; i++) {
+ (void) snprintf(lifname, LIFNAMSIZ, "%s:%d", pif->pi_ifname, i);
+
/* reset lif through every iteration */
- lif = pif->pi_lifs;
- while (lif != NULL) {
- if (((lif->li_ifflags & IFF_NOFAILOVER) ||
- (strcmp(pif->pi_grpname, "") == 0)) &&
- (lif->li_ifnum == i)) {
- /* Plumb in the logical interface */
- (void) snprintf(syscmd, sizeof (syscmd),
- "%s %s\n", USR_SBIN_IFCONFIG,
- lif->li_reconfig);
- rcm_log_message(RCM_TRACE2,
- "IP: if_replumb: %s\n", syscmd);
- if (rcm_exec_cmd(syscmd) != 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: Cannot addif (%s:%d) "
- "%s\n"),
- pif->pi_ifname, i, strerror(errno));
- return (-1);
- }
+ for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) {
+ /*
+ * Process entries in order. If the interface is
+ * using IPMP, only process test addresses.
+ */
+ if (lif->li_ifnum != i ||
+ (ipmp && !(lif->li_ifflags & IFF_NOFAILOVER)))
+ continue;
+
+ if (!ifconfig("", "", lif->li_reconfig, B_FALSE)) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: Cannot addif (%s) %s\n"), lifname,
+ strerror(errno));
+ return (-1);
+ }
+
+ /*
+ * Restart DHCP if necessary.
+ */
+ if ((lif->li_ifflags & IFF_DHCPRUNNING) &&
+ !ifconfig(lifname, fstr, CFG_DHCP_CMD, B_FALSE)) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: Cannot start DHCP (%s) %s\n"),
+ lifname, strerror(errno));
+ return (-1);
}
- lif = lif->li_next;
}
}
@@ -1865,71 +1768,64 @@ clr_cfg_state(ip_pif_t *pif)
}
/*
- * ip_ipmp_offline() - Failover from if_from to if_to using a
- * minimum redudancy of min_red. This uses IPMPs
- * "offline" mechanism to achieve the failover.
+ * Attempt to offline ip_cache_t `node'; returns an IPMP error code.
*/
static int
-ip_ipmp_offline(ip_cache_t *if_from, ip_cache_t *if_to)
+ip_ipmp_offline(ip_cache_t *node)
{
- mpathd_cmd_t mpdcmd;
-
- if ((if_from == NULL) || (if_from->ip_pif == NULL) ||
- (if_from->ip_pif->pi_ifname == NULL)) {
- return (-1);
- }
+ int retval;
+ ipmp_handle_t handle;
rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline\n");
- mpdcmd.cmd_command = MI_OFFLINE;
- (void) strcpy(mpdcmd.cmd_ifname, if_from->ip_pif->pi_ifname);
-
- if ((if_to != NULL) && (if_to->ip_pif != NULL) &&
- (if_to->ip_pif->pi_ifname != NULL)) {
- rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline (%s)->(%s)\n",
- if_from->ip_pif->pi_ifname, if_to->ip_pif->pi_ifname);
- (void) strncpy(mpdcmd.cmd_movetoif, if_to->ip_pif->pi_ifname,
- sizeof (mpdcmd.cmd_movetoif));
- mpdcmd.cmd_movetoif[sizeof (mpdcmd.cmd_movetoif) - 1] = '\0';
- } else {
- rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline (%s)->(any)\n",
- if_from->ip_pif->pi_ifname);
- (void) strcpy(mpdcmd.cmd_movetoif, ""); /* signifies any */
+ if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: cannot create ipmp handle: %s\n"),
+ ipmp_errmsg(retval));
+ return (retval);
}
- mpdcmd.cmd_min_red = if_from->ip_ifred;
- if (mpathd_send_cmd(&mpdcmd) < 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd offline error: %s\n"),
- strerror(errno));
- return (-1);
+ retval = ipmp_offline(handle, node->ip_pif->pi_ifname, node->ip_ifred);
+ if (retval != IPMP_SUCCESS) {
+ rcm_log_message(RCM_ERROR, _("IP: ipmp_offline error: %s\n"),
+ ipmp_errmsg(retval));
+ } else {
+ rcm_log_message(RCM_TRACE1, "IP: ipmp_offline success\n");
}
- rcm_log_message(RCM_TRACE1, "IP: ipmp offline success\n");
- return (0);
+ ipmp_close(handle);
+ return (retval);
}
/*
- * ip_ipmp_undo_offline() - Undo prior offline of the interface.
- * This uses IPMPs "undo offline" feature.
+ * Attempt to undo the offline ip_cache_t `node'; returns an IPMP error code.
*/
static int
ip_ipmp_undo_offline(ip_cache_t *node)
{
- mpathd_cmd_t mpdcmd;
+ int retval;
+ ipmp_handle_t handle;
- mpdcmd.cmd_command = MI_UNDO_OFFLINE;
- (void) strcpy(mpdcmd.cmd_ifname, node->ip_pif->pi_ifname);
+ rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_undo_offline\n");
- if (mpathd_send_cmd(&mpdcmd) < 0) {
+ if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS) {
rcm_log_message(RCM_ERROR,
- _("IP: mpathd error: %s\n"),
- strerror(errno));
- return (-1);
+ _("IP: cannot create ipmp handle: %s\n"),
+ ipmp_errmsg(retval));
+ return (retval);
}
- rcm_log_message(RCM_TRACE1, "IP: ipmp undo offline success\n");
- return (0);
+ retval = ipmp_undo_offline(handle, node->ip_pif->pi_ifname);
+ if (retval != IPMP_SUCCESS) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: ipmp_undo_offline error: %s\n"),
+ ipmp_errmsg(retval));
+ } else {
+ rcm_log_message(RCM_TRACE1, "IP: ipmp_undo_offline success\n");
+ }
+
+ ipmp_close(handle);
+ return (retval);
}
/*
@@ -1946,10 +1842,9 @@ get_link_resource(const char *link)
char *resource;
dladm_status_t status;
- if ((status = dladm_name2info(dld_handle, link, &linkid, &flags, NULL,
- NULL)) != DLADM_STATUS_OK) {
+ status = dladm_name2info(dld_handle, link, &linkid, &flags, NULL, NULL);
+ if (status != DLADM_STATUS_OK)
goto fail;
- }
if (!(flags & DLADM_OPT_ACTIVE)) {
status = DLADM_STATUS_FAILED;
@@ -1976,243 +1871,6 @@ fail:
}
/*
- * if_get_flags() - Return the cached physical interface flags
- * Call with cache_lock held
- */
-static uint64_t
-if_get_flags(ip_pif_t *pif)
-{
- ip_lif_t *lif;
-
- for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) {
- if (lif->li_ifnum == 0) {
- return (lif->li_ifflags & RCM_PIF_FLAGS);
- }
- }
- return (0);
-}
-
-/*
- * mpathd_send_cmd() - Sends the command to in.mpathd.
- */
-static int
-mpathd_send_cmd(mpathd_cmd_t *mpd)
-{
- mpathd_unoffline_t mpc;
- struct mpathd_response mpr;
- int i;
- int s;
-
- rcm_log_message(RCM_TRACE1, "IP: mpathd_send_cmd \n");
-
- for (i = 0; i < MPATHD_MAX_RETRIES; i++) {
- s = connect_to_mpathd(AF_INET);
- if (s == -1) {
- s = connect_to_mpathd(AF_INET6);
- if (s == -1) {
- rcm_log_message(RCM_ERROR,
- _("IP: Cannot talk to mpathd\n"));
- return (-1);
- }
- }
- switch (mpd->cmd_command) {
- case MI_OFFLINE :
- rcm_log_message(RCM_TRACE1, "IP: MI_OFFLINE: "
- "(%s)->(%s) redundancy = %d\n", mpd->cmd_ifname,
- mpd->cmd_movetoif, mpd->cmd_min_red);
-
- if (write(s, mpd, sizeof (mpathd_cmd_t)) !=
- sizeof (mpathd_cmd_t)) {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd write: %s\n"),
- strerror(errno));
- (void) close(s);
- return (-1);
- }
- break;
-
- case MI_SETOINDEX :
- rcm_log_message(RCM_TRACE1, "IP: MI_SETOINDEX: "
- "(%s)->(%s) family = %d\n", mpd->from_lifname,
- mpd->to_pifname, mpd->addr_family);
-
- if (write(s, mpd, sizeof (mpathd_cmd_t)) !=
- sizeof (mpathd_cmd_t)) {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd write: %s\n"),
- strerror(errno));
- (void) close(s);
- return (-1);
- }
- break;
-
- case MI_UNDO_OFFLINE:
- /* mpathd checks for exact size of the message */
- mpc.cmd_command = mpd->cmd_command;
- (void) strcpy(mpc.cmd_ifname, mpd->cmd_ifname);
-
- rcm_log_message(RCM_TRACE1, "IP: MI_UNDO_OFFLINE: "
- "(%s)\n", mpd->cmd_ifname);
-
- if (write(s, &mpc, sizeof (mpathd_unoffline_t)) !=
- sizeof (mpathd_unoffline_t)) {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd write: %s\n"),
- strerror(errno));
- (void) close(s);
- return (-1);
- }
- break;
- default :
- rcm_log_message(RCM_ERROR,
- _("IP: unsupported mpathd command\n"));
- (void) close(s);
- return (-1);
- }
-
- bzero(&mpr, sizeof (struct mpathd_response));
- /* Read the result from mpathd */
- if (read(s, &mpr, sizeof (struct mpathd_response)) !=
- sizeof (struct mpathd_response)) {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd read : %s\n"), strerror(errno));
- (void) close(s);
- return (-1);
- }
-
- (void) close(s);
- if (mpr.resp_mpathd_err == 0) {
- rcm_log_message(RCM_TRACE1,
- "IP: mpathd_send_cmd success\n");
- return (0); /* Successful */
- }
-
- if (mpr.resp_mpathd_err == MPATHD_SYS_ERROR) {
- if (mpr.resp_sys_errno == EAGAIN) {
- (void) sleep(1);
- rcm_log_message(RCM_DEBUG,
- "IP: mpathd retrying\n");
- continue; /* Retry */
- }
- errno = mpr.resp_sys_errno;
- rcm_log_message(RCM_WARNING,
- _("IP: mpathd_send_cmd error: %s\n"),
- strerror(errno));
- } else if (mpr.resp_mpathd_err == MPATHD_MIN_RED_ERROR) {
- errno = EIO;
- rcm_log_message(RCM_ERROR, _("IP: in.mpathd(1M): "
- "Minimum redundancy not met\n"));
- } else {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd_send_cmd error\n"));
- }
- /* retry */
- }
-
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd_send_cmd failed %d retries\n"), MPATHD_MAX_RETRIES);
- return (-1);
-}
-
-/*
- * Returns -1 on failure. Returns the socket file descriptor on
- * success.
- */
-static int
-connect_to_mpathd(int family)
-{
- int s;
- struct sockaddr_storage ss;
- struct sockaddr_in *sin = (struct sockaddr_in *)&ss;
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss;
- struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
- int addrlen;
- int ret;
- int on;
-
- rcm_log_message(RCM_TRACE1, "IP: connect_to_mpathd\n");
-
- s = socket(family, SOCK_STREAM, 0);
- if (s < 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd socket: %s\n"), strerror(errno));
- return (-1);
- }
- bzero((char *)&ss, sizeof (ss));
- ss.ss_family = family;
- /*
- * Need to bind to a privelged port. For non-root, this
- * will fail. in.mpathd verifies that only commands coming
- * from priveleged ports succeed so that the ordinary user
- * can't issue offline commands.
- */
- on = 1;
- if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on,
- sizeof (on)) < 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd setsockopt: TCP_ANONPRIVBIND: %s\n"),
- strerror(errno));
- return (-1);
- }
- switch (family) {
- case AF_INET:
- sin->sin_port = 0;
- sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
- addrlen = sizeof (struct sockaddr_in);
- break;
- case AF_INET6:
- sin6->sin6_port = 0;
- sin6->sin6_addr = loopback_addr;
- addrlen = sizeof (struct sockaddr_in6);
- break;
- }
- ret = bind(s, (struct sockaddr *)&ss, addrlen);
- if (ret != 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd bind: %s\n"), strerror(errno));
- return (-1);
- }
- switch (family) {
- case AF_INET:
- sin->sin_port = htons(MPATHD_PORT);
- break;
- case AF_INET6:
- sin6->sin6_port = htons(MPATHD_PORT);
- break;
- }
- ret = connect(s, (struct sockaddr *)&ss, addrlen);
- if (ret != 0) {
- if (errno == ECONNREFUSED) {
- /* in.mpathd is not running, start it */
- if (rcm_exec_cmd(MPATHD_PATH) == -1) {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd exec: %s\n"),
- strerror(errno));
- return (-1);
- }
- ret = connect(s, (struct sockaddr *)&ss, addrlen);
- }
- if (ret != 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd connect: %s\n"), strerror(errno));
- return (-1);
- }
- }
- on = 0;
- if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on,
- sizeof (on)) < 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: mpathd setsockopt TCP_ANONPRIVBIND: %s\n"),
- strerror(errno));
- return (-1);
- }
-
- rcm_log_message(RCM_TRACE1, "IP: connect_to_mpathd success\n");
-
- return (s);
-}
-
-/*
* modop() - Remove/insert a module
*/
static int
@@ -2239,12 +1897,10 @@ modop(char *name, char *arg, int pos, char op)
if (op == MOD_REMOVE) {
(void) snprintf(syscmd, sizeof (syscmd),
- "%s %s modremove %s@%d\n", USR_SBIN_IFCONFIG, name, arg,
- pos);
+ "%s %s modremove %s@%d\n", SBIN_IFCONFIG, name, arg, pos);
} else if (op == MOD_INSERT) {
(void) snprintf(syscmd, sizeof (syscmd),
- "%s %s modinsert %s@%d\n", USR_SBIN_IFCONFIG, name, arg,
- pos);
+ "%s %s modinsert %s@%d\n", SBIN_IFCONFIG, name, arg, pos);
} else {
rcm_log_message(RCM_ERROR,
_("IP: modop(%s): unknown operation\n"), name);
@@ -2277,11 +1933,11 @@ get_modlist(char *name, ip_lif_t *lif)
int i;
int num_mods;
struct lifreq lifr;
- struct str_list strlist;
+ struct str_list strlist = { 0 };
rcm_log_message(RCM_TRACE1, "IP: getmodlist(%s)\n", name);
- (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
+ (void) strlcpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
lifr.lifr_flags = lif->li_ifflags;
if (ip_domux2fd(&mux_fd, &muxid_fd, &fd, &lifr) < 0) {
rcm_log_message(RCM_ERROR, _("IP: ip_domux2fd(%s)\n"), name);
@@ -2292,39 +1948,34 @@ get_modlist(char *name, ip_lif_t *lif)
rcm_log_message(RCM_ERROR,
_("IP: get_modlist(%s): I_LIST(%s) \n"),
name, strerror(errno));
- (void) ip_plink(mux_fd, muxid_fd, fd, &lifr);
- return (-1);
+ goto fail;
}
strlist.sl_nmods = num_mods;
strlist.sl_modlist = malloc(sizeof (struct str_mlist) * num_mods);
-
if (strlist.sl_modlist == NULL) {
rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): %s\n"),
name, strerror(errno));
- (void) ip_plink(mux_fd, muxid_fd, fd, &lifr);
- return (-1);
+ goto fail;
}
if (ioctl(fd, I_LIST, (caddr_t)&strlist) < 0) {
rcm_log_message(RCM_ERROR,
_("IP: get_modlist(%s): I_LIST error: %s\n"),
name, strerror(errno));
- (void) ip_plink(mux_fd, muxid_fd, fd, &lifr);
- return (-1);
+ goto fail;
}
for (i = 0; i < strlist.sl_nmods; i++) {
- lif->li_modules[i] =
- malloc(strlen(strlist.sl_modlist[i].l_name)+1);
+ lif->li_modules[i] = strdup(strlist.sl_modlist[i].l_name);
if (lif->li_modules[i] == NULL) {
rcm_log_message(RCM_ERROR,
_("IP: get_modlist(%s): %s\n"),
name, strerror(errno));
- (void) ip_plink(mux_fd, muxid_fd, fd, &lifr);
- return (-1);
+ while (i > 0)
+ free(lif->li_modules[--i]);
+ goto fail;
}
- (void) strcpy(lif->li_modules[i], strlist.sl_modlist[i].l_name);
}
lif->li_modcnt = strlist.sl_nmods;
@@ -2332,6 +1983,10 @@ get_modlist(char *name, ip_lif_t *lif)
rcm_log_message(RCM_TRACE1, "IP: getmodlist(%s) success\n", name);
return (ip_plink(mux_fd, muxid_fd, fd, &lifr));
+fail:
+ free(strlist.sl_modlist);
+ (void) ip_plink(mux_fd, muxid_fd, fd, &lifr);
+ return (-1);
}
/*
@@ -2436,6 +2091,7 @@ ip_plink(int mux_fd, int muxid_fd, int fd, struct lifreq *lifr)
*
* Notify online to IP address consumers.
*/
+/*ARGSUSED*/
static int
ip_onlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags,
rcm_info_t **depend_info)
@@ -2464,6 +2120,7 @@ ip_onlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags,
*
* Offline IP address consumers.
*/
+/*ARGSUSED*/
static int
ip_offlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags,
rcm_info_t **depend_info)
@@ -2494,9 +2151,9 @@ ip_offlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags,
}
/*
- * ip_get_addrlist() - Compile list of IP addresses hosted on this NIC (node)
- * This routine malloc() required memeory for the list
- * Returns list on success, NULL if failed
+ * ip_get_addrlist() - Get the list of IP addresses on this interface (node);
+ * This routine malloc()s required memory for the list.
+ * Returns the list on success, NULL on failure.
* Call with cache_lock held.
*/
static char **
@@ -2504,11 +2161,9 @@ ip_get_addrlist(ip_cache_t *node)
{
ip_lif_t *lif;
char **addrlist = NULL;
- int numifs;
+ int i, numifs;
+ size_t addrlistsize;
char addrstr[INET6_ADDRSTRLEN];
- void *addr;
- int af;
- int i;
rcm_log_message(RCM_TRACE2, "IP: ip_get_addrlist(%s)\n",
node->ip_resource);
@@ -2532,35 +2187,21 @@ ip_get_addrlist(ip_cache_t *node)
for (lif = node->ip_pif->pi_lifs, i = 0; lif != NULL;
lif = lif->li_next, i++) {
- af = lif->li_addr.family;
- if (af == AF_INET6) {
- addr = &lif->li_addr.ip6.sin6_addr;
- } else if (af == AF_INET) {
- addr = &lif->li_addr.ip4.sin_addr;
- } else {
- rcm_log_message(RCM_DEBUG,
- "IP: unknown addr family %d, assuming AF_INET\n",
- af);
- af = AF_INET;
- addr = &lif->li_addr.ip4.sin_addr;
- }
- if (inet_ntop(af, addr, addrstr, INET6_ADDRSTRLEN) == NULL) {
- rcm_log_message(RCM_ERROR,
- _("IP: inet_ntop: %s\n"), strerror(errno));
+ if (!ip_addrstr(lif, addrstr, sizeof (addrstr))) {
ip_free_addrlist(addrlist);
return (NULL);
}
- if ((addrlist[i] = malloc(strlen(addrstr) + RCM_SIZE_SUNW_IP))
- == NULL) {
+ addrlistsize = strlen(addrstr) + sizeof (RCM_STR_SUNW_IP);
+ if ((addrlist[i] = malloc(addrlistsize)) == NULL) {
rcm_log_message(RCM_ERROR,
_("IP: ip_get_addrlist(%s) malloc failure(%s)\n"),
node->ip_resource, strerror(errno));
ip_free_addrlist(addrlist);
return (NULL);
}
- (void) strcpy(addrlist[i], RCM_STR_SUNW_IP); /* SUNW_ip/ */
- (void) strcat(addrlist[i], addrstr); /* SUNW_ip/<address> */
+ (void) snprintf(addrlist[i], addrlistsize, "%s%s",
+ RCM_STR_SUNW_IP, addrstr);
rcm_log_message(RCM_DEBUG, "Anon Address: %s\n", addrlist[i]);
}
@@ -2611,16 +2252,13 @@ ip_consumer_notify(rcm_handle_t *hd, datalink_id_t linkid, char **errorp,
return;
}
/*
- * Inform anonymous consumers about IP addresses being
- * onlined
+ * Inform anonymous consumers about IP addresses being onlined.
*/
(void) ip_onlinelist(hd, node, errorp, flags, depend_info);
(void) mutex_unlock(&cache_lock);
rcm_log_message(RCM_TRACE2, "IP: ip_consumer_notify success\n");
- return;
-
}
/*
@@ -2632,20 +2270,18 @@ if_configure(datalink_id_t linkid)
char ifinst[MAXLINKNAMELEN];
char cfgfile[MAXPATHLEN];
char cached_name[RCM_LINK_RESOURCE_MAX];
- struct stat statbuf;
+ FILE *hostfp, *host6fp;
ip_cache_t *node;
- int af = 0;
- int ipmp = 0;
+ boolean_t ipmp = B_FALSE;
assert(linkid != DATALINK_INVALID_LINKID);
-
rcm_log_message(RCM_TRACE1, _("IP: if_configure(%u)\n"), linkid);
/* Check for the interface in the cache */
(void) snprintf(cached_name, sizeof (cached_name), "%s/%u",
RCM_LINK_PREFIX, linkid);
- /* Check if the interface is new or was previously offlined */
+ /* Check if the interface is new or was not previously offlined */
(void) mutex_lock(&cache_lock);
if (((node = cache_lookup(NULL, cached_name, CACHE_REFRESH)) != NULL) &&
(!(node->ip_cachestate & CACHE_IF_OFFLINED))) {
@@ -2663,76 +2299,69 @@ if_configure(datalink_id_t linkid)
return (-1);
}
- /* Scan IPv4 configuration first */
- (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV4, ifinst);
- cfgfile[MAXPATHLEN - 1] = '\0';
-
+ /*
+ * Scan the IPv4 and IPv6 hostname files to see if (a) they exist
+ * and (b) if either one places the interface into an IPMP group.
+ */
+ (void) snprintf(cfgfile, MAXPATHLEN, CFGFILE_FMT_IPV4, ifinst);
rcm_log_message(RCM_TRACE1, "IP: Scanning %s\n", cfgfile);
- if (stat(cfgfile, &statbuf) == 0) {
- af |= CONFIG_AF_INET;
- if (isgrouped(cfgfile)) {
- ipmp++;
- }
+ if ((hostfp = fopen(cfgfile, "r")) != NULL) {
+ if (isgrouped(cfgfile))
+ ipmp = B_TRUE;
}
- /* Scan IPv6 configuration details */
- (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV6, ifinst);
- cfgfile[MAXPATHLEN - 1] = '\0';
+ (void) snprintf(cfgfile, MAXPATHLEN, CFGFILE_FMT_IPV6, ifinst);
rcm_log_message(RCM_TRACE1, "IP: Scanning %s\n", cfgfile);
- if (stat(cfgfile, &statbuf) == 0) {
- af |= CONFIG_AF_INET6;
- if ((ipmp == 0) && isgrouped(cfgfile)) {
- ipmp++;
- }
+ if ((host6fp = fopen(cfgfile, "r")) != NULL) {
+ if (!ipmp && isgrouped(cfgfile))
+ ipmp = B_TRUE;
}
- if (af & CONFIG_AF_INET) {
- if (if_ipmp_config(ifinst, CONFIG_AF_INET, ipmp) == -1) {
- rcm_log_message(RCM_ERROR,
- _("IP: IPv4 Post-attach failed (%s)\n"), ifinst);
- return (-1);
- }
+ /*
+ * Configure the interface according to its hostname files.
+ */
+ if (hostfp != NULL &&
+ if_config_inst(ifinst, hostfp, AF_INET, ipmp) == -1) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: IPv4 Post-attach failed (%s)\n"), ifinst);
+ goto fail;
}
- if (af & CONFIG_AF_INET6) {
- if (if_ipmp_config(ifinst, CONFIG_AF_INET6, ipmp) == -1) {
- rcm_log_message(RCM_ERROR,
- _("IP: IPv6 Post-attach failed(%s)\n"), ifinst);
- return (-1);
- }
+ if (host6fp != NULL &&
+ if_config_inst(ifinst, host6fp, AF_INET6, ipmp) == -1) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: IPv6 Post-attach failed (%s)\n"), ifinst);
+ goto fail;
}
+ (void) fclose(hostfp);
+ (void) fclose(host6fp);
rcm_log_message(RCM_TRACE1, "IP: if_configure(%s) success\n", ifinst);
-
return (0);
-
+fail:
+ (void) fclose(hostfp);
+ (void) fclose(host6fp);
+ return (-1);
}
/*
- * isgrouped() - Scans the given config file to see if this is a grouped
- * interface
- * Returns non-zero if true; 0 if false
+ * isgrouped() - Scans the given config file to see if this interface is
+ * using IPMP. Returns B_TRUE or B_FALSE.
*/
-static int
-isgrouped(char *cfgfile)
+static boolean_t
+isgrouped(const char *cfgfile)
{
FILE *fp;
struct stat statb;
- char *buf = NULL;
- char *tokens[MAXARGS]; /* token pointers */
- char tspace[MAXLINE]; /* token space */
- int ntok;
- int group = 0;
-
- if (cfgfile == NULL)
- return (0);
+ char *nlp, *line, *token, *lasts, *buf;
+ boolean_t grouped = B_FALSE;
rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s)\n", cfgfile);
if (stat(cfgfile, &statb) != 0) {
rcm_log_message(RCM_TRACE1,
_("IP: No config file(%s)\n"), cfgfile);
- return (0);
+ return (B_FALSE);
}
/*
@@ -2744,609 +2373,284 @@ isgrouped(char *cfgfile)
if (statb.st_size <= 1) {
rcm_log_message(RCM_TRACE1,
_("IP: Empty config file(%s)\n"), cfgfile);
- return (0);
+ return (B_FALSE);
}
if ((fp = fopen(cfgfile, "r")) == NULL) {
rcm_log_message(RCM_ERROR,
_("IP: Cannot open configuration file(%s): %s\n"), cfgfile,
strerror(errno));
- return (0);
+ return (B_FALSE);
}
- if ((buf = calloc(1, statb.st_size)) == NULL) {
+ if ((buf = malloc(statb.st_size)) == NULL) {
rcm_log_message(RCM_ERROR,
- _("IP: calloc failure(%s): %s\n"), cfgfile,
+ _("IP: malloc failure(%s): %s\n"), cfgfile,
strerror(errno));
- (void) fclose(fp);
- return (0);
+ goto out;
}
while (fgets(buf, statb.st_size, fp) != NULL) {
- if (*buf == '\0')
- continue;
-
- tokenize(buf, tokens, tspace, &ntok);
- while (ntok) {
- if (STREQ("group", tokens[ntok - 1])) {
- if (tokens[ntok] != NULL) {
- group++;
- }
+ if ((nlp = strrchr(buf, '\n')) != NULL)
+ *nlp = '\0';
+
+ line = buf;
+ while ((token = strtok_r(line, " \t", &lasts)) != NULL) {
+ line = NULL;
+ if (STREQ("group", token) &&
+ strtok_r(NULL, " \t", &lasts) != NULL) {
+ grouped = B_TRUE;
+ goto out;
}
- ntok--;
}
}
-
+out:
free(buf);
-
(void) fclose(fp);
- if (group <= 0) {
- rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s) non-grouped\n",
- cfgfile);
- return (0);
- } else {
- rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s) grouped\n",
- cfgfile);
- return (1);
- }
-}
+ rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s): %d\n", cfgfile,
+ grouped);
+ return (grouped);
+}
/*
- * if_ipmp_config() - Configure an interface instance as specified by the
+ * if_config_inst() - Configure an interface instance as specified by the
* address family af and if it is grouped (ipmp).
*/
static int
-if_ipmp_config(char *ifinst, int af, int ipmp)
+if_config_inst(const char *ifinst, FILE *hfp, int af, boolean_t ipmp)
{
- char cfgfile[MAXPATHLEN]; /* configuration file */
- FILE *fp;
+ FILE *ifparsefp;
struct stat statb;
- char *buf;
- char *tokens[MAXARGS]; /* list of config attributes */
- char tspace[MAXLINE]; /* token space */
- char syscmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1];
- char grpcmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1];
- char fstr[8]; /* address family string inet or inet6 */
- int nofailover = 0;
- int newattach = 0;
- int cmdvalid = 0;
- int ntok;
- int n;
- int stdif = 0;
-
- if (ifinst == NULL)
- return (0);
+ char *buf = NULL;
+ char *ifparsebuf = NULL;
+ uint_t ifparsebufsize;
+ const char *fstr; /* address family string */
+ boolean_t stdif = B_FALSE;
- rcm_log_message(RCM_TRACE1, "IP: if_ipmp_config(%s) ipmp = %d\n",
+ rcm_log_message(RCM_TRACE1, "IP: if_config_inst(%s) ipmp = %d\n",
ifinst, ipmp);
- if (af & CONFIG_AF_INET) {
- (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV4,
- ifinst);
- (void) strcpy(fstr, "inet");
- } else if (af & CONFIG_AF_INET6) {
- (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV6,
- ifinst);
- (void) strcpy(fstr, "inet6");
- } else {
- return (0); /* nothing to do */
- }
-
- cfgfile[MAXPATHLEN - 1] = '\0';
- grpcmd[0] = '\0';
-
- if (stat(cfgfile, &statb) != 0) {
- rcm_log_message(RCM_TRACE1,
- "IP: No config file(%s)\n", ifinst);
- return (0);
+ if (fstat(fileno(hfp), &statb) != 0) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: Cannot fstat file(%s)\n"), ifinst);
+ goto fail;
}
- /* Config file exists, plumb in the physical interface */
- if (af & CONFIG_AF_INET6) {
- if (if_getcount(AF_INET6) == 0) {
- /*
- * Configure software loopback driver if this is the
- * first IPv6 interface plumbed
- */
- newattach++;
- (void) snprintf(syscmd, sizeof (syscmd),
- "%s lo0 %s plumb ::1 up", USR_SBIN_IFCONFIG, fstr);
- if (rcm_exec_cmd(syscmd) != 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: Cannot plumb (%s) %s\n"),
- ifinst, strerror(errno));
- return (-1);
- }
- }
- (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s plumb up",
- USR_SBIN_IFCONFIG, ifinst, fstr);
- } else {
- (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s plumb ",
- USR_SBIN_IFCONFIG, ifinst, fstr);
- if (if_getcount(AF_INET) == 0) {
- newattach++;
- }
+ switch (af) {
+ case AF_INET:
+ fstr = "inet";
+ break;
+ case AF_INET6:
+ fstr = "inet6";
+ break;
+ default:
+ assert(0);
}
- rcm_log_message(RCM_TRACE1, "IP: Exec: %s\n", syscmd);
- if (rcm_exec_cmd(syscmd) != 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: Cannot plumb (%s) %s\n"), ifinst, strerror(errno));
- return (-1);
- }
+ /*
+ * The hostname file exists; plumb the physical interface.
+ */
+ if (!ifconfig(ifinst, fstr, "plumb", B_FALSE))
+ goto fail;
- /* Check if config file is empty, if so, nothing else to do */
- if (statb.st_size == 0) {
+ /* Skip static configuration if the hostname file is empty */
+ if (statb.st_size <= 1) {
rcm_log_message(RCM_TRACE1,
- "IP: Zero size config file(%s)\n", ifinst);
- return (0);
+ _("IP: Zero size hostname file(%s)\n"), ifinst);
+ goto configured;
}
- if ((fp = fopen(cfgfile, "r")) == NULL) {
+ if (fseek(hfp, 0, SEEK_SET) == -1) {
rcm_log_message(RCM_ERROR,
- _("IP: Open error(%s): %s\n"), cfgfile, strerror(errno));
- return (-1);
+ _("IP: Cannot rewind hostname file(%s): %s\n"), ifinst,
+ strerror(errno));
+ goto fail;
}
+ /*
+ * Allocate the worst-case single-line buffer sizes. A bit skanky,
+ * but since hostname files are small, this should suffice.
+ */
if ((buf = calloc(1, statb.st_size)) == NULL) {
rcm_log_message(RCM_ERROR,
_("IP: calloc(%s): %s\n"), ifinst, strerror(errno));
- (void) fclose(fp);
- return (-1);
+ goto fail;
}
- /* a single line with one token implies a classical if */
- if (fgets(buf, statb.st_size, fp) != NULL) {
- tokenize(buf, tokens, tspace, &ntok);
- if (ntok == 1) {
- rcm_log_message(RCM_TRACE1, "IP: Standard interface\n");
- stdif++;
- }
- }
- if (fseek(fp, 0L, SEEK_SET) == -1) {
- rcm_log_message(RCM_ERROR, _("IP: fseek: %s\n"),
- strerror(errno));
- return (-1);
+ ifparsebufsize = statb.st_size + sizeof (SBIN_IFPARSE " -s inet6 ");
+ if ((ifparsebuf = calloc(1, ifparsebufsize)) == NULL) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: calloc(%s): %s\n"), ifinst, strerror(errno));
+ goto fail;
}
/*
- * Process the config command
- * This loop also handles multiple logical interfaces that may
- * be configured on a single line
+ * For IPv4, determine whether the hostname file consists of a single
+ * line. We need to handle these specially since they should
+ * automatically be suffixed with "netmask + broadcast + up".
*/
- while (fgets(buf, statb.st_size, fp) != NULL) {
- nofailover = 0;
- cmdvalid = 0;
+ if (af == AF_INET &&
+ fgets(buf, statb.st_size, hfp) != NULL &&
+ fgets(buf, statb.st_size, hfp) == NULL) {
+ rcm_log_message(RCM_TRACE1, "IP: one-line hostname file\n");
+ stdif = B_TRUE;
+ }
- if (*buf == '\0')
- continue;
+ if (fseek(hfp, 0L, SEEK_SET) == -1) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: Cannot rewind hostname file(%s): %s\n"), ifinst,
+ strerror(errno));
+ goto fail;
+ }
- tokenize(buf, tokens, tspace, &ntok);
- if (ntok <= 0)
+ /*
+ * Loop through the file one line at a time and feed it to ifconfig.
+ * If the interface is using IPMP, then we use /sbin/ifparse -s to
+ * weed out all of the data addresses, since those are already on the
+ * IPMP meta-interface.
+ */
+ while (fgets(buf, statb.st_size, hfp) != NULL) {
+ if (ntok(buf) == 0)
continue;
- /* Reset the config command */
- (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s ",
- USR_SBIN_IFCONFIG, ifinst, fstr);
-
- /* No parsing if this is first interface of its kind */
- if (newattach) {
- (void) strcat(syscmd, buf);
- /* Classic if */
- if ((af & CONFIG_AF_INET) && (stdif == 1)) {
- (void) strcat(syscmd, CFG_CMDS_STD);
- }
- rcm_log_message(RCM_TRACE1, "IP: New: %s\n", syscmd);
- if (rcm_exec_cmd(syscmd) != 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: Error: %s (%s): %s\n"),
- syscmd, ifinst, strerror(errno));
- }
+ if (!ipmp) {
+ (void) ifconfig(ifinst, fstr, buf, stdif);
continue;
}
- /* Parse the tokens to determine nature of the interface */
- for (n = 0; n < ntok; n++) {
- /* Handle pathological failover cases */
- if (STREQ("-failover", tokens[n]))
- nofailover++;
- if (STREQ("failover", tokens[n]))
- nofailover--;
-
- /* group attribute requires special processing */
- if (STREQ("group", tokens[n])) {
- if (tokens[n + 1] != NULL) {
- (void) snprintf(grpcmd, sizeof (grpcmd),
- "%s %s %s %s %s", USR_SBIN_IFCONFIG,
- ifinst, fstr,
- tokens[n], tokens[n + 1]);
- n++; /* skip next token */
- continue;
- }
- }
-
- /* Execute buffered command ? */
- if (STREQ("set", tokens[n]) ||
- STREQ("addif", tokens[n]) ||
- STREQ("removeif", tokens[n]) ||
- (n == (ntok -1))) {
-
- /* config command complete ? */
- if (n == (ntok -1)) {
- ADDSPACE(syscmd);
- (void) strcat(syscmd, tokens[n]);
- cmdvalid++;
- }
-
- if (!cmdvalid) {
- ADDSPACE(syscmd);
- (void) strcat(syscmd, tokens[n]);
- cmdvalid++;
- continue;
- }
- /* Classic if ? */
- if ((af & CONFIG_AF_INET) && (stdif == 1)) {
- (void) strcat(syscmd, CFG_CMDS_STD);
- }
-
- if (nofailover > 0) {
- rcm_log_message(RCM_TRACE1,
- "IP: Interim exec: %s\n", syscmd);
- if (rcm_exec_cmd(syscmd) != 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: %s fail(%s): %s\n"),
- syscmd, ifinst,
- strerror(errno));
- }
- } else {
- /* Have mpathd configure the address */
- if (if_mpathd_configure(syscmd, ifinst,
- af, ipmp) != 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: %s fail(%s): %s\n"),
- syscmd, ifinst,
- strerror(errno));
- }
- }
-
- /* Reset config command */
- (void) snprintf(syscmd, sizeof (syscmd),
- "%s %s %s ", USR_SBIN_IFCONFIG, ifinst,
- fstr);
- nofailover = 0;
- cmdvalid = 0;
- }
- /*
- * Note: No explicit command validation is required
- * since ifconfig to does it for us
- */
- ADDSPACE(syscmd);
- (void) strcat(syscmd, tokens[n]);
- cmdvalid++;
- }
- }
-
- free(buf);
- (void) fclose(fp);
-
- /*
- * The group name needs to be set after all the test/nofailover
- * addresses have been configured. Otherwise, if IPMP detects that the
- * interface is failed, the addresses will be moved to a working
- * interface before the '-failover' flag can be set.
- */
- if (grpcmd[0] != '\0') {
- rcm_log_message(RCM_TRACE1, "IP: set group name: %s\n", grpcmd);
- if (rcm_exec_cmd(grpcmd) != 0) {
- rcm_log_message(RCM_ERROR, _("IP: %s fail(%s): %s\n"),
- grpcmd, ifinst, strerror(errno));
+ (void) snprintf(ifparsebuf, ifparsebufsize, SBIN_IFPARSE
+ " -s %s %s", fstr, buf);
+ if ((ifparsefp = popen(ifparsebuf, "r")) == NULL) {
+ rcm_log_message(RCM_ERROR,
+ _("IP: cannot configure %s: popen \"%s\" "
+ "failed: %s\n"), ifinst, buf, strerror(errno));
+ goto fail;
}
- }
- rcm_log_message(RCM_TRACE1, "IP: if_ipmp_config(%s) success\n", ifinst);
-
- return (0);
-}
-
-/*
- * if_mpathd_configure() - Determine configuration disposition of the interface
- */
-static int
-if_mpathd_configure(char *syscmd, char *ifinst, int af, int ipmp)
-{
- char *tokens[MAXARGS];
- char tspace[MAXLINE];
- int ntok;
- char *addr;
- char *from_lifname;
- mpathd_cmd_t mpdcmd;
- int n;
-
- rcm_log_message(RCM_TRACE1, "IP: if_mpathd_configure(%s): %s\n",
- ifinst, syscmd);
-
- tokenize(syscmd, tokens, tspace, &ntok);
- if (ntok <= 0)
- return (0);
-
- addr = tokens[3]; /* by default, third token is valid address */
- for (n = 0; n < ntok; n++) {
- if (STREQ("set", tokens[n]) ||
- STREQ("addif", tokens[n])) {
- addr = tokens[n+1];
- if (addr == NULL) { /* invalid format */
- return (-1);
- } else
- break;
+ while (fgets(buf, statb.st_size, ifparsefp) != NULL) {
+ if (ntok(buf) > 0)
+ (void) ifconfig(ifinst, fstr, buf, stdif);
}
- }
- /* Check std. commands or no failed over address */
- if (STREQ("removeif", addr) || STREQ("group", addr) ||
- ((from_lifname = get_mpathd_dest(addr, af)) == NULL)) {
- rcm_log_message(RCM_TRACE1,
- "IP: No failed-over host, exec %s\n", syscmd);
- if (rcm_exec_cmd(syscmd) != 0) {
+ if (pclose(ifparsefp) == -1) {
rcm_log_message(RCM_ERROR,
- _("IP: %s failed(%s): %s\n"),
- syscmd, ifinst, strerror(errno));
- return (-1);
+ _("IP: cannot configure %s: pclose \"%s\" "
+ "failed: %s\n"), ifinst, buf, strerror(errno));
+ goto fail;
}
- return (0);
- }
-
- /* Check for non-IPMP failover scenarios */
- if ((ipmp <= 0) && (from_lifname != NULL)) {
- /* Address already hosted on another NIC, return */
- rcm_log_message(RCM_TRACE1,
- "IP: Non-IPMP failed-over host(%s): %s\n",
- ifinst, addr);
- return (0);
}
+configured:
/*
- * Valid failed-over host; have mpathd set the original index
+ * Bring up the interface (it may already be up)
+ *
+ * Technically, since the boot scripts only unconditionally bring up
+ * IPv6 interfaces, we should only unconditionally bring up IPv6 here.
+ * However, if we don't bring up IPv4, and a legacy IPMP configuration
+ * without test addresses is being used, we will never bring the
+ * interface up even though we would've at boot. One fix is to check
+ * if the IPv4 hostname file contains data addresses that we would've
+ * brought up, but there's no simple way to do that. Given that it's
+ * rare to have persistent IP configuration for an interface that
+ * leaves it down, we cheap out and always bring it up for IPMP.
*/
- mpdcmd.cmd_command = MI_SETOINDEX;
- (void) strcpy(mpdcmd.from_lifname, from_lifname);
- (void) strcpy(mpdcmd.to_pifname, ifinst);
- if (af & CONFIG_AF_INET6) {
- mpdcmd.addr_family = AF_INET6;
- } else {
- mpdcmd.addr_family = AF_INET;
- }
-
- /* Send command to in.mpathd(1M) */
- rcm_log_message(RCM_TRACE1,
- "IP: Attempting setoindex from (%s) to (%s) ....\n",
- from_lifname, ifinst);
-
- if (mpathd_send_cmd(&mpdcmd) < 0) {
- rcm_log_message(RCM_TRACE1,
- "IP: mpathd set original index unsuccessful: %s\n",
- strerror(errno));
- return (-1);
- }
-
- rcm_log_message(RCM_TRACE1,
- "IP: setoindex success (%s) to (%s)\n",
- from_lifname, ifinst);
-
- return (0);
-}
-
-/*
- * get_mpathd_dest() - Return current destination for lif; caller is
- * responsible to free memory allocated for address
- */
-static char *
-get_mpathd_dest(char *addr, int family)
-{
- int sock;
- char *buf;
- struct lifnum lifn;
- struct lifconf lifc;
- struct lifreq *lifrp;
- sa_family_t af = AF_INET; /* IPv4 by default */
- int i;
- struct lifreq lifreq;
- struct sockaddr_in *sin;
- struct sockaddr_in6 *sin6;
- struct hostent *hp;
- char *ifname = NULL;
- char *prefix = NULL;
- char addrstr[INET6_ADDRSTRLEN];
- char ifaddr[INET6_ADDRSTRLEN];
- int err;
-
- if (addr == NULL) {
- return (NULL);
- }
-
- rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s)\n", addr);
-
- if (family & CONFIG_AF_INET6) {
- af = AF_INET6;
- } else {
- af = AF_INET;
- }
-
- if ((sock = socket(af, SOCK_DGRAM, 0)) == -1) {
- rcm_log_message(RCM_ERROR,
- _("IP: failure opening %s socket: %s\n"),
- af == AF_INET6 ? "IPv6" : "IPv4", strerror(errno));
- return (NULL);
- }
-
- lifn.lifn_family = af;
- lifn.lifn_flags = 0;
- if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: SIOCLGIFNUM failed: %s\n"),
- strerror(errno));
- (void) close(sock);
- return (NULL);
- }
-
- if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) {
- rcm_log_message(RCM_ERROR, _("IP: calloc: %s\n"),
- strerror(errno));
- (void) close(sock);
- return (NULL);
- }
-
- lifc.lifc_family = af;
- lifc.lifc_flags = 0;
- lifc.lifc_len = sizeof (struct lifreq) * lifn.lifn_count;
- lifc.lifc_buf = buf;
-
- if (ioctl(sock, SIOCGLIFCONF, (char *)&lifc) < 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: SIOCGLIFCONF failed: %s\n"),
- strerror(errno));
- free(buf);
- (void) close(sock);
- return (NULL);
- }
+ if ((af == AF_INET6 || ipmp) && !ifconfig(ifinst, fstr, "up", B_FALSE))
+ goto fail;
- /* Filter out prefix address from netmask */
- (void) strcpy(ifaddr, addr);
- if ((prefix = strchr(ifaddr, '/')) != NULL) {
- *prefix = '\0'; /* We care about the address part only */
- }
+ /*
+ * For IPv4, if a DHCP configuration file exists, have DHCP configure
+ * the interface. As with the boot scripts, this is done after the
+ * hostname files are processed so that configuration in those files
+ * (such as IPMP group names) will be applied first.
+ */
+ if (af == AF_INET) {
+ char dhcpfile[MAXPATHLEN];
+ char *dhcpbuf;
+ off_t i, dhcpsize;
- /* Check for aliases */
- hp = getipnodebyname(ifaddr, af, AI_DEFAULT, &err);
- if (hp) {
- if (inet_ntop(af, (void *)hp->h_addr_list[0],
- ifaddr, sizeof (ifaddr)) == NULL) {
- /* Restore original address and use it */
- (void) strcpy(ifaddr, addr);
- if ((prefix = strchr(ifaddr, '/')) != NULL) {
- *prefix = '\0';
- }
- }
- freehostent(hp);
- }
- rcm_log_message(RCM_TRACE2, "IP: ifaddr(%s) = %s\n", addr, ifaddr);
+ (void) snprintf(dhcpfile, MAXPATHLEN, DHCPFILE_FMT, ifinst);
+ if (stat(dhcpfile, &statb) == -1)
+ goto out;
- /* now search the interfaces */
- lifrp = lifc.lifc_req;
- for (i = 0; i < lifn.lifn_count; i++, lifrp++) {
- (void) strcpy(lifreq.lifr_name, lifrp->lifr_name);
- /* Get the interface address for this interface */
- if (ioctl(sock, SIOCGLIFADDR, (char *)&lifreq) < 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: SIOCGLIFADDR: %s\n"), strerror(errno));
- free(buf);
- (void) close(sock);
- return (NULL);
- }
-
- if (af == AF_INET6) {
- sin6 = (struct sockaddr_in6 *)&lifreq.lifr_addr;
- if (inet_ntop(AF_INET6, (void *)&sin6->sin6_addr,
- addrstr, sizeof (addrstr)) == NULL) {
- continue;
- }
- } else {
- sin = (struct sockaddr_in *)&lifreq.lifr_addr;
- if (inet_ntop(AF_INET, (void *)&sin->sin_addr,
- addrstr, sizeof (addrstr)) == NULL) {
- continue;
- }
+ if ((dhcpbuf = copylist(dhcpfile, &dhcpsize)) == NULL) {
+ rcm_log_message(RCM_ERROR, _("IP: cannot read "
+ "(%s): %s\n"), dhcpfile, strerror(errno));
+ goto fail;
}
- if (STREQ(addrstr, ifaddr)) {
- /* Allocate memory to hold interface name */
- if ((ifname = (char *)malloc(LIFNAMSIZ)) == NULL) {
- rcm_log_message(RCM_ERROR,
- _("IP: malloc: %s\n"), strerror(errno));
- free(buf);
- (void) close(sock);
- return (NULL);
- }
-
- /* Copy the interface name */
- /*
- * (void) memcpy(ifname, lifrp->lifr_name,
- * sizeof (ifname));
- * ifname[sizeof (ifname) - 1] = '\0';
- */
- (void) strcpy(ifname, lifrp->lifr_name);
- break;
+ /*
+ * The copylist() API converts \n's to \0's, but we want them
+ * to be spaces.
+ */
+ if (dhcpsize > 0) {
+ for (i = 0; i < dhcpsize; i++)
+ if (dhcpbuf[i] == '\0')
+ dhcpbuf[i] = ' ';
+ dhcpbuf[dhcpsize - 1] = '\0';
}
+ (void) ifconfig(ifinst, CFG_DHCP_CMD, dhcpbuf, B_FALSE);
+ free(dhcpbuf);
}
-
- (void) close(sock);
+out:
+ free(ifparsebuf);
free(buf);
-
- if (ifname == NULL)
- rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s): none\n",
- addr);
- else
- rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s): %s\n",
- addr, ifname);
-
- return (ifname);
-}
-
-static int
-if_getcount(int af)
-{
- int sock;
- struct lifnum lifn;
-
- rcm_log_message(RCM_TRACE1, "IP: if_getcount\n");
-
- if ((sock = socket(af, SOCK_DGRAM, 0)) == -1) {
- rcm_log_message(RCM_ERROR,
- _("IP: failure opening %s socket: %s\n"),
- af == AF_INET6 ? "IPv6" : "IPv4", strerror(errno));
- return (-1);
- }
-
- lifn.lifn_family = af;
- lifn.lifn_flags = 0;
- if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) {
- rcm_log_message(RCM_ERROR,
- _("IP: SIOCLGIFNUM failed: %s\n"),
- strerror(errno));
- (void) close(sock);
- return (-1);
- }
- (void) close(sock);
-
- rcm_log_message(RCM_TRACE1, "IP: if_getcount success: %d\n",
- lifn.lifn_count);
-
- return (lifn.lifn_count);
+ rcm_log_message(RCM_TRACE1, "IP: if_config_inst(%s) success\n", ifinst);
+ return (0);
+fail:
+ free(ifparsebuf);
+ free(buf);
+ rcm_log_message(RCM_ERROR, "IP: if_config_inst(%s) failure\n", ifinst);
+ return (-1);
}
/*
- * tokenize() - turn a command line into tokens; caller is responsible to
- * provide enough memory to hold all tokens
+ * ntok() - count the number of tokens in the provided buffer.
*/
-static void
-tokenize(char *line, char **tokens, char *tspace, int *ntok)
+static uint_t
+ntok(const char *cp)
{
- char *cp;
- char *sp;
+ uint_t ntok = 0;
- sp = tspace;
- cp = line;
- for (*ntok = 0; *ntok < MAXARGS; (*ntok)++) {
- tokens[*ntok] = sp;
+ for (;;) {
while (ISSPACE(*cp))
cp++;
+
if (ISEOL(*cp))
break;
+
do {
- *sp++ = *cp++;
+ cp++;
} while (!ISSPACE(*cp) && !ISEOL(*cp));
- *sp++ = '\0';
+ ntok++;
+ }
+ return (ntok);
+}
+
+static boolean_t
+ifconfig(const char *ifinst, const char *fstr, const char *buf, boolean_t stdif)
+{
+ char syscmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1];
+ int status;
+
+ (void) snprintf(syscmd, sizeof (syscmd), SBIN_IFCONFIG " %s %s %s",
+ ifinst, fstr, buf);
+
+ if (stdif)
+ (void) strlcat(syscmd, CFG_CMDS_STD, sizeof (syscmd));
+
+ rcm_log_message(RCM_TRACE1, "IP: Exec: %s\n", syscmd);
+ if ((status = rcm_exec_cmd(syscmd)) != 0) {
+ if (WIFEXITED(status)) {
+ rcm_log_message(RCM_ERROR, _("IP: \"%s\" failed with "
+ "exit status %d\n"), syscmd, WEXITSTATUS(status));
+ } else {
+ rcm_log_message(RCM_ERROR, _("IP: Error: %s: %s\n"),
+ syscmd, strerror(errno));
+ }
+ return (B_FALSE);
}
+ return (B_TRUE);
}
diff --git a/usr/src/cmd/svc/milestone/net-init b/usr/src/cmd/svc/milestone/net-init
index 26b295dce9..7f0804af67 100644
--- a/usr/src/cmd/svc/milestone/net-init
+++ b/usr/src/cmd/svc/milestone/net-init
@@ -20,11 +20,9 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# This is the second phase of TCP/IP configuration. The first part is
# run by the svc:/network/physical service and includes configuring the
# interfaces and setting the machine's hostname. The svc:/network/initial
@@ -52,10 +50,11 @@ if [ -f /etc/inet/ipaddrsel.conf ]; then
fi
#
-# Now that /usr is mounted, see if in.mpathd needs to be started by firing it
-# up in "adopt" mode; if there are no interfaces it needs to manage, it will
-# automatically exit. Note that it may already be running if we're not
-# executing as part of system boot.
+# If explicit IPMP groups are being used, in.mpathd will already be started.
+# However, if TRACK_INTERFACES_ONLY_WITH_GROUPS=no and no explicit IPMP
+# groups have been configured, then it still needs to be started. So, fire
+# it up in "adopt" mode; if there are no interfaces it needs to manage, it
+# will automatically exit.
#
/usr/bin/pgrep -x -u 0 -z `smf_zonename` in.mpathd >/dev/null 2>&1 || \
/usr/lib/inet/in.mpathd -a
diff --git a/usr/src/cmd/svc/milestone/net-loopback b/usr/src/cmd/svc/milestone/net-loopback
index 3bd5a0f525..d07afd4ada 100644
--- a/usr/src/cmd/svc/milestone/net-loopback
+++ b/usr/src/cmd/svc/milestone/net-loopback
@@ -20,10 +20,9 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
. /lib/svc/share/smf_include.sh
@@ -36,14 +35,6 @@
smf_configure_ip || exit $SMF_EXIT_OK
#
-# Cause ifconfig to not automatically start in.mpathd when IPMP groups are
-# configured. This is not strictly necessary but makes it so that in.mpathd
-# will always be started explicitly from /lib/svc/method/net-init (the
-# svc:/network/initial service), when we're sure that /usr is mounted.
-#
-SUNW_NO_MPATHD=; export SUNW_NO_MPATHD
-
-#
# Before any interfaces are configured, we need to set the system
# default IP forwarding behavior. This will be the setting for
# interfaces that don't modify the per-interface setting with the
diff --git a/usr/src/cmd/svc/milestone/net-physical b/usr/src/cmd/svc/milestone/net-physical
index 8530806768..bc74c2a206 100644
--- a/usr/src/cmd/svc/milestone/net-physical
+++ b/usr/src/cmd/svc/milestone/net-physical
@@ -20,7 +20,7 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T.
@@ -38,22 +38,9 @@
#
smf_configure_ip || exit $SMF_EXIT_OK
-# Print warnings to console
-warn_failed_ifs() {
- echo "Failed to $1 interface(s): $2" >/dev/msglog
-}
-
# Make sure that the libraries essential to this stage of booting can be found.
LD_LIBRARY_PATH=/lib; export LD_LIBRARY_PATH
-#
-# Cause ifconfig to not automatically start in.mpathd when IPMP groups are
-# configured. This is not strictly necessary but makes it so that in.mpathd
-# will always be started explicitly from /etc/init.d/inetinit, when we're
-# sure that /usr is mounted.
-#
-SUNW_NO_MPATHD=; export SUNW_NO_MPATHD
-
smf_netstrategy
if smf_is_globalzone; then
@@ -127,13 +114,18 @@ if [ "$interface_names" != "/etc/hostname.*[0-9]" ]; then
IFS="$ORIGIFS"
while [ $# -ge 2 ]; do
shift
- if [ $# -gt 1 -a "$2" != "/etc/hostname" ]; then
- while [ $# -gt 1 -a "$1" != "/etc/hostname" ]; do
- shift
- done
- else
- inet_list="$inet_list $1"
+ intf_name=$1
+ while [ $# -gt 1 -a "$2" != "/etc/hostname" ]; do
+ intf_name="$intf_name.$2"
shift
+ done
+ shift
+
+ read one rest < /etc/hostname.$intf_name
+ if [ "$one" = ipmp ]; then
+ ipmp_list="$ipmp_list $intf_name"
+ else
+ inet_list="$inet_list $intf_name"
fi
done
fi
@@ -151,17 +143,38 @@ if [ "$interface_names" != "/etc/hostname6.*[0-9]" ]; then
IFS="$ORIGIFS"
while [ $# -ge 2 ]; do
shift
- if [ $# -gt 1 -a "$2" != "/etc/hostname6" ]; then
- while [ $# -gt 1 -a "$1" != "/etc/hostname6" ]; do
- shift
- done
- else
- inet6_list="$inet6_list $1"
+ intf_name=$1
+ while [ $# -gt 1 -a "$2" != "/etc/hostname6" ]; do
+ intf_name="$intf_name.$2"
shift
+ done
+ shift
+
+ read one rest < /etc/hostname6.$intf_name
+ if [ "$one" = ipmp ]; then
+ ipmp6_list="$ipmp6_list $intf_name"
+ else
+ inet6_list="$inet6_list $intf_name"
fi
done
fi
+#
+# Create all of the IPv4 IPMP interfaces.
+#
+if [ -n "$ipmp_list" ]; then
+ set -- $ipmp_list
+ while [ $# -gt 0 ]; do
+ if /sbin/ifconfig $1 ipmp; then
+ ipmp_created="$ipmp_created $1"
+ else
+ ipmp_failed="$ipmp_failed $1"
+ fi
+ shift
+ done
+ [ -n "$ipmp_failed" ] && warn_failed_ifs "create IPv4 IPMP" \
+ "$ipmp_failed"
+fi
#
# Step through the IPv4 interface list and try to plumb every interface.
@@ -178,7 +191,7 @@ if [ -n "$inet_list" ]; then
fi
shift
done
- [ -n "$inet_failed" ] && warn_failed_ifs "plumb IPv4" $inet_failed
+ [ -n "$inet_failed" ] && warn_failed_ifs "plumb IPv4" "$inet_failed"
fi
# Run autoconf to connect to a WLAN if the interface is a wireless one
@@ -209,7 +222,24 @@ if [ -n "$inet6_list" ]; then
fi
shift
done
- [ -n "$inet6_failed" ] && warn_failed_ifs "plumb IPv6" $inet6_failed
+ [ -n "$inet6_failed" ] && warn_failed_ifs "plumb IPv6" "$inet6_failed"
+fi
+
+#
+# Create all of the IPv6 IPMP interfaces.
+#
+if [ -n "$ipmp6_list" ]; then
+ set -- $ipmp6_list
+ while [ $# -gt 0 ]; do
+ if /sbin/ifconfig $1 inet6 ipmp; then
+ ipmp6_created="$ipmp6_created $1"
+ else
+ ipmp6_failed="$ipmp6_failed $1"
+ fi
+ shift
+ done
+ [ -n "$ipmp6_failed" ] && warn_failed_ifs "create IPv6 IPMP" \
+ "$ipmp6_failed"
fi
if smf_is_globalzone; then
@@ -224,49 +254,24 @@ if smf_is_globalzone; then
fi
#
-# Process the /etc/hostname.* files of plumbed IPv4 interfaces. If an
-# /etc/hostname file is not present or is empty, the ifconfig auto-dhcp
-# / auto-revarp command will attempt to set the address, later.
+# Process the /etc/hostname[6].* files for IPMP interfaces. Processing these
+# before non-IPMP interfaces avoids accidental implicit IPMP group creation.
+#
+[ -n "$ipmp_created" ] && if_configure inet "IPMP" $ipmp_created
+[ -n "$ipmp6_created" ] && if_configure inet6 "IPMP" $ipmp6_created
+
#
-# If /etc/hostname.lo0 exists the loop below will do additional
-# configuration of lo0.
+# Process the /etc/hostname[6].* files for non-IPMP interfaces.
#
-if [ -n "$inet_plumbed" ]; then
- i4s_fail=
- echo "configuring IPv4 interfaces:\c"
- set -- $inet_plumbed
- while [ $# -gt 0 ]; do
- inet_process_hostname /sbin/ifconfig $1 inet \
- </etc/hostname.$1 >/dev/null
- [ $? != 0 ] && i4s_fail="$i4s_fail $1"
- echo " $1\c"
- shift
- done
- echo "."
- [ -n "$i4s_fail" ] && warn_failed_ifs "configure IPv4" $i4s_fail
-fi
+[ -n "$inet_plumbed" ] && if_configure inet "" $inet_plumbed
+[ -n "$inet6_plumbed" ] && if_configure inet6 "" $inet6_plumbed
#
-# Process the /etc/hostname6.* files of plumbed IPv6 interfaces. After
-# processing the hostname6 file, bring the interface up. If
-# /etc/hostname6.lo0 exists the loop below will do additional
-# configuration of lo0.
+# For the IPv4 and IPv6 interfaces that failed to plumb, find (or create)
+# IPMP meta-interfaces to host their data addresses.
#
-if [ -n "$inet6_plumbed" ]; then
- i6_fail=
- echo "configuring IPv6 interfaces:\c"
- set -- $inet6_plumbed
- while [ $# -gt 0 ]; do
- inet6_process_hostname /sbin/ifconfig $1 inet6 \
- </etc/hostname6.$1 >/dev/null &&
- /sbin/ifconfig $1 inet6 up
- [ $? != 0 ] && i6_fail="$i6_fail $1"
- echo " $1\c"
- shift
- done
- echo "."
- [ -n "$i6_fail" ] && warn_failed_ifs "configure IPv6" $i6_fail
-fi
+[ -n "$inet_failed" ] && move_addresses inet
+[ -n "$inet6_failed" ] && move_addresses inet6
# Run DHCP if requested. Skip boot-configured interface.
interface_names="`echo /etc/dhcp.*[0-9] 2>/dev/null`"
@@ -326,7 +331,7 @@ if [ "$interface_names" != '/etc/dhcp.*[0-9]' ]; then
done
IFS="$ORIGIFS"
unset ORIGIFS
- [ -n "$i4d_fail" ] && warn_failed_ifs "configure IPv4 DHCP" $i4d_fail
+ [ -n "$i4d_fail" ] && warn_failed_ifs "configure IPv4 DHCP" "$i4d_fail"
fi
# In order to avoid bringing up the interfaces that have
@@ -338,14 +343,6 @@ if [ "$_INIT_NET_STRATEGY" = "rarp" -o -z "$hostname" ]; then
fi
#
-# Process IPv4 and IPv6 interfaces that failed to plumb. Find an
-# alternative interface to host the addresses.
-#
-[ -n "$inet_failed" ] && move_addresses inet
-
-[ -n "$inet6_failed" ] && move_addresses inet6
-
-#
# If the /etc/defaultrouter file exists, process it now so that the next
# stage of booting will have access to NFS.
#
diff --git a/usr/src/cmd/svc/shell/net_include.sh b/usr/src/cmd/svc/shell/net_include.sh
index 51c87a40a8..71dc6a8256 100644
--- a/usr/src/cmd/svc/shell/net_include.sh
+++ b/usr/src/cmd/svc/shell/net_include.sh
@@ -20,13 +20,18 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T.
# All rights reserved.
#
+# Print warnings to console
+warn_failed_ifs() {
+ echo "Failed to $1 interface(s):$2" >/dev/msglog
+}
+
#
# shcat file
# Simulates cat in sh so it doesn't need to be on the root filesystem.
@@ -41,20 +46,28 @@ shcat() {
}
#
-# Inet_list, list of IPv4 interfaces.
-# Inet_plumbed, list of plumbed IPv4 interfaces.
-# Inet_failed, list of IPv4 interfaces that failed to plumb.
-# Inet6_list, list of IPv6 interfaces.
-# Inet6_plumbed, list of plumbed IPv6 interfaces.
-# Inet6_failed, list of IPv6 interfaces that failed to plumb.
+# inet_list list of IPv4 interfaces.
+# inet6_list list of IPv6 interfaces.
+# ipmp_list list of IPMP IPv4 interfaces.
+# ipmp6_list list of IPMP IPv6 interfaces.
+# inet_plumbed list of plumbed IPv4 interfaces.
+# inet6_plumbed list of plumbed IPv6 interfaces.
+# ipmp_created list of created IPMP IPv4 interfaces.
+# ipmp6_created list of created IPMP IPv6 interfaces.
+# inet_failed list of IPv4 interfaces that failed to plumb.
+# inet6_failed list of IPv6 interfaces that failed to plumb.
+# ipmp_failed list of IPMP IPv4 interfaces that failed to be created.
+# ipmp6_failed list of IPMP IPv6 interfaces that failed to be created.
#
unset inet_list inet_plumbed inet_failed \
- inet6_list inet6_plumbed inet6_failed
+ inet6_list inet6_plumbed inet6_failed \
+ ipmp_list ipmp_created ipmp_failed \
+ ipmp6_list ipmp6_created ipmp6_failed
+
#
# get_physical interface
#
-# Return physical interface corresponding to the given logical
-# interface.
+# Return physical interface corresponding to the given interface.
#
get_physical()
{
@@ -70,7 +83,7 @@ get_physical()
# get_logical interface
#
# Return logical interface number. Zero will be returned
-# if there is no explicit logical device number.
+# if there is no explicit logical number.
#
get_logical()
{
@@ -89,19 +102,18 @@ get_logical()
#
# if_comp if1 if2
#
-# Compare Interfaces. Do the physical interface names and logical interface
+# Compare interfaces. Do the physical interface names and logical interface
# numbers match?
#
if_comp()
{
- [ "`get_physical $1`" = "`get_physical $2`" ] && \
- [ `get_logical $1` -eq `get_logical $2` ]
+ physical_comp $1 $2 && [ `get_logical $1` -eq `get_logical $2` ]
}
-
+
#
# physical_comp if1 if2
#
-# Do the two devices share a physical interface?
+# Do the two interfaces share a physical interface?
#
physical_comp()
{
@@ -129,19 +141,110 @@ in_list()
}
#
-# get_group_from_hostname interface type
+# get_inactive_ifname groupname
+#
+# Return the name of an inactive interface in `groupname', if one exists.
+#
+get_inactive_ifname()
+{
+ ORIGIFS="$IFS"
+ /sbin/ipmpstat -gP -o groupname,interfaces |
+ while IFS=: read groupname ifnames; do
+ #
+ # Skip other IPMP groups.
+ #
+ [ "$groupname" != "$1" ] && continue
+
+ #
+ # Standby interfaces are always enclosed in ()'s, so look
+ # for the first interface name starting with a "(", and
+ # strip those off.
+ #
+ IFS=" "
+ for ifname in $ifnames; do
+ case "$ifname" in
+ '('*) IFS="()"
+ echo $ifname
+ IFS="$ORIGIFS"
+ return
+ ;;
+ *) ;;
+ esac
+ done
+ done
+ IFS="$ORIGIFS"
+}
+
+#
+# get_groupifname groupname
+#
+# Return the IPMP meta-interface name for the group, if it exists.
+#
+get_groupifname()
+{
+ /sbin/ipmpstat -gP -o groupname,group | while IFS=: read name ifname; do
+ if [ "$name" = "$1" ]; then
+ echo "$ifname"
+ return
+ fi
+ done
+}
+
+#
+# create_ipmp ifname groupname type
+#
+# Helper function for create_groupifname() that returns zero if it's able
+# to create an IPMP interface of the specified type and place it in the
+# specified group, or non-zero otherwise.
+#
+create_ipmp()
+{
+ /sbin/ifconfig $1 >/dev/null 2>&1 && return 1
+ /sbin/ifconfig $1 inet6 >/dev/null 2>&1 && return 1
+ /sbin/ifconfig $1 $3 ipmp group $2 2>/dev/null
+}
+
+#
+# create_groupifname groupname type
+#
+# Create an IPMP meta-interface name for the group. We only use this
+# function if all of the interfaces in the group failed at boot and there
+# were no /etc/hostname[6].<if> files for the IPMP meta-interface.
+#
+create_groupifname()
+{
+ #
+ # This is a horrible way to count from 0 to 999, but in sh and
+ # without necessarily having /usr mounted, what else can we do?
+ #
+ for a in "" 1 2 3 4 5 6 7 8 9; do
+ for b in 0 1 2 3 4 5 6 7 8 9; do
+ for c in 0 1 2 3 4 5 6 7 8 9; do
+ # strip leading zeroes
+ [ "$a" = "" ] && [ "$b" = 0 ] && b=""
+ if create_ipmp ipmp$a$b$c $1 $2; then
+ echo ipmp$a$b$c
+ return
+ fi
+ done
+ done
+ done
+}
+
+#
+# get_hostname_ipmpinfo interface type
#
-# Return all group settings from hostname file for a given interface.
+# Return all requested IPMP keywords from hostname file for a given interface.
#
# Example:
-# get_group_from_hostname hme0 inet
+# get_hostname_ipmpinfo hme0 inet keyword [ keyword ... ]
#
-get_group_from_hostname()
+get_hostname_ipmpinfo()
{
case "$2" in
- inet) file=/etc/hostname.$1
+ inet) file=/etc/hostname.$1
;;
- inet6) file=/etc/hostname6.$1
+ inet6) file=/etc/hostname6.$1
;;
*)
return
@@ -150,16 +253,21 @@ get_group_from_hostname()
[ -r "$file" ] || return
+ type=$2
+ shift 2
+
#
- # Read through the hostname file looking for group settings
- # There may be several group settings in the file. It is up
- # to the caller to pick the right one (i.e. the last one).
+ # Read through the hostname file looking for the specified
+ # keywords. Since there may be several keywords that cancel
+ # each other out, the caller must post-process as appropriate.
#
while read line; do
[ -z "$line" ] && continue
- /sbin/ifparse -s "$2" $line
- done < "$file" | while read one two three; do
- [ "$one" = "group" ] && echo "$two"
+ /sbin/ifparse -s "$type" $line
+ done < "$file" | while read one two; do
+ for keyword in "$@"; do
+ [ "$one" = "$keyword" ] && echo "$one $two"
+ done
done
}
@@ -174,7 +282,6 @@ get_group_from_hostname()
get_group_for_type()
{
physical=`get_physical $1`
-
type=$2
group=""
@@ -183,184 +290,77 @@ get_group_for_type()
# the reason for the second while loop.
#
shift 2
- while [ $# -gt 0 ]; do
- if if_comp "$physical" $1; then
- get_group_from_hostname $1 $type
+ for ifname in "$@"; do
+ if if_comp "$physical" $ifname; then
+ get_hostname_ipmpinfo $ifname $type group
fi
- shift
done | while :; do
- read next || {
+ read keyword grname || {
echo "$group"
break
}
- group="$next"
+ group="$grname"
done
}
#
-# get_group interface [ configured | failed ]
-#
-# If there is both an inet and inet6 version of an interface, the group
-# could be set in either set of hostname files.
-#
-# Inet6 is configured after inet, so if the group is set in both
-# sets of hostname files, the inet6 file wins.
-#
-# The "configured" argument should be used to get the group for
-# an interface that has been plumbed into the stack and configured. Use
-# the "failed" argument to get the group for an interface that failed to
-# plumb.
-#
-get_group()
-{
- group=""
-
- case "$2" in
- configured)
- group=`get_group_for_type $1 inet6 $inet6_plumbed`
- ;;
- failed)
- group=`get_group_for_type $1 inet6 $inet6_list`
- ;;
- *)
- return
- ;;
- esac
-
- if [ -z "$group" ]; then
- if [ "$2" = configured ]; then
- group=`get_group_for_type $1 inet $inet_plumbed`
- else
- group=`get_group_for_type $1 inet $inet_list`
- fi
- fi
-
- echo $group
-}
-
-#
-# get_standby_from_hostname interface type
-#
-# Return any "standby" or "-standby" flags in the hostname file.
-#
-# Example:
-# get_standby_from_hostname hme0 inet6
-#
-#
-get_standby_from_hostname()
-{
- case "$2" in
- inet) file=/etc/hostname.$1
- ;;
- inet6) file=/etc/hostname6.$1
- ;;
- *)
- return
- ;;
- esac
-
- [ -r "$file" ] || return
-
- #
- # There may be several instances of the "standby" and
- # "-standby" flags in the hostname file. It is up to
- # the caller to pick the correct one.
- #
- while read line; do
- [ -z "$line" ] && continue
- /sbin/ifparse -s "$2" $line
- done < "$file" | while read one two; do
- [ "$one" = "standby" ] || [ "$one" = "-standby" ] \
- && echo "$one"
- done
-}
-
-#
-# get_standby_for_type interface type plumbed_list
+# get_standby_for_type interface type list
#
# Look through the set of hostname files associated with the same physical
-# interface as "interface", and determine whether they would configure
-# the interface as a standby interface.
+# interface as "interface", and print the standby value ("standby",
+# "-standby", or nothing). Only hostname files associated with the
+# physical interface or logical interface zero can set this flag.
#
get_standby_for_type()
{
-
physical=`get_physical $1`
type=$2
- final=""
-
#
- # The last "standby" or "-standby" flag is the one that counts,
- # which is the reason for the second while loop.
+ # The last setting of "standby" or "-standby" is the one that
+ # counts, which is the reason for the second while loop.
#
shift 2
- while [ $# -gt 0 ]; do
- if [ "`get_physical $1`" = "$physical" ]; then
- get_standby_from_hostname $1 $type
+ for ifname in "$@"; do
+ if if_comp "$physical" $ifname; then
+ get_hostname_ipmpinfo $ifname $type standby -standby
fi
- shift
done | while :; do
- read next || {
- echo "$final"
+ read keyword || {
+ echo "$iftype"
break
}
- final="$next"
+ iftype="$keyword"
done
}
#
-# is_standby interface
+# get_group interface
#
-# Determine whether a configured interface is a standby interface.
-#
-# Both the inet and inet6 hostname file sets must be checked.
-# If "standby" or "-standby" is set in the inet6 hostname file set,
-# don't bother looking at the inet set.
+# If there is both an inet and inet6 version of an interface, the group
+# could be set in either set of hostname files. Since inet6 is configured
+# after inet, if there's a setting in both files, inet6 wins.
#
-is_standby()
+get_group()
{
- standby=`get_standby_for_type $1 inet6 $inet6_plumbed`
-
- if [ -z "$standby" ]; then
- standby=`get_standby_for_type $1 inet $inet_plumbed`
- fi
-
- # The return value is the value of the following test.
- [ "$standby" = "standby" ]
+ group=`get_group_for_type $1 inet6 $inet6_list`
+ [ -z "$group" ] && group=`get_group_for_type $1 inet $inet_list`
+ echo $group
}
#
-# get_alternate interface plumbed_list
-#
-# Look for a plumbed interface in the same group as "interface".
-# A standby interface is preferred over a non-standby interface.
+# is_standby interface
#
-# Example:
-# get_alternate hme0 $inet_plumbed
+# If there is both an inet and inet6 version of an interface, the
+# "standby" or "-standby" flag could be set in either set of hostname
+# files. Since inet6 is configured after inet, if there's a setting in
+# both files, inet6 wins.
#
-get_alternate()
+is_standby()
{
- mygroup=`get_group $1 failed`
- [ -z "$mygroup" ] && return
-
- maybe=""
-
- shift
- while [ $# -gt 0 ]; do
- group=`get_group $1 configured`
- if [ "$group" = "$mygroup" ]; then
- if is_standby $1; then
- get_physical $1
- return
- else
- [ -z "$maybe" ] && maybe=$1
- fi
- fi
- shift
- done
-
- get_physical $maybe
+ standby=`get_standby_for_type $1 inet6 $inet6_list`
+ [ -z "$standby" ] && standby=`get_standby_for_type $1 inet $inet_list`
+ [ "$standby" = "standby" ]
}
#
@@ -394,7 +394,7 @@ doDHCPhostname()
#
# If there is only line in an hostname file we assume it contains
# the old style address which results in the interface being brought up
-# and the netmask and broadcast address being set.
+# and the netmask and broadcast address being set ($inet_oneline_epilogue).
#
# If there are multiple lines we assume the file contains a list of
# commands to the processor with neither the implied bringing up of the
@@ -403,6 +403,8 @@ doDHCPhostname()
# Return non-zero if any command fails so that the caller may alert
# users to errors in the configuration.
#
+inet_oneline_epilogue="netmask + broadcast + up"
+
inet_process_hostname()
{
if doDHCPhostname $2; then
@@ -418,7 +420,7 @@ inet_process_hostname()
ifcmds=""
retval=0
- while read line; do
+ while read one rest; do
if [ -n "$ifcmds" ]; then
#
# This handles the first N-1
@@ -427,7 +429,14 @@ inet_process_hostname()
$* $ifcmds || retval=$?
multiple_lines=true
fi
- ifcmds="$line"
+
+ #
+ # Strip out the "ipmp" keyword if it's the
+ # first token, since it's used to control
+ # interface creation, not configuration.
+ #
+ [ "$one" = ipmp ] && one=
+ ifcmds="$one $rest"
done
#
@@ -437,8 +446,8 @@ inet_process_hostname()
#
[ -z "$ifcmds" ] && return $retval
if [ $multiple_lines = false ]; then
- # The traditional single-line hostname file.
- ifcmds="$ifcmds netmask + broadcast + up"
+ # The traditional one-line hostname file.
+ ifcmds="$ifcmds $inet_oneline_epilogue"
fi
#
@@ -470,7 +479,13 @@ inet_process_hostname()
inet6_process_hostname()
{
retval=0
- while read ifcmds; do
+ while read one rest; do
+ #
+ # See comment in inet_process_hostname for details.
+ #
+ [ "$one" = ipmp ] && one=
+ ifcmds="$one $rest"
+
if [ -n "$ifcmds" ]; then
$* $ifcmds || retval=$?
fi
@@ -479,10 +494,9 @@ inet6_process_hostname()
}
#
-# Process interfaces that failed to plumb. Find an alternative
-# interface to host the addresses. For IPv6, only static addresses
-# defined in hostname6 files are moved, autoconfigured addresses are
-# not moved.
+# Process interfaces that failed to plumb. Find the IPMP meta-interface
+# that should host the addresses. For IPv6, only static addresses defined
+# in hostname6 files are moved, autoconfigured addresses are not moved.
#
# Example:
# move_addresses inet6
@@ -491,35 +505,43 @@ move_addresses()
{
type="$1"
eval "failed=\"\$${type}_failed\""
- eval "plumbed=\"\$${type}_plumbed\""
eval "list=\"\$${type}_list\""
- process_hostname="${type}_process_hostname"
+ process_func="${type}_process_hostname"
processed=""
if [ "$type" = inet ]; then
- echo "moving addresses from failed IPv4 interfaces:\c"
+ typedesc="IPv4"
zaddr="0.0.0.0"
hostpfx="/etc/hostname"
else
- echo "moving addresses from failed IPv6 interfaces:\c"
+ typedesc="IPv6"
zaddr="::"
hostpfx="/etc/hostname6"
fi
- set -- $failed
- while [ $# -gt 0 ]; do
- in_list if_comp $1 $processed && { shift; continue; }
-
- alternate="`get_alternate $1 $plumbed`"
- if [ -z "$alternate" ]; then
- in_list physical_comp $1 $processed || {
- echo " $1 (couldn't move, no" \
- "alternative interface)\c"
- processed="$processed $1"
+ echo "Moving addresses from missing ${typedesc} interface(s):\c" \
+ >/dev/msglog
+
+ for ifname in $failed; do
+ in_list if_comp $ifname $processed && continue
+
+ group=`get_group $ifname`
+ if [ -z "$group" ]; then
+ in_list physical_comp $ifname $processed || {
+ echo " $ifname (not moved -- not" \
+ "in an IPMP group)\c" >/dev/msglog
+ processed="$processed $ifname"
}
- shift
continue
fi
+
+ #
+ # Lookup the IPMP meta-interface name. If one doesn't exist,
+ # create it.
+ #
+ grifname=`get_groupifname $group`
+ [ -z "$grifname" ] && grifname=`create_groupifname $group $type`
+
#
# The hostname files are processed twice. In the first
# pass, we are looking for all commands that apply
@@ -528,7 +550,7 @@ move_addresses()
# whether the address represents a failover address
# or not until we've read all the files associated with the
# interface.
-
+ #
# In the first pass through the hostname files, all
# additional logical interface commands are removed.
# The remaining commands are concatenated together and
@@ -541,19 +563,18 @@ move_addresses()
# the embedded "set" command set the address later.
#
/sbin/ifparse -f $type `
- for item in $list; do
- if_comp $1 $item && \
- $process_hostname /sbin/ifparse \
- $type < $hostpfx.$item
- done | while read three four; do
- [ "$three" != addif ] && \
- echo "$three $four \c"
- done` | while read one two; do
- [ -z "$one" ] && continue
- line="addif $zaddr $one $two"
- /sbin/ifconfig $alternate $type \
- -standby $line >/dev/null
- done
+ for item in $list; do
+ if_comp $ifname $item && $process_func \
+ /sbin/ifparse $type < $hostpfx.$item
+ done | while read three four; do
+ [ "$three" != addif ] && echo "$three $four \c"
+ done` | while read one two; do
+ [ -z "$one" ] && continue
+ [ "$one $two" = "$inet_oneline_epilogue" ] && \
+ continue
+ line="addif $zaddr $one $two"
+ /sbin/ifconfig $grifname $type $line >/dev/null
+ done
#
# In the second pass, look for the the "addif" commands
@@ -561,22 +582,75 @@ move_addresses()
# commands are not valid in logical interface hostname
# files.
#
- if [ "$1" = "`get_physical $1`" ]; then
- $process_hostname /sbin/ifparse -f $type \
- <$hostpfx.$1 | while read one two; do
- [ "$one" = addif ] && \
- /sbin/ifconfig $alternate $type -standby \
- addif $two >/dev/null
+ if [ "$ifname" = "`get_physical $ifname`" ]; then
+ $process_func /sbin/ifparse -f $type < $hostpfx.$ifname \
+ | while read one two; do
+ [ "$one" = addif ] && \
+ /sbin/ifconfig $grifname $type \
+ addif $two >/dev/null
done
fi
- in_list physical_comp $1 $processed || {
- echo " $1 (moved to $alternate)\c"
- processed="$processed $1"
+ #
+ # Check if this was an active interface in the group. If so,
+ # activate another IP interface (if possible)
+ #
+ is_standby $ifname || inactive=`get_inactive_ifname $group`
+ [ -n "$inactive" ] && /sbin/ifconfig $inactive $type -standby
+
+ in_list physical_comp $ifname $processed || {
+ processed="$processed $ifname"
+ echo " $ifname (moved to $grifname\c" > /dev/msglog
+ if [ -n "$inactive" ]; then
+ echo " and cleared 'standby' on\c" > /dev/msglog
+ echo " $inactive to compensate\c" > /dev/msglog
+ fi
+ echo ")\c" > /dev/msglog
}
+ inactive=""
+ done
+ echo "." >/dev/msglog
+}
+
+#
+# if_configure type class interface_list
+#
+# Configure all of the interfaces of type `type' (e.g., "inet6") in
+# `interface_list' according to their /etc/hostname[6].* files. `class'
+# describes the class of interface (e.g., "IPMP"), as a diagnostic aid.
+# For inet6 interfaces, the interface is also brought up.
+#
+if_configure()
+{
+ fail=
+ type=$1
+ class=$2
+ process_func=${type}_process_hostname
+ shift 2
+
+ if [ "$type" = inet ]; then
+ desc="IPv4"
+ hostpfx="/etc/hostname"
+ else
+ desc="IPv6"
+ hostpfx="/etc/hostname6"
+ fi
+ [ -n "$class" ] && desc="$class $desc"
+
+ echo "configuring $desc interfaces:\c"
+ while [ $# -gt 0 ]; do
+ $process_func /sbin/ifconfig $1 $type < $hostpfx.$1 >/dev/null
+ if [ $? != 0 ]; then
+ fail="$fail $1"
+ elif [ "$type" = inet6 ]; then
+ /sbin/ifconfig $1 inet6 up || fail="$fail $1"
+ fi
+ echo " $1\c"
shift
done
echo "."
+
+ [ -n "$fail" ] && warn_failed_ifs "configure $desc" "$fail"
}
#
diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c
index 46b2b5a958..dc90957dfa 100644
--- a/usr/src/cmd/truss/codes.c
+++ b/usr/src/cmd/truss/codes.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -615,13 +615,10 @@ const struct ioc {
{ (uint_t)SIOCSIPSECONFIG, "SIOCSIPSECONFIG", NULL },
{ (uint_t)SIOCDIPSECONFIG, "SIOCDIPSECONFIG", NULL },
{ (uint_t)SIOCLIPSECONFIG, "SIOCLIPSECONFIG", NULL },
- { (uint_t)SIOCLIFFAILOVER, "SIOCLIFFAILOVER", "lifreq" },
- { (uint_t)SIOCLIFFAILBACK, "SIOCLIFFAILBACK", "lifreq" },
- { (uint_t)SIOCSIPMPFAILBACK, "SIOCSIPMPFAILBACK", NULL },
+ { (uint_t)SIOCGLIFBINDING, "SIOCGLIFBINDING", "lifreq" },
{ (uint_t)SIOCSLIFGROUPNAME, "SIOCSLIFGROUPNAME", "lifreq" },
{ (uint_t)SIOCGLIFGROUPNAME, "SIOCGLIFGROUPNAME", "lifreq" },
- { (uint_t)SIOCGLIFOINDEX, "SIOCGLIFOINDEX", "lifreq" },
- { (uint_t)SIOCSLIFOINDEX, "SIOCSLIFOINDEX", "lifreq" },
+ { (uint_t)SIOCGLIFGROUPINFO, "SIOCGLIFGROUPINFO", "lifgroupinfo" },
{ (uint_t)SIOCGDSTINFO, "SIOCGDSTINFO", NULL },
{ (uint_t)SIOCGIP6ADDRPOLICY, "SIOCGIP6ADDRPOLICY", NULL },
{ (uint_t)SIOCSIP6ADDRPOLICY, "SIOCSIP6ADDRPOLICY", NULL },
diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c
index edc610559d..8165f64f99 100644
--- a/usr/src/cmd/truss/print.c
+++ b/usr/src/cmd/truss/print.c
@@ -19,16 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#define _SYSCALL32 /* make 32-bit compat headers visible */
#include <stdio.h>
@@ -73,6 +70,7 @@
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <netinet/sctp.h>
+#include <net/route.h>
#include <sys/utrap.h>
#include <sys/lgrp_user.h>
#include <sys/door.h>
@@ -1749,6 +1747,8 @@ prt_sol(private_t *pri, int raw, long val)
{
if (val == SOL_SOCKET) {
outstring(pri, "SOL_SOCKET");
+ } else if (val == SOL_ROUTE) {
+ outstring(pri, "SOL_ROUTE");
} else {
const struct protoent *p;
struct protoent res;
@@ -1826,6 +1826,18 @@ sol_optname(private_t *pri, long val)
#undef CBSIZE
}
+const char *
+route_optname(private_t *pri, long val)
+{
+ switch (val) {
+ case RT_AWARE:
+ return ("RT_AWARE");
+ default:
+ (void) snprintf(pri->code_buf, sizeof (pri->code_buf),
+ "0x%lx", val);
+ return (pri->code_buf);
+ }
+}
const char *
tcp_optname(private_t *pri, long val)
@@ -1918,6 +1930,8 @@ prt_son(private_t *pri, int raw, long val)
switch (pri->sys_args[1]) {
case SOL_SOCKET: outstring(pri, sol_optname(pri, val));
break;
+ case SOL_ROUTE: outstring(pri, route_optname(pri, val));
+ break;
case IPPROTO_TCP: outstring(pri, tcp_optname(pri, val));
break;
case IPPROTO_UDP: outstring(pri, udp_optname(pri, val));
diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c
index 72b6ce5c76..fb8f540cb5 100644
--- a/usr/src/cmd/zoneadmd/vplat.c
+++ b/usr/src/cmd/zoneadmd/vplat.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -2397,6 +2397,7 @@ configure_one_interface(zlog_t *zlogp, zoneid_t zone_id,
*/
char buffer[INET6_ADDRSTRLEN];
void *addr;
+ const char *nomatch = "no matching subnet found in netmasks(4)";
if (af == AF_INET)
addr = &((struct sockaddr_in *)
@@ -2405,14 +2406,23 @@ configure_one_interface(zlog_t *zlogp, zoneid_t zone_id,
addr = &((struct sockaddr_in6 *)
(&lifr.lifr_addr))->sin6_addr;
- /* Find out what netmask interface is going to be using */
+ /*
+ * Find out what netmask the interface is going to be using.
+ * If we just brought up an IPMP data address on an underlying
+ * interface above, the address will have already migrated, so
+ * the SIOCGLIFNETMASK won't be able to find it (but we need
+ * to bring the address up to get the actual netmask). Just
+ * omit printing the actual netmask in this corner-case.
+ */
if (ioctl(s, SIOCGLIFNETMASK, (caddr_t)&lifr) < 0 ||
- inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL)
- goto bad;
- zerror(zlogp, B_FALSE,
- "WARNING: %s: no matching subnet found in netmasks(4) for "
- "%s; using default of %s.",
- lifr.lifr_name, addrstr4, buffer);
+ inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL) {
+ zerror(zlogp, B_FALSE, "WARNING: %s; using default.",
+ nomatch);
+ } else {
+ zerror(zlogp, B_FALSE,
+ "WARNING: %s: %s: %s; using default of %s.",
+ lifr.lifr_name, nomatch, addrstr4, buffer);
+ }
}
/*
diff --git a/usr/src/lib/brand/native/zone/platform.xml b/usr/src/lib/brand/native/zone/platform.xml
index f7030ba0a1..69e86cefd2 100644
--- a/usr/src/lib/brand/native/zone/platform.xml
+++ b/usr/src/lib/brand/native/zone/platform.xml
@@ -20,7 +20,7 @@
CDDL HEADER END
- Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ Copyright 2009 Sun Microsystems, Inc. All rights reserved.
Use is subject to license terms.
DO NOT EDIT THIS FILE.
@@ -97,6 +97,7 @@
<device match="ipf" ip-type="exclusive" />
<device match="ipl" ip-type="exclusive" />
<device match="iplookup" ip-type="exclusive" />
+ <device match="ipmpstub" ip-type="exclusive" />
<device match="ipnat" ip-type="exclusive" />
<device match="ipscan" ip-type="exclusive" />
<device match="ipsecah" ip-type="exclusive" />
diff --git a/usr/src/lib/brand/sn1/zone/platform.xml b/usr/src/lib/brand/sn1/zone/platform.xml
index 1659d8851c..b3bb0d7962 100644
--- a/usr/src/lib/brand/sn1/zone/platform.xml
+++ b/usr/src/lib/brand/sn1/zone/platform.xml
@@ -20,7 +20,7 @@
CDDL HEADER END
- Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ Copyright 2009 Sun Microsystems, Inc. All rights reserved.
Use is subject to license terms.
DO NOT EDIT THIS FILE.
@@ -101,6 +101,7 @@
<device match="ipf" ip-type="exclusive" />
<device match="ipl" ip-type="exclusive" />
<device match="iplookup" ip-type="exclusive" />
+ <device match="ipmpstub" ip-type="exclusive" />
<device match="ipnat" ip-type="exclusive" />
<device match="ipscan" ip-type="exclusive" />
<device match="ipsecah" ip-type="exclusive" />
diff --git a/usr/src/lib/libbsm/common/adt.c b/usr/src/lib/libbsm/common/adt.c
index 23f78b6247..d9947622d4 100644
--- a/usr/src/lib/libbsm/common/adt.c
+++ b/usr/src/lib/libbsm/common/adt.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -2137,7 +2137,7 @@ adt_get_local_address(int family, struct ifaddrlist *al)
int ifal_count;
int i;
- if ((ifal_count = ifaddrlist(&ifal, family, errbuf)) <= 0) {
+ if ((ifal_count = ifaddrlist(&ifal, family, 0, errbuf)) <= 0) {
int serrno = errno;
(void) snprintf(msg, sizeof (msg), "adt_get_local_address "
diff --git a/usr/src/lib/libdlpi/common/libdlpi.c b/usr/src/lib/libdlpi/common/libdlpi.c
index 14c4451081..d546807342 100644
--- a/usr/src/lib/libdlpi/common/libdlpi.c
+++ b/usr/src/lib/libdlpi/common/libdlpi.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1109,7 +1109,7 @@ i_dlpi_open(const char *provider, int *fd, uint_t flags, boolean_t style1)
/* open libdladm handle rather than taking it as input */
if (dladm_open(&handle) != DLADM_STATUS_OK)
- return (DLPI_FAILURE);
+ goto fallback;
if (dladm_dev2linkid(handle, device, &linkid) ==
DLADM_STATUS_OK) {
@@ -1400,7 +1400,7 @@ i_dlpi_strgetmsg(dlpi_impl_t *dip, int msec, dlpi_msg_t *dlreplyp,
void *databuf, size_t *datalenp, size_t *totdatalenp)
{
int retval;
- int flags = 0;
+ int flags;
int fd = dip->dli_fd;
struct strbuf ctl, data;
struct pollfd pfd;
@@ -1437,16 +1437,17 @@ i_dlpi_strgetmsg(dlpi_impl_t *dip, int msec, dlpi_msg_t *dlreplyp,
start = gethrtime() / (NANOSEC / MILLISEC);
switch (poll(&pfd, 1, msec)) {
- default:
- if (pfd.revents & POLLHUP)
- return (DL_SYSERR);
- break;
- case 0:
- return (DLPI_ETIMEDOUT);
- case -1:
+ default:
+ if (pfd.revents & POLLHUP)
return (DL_SYSERR);
+ break;
+ case 0:
+ return (DLPI_ETIMEDOUT);
+ case -1:
+ return (DL_SYSERR);
}
+ flags = 0;
if ((retval = getmsg(fd, &ctl, &data, &flags)) < 0)
return (DL_SYSERR);
diff --git a/usr/src/lib/libinetcfg/common/inetcfg.c b/usr/src/lib/libinetcfg/common/inetcfg.c
index 38beca5574..e1f09a881a 100644
--- a/usr/src/lib/libinetcfg/common/inetcfg.c
+++ b/usr/src/lib/libinetcfg/common/inetcfg.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -743,7 +741,8 @@ icfg_set_flags(icfg_handle_t handle, uint64_t flags)
struct lifreq lifr;
uint64_t oflags;
int ret;
- int rtsock;
+ int rtsock = -1;
+ int aware = RTAW_UNDER_IPMP;
(void) strlcpy(lifr.lifr_name, handle->ifh_interface.if_name,
sizeof (lifr.lifr_name));
@@ -757,10 +756,16 @@ icfg_set_flags(icfg_handle_t handle, uint64_t flags)
/*
* Any time flags are changed on an interface that has IFF_UP set,
* you'll get a routing socket message. We care about the status,
- * though, only when the new flags are marked "up."
+ * though, only when the new flags are marked "up." Since we may be
+ * changing an IPMP test address, we enable RTAW_UNDER_IPMP.
*/
- rtsock = (flags & IFF_UP) ?
- socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle)) : -1;
+ if (flags & IFF_UP) {
+ rtsock = socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle));
+ if (rtsock != -1) {
+ (void) setsockopt(rtsock, SOL_ROUTE, RT_AWARE, &aware,
+ sizeof (aware));
+ }
+ }
lifr.lifr_flags = flags;
if (ioctl(handle->ifh_sock, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) {
@@ -993,7 +998,8 @@ icfg_set_addr(icfg_handle_t handle, const struct sockaddr *addr,
struct lifreq lifr;
uint64_t flags;
int ret;
- int rtsock;
+ int rtsock = -1;
+ int aware = RTAW_UNDER_IPMP;
(void) memset(&lifr.lifr_addr, 0, sizeof (lifr.lifr_addr));
if ((ret = to_sockaddr_storage(ICFG_FAMILY(handle), addr, addrlen,
@@ -1002,15 +1008,19 @@ icfg_set_addr(icfg_handle_t handle, const struct sockaddr *addr,
}
/*
- * Need to do check on duplicate address detection results if the
- * interface is up.
+ * Need to check duplicate address detection results if the address is
+ * up. Since this may be an IPMP test address, enable RTAW_UNDER_IPMP.
*/
- if ((ret = icfg_get_flags(handle, &flags)) != ICFG_SUCCESS) {
+ if ((ret = icfg_get_flags(handle, &flags)) != ICFG_SUCCESS)
return (ret);
- }
- rtsock = (flags & IFF_UP) ?
- socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle)) : -1;
+ if (flags & IFF_UP) {
+ rtsock = socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle));
+ if (rtsock != -1) {
+ (void) setsockopt(rtsock, SOL_ROUTE, RT_AWARE, &aware,
+ sizeof (aware));
+ }
+ }
(void) strlcpy(lifr.lifr_name, handle->ifh_interface.if_name,
sizeof (lifr.lifr_name));
diff --git a/usr/src/lib/libinetutil/Makefile.com b/usr/src/lib/libinetutil/Makefile.com
index 810f24bd71..cd3a0d6e33 100644
--- a/usr/src/lib/libinetutil/Makefile.com
+++ b/usr/src/lib/libinetutil/Makefile.com
@@ -19,15 +19,13 @@
# CDDL HEADER END
#
#
-# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
-LIBRARY = libinetutil.a
-VERS = .1
-OBJECTS = octet.o inetutil4.o ifspec.o ifaddrlist.o eh.o tq.o
+LIBRARY = libinetutil.a
+VERS = .1
+OBJECTS = octet.o inetutil.o ifspec.o ifaddrlist.o ifaddrlistx.o eh.o tq.o
include ../../Makefile.lib
@@ -38,9 +36,9 @@ LIBS = $(DYNLIB) $(LINTLIB)
SRCDIR = ../common
COMDIR = $(SRC)/common/net/dhcp
-SRCS = $(COMDIR)/octet.c $(SRCDIR)/inetutil4.c \
+SRCS = $(COMDIR)/octet.c $(SRCDIR)/inetutil.c \
$(SRCDIR)/ifspec.c $(SRCDIR)/eh.c $(SRCDIR)/tq.c \
- $(SRCDIR)/ifaddrlist.c
+ $(SRCDIR)/ifaddrlist.c $(SRCDIR)/ifaddrlistx.c
$(LINTLIB):= SRCS = $(SRCDIR)/$(LINTSRC)
LDLIBS += -lsocket -lc
diff --git a/usr/src/lib/libinetutil/common/ifaddrlist.c b/usr/src/lib/libinetutil/common/ifaddrlist.c
index 383dc2afb0..fa67a0fc37 100644
--- a/usr/src/lib/libinetutil/common/ifaddrlist.c
+++ b/usr/src/lib/libinetutil/common/ifaddrlist.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -38,9 +38,6 @@
* @(#) $Header: ifaddrlist.c,v 1.2 97/04/22 13:31:05 leres Exp $ (LBL)
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <alloca.h>
#include <errno.h>
#include <libinetutil.h>
#include <stdio.h>
@@ -54,9 +51,9 @@
* See <libinetutil.h> for a description of the programming interface.
*/
int
-ifaddrlist(struct ifaddrlist **ipaddrp, int family, char *errbuf)
+ifaddrlist(struct ifaddrlist **ipaddrp, int family, uint_t flags, char *errbuf)
{
- struct ifaddrlist *ifaddrlist, *al;
+ struct ifaddrlist *ifaddrlist = NULL, *al = NULL;
struct sockaddr_in *sin;
struct sockaddr_in6 *sin6;
struct lifconf lifc;
@@ -64,31 +61,28 @@ ifaddrlist(struct ifaddrlist **ipaddrp, int family, char *errbuf)
struct lifreq *lifrp;
int i, count, nlifr;
int fd;
- const char *iocstr;
+ const char *opstr;
+ (void) memset(&lifc, 0, sizeof (lifc));
if (family != AF_INET && family != AF_INET6) {
(void) strlcpy(errbuf, "invalid address family", ERRBUFSIZE);
return (-1);
}
- fd = socket(family, SOCK_DGRAM, 0);
- if (fd == -1) {
- (void) snprintf(errbuf, ERRBUFSIZE, "socket: %s",
- strerror(errno));
- return (-1);
+ if ((fd = socket(family, SOCK_DGRAM, 0)) == -1) {
+ opstr = "socket";
+ goto fail;
}
/*
* Get the number of network interfaces of type `family'.
*/
lifn.lifn_family = family;
- lifn.lifn_flags = 0;
+ lifn.lifn_flags = flags;
again:
if (ioctl(fd, SIOCGLIFNUM, &lifn) == -1) {
- (void) snprintf(errbuf, ERRBUFSIZE, "SIOCGLIFNUM: %s",
- strerror(errno));
- (void) close(fd);
- return (-1);
+ opstr = "SIOCGLIFNUM";
+ goto fail;
}
/*
@@ -97,16 +91,17 @@ again:
*/
lifn.lifn_count += 4;
+ lifc.lifc_flags = flags;
lifc.lifc_family = family;
lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq);
- lifc.lifc_buf = alloca(lifc.lifc_len);
- lifc.lifc_flags = 0;
+ if ((lifc.lifc_buf = realloc(lifc.lifc_buf, lifc.lifc_len)) == NULL) {
+ opstr = "realloc";
+ goto fail;
+ }
if (ioctl(fd, SIOCGLIFCONF, &lifc) == -1) {
- (void) snprintf(errbuf, ERRBUFSIZE, "SIOCGLIFCONF: %s",
- strerror(errno));
- (void) close(fd);
- return (-1);
+ opstr = "SIOCGLIFCONF";
+ goto fail;
}
/*
@@ -121,12 +116,9 @@ again:
/*
* Allocate the address list to return.
*/
- ifaddrlist = calloc(nlifr, sizeof (struct ifaddrlist));
- if (ifaddrlist == NULL) {
- (void) snprintf(errbuf, ERRBUFSIZE, "calloc: %s",
- strerror(errno));
- (void) close(fd);
- return (-1);
+ if ((ifaddrlist = calloc(nlifr, sizeof (struct ifaddrlist))) == NULL) {
+ opstr = "calloc";
+ goto fail;
}
/*
@@ -142,7 +134,7 @@ again:
if (ioctl(fd, SIOCGLIFFLAGS, lifrp) == -1) {
if (errno == ENXIO)
continue;
- iocstr = "SIOCGLIFFLAGS";
+ opstr = "SIOCGLIFFLAGS";
goto fail;
}
al->flags = lifrp->lifr_flags;
@@ -150,7 +142,7 @@ again:
if (ioctl(fd, SIOCGLIFINDEX, lifrp) == -1) {
if (errno == ENXIO)
continue;
- iocstr = "SIOCGLIFINDEX";
+ opstr = "SIOCGLIFINDEX";
goto fail;
}
al->index = lifrp->lifr_index;
@@ -158,7 +150,7 @@ again:
if (ioctl(fd, SIOCGLIFADDR, lifrp) == -1) {
if (errno == ENXIO)
continue;
- iocstr = "SIOCGLIFADDR";
+ opstr = "SIOCGLIFADDR";
goto fail;
}
@@ -174,6 +166,7 @@ again:
}
(void) close(fd);
+ free(lifc.lifc_buf);
if (count == 0) {
free(ifaddrlist);
*ipaddrp = NULL;
@@ -183,9 +176,14 @@ again:
*ipaddrp = ifaddrlist;
return (count);
fail:
- (void) snprintf(errbuf, ERRBUFSIZE, "%s: %s: %s", iocstr, al->device,
- strerror(errno));
-
+ if (al == NULL) {
+ (void) snprintf(errbuf, ERRBUFSIZE, "%s: %s", opstr,
+ strerror(errno));
+ } else {
+ (void) snprintf(errbuf, ERRBUFSIZE, "%s: %s: %s", opstr,
+ al->device, strerror(errno));
+ }
+ free(lifc.lifc_buf);
free(ifaddrlist);
(void) close(fd);
return (-1);
diff --git a/usr/src/lib/libinetutil/common/ifaddrlistx.c b/usr/src/lib/libinetutil/common/ifaddrlistx.c
new file mode 100644
index 0000000000..ce85c5521f
--- /dev/null
+++ b/usr/src/lib/libinetutil/common/ifaddrlistx.c
@@ -0,0 +1,168 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <errno.h>
+#include <libinetutil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+
+/*
+ * Create a list of the addresses on physical interface `ifname' with at least
+ * one of the flags in `set' set and all of the flags in `clear' clear.
+ * Return the number of items in the list, or -1 on failure.
+ */
+int
+ifaddrlistx(const char *ifname, uint64_t set, uint64_t clear,
+ ifaddrlistx_t **ifaddrsp)
+{
+ struct lifconf lifc;
+ struct lifnum lifn;
+ struct lifreq *lifrp;
+ ifaddrlistx_t *ifaddrp, *ifaddrs = NULL;
+ int i, nlifr, naddr = 0;
+ char *cp;
+ uint_t flags;
+ int s4, s6 = -1;
+ boolean_t isv6;
+ int save_errno;
+ struct sockaddr_storage addr;
+
+ (void) memset(&lifc, 0, sizeof (lifc));
+ flags = LIFC_NOXMIT | LIFC_ALLZONES | LIFC_TEMPORARY | LIFC_UNDER_IPMP;
+
+ /*
+ * We need both IPv4 and IPv6 sockets to query both IPv4 and IPv6
+ * interfaces below.
+ */
+ if ((s4 = socket(AF_INET, SOCK_DGRAM, 0)) == -1 ||
+ (s6 = socket(AF_INET6, SOCK_DGRAM, 0)) == -1) {
+ goto fail;
+ }
+
+ /*
+ * Get the number of network interfaces of type `family'.
+ */
+ lifn.lifn_family = AF_UNSPEC;
+ lifn.lifn_flags = flags;
+again:
+ if (ioctl(s4, SIOCGLIFNUM, &lifn) == -1)
+ goto fail;
+
+ /*
+ * Pad the interface count to detect when additional interfaces have
+ * been configured between SIOCGLIFNUM and SIOCGLIFCONF.
+ */
+ lifn.lifn_count += 4;
+
+ lifc.lifc_flags = flags;
+ lifc.lifc_family = AF_UNSPEC;
+ lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq);
+ if ((lifc.lifc_buf = realloc(lifc.lifc_buf, lifc.lifc_len)) == NULL)
+ goto fail;
+
+ if (ioctl(s4, SIOCGLIFCONF, &lifc) == -1)
+ goto fail;
+
+ /*
+ * If every lifr_req slot is taken, then additional interfaces must
+ * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF.
+ * Recalculate to make sure we didn't miss any interfaces.
+ */
+ nlifr = lifc.lifc_len / sizeof (struct lifreq);
+ if (nlifr >= lifn.lifn_count)
+ goto again;
+
+ /*
+ * Populate the ifaddrlistx by querying each matching interface. If a
+ * query ioctl returns ENXIO, then the interface must have been
+ * removed after the SIOCGLIFCONF completed -- so we just ignore it.
+ */
+ for (lifrp = lifc.lifc_req, i = 0; i < nlifr; i++, lifrp++) {
+ if ((cp = strchr(lifrp->lifr_name, ':')) != NULL)
+ *cp = '\0';
+
+ if (strcmp(lifrp->lifr_name, ifname) != 0)
+ continue;
+
+ if (cp != NULL)
+ *cp = ':';
+
+ addr = lifrp->lifr_addr;
+ isv6 = addr.ss_family == AF_INET6;
+ if (ioctl(isv6 ? s6 : s4, SIOCGLIFFLAGS, lifrp) == -1) {
+ if (errno == ENXIO)
+ continue;
+ goto fail;
+ }
+
+ if (set != 0 && ((lifrp->lifr_flags & set) == 0) ||
+ (lifrp->lifr_flags & clear) != 0)
+ continue;
+
+ /*
+ * We've got a match; allocate a new record.
+ */
+ if ((ifaddrp = malloc(sizeof (ifaddrlistx_t))) == NULL)
+ goto fail;
+
+ (void) strlcpy(ifaddrp->ia_name, lifrp->lifr_name, LIFNAMSIZ);
+ ifaddrp->ia_flags = lifrp->lifr_flags;
+ ifaddrp->ia_addr = addr;
+ ifaddrp->ia_next = ifaddrs;
+ ifaddrs = ifaddrp;
+ naddr++;
+ }
+
+ (void) close(s4);
+ (void) close(s6);
+ free(lifc.lifc_buf);
+ *ifaddrsp = ifaddrs;
+ return (naddr);
+fail:
+ save_errno = errno;
+ (void) close(s4);
+ (void) close(s6);
+ free(lifc.lifc_buf);
+ ifaddrlistx_free(ifaddrs);
+ errno = save_errno;
+ return (-1);
+}
+
+/*
+ * Free the provided ifaddrlistx_t.
+ */
+void
+ifaddrlistx_free(ifaddrlistx_t *ifaddrp)
+{
+ ifaddrlistx_t *next_ifaddrp;
+
+ for (; ifaddrp != NULL; ifaddrp = next_ifaddrp) {
+ next_ifaddrp = ifaddrp->ia_next;
+ free(ifaddrp);
+ }
+}
diff --git a/usr/src/lib/libinetutil/common/inetutil4.c b/usr/src/lib/libinetutil/common/inetutil.c
index ff5607e192..195d080b79 100644
--- a/usr/src/lib/libinetutil/common/inetutil4.c
+++ b/usr/src/lib/libinetutil/common/inetutil.c
@@ -18,13 +18,12 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <unistd.h>
#include <netinet/in.h>
#include <libinetutil.h>
@@ -32,7 +31,7 @@
extern int getnetmaskbyaddr(const struct in_addr, struct in_addr *);
/*
- * Generic internet (v4) functions.
+ * Internet utility functions.
*/
/*
@@ -67,3 +66,32 @@ get_netmask4(const struct in_addr *n_addrp, struct in_addr *s_addrp)
else
s_addrp->s_addr = IN_CLASSE_NET;
}
+
+/*
+ * Checks if the IP addresses `ssp1' and `ssp2' are equal.
+ */
+boolean_t
+sockaddrcmp(const struct sockaddr_storage *ssp1,
+ const struct sockaddr_storage *ssp2)
+{
+ struct in_addr addr1, addr2;
+ const struct in6_addr *addr6p1, *addr6p2;
+
+ if (ssp1->ss_family != ssp2->ss_family)
+ return (B_FALSE);
+
+ if (ssp1 == ssp2)
+ return (B_TRUE);
+
+ switch (ssp1->ss_family) {
+ case AF_INET:
+ addr1 = ((const struct sockaddr_in *)ssp1)->sin_addr;
+ addr2 = ((const struct sockaddr_in *)ssp2)->sin_addr;
+ return (addr1.s_addr == addr2.s_addr);
+ case AF_INET6:
+ addr6p1 = &((const struct sockaddr_in6 *)ssp1)->sin6_addr;
+ addr6p2 = &((const struct sockaddr_in6 *)ssp2)->sin6_addr;
+ return (IN6_ARE_ADDR_EQUAL(addr6p1, addr6p2));
+ }
+ return (B_FALSE);
+}
diff --git a/usr/src/lib/libinetutil/common/libinetutil.h b/usr/src/lib/libinetutil/common/libinetutil.h
index b21d54f56c..0bece07e07 100644
--- a/usr/src/lib/libinetutil/common/libinetutil.h
+++ b/usr/src/lib/libinetutil/common/libinetutil.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -21,15 +20,13 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _LIBINETUTIL_H
#define _LIBINETUTIL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Contains SMI-private API for general Internet functionality
*/
@@ -59,11 +56,14 @@ typedef struct {
extern boolean_t ifparse_ifspec(const char *, ifspec_t *);
extern void get_netmask4(const struct in_addr *, struct in_addr *);
+extern boolean_t sockaddrcmp(const struct sockaddr_storage *,
+ const struct sockaddr_storage *);
/*
* Extended version of the classic BSD ifaddrlist() interface:
*
- * int ifaddrlist(struct ifaddrlist **addrlistp, int af, char *errbuf);
+ * int ifaddrlist(struct ifaddrlist **addrlistp, int af, uint_t flags,
+ * char *errbuf);
*
* * addrlistp: Upon success, ifaddrlist() sets *addrlistp to a
* dynamically-allocated array of addresses.
@@ -71,6 +71,9 @@ extern void get_netmask4(const struct in_addr *, struct in_addr *);
* * af: Either AF_INET to obtain IPv4 addresses, or AF_INET6 to
* obtain IPv6 addresses.
*
+ * * flags: LIFC_* flags that control the classes of interfaces that
+ * will be visible.
+ *
* * errbuf: A caller-supplied buffer of ERRBUFSIZE. Upon failure,
* provides the reason for the failure.
*
@@ -89,9 +92,43 @@ struct ifaddrlist {
uint64_t flags; /* interface flags */
};
-#define ERRBUFSIZE 128 /* expected size of third argument */
+#define ERRBUFSIZE 128 /* expected size of fourth argument */
+
+extern int ifaddrlist(struct ifaddrlist **, int, uint_t, char *);
-extern int ifaddrlist(struct ifaddrlist **, int, char *);
+/*
+ * Similar to ifaddrlist(), but returns a linked-list of addresses for a
+ * *specific* interface name, and allows specific address flags to be matched
+ * against. A linked list is used rather than an array so that information
+ * can grow over time without affecting binary compatibility. Also, leaves
+ * error-handling up to the caller. Returns the number of ifaddrlistx's
+ * chained through ifaddrp.
+ *
+ * int ifaddrlistx(const char *ifname, uint64_t set, uint64_t clear,
+ * ifaddrlistx_t **ifaddrp);
+ *
+ * * ifname: Interface name to match against.
+ *
+ * * set: One or more flags that must be set on the address for
+ * it to be returned.
+ *
+ * * clear: Flags that must be clear on the address for it to be
+ * returned.
+ *
+ * * ifaddrp: Upon success, ifaddrlistx() sets *ifaddrp to the head
+ * of a dynamically-allocated array of ifaddrlistx structures.
+ *
+ * Once done, the caller must free `ifaddrp' by calling ifaddrlistx_free().
+ */
+typedef struct ifaddrlistx {
+ struct ifaddrlistx *ia_next;
+ char ia_name[LIFNAMSIZ];
+ uint64_t ia_flags;
+ struct sockaddr_storage ia_addr;
+} ifaddrlistx_t;
+
+extern int ifaddrlistx(const char *, uint64_t, uint64_t, ifaddrlistx_t **);
+extern void ifaddrlistx_free(ifaddrlistx_t *);
/*
* Timer queues
diff --git a/usr/src/lib/libinetutil/common/mapfile-vers b/usr/src/lib/libinetutil/common/mapfile-vers
index 51c168fcc4..c9a7829fdb 100644
--- a/usr/src/lib/libinetutil/common/mapfile-vers
+++ b/usr/src/lib/libinetutil/common/mapfile-vers
@@ -19,17 +19,17 @@
# CDDL HEADER END
#
#
-# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
SUNWprivate_1.1 {
global:
get_netmask4;
hexascii_to_octet;
ifaddrlist;
+ ifaddrlistx;
+ ifaddrlistx_free;
ifparse_ifspec;
iu_adjust_timer;
iu_cancel_timer;
@@ -48,6 +48,7 @@ SUNWprivate_1.1 {
iu_tq_destroy;
iu_unregister_event;
octet_to_hexascii;
+ sockaddrcmp;
local:
*;
};
diff --git a/usr/src/lib/libipmp/Makefile b/usr/src/lib/libipmp/Makefile
index 188c49c073..5d52f304dc 100644
--- a/usr/src/lib/libipmp/Makefile
+++ b/usr/src/lib/libipmp/Makefile
@@ -19,15 +19,13 @@
# CDDL HEADER END
#
#
-# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
include $(SRC)/lib/Makefile.lib
-HDRS = ipmp.h ipmp_mpathd.h ipmp_query.h ipmp_query_impl.h
+HDRS = ipmp.h ipmp_admin.h ipmp_mpathd.h ipmp_query.h ipmp_query_impl.h
HDRDIR = common
SUBDIRS = $(MACH)
diff --git a/usr/src/lib/libipmp/Makefile.com b/usr/src/lib/libipmp/Makefile.com
index bea02659a8..d3065ae37c 100644
--- a/usr/src/lib/libipmp/Makefile.com
+++ b/usr/src/lib/libipmp/Makefile.com
@@ -19,20 +19,19 @@
# CDDL HEADER END
#
#
-# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
LIBRARY = libipmp.a
VERS = .1
-OBJECTS = ipmp_query.o ipmp_mpathd.o ipmp.o
+OBJECTS = ipmp_admin.o ipmp_query.o ipmp_mpathd.o ipmp.o
include ../../Makefile.lib
+include ../../Makefile.rootfs
LIBS = $(DYNLIB) $(LINTLIB)
-LDLIBS += -lsocket -lc
+LDLIBS += -linetutil -lsocket -lc
SRCDIR = ../common
$(LINTLIB):= SRCS = $(SRCDIR)/$(LINTSRC)
diff --git a/usr/src/lib/libipmp/common/ipmp.c b/usr/src/lib/libipmp/common/ipmp.c
index b9a7984889..cf9c3c7c3c 100644
--- a/usr/src/lib/libipmp/common/ipmp.c
+++ b/usr/src/lib/libipmp/common/ipmp.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* IPMP general interfaces (PSARC/2002/615).
*/
@@ -34,6 +31,8 @@
#include <stdlib.h>
#include <locale.h>
#include <unistd.h>
+#include <string.h>
+#include <errno.h>
#include "ipmp_impl.h"
@@ -92,13 +91,15 @@ static char *errmsgs[IPMP_NERR] = {
"operation failed", /* 1 IPMP_FAILURE */
"minimum failover redundancy not met", /* 2 IPMP_EMINRED */
"failback disabled", /* 3 IPMP_EFBDISABLED */
- "unable to completely fail back", /* 4 IPMP_EFBPARTIAL */
+ "unknown IPMP data address", /* 4 IPMP_EUNKADDR */
"invalid argument", /* 5 IPMP_EINVAL */
"out of memory", /* 6 IPMP_ENOMEM */
"cannot contact in.mpathd", /* 7 IPMP_ENOMPATHD */
"unknown IPMP group", /* 8 IPMP_EUNKGROUP */
"interface is not using IPMP", /* 9 IPMP_EUNKIF */
- "unable to communicate with in.mpathd" /* 10 IPMP_EPROTO */
+ "unable to communicate with in.mpathd", /* 10 IPMP_EPROTO */
+ "interface has duplicate hardware address"
+ /* 11 IPMP_EHWADDRDUP */
};
/*
@@ -110,5 +111,8 @@ ipmp_errmsg(int error)
if (error >= IPMP_NERR || error < 0)
return (dgettext(TEXT_DOMAIN, "<unknown error>"));
+ if (error == IPMP_FAILURE)
+ return (strerror(errno));
+
return (dgettext(TEXT_DOMAIN, errmsgs[error]));
}
diff --git a/usr/src/lib/libipmp/common/ipmp.h b/usr/src/lib/libipmp/common/ipmp.h
index 0112615a84..2ca0a9b2b9 100644
--- a/usr/src/lib/libipmp/common/ipmp.h
+++ b/usr/src/lib/libipmp/common/ipmp.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _IPMP_H
#define _IPMP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* General IPMP-related definitions and functions.
*
@@ -50,13 +47,14 @@ enum {
IPMP_FAILURE, /* operation failed (check errno) */
IPMP_EMINRED, /* minimum failover redundancy not met */
IPMP_EFBDISABLED, /* failback disabled */
- IPMP_EFBPARTIAL, /* unable to completely fail back */
+ IPMP_EUNKADDR, /* unknown IPMP data address */
IPMP_EINVAL, /* invalid argument */
IPMP_ENOMEM, /* out of memory */
IPMP_ENOMPATHD, /* cannot contact in.mpathd */
IPMP_EUNKGROUP, /* unknown IPMP group */
IPMP_EUNKIF, /* interface is not using IPMP */
IPMP_EPROTO, /* unable to communicate with in.mpathd */
+ IPMP_EHWADDRDUP, /* interface has duplicate hardware address */
IPMP_NERR /* number of error codes */
};
diff --git a/usr/src/lib/libipmp/common/ipmp_admin.c b/usr/src/lib/libipmp/common/ipmp_admin.c
new file mode 100644
index 0000000000..8a282f5286
--- /dev/null
+++ b/usr/src/lib/libipmp/common/ipmp_admin.c
@@ -0,0 +1,104 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * IPMP administrative interfaces (see PSARC/2007/272).
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include "ipmp_impl.h"
+#include "ipmp_mpathd.h"
+#include "ipmp_admin.h"
+
+static int
+ipmp_command(ipmp_handle_t handle, const void *req, uint_t reqsize)
+{
+ ipmp_state_t *statep = (ipmp_state_t *)handle;
+ mi_result_t result;
+ struct timeval end;
+ int save_errno;
+ int retval;
+
+ if (gettimeofday(&end, NULL) == -1)
+ return (IPMP_FAILURE);
+ end.tv_sec += IPMP_REQTIMEOUT;
+
+ assert(statep->st_fd == -1);
+ retval = ipmp_connect(&statep->st_fd);
+ if (retval != IPMP_SUCCESS)
+ return (retval);
+
+ retval = ipmp_write(statep->st_fd, req, reqsize);
+ if (retval != IPMP_SUCCESS)
+ goto out;
+
+ retval = ipmp_read(statep->st_fd, &result, sizeof (result), &end);
+ if (retval != IPMP_SUCCESS)
+ goto out;
+
+ errno = result.me_sys_error;
+ retval = result.me_mpathd_error;
+out:
+ save_errno = errno;
+ (void) close(statep->st_fd);
+ statep->st_fd = -1;
+ errno = save_errno;
+ return (retval);
+}
+
+int
+ipmp_offline(ipmp_handle_t handle, const char *ifname, uint_t minred)
+{
+ mi_offline_t mio;
+
+ mio.mio_command = MI_OFFLINE;
+ mio.mio_min_redundancy = minred;
+ (void) strlcpy(mio.mio_ifname, ifname, LIFNAMSIZ);
+ return (ipmp_command(handle, &mio, sizeof (mio)));
+}
+
+int
+ipmp_undo_offline(ipmp_handle_t handle, const char *ifname)
+{
+ mi_undo_offline_t miu;
+
+ miu.miu_command = MI_UNDO_OFFLINE;
+ (void) strlcpy(miu.miu_ifname, ifname, LIFNAMSIZ);
+ return (ipmp_command(handle, &miu, sizeof (miu)));
+}
+
+int
+ipmp_ping_daemon(ipmp_handle_t handle)
+{
+ mi_ping_t mip;
+
+ mip.mip_command = MI_PING;
+ return (ipmp_command(handle, &mip, sizeof (mip)));
+}
diff --git a/usr/src/lib/libipmp/common/ipmp_admin.h b/usr/src/lib/libipmp/common/ipmp_admin.h
new file mode 100644
index 0000000000..fa0986f7fa
--- /dev/null
+++ b/usr/src/lib/libipmp/common/ipmp_admin.h
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _IPMP_ADMIN_H
+#define _IPMP_ADMIN_H
+
+#include <ipmp.h>
+#include <sys/types.h>
+
+/*
+ * IPMP administrative interfaces.
+ *
+ * These interfaces may only be used within ON or after signing a contract
+ * with ON. For documentation, refer to PSARC/2007/272.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int ipmp_offline(ipmp_handle_t, const char *, uint_t);
+extern int ipmp_undo_offline(ipmp_handle_t, const char *);
+extern int ipmp_ping_daemon(ipmp_handle_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _IPMP_ADMIN_H */
diff --git a/usr/src/lib/libipmp/common/ipmp_mpathd.c b/usr/src/lib/libipmp/common/ipmp_mpathd.c
index ee1d35de33..e24de71017 100644
--- a/usr/src/lib/libipmp/common/ipmp_mpathd.c
+++ b/usr/src/lib/libipmp/common/ipmp_mpathd.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -18,14 +17,11 @@
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
- */
-/*
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Low-level interfaces for communicating with in.mpathd(1M).
*
@@ -66,16 +62,16 @@ ipmp_connect(int *fdp)
return (IPMP_FAILURE);
/*
- * Enable TCP_ANONPRIVBIND so the kernel will choose our source port.
- * Since we're using loopback sockets, requiring use of privileged
- * source ports is sufficient for security.
+ * If we have sufficient privilege, enable TCP_ANONPRIVBIND so the
+ * kernel will choose a privileged source port (since in.mpathd only
+ * accepts requests on loopback, this is sufficient for security).
+ * If not, drive on since MI_QUERY and MI_PING commands are allowed
+ * from non-privileged ports.
*/
- if (setsockopt(fd, IPPROTO_TCP, TCP_ANONPRIVBIND, &on,
- sizeof (on)) == -1)
- goto fail;
+ (void) setsockopt(fd, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, sizeof (on));
/*
- * Bind to a privileged port chosen by the kernel.
+ * Bind to a port chosen by the kernel.
*/
(void) memset(&sin, 0, sizeof (struct sockaddr_in));
sin.sin_port = htons(0);
diff --git a/usr/src/lib/libipmp/common/ipmp_mpathd.h b/usr/src/lib/libipmp/common/ipmp_mpathd.h
index 61ae71b78f..7df3b4fd92 100644
--- a/usr/src/lib/libipmp/common/ipmp_mpathd.h
+++ b/usr/src/lib/libipmp/common/ipmp_mpathd.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -18,26 +17,17 @@
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
- */
-/*
- * Copyright 1999-2002 Sun Microsystems, Inc. All rights reserved.
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _IPMP_MPATHD_H
#define _IPMP_MPATHD_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Definitions for the messaging protocol between in.mpathd and libipmp.
- * This interface is loosely documented in PSARC/2000/306.
- *
- * PLEASE NOTE: Although this interface is officially consolidation-private,
- * we will be reclassifying it as project-private in the future, and
- * transitioning any existing consumers to use higher-level libipmp routines.
- *
- * Put another way: treat this as if it was project-private!
+ * This interface is project-private to the IPMP subsystem.
*/
#include <sys/types.h>
@@ -49,33 +39,41 @@ extern "C" {
#endif
#define MPATHD_PORT 5999
-#define MPATHD_PATH "/usr/lib/inet/in.mpathd"
+#define MPATHD_PATH "/lib/inet/in.mpathd"
/*
* Supported commands.
*/
enum {
- MI_PING = 0, /* sanity test */
+ MI_PING = 0, /* ping in.mpathd */
MI_OFFLINE = 1, /* offline the interface */
MI_UNDO_OFFLINE = 2, /* undo the offline */
- MI_SETOINDEX = 3, /* set original interface index */
- MI_QUERY = 4, /* query ipmp-related information */
+ MI_QUERY = 3, /* query ipmp-related information */
MI_NCMD /* total number of commands */
};
/*
* Types of information which can be requested and received (except for
- * IPMP_IFLIST, which can only be received).
+ * IPMP_IFLIST and IPMP_ADDRLIST, which can only be received).
*/
typedef enum {
IPMP_GROUPLIST = 1,
IPMP_GROUPINFO = 2,
IPMP_IFINFO = 3,
IPMP_IFLIST = 4,
- IPMP_SNAP = 5
+ IPMP_SNAP = 5,
+ IPMP_ADDRLIST = 6,
+ IPMP_ADDRINFO = 7
} ipmp_infotype_t;
/*
+ * Daemon ping request.
+ */
+typedef struct mi_ping {
+ uint32_t mip_command;
+} mi_ping_t;
+
+/*
* Interface offline request; `mio_ifname' is the interface to offline;
* `mio_min_redundancy' is the minimum amount of usable interfaces after
* offline that must exist for the operation to succeed.
@@ -83,7 +81,6 @@ typedef enum {
typedef struct mi_offline {
uint32_t mio_command;
char mio_ifname[LIFNAMSIZ];
- char mio_move_to_if[LIFNAMSIZ]; /* currently unused */
uint32_t mio_min_redundancy;
} mi_offline_t;
@@ -97,24 +94,12 @@ typedef struct mi_undo_offline {
} mi_undo_offline_t;
/*
- * Set original interface index request: `mis_lifname' is the name of the
- * logical interface that is having its index reset; `mis_new_pifname' is the
- * name of the interface whose index will be associated with `mis_lifname';
- * `mis_iftype' is the interface type.
- */
-typedef struct mi_setoindex {
- uint32_t mis_command;
- char mis_lifname[LIFNAMSIZ];
- char mis_new_pifname[LIFNAMSIZ];
- uint32_t mis_iftype;
-} mi_setoindex_t;
-
-/*
* Retrieve IPMP-related information: `miq_inforeq' is the type of information
- * being request (see above for the list of types). If the request is for
- * either IPMP_GROUPINFO or IPMP_IFINFO, then either `miq_grname' or
- * `miq_ifname' should be set (respectively) to indicate the name of the
- * group or interface to retrieve the information for.
+ * being request (see above for the list of types). If the request type is
+ * IPMP_GROUPINFO, then `miq_grname' indicates the group. If the request type
+ * is IPMP_IFINFO, then `miq_ifname' indicates the interface. If the request
+ * type is IPMP_ADDRINFO then `miq_grname' indicates the group and `miq_addr'
+ * indicates the address.
*/
typedef struct mi_query {
uint32_t miq_command;
@@ -123,6 +108,7 @@ typedef struct mi_query {
char miqu_ifname[LIFNAMSIZ];
char miqu_grname[LIFGRNAMSIZ];
} miq_infodata;
+ struct sockaddr_storage miq_addr;
} mi_query_t;
#define miq_ifname miq_infodata.miqu_ifname
#define miq_grname miq_infodata.miqu_grname
@@ -132,10 +118,10 @@ typedef struct mi_query {
* requirement for receiving any command.
*/
union mi_commands {
- uint32_t mi_command;
+ uint32_t mi_command;
+ mi_ping_t mi_pcmd;
mi_offline_t mi_ocmd;
mi_undo_offline_t mi_ucmd;
- mi_setoindex_t mi_scmd;
mi_query_t mi_qcmd;
};
@@ -147,18 +133,7 @@ typedef struct mi_result {
uint32_t me_mpathd_error; /* Mpathd error */
} mi_result_t;
-/*
- * Legacy values for me_mpathd_error; the daemon now returns the IPMP
- * error codes defined in <ipmp.h>, which are compatible with these error
- * codes. These will be removed in the future.
- */
-enum {
- MPATHD_SUCCESS = 0, /* operation succeeded */
- MPATHD_SYS_ERROR = 1, /* check me_sys_error for the errno */
- MPATHD_MIN_RED_ERROR = 2, /* minimum redundancy not met */
- MPATHD_FAILBACK_DISABLED = 3, /* failback administratively disabled */
- MPATHD_FAILBACK_PARTIAL = 4 /* unable to completely failback */
-};
+#define IPMP_REQTIMEOUT 5 /* seconds */
extern int ipmp_connect(int *);
extern int ipmp_read(int, void *, size_t, const struct timeval *);
diff --git a/usr/src/lib/libipmp/common/ipmp_query.c b/usr/src/lib/libipmp/common/ipmp_query.c
index 8a7dc7ee69..a0af2da578 100644
--- a/usr/src/lib/libipmp/common/ipmp_query.c
+++ b/usr/src/lib/libipmp/common/ipmp_query.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -18,20 +17,18 @@
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
- */
-/*
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
- * IPMP query interfaces (PSARC/2002/615).
+ * IPMP query interfaces (see PSARC/2002/615 and PSARC/2007/272).
*/
#include <assert.h>
#include <errno.h>
+#include <libinetutil.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
@@ -41,13 +38,19 @@
#include "ipmp_mpathd.h"
#include "ipmp_query_impl.h"
-#define IPMP_REQTIMEOUT 5 /* seconds */
-
static ipmp_ifinfo_t *ipmp_ifinfo_clone(ipmp_ifinfo_t *);
+static ipmp_addrinfo_t *ipmp_addrinfo_clone(ipmp_addrinfo_t *);
+static ipmp_addrlist_t *ipmp_addrlist_clone(ipmp_addrlist_t *);
static ipmp_grouplist_t *ipmp_grouplist_clone(ipmp_grouplist_t *);
static ipmp_groupinfo_t *ipmp_groupinfo_clone(ipmp_groupinfo_t *);
+static ipmp_iflist_t *ipmp_iflist_create(uint_t, char (*)[LIFNAMSIZ]);
+static void ipmp_freeiflist(ipmp_iflist_t *);
+static ipmp_addrlist_t *ipmp_addrlist_create(uint_t, struct sockaddr_storage *);
+static void ipmp_freeaddrlist(ipmp_addrlist_t *);
static ipmp_groupinfo_t *ipmp_snap_getgroupinfo(ipmp_snap_t *, const char *);
static ipmp_ifinfo_t *ipmp_snap_getifinfo(ipmp_snap_t *, const char *);
+static ipmp_addrinfo_t *ipmp_snap_getaddrinfo(ipmp_snap_t *, const char *,
+ struct sockaddr_storage *);
static int ipmp_snap_take(ipmp_state_t *, ipmp_snap_t **);
static boolean_t ipmp_checktlv(ipmp_infotype_t, size_t, void *);
static int ipmp_querydone(ipmp_state_t *, int);
@@ -62,7 +65,7 @@ static int ipmp_querydone(ipmp_state_t *, int);
*/
static int
ipmp_sendquery(ipmp_state_t *statep, ipmp_infotype_t type, const char *name,
- struct timeval *endtp)
+ const void *addr, struct timeval *endtp)
{
mi_query_t query;
mi_result_t result;
@@ -72,6 +75,11 @@ ipmp_sendquery(ipmp_state_t *statep, ipmp_infotype_t type, const char *name,
query.miq_inforeq = type;
switch (type) {
+ case IPMP_ADDRINFO:
+ (void) strlcpy(query.miq_grname, name, LIFGRNAMSIZ);
+ query.miq_addr = *(struct sockaddr_storage *)addr;
+ break;
+
case IPMP_GROUPINFO:
(void) strlcpy(query.miq_grname, name, LIFGRNAMSIZ);
break;
@@ -138,6 +146,61 @@ ipmp_readinfo(ipmp_state_t *statep, ipmp_infotype_t infotype, void **infop,
}
/*
+ * Using `statep', read in the remaining IPMP group information TLVs from
+ * in.mpathd into `grinfop' before the current time becomes `endtp'. Returns
+ * an IPMP error code. On failure, `grinfop' will have its original contents.
+ */
+static int
+ipmp_readgroupinfo_lists(ipmp_state_t *statep, ipmp_groupinfo_t *grinfop,
+ const struct timeval *endtp)
+{
+ int retval;
+ ipmp_iflist_t *iflistp;
+ ipmp_addrlist_t *adlistp;
+
+ retval = ipmp_readinfo(statep, IPMP_IFLIST, (void **)&iflistp, endtp);
+ if (retval != IPMP_SUCCESS)
+ return (retval);
+
+ retval = ipmp_readinfo(statep, IPMP_ADDRLIST, (void **)&adlistp, endtp);
+ if (retval != IPMP_SUCCESS) {
+ ipmp_freeiflist(iflistp);
+ return (retval);
+ }
+
+ grinfop->gr_iflistp = iflistp;
+ grinfop->gr_adlistp = adlistp;
+ return (IPMP_SUCCESS);
+}
+
+/*
+ * Using `statep', read in the remaining IPMP interface information TLVs from
+ * in.mpathd into `ifinfop' before the current time becomes `endtp'. Returns
+ * an IPMP error code. On failure, `ifinfop' will have its original contents.
+ */
+static int
+ipmp_readifinfo_lists(ipmp_state_t *statep, ipmp_ifinfo_t *ifinfop,
+ const struct timeval *endtp)
+{
+ int retval;
+ ipmp_addrlist_t *tlist4p, *tlist6p;
+
+ retval = ipmp_readinfo(statep, IPMP_ADDRLIST, (void **)&tlist4p, endtp);
+ if (retval != IPMP_SUCCESS)
+ return (retval);
+
+ retval = ipmp_readinfo(statep, IPMP_ADDRLIST, (void **)&tlist6p, endtp);
+ if (retval != IPMP_SUCCESS) {
+ ipmp_freeaddrlist(tlist4p);
+ return (retval);
+ }
+
+ ifinfop->if_targinfo4.it_targlistp = tlist4p;
+ ifinfop->if_targinfo6.it_targlistp = tlist6p;
+ return (IPMP_SUCCESS);
+}
+
+/*
* Complete the query operation started in ipmp_sendquery(). The interface is
* designed to be easy to use in the `return' statement of a function, and
* thus returns the passed in `retval' and preserves `errno'.
@@ -169,7 +232,7 @@ ipmp_getgrouplist(ipmp_handle_t handle, ipmp_grouplist_t **grlistpp)
return (*grlistpp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM);
}
- retval = ipmp_sendquery(statep, IPMP_GROUPLIST, NULL, &end);
+ retval = ipmp_sendquery(statep, IPMP_GROUPLIST, NULL, NULL, &end);
if (retval != IPMP_SUCCESS)
return (retval);
@@ -196,7 +259,6 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name,
ipmp_groupinfo_t **grinfopp)
{
ipmp_state_t *statep = handle;
- ipmp_iflist_t *iflistp;
int retval;
struct timeval end;
ipmp_groupinfo_t *grinfop;
@@ -210,7 +272,7 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name,
return (*grinfopp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM);
}
- retval = ipmp_sendquery(statep, IPMP_GROUPINFO, name, &end);
+ retval = ipmp_sendquery(statep, IPMP_GROUPINFO, name, NULL, &end);
if (retval != IPMP_SUCCESS)
return (retval);
@@ -218,11 +280,9 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name,
if (retval != IPMP_SUCCESS)
return (ipmp_querydone(statep, retval));
- retval = ipmp_readinfo(statep, IPMP_IFLIST, (void **)&iflistp, &end);
+ retval = ipmp_readgroupinfo_lists(statep, *grinfopp, &end);
if (retval != IPMP_SUCCESS)
free(*grinfopp);
- else
- (*grinfopp)->gr_iflistp = iflistp;
return (ipmp_querydone(statep, retval));
}
@@ -233,7 +293,8 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name,
void
ipmp_freegroupinfo(ipmp_groupinfo_t *grinfop)
{
- free(grinfop->gr_iflistp);
+ ipmp_freeaddrlist(grinfop->gr_adlistp);
+ ipmp_freeiflist(grinfop->gr_iflistp);
free(grinfop);
}
@@ -259,11 +320,18 @@ ipmp_getifinfo(ipmp_handle_t handle, const char *name, ipmp_ifinfo_t **ifinfopp)
return (*ifinfopp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM);
}
- retval = ipmp_sendquery(statep, IPMP_IFINFO, name, &end);
+ retval = ipmp_sendquery(statep, IPMP_IFINFO, name, NULL, &end);
if (retval != IPMP_SUCCESS)
return (retval);
retval = ipmp_readinfo(statep, IPMP_IFINFO, (void **)ifinfopp, &end);
+ if (retval != IPMP_SUCCESS)
+ return (ipmp_querydone(statep, retval));
+
+ retval = ipmp_readifinfo_lists(statep, *ifinfopp, &end);
+ if (retval != IPMP_SUCCESS)
+ free(*ifinfopp);
+
return (ipmp_querydone(statep, retval));
}
@@ -273,10 +341,52 @@ ipmp_getifinfo(ipmp_handle_t handle, const char *name, ipmp_ifinfo_t **ifinfopp)
void
ipmp_freeifinfo(ipmp_ifinfo_t *ifinfop)
{
+ ipmp_freeaddrlist(ifinfop->if_targinfo4.it_targlistp);
+ ipmp_freeaddrlist(ifinfop->if_targinfo6.it_targlistp);
free(ifinfop);
}
/*
+ * Using `handle', get the address information associated with address `addrp'
+ * on group `grname' and store the results in a dynamically allocated buffer
+ * pointed to by `*adinfopp'. Returns an IPMP error code.
+ */
+int
+ipmp_getaddrinfo(ipmp_handle_t handle, const char *grname,
+ struct sockaddr_storage *addrp, ipmp_addrinfo_t **adinfopp)
+{
+ ipmp_state_t *statep = handle;
+ ipmp_addrinfo_t *adinfop;
+ int retval;
+ struct timeval end;
+
+ if (statep->st_snap != NULL) {
+ adinfop = ipmp_snap_getaddrinfo(statep->st_snap, grname, addrp);
+ if (adinfop == NULL)
+ return (IPMP_EUNKADDR);
+
+ *adinfopp = ipmp_addrinfo_clone(adinfop);
+ return (*adinfopp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM);
+ }
+
+ retval = ipmp_sendquery(statep, IPMP_ADDRINFO, grname, addrp, &end);
+ if (retval != IPMP_SUCCESS)
+ return (retval);
+
+ retval = ipmp_readinfo(statep, IPMP_ADDRINFO, (void **)adinfopp, &end);
+ return (ipmp_querydone(statep, retval));
+}
+
+/*
+ * Free the address information pointed to by `adinfop'.
+ */
+void
+ipmp_freeaddrinfo(ipmp_addrinfo_t *adinfop)
+{
+ free(adinfop);
+}
+
+/*
* Check if `buf' has a NUL byte in its first `bufsize' bytes.
*/
static boolean_t
@@ -300,12 +410,25 @@ ipmp_checktlv(ipmp_infotype_t type, size_t len, void *value)
ipmp_ifinfo_t *ifinfop;
ipmp_grouplist_t *grlistp;
ipmp_groupinfo_t *grinfop;
+ ipmp_addrlist_t *adlistp;
unsigned int i;
switch (type) {
+ case IPMP_ADDRINFO:
+ if (len != sizeof (ipmp_addrinfo_t))
+ return (B_FALSE);
+ break;
+
+ case IPMP_ADDRLIST:
+ adlistp = (ipmp_addrlist_t *)value;
+ if (len < IPMP_ADDRLIST_SIZE(0) ||
+ len < IPMP_ADDRLIST_SIZE(adlistp->al_naddr))
+ return (B_FALSE);
+ break;
+
case IPMP_IFLIST:
iflistp = (ipmp_iflist_t *)value;
- if (len < IPMP_IFLIST_MINSIZE ||
+ if (len < IPMP_IFLIST_SIZE(0) ||
len < IPMP_IFLIST_SIZE(iflistp->il_nif))
return (B_FALSE);
@@ -326,7 +449,7 @@ ipmp_checktlv(ipmp_infotype_t type, size_t len, void *value)
case IPMP_GROUPLIST:
grlistp = (ipmp_grouplist_t *)value;
- if (len < IPMP_GROUPLIST_MINSIZE ||
+ if (len < IPMP_GROUPLIST_SIZE(0) ||
len < IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup))
return (B_FALSE);
@@ -357,9 +480,8 @@ ipmp_checktlv(ipmp_infotype_t type, size_t len, void *value)
}
/*
- * Create a group list with signature `sig' containing `ngroup' groups named
- * by `groups'. Returns a pointer to the new group list on success, or NULL
- * on failure.
+ * Create a group list; arguments match ipmp_grouplist_t fields. Returns a
+ * pointer to the new group list on success, or NULL on failure.
*/
ipmp_grouplist_t *
ipmp_grouplist_create(uint64_t sig, unsigned int ngroup,
@@ -392,13 +514,80 @@ ipmp_grouplist_clone(ipmp_grouplist_t *grlistp)
}
/*
- * Create an interface information structure for interface `name' and
- * associate `group', `state' and `type' with it. Returns a pointer to the
- * interface information on success, or NULL on failure.
+ * Create target information; arguments match ipmp_targinfo_t fields. Returns
+ * a pointer to the new target info on success, or NULL on failure.
+ */
+ipmp_targinfo_t *
+ipmp_targinfo_create(const char *name, struct sockaddr_storage *testaddrp,
+ ipmp_if_targmode_t targmode, uint_t ntarg, struct sockaddr_storage *targs)
+{
+ ipmp_targinfo_t *targinfop;
+
+ targinfop = malloc(sizeof (ipmp_targinfo_t));
+ if (targinfop == NULL)
+ return (NULL);
+
+ targinfop->it_testaddr = *testaddrp;
+ targinfop->it_targmode = targmode;
+ targinfop->it_targlistp = ipmp_addrlist_create(ntarg, targs);
+ if (targinfop->it_targlistp == NULL) {
+ ipmp_freetarginfo(targinfop);
+ return (NULL);
+ }
+ (void) strlcpy(targinfop->it_name, name, LIFNAMSIZ);
+
+ return (targinfop);
+}
+
+/*
+ * Free the target information pointed to by `targinfop'.
+ */
+void
+ipmp_freetarginfo(ipmp_targinfo_t *targinfop)
+{
+ free(targinfop->it_targlistp);
+ free(targinfop);
+}
+
+/*
+ * Create an interface list; arguments match ipmp_iflist_t fields. Returns a
+ * pointer to the new interface list on success, or NULL on failure.
+ */
+static ipmp_iflist_t *
+ipmp_iflist_create(uint_t nif, char (*ifs)[LIFNAMSIZ])
+{
+ unsigned int i;
+ ipmp_iflist_t *iflistp;
+
+ iflistp = malloc(IPMP_IFLIST_SIZE(nif));
+ if (iflistp == NULL)
+ return (NULL);
+
+ iflistp->il_nif = nif;
+ for (i = 0; i < nif; i++)
+ (void) strlcpy(iflistp->il_ifs[i], ifs[i], LIFNAMSIZ);
+
+ return (iflistp);
+}
+
+/*
+ * Free the interface list pointed to by `iflistp'.
+ */
+static void
+ipmp_freeiflist(ipmp_iflist_t *iflistp)
+{
+ free(iflistp);
+}
+
+/*
+ * Create an interface; arguments match ipmp_ifinfo_t fields. Returns a
+ * pointer to the new interface on success, or NULL on failure.
*/
ipmp_ifinfo_t *
ipmp_ifinfo_create(const char *name, const char *group, ipmp_if_state_t state,
- ipmp_if_type_t type)
+ ipmp_if_type_t type, ipmp_if_linkstate_t linkstate,
+ ipmp_if_probestate_t probestate, ipmp_if_flags_t flags,
+ ipmp_targinfo_t *targinfo4p, ipmp_targinfo_t *targinfo6p)
{
ipmp_ifinfo_t *ifinfop;
@@ -408,8 +597,25 @@ ipmp_ifinfo_create(const char *name, const char *group, ipmp_if_state_t state,
(void) strlcpy(ifinfop->if_name, name, LIFNAMSIZ);
(void) strlcpy(ifinfop->if_group, group, LIFGRNAMSIZ);
- ifinfop->if_state = state;
- ifinfop->if_type = type;
+
+ ifinfop->if_state = state;
+ ifinfop->if_type = type;
+ ifinfop->if_linkstate = linkstate;
+ ifinfop->if_probestate = probestate;
+ ifinfop->if_flags = flags;
+ ifinfop->if_targinfo4 = *targinfo4p;
+ ifinfop->if_targinfo6 = *targinfo6p;
+
+ ifinfop->if_targinfo4.it_targlistp =
+ ipmp_addrlist_clone(targinfo4p->it_targlistp);
+ ifinfop->if_targinfo6.it_targlistp =
+ ipmp_addrlist_clone(targinfo6p->it_targlistp);
+
+ if (ifinfop->if_targinfo4.it_targlistp == NULL ||
+ ifinfop->if_targinfo6.it_targlistp == NULL) {
+ ipmp_freeifinfo(ifinfop);
+ return (NULL);
+ }
return (ifinfop);
}
@@ -422,40 +628,41 @@ ipmp_ifinfo_t *
ipmp_ifinfo_clone(ipmp_ifinfo_t *ifinfop)
{
return (ipmp_ifinfo_create(ifinfop->if_name, ifinfop->if_group,
- ifinfop->if_state, ifinfop->if_type));
+ ifinfop->if_state, ifinfop->if_type, ifinfop->if_linkstate,
+ ifinfop->if_probestate, ifinfop->if_flags, &ifinfop->if_targinfo4,
+ &ifinfop->if_targinfo6));
}
/*
- * Create a group named `name' with signature `sig', in state `state', and
- * with the `nif' interfaces named by `ifs' as members. Returns a pointer
+ * Create a group; arguments match ipmp_groupinfo_t fields. Returns a pointer
* to the new group on success, or NULL on failure.
*/
ipmp_groupinfo_t *
-ipmp_groupinfo_create(const char *name, uint64_t sig, ipmp_group_state_t state,
- unsigned int nif, char (*ifs)[LIFNAMSIZ])
+ipmp_groupinfo_create(const char *name, uint64_t sig, uint_t fdt,
+ ipmp_group_state_t state, uint_t nif, char (*ifs)[LIFNAMSIZ],
+ const char *grifname, const char *m4ifname, const char *m6ifname,
+ const char *bcifname, uint_t naddr, struct sockaddr_storage *addrs)
{
ipmp_groupinfo_t *grinfop;
- ipmp_iflist_t *iflistp;
- unsigned int i;
grinfop = malloc(sizeof (ipmp_groupinfo_t));
if (grinfop == NULL)
return (NULL);
- iflistp = malloc(IPMP_IFLIST_SIZE(nif));
- if (iflistp == NULL) {
- free(grinfop);
+ grinfop->gr_sig = sig;
+ grinfop->gr_fdt = fdt;
+ grinfop->gr_state = state;
+ grinfop->gr_iflistp = ipmp_iflist_create(nif, ifs);
+ grinfop->gr_adlistp = ipmp_addrlist_create(naddr, addrs);
+ if (grinfop->gr_iflistp == NULL || grinfop->gr_adlistp == NULL) {
+ ipmp_freegroupinfo(grinfop);
return (NULL);
}
-
- grinfop->gr_sig = sig;
- grinfop->gr_state = state;
- grinfop->gr_iflistp = iflistp;
(void) strlcpy(grinfop->gr_name, name, LIFGRNAMSIZ);
-
- iflistp->il_nif = nif;
- for (i = 0; i < nif; i++)
- (void) strlcpy(iflistp->il_ifs[i], ifs[i], LIFNAMSIZ);
+ (void) strlcpy(grinfop->gr_ifname, grifname, LIFNAMSIZ);
+ (void) strlcpy(grinfop->gr_m4ifname, m4ifname, LIFNAMSIZ);
+ (void) strlcpy(grinfop->gr_m6ifname, m6ifname, LIFNAMSIZ);
+ (void) strlcpy(grinfop->gr_bcifname, bcifname, LIFNAMSIZ);
return (grinfop);
}
@@ -467,9 +674,86 @@ ipmp_groupinfo_create(const char *name, uint64_t sig, ipmp_group_state_t state,
ipmp_groupinfo_t *
ipmp_groupinfo_clone(ipmp_groupinfo_t *grinfop)
{
+ ipmp_addrlist_t *adlistp = grinfop->gr_adlistp;
+
return (ipmp_groupinfo_create(grinfop->gr_name, grinfop->gr_sig,
- grinfop->gr_state, grinfop->gr_iflistp->il_nif,
- grinfop->gr_iflistp->il_ifs));
+ grinfop->gr_fdt, grinfop->gr_state, grinfop->gr_iflistp->il_nif,
+ grinfop->gr_iflistp->il_ifs, grinfop->gr_ifname,
+ grinfop->gr_m4ifname, grinfop->gr_m6ifname, grinfop->gr_bcifname,
+ adlistp->al_naddr, adlistp->al_addrs));
+}
+
+/*
+ * Create an address list; arguments match ipmp_addrlist_t fields. Returns
+ * a pointer to the new address list on success, or NULL on failure.
+ */
+static ipmp_addrlist_t *
+ipmp_addrlist_create(uint_t naddr, struct sockaddr_storage *addrs)
+{
+ unsigned int i;
+ ipmp_addrlist_t *adlistp;
+
+ adlistp = malloc(IPMP_ADDRLIST_SIZE(naddr));
+ if (adlistp == NULL)
+ return (NULL);
+
+ adlistp->al_naddr = naddr;
+ for (i = 0; i < naddr; i++)
+ adlistp->al_addrs[i] = addrs[i];
+
+ return (adlistp);
+}
+
+/*
+ * Clone the address list named by `adlistp'. Returns a pointer to the clone
+ * on success, or NULL on failure.
+ */
+static ipmp_addrlist_t *
+ipmp_addrlist_clone(ipmp_addrlist_t *adlistp)
+{
+ return (ipmp_addrlist_create(adlistp->al_naddr, adlistp->al_addrs));
+}
+
+/*
+ * Free the address list pointed to by `adlistp'.
+ */
+static void
+ipmp_freeaddrlist(ipmp_addrlist_t *adlistp)
+{
+ free(adlistp);
+}
+
+/*
+ * Create an address; arguments match ipmp_addrinfo_t fields. Returns a
+ * pointer to the new address on success, or NULL on failure.
+ */
+ipmp_addrinfo_t *
+ipmp_addrinfo_create(struct sockaddr_storage *addrp, ipmp_addr_state_t state,
+ const char *group, const char *binding)
+{
+ ipmp_addrinfo_t *adinfop;
+
+ adinfop = malloc(sizeof (ipmp_addrinfo_t));
+ if (adinfop == NULL)
+ return (NULL);
+
+ adinfop->ad_addr = *addrp;
+ adinfop->ad_state = state;
+ (void) strlcpy(adinfop->ad_group, group, LIFGRNAMSIZ);
+ (void) strlcpy(adinfop->ad_binding, binding, LIFNAMSIZ);
+
+ return (adinfop);
+}
+
+/*
+ * Clone the address information named by `adinfop'. Returns a pointer to
+ * the clone on success, or NULL on failure.
+ */
+ipmp_addrinfo_t *
+ipmp_addrinfo_clone(ipmp_addrinfo_t *adinfop)
+{
+ return (ipmp_addrinfo_create(&adinfop->ad_addr, adinfop->ad_state,
+ adinfop->ad_group, adinfop->ad_binding));
}
/*
@@ -523,8 +807,10 @@ ipmp_snap_create(void)
snap->sn_grlistp = NULL;
snap->sn_grinfolistp = NULL;
snap->sn_ifinfolistp = NULL;
+ snap->sn_adinfolistp = NULL;
snap->sn_ngroup = 0;
snap->sn_nif = 0;
+ snap->sn_naddr = 0;
return (snap);
}
@@ -536,6 +822,7 @@ void
ipmp_snap_free(ipmp_snap_t *snap)
{
ipmp_ifinfolist_t *iflp, *ifnext;
+ ipmp_addrinfolist_t *adlp, *adnext;
ipmp_groupinfolist_t *grlp, *grnext;
ipmp_freegrouplist(snap->sn_grlistp);
@@ -552,6 +839,12 @@ ipmp_snap_free(ipmp_snap_t *snap)
free(iflp);
}
+ for (adlp = snap->sn_adinfolistp; adlp != NULL; adlp = adnext) {
+ adnext = adlp->adl_next;
+ ipmp_freeaddrinfo(adlp->adl_adinfop);
+ free(adlp);
+ }
+
free(snap);
}
@@ -612,6 +905,34 @@ ipmp_snap_addifinfo(ipmp_snap_t *snap, ipmp_ifinfo_t *ifinfop)
}
/*
+ * Add the address information in `adinfop' to the snapshot named by `snap'.
+ * Returns an IPMP error code.
+ */
+int
+ipmp_snap_addaddrinfo(ipmp_snap_t *snap, ipmp_addrinfo_t *adinfop)
+{
+ ipmp_addrinfolist_t *adlp;
+
+ /*
+ * Any duplicate addresses should've already been weeded by in.mpathd.
+ */
+ if (ipmp_snap_getaddrinfo(snap, adinfop->ad_group,
+ &adinfop->ad_addr) != NULL)
+ return (IPMP_EPROTO);
+
+ adlp = malloc(sizeof (ipmp_addrinfolist_t));
+ if (adlp == NULL)
+ return (IPMP_ENOMEM);
+
+ adlp->adl_adinfop = adinfop;
+ adlp->adl_next = snap->sn_adinfolistp;
+ snap->sn_adinfolistp = adlp;
+ snap->sn_naddr++;
+
+ return (IPMP_SUCCESS);
+}
+
+/*
* Retrieve the information for the group `name' in snapshot `snap'.
* Returns a pointer to the group information on success, or NULL on failure.
*/
@@ -647,6 +968,26 @@ ipmp_snap_getifinfo(ipmp_snap_t *snap, const char *name)
}
/*
+ * Retrieve the information for the address `addrp' on group `grname' in
+ * snapshot `snap'. Returns a pointer to the address information on success,
+ * or NULL on failure.
+ */
+static ipmp_addrinfo_t *
+ipmp_snap_getaddrinfo(ipmp_snap_t *snap, const char *grname,
+ struct sockaddr_storage *addrp)
+{
+ ipmp_addrinfolist_t *adlp;
+
+ for (adlp = snap->sn_adinfolistp; adlp != NULL; adlp = adlp->adl_next) {
+ if (strcmp(grname, adlp->adl_adinfop->ad_group) == 0 &&
+ sockaddrcmp(addrp, &adlp->adl_adinfop->ad_addr))
+ break;
+ }
+
+ return (adlp != NULL ? adlp->adl_adinfop : NULL);
+}
+
+/*
* Using `statep', take a snapshot of the IPMP subsystem and if successful
* return it in a dynamically allocated snapshot pointed to by `*snapp'.
* Returns an IPMP error code.
@@ -656,7 +997,6 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp)
{
ipmp_snap_t *snap, *osnap;
ipmp_infotype_t type;
- ipmp_iflist_t *iflistp;
int retval;
size_t len;
void *infop;
@@ -666,7 +1006,7 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp)
if (snap == NULL)
return (IPMP_ENOMEM);
- retval = ipmp_sendquery(statep, IPMP_SNAP, NULL, &end);
+ retval = ipmp_sendquery(statep, IPMP_SNAP, NULL, NULL, &end);
if (retval != IPMP_SUCCESS) {
ipmp_snap_free(snap);
return (retval);
@@ -679,12 +1019,11 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp)
}
/*
- * Using the information in the passed `osnap' snapshot, build up our
- * own snapshot. If we receive more than one grouplist, or more than
- * the expected number of interfaces or groups, then bail out. Note
- * that there's only so much we can do to check that the information
- * sent by in.mpathd makes sense. We know there will always be at
- * least one TLV (IPMP_GROUPLIST).
+ * Using the information in the `osnap' snapshot, build up our own
+ * snapshot. We know there will always be at least one TLV (for
+ * IPMP_GROUPLIST). If we receive anything illogical (e.g., more than
+ * the expected number of interfaces), then bail out. However, to a
+ * large extent we have to trust the information sent by in.mpathd.
*/
do {
infop = NULL;
@@ -711,7 +1050,32 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp)
retval = IPMP_EPROTO;
break;
}
+
+ /*
+ * Read in V4 and V6 targlist TLVs that follow.
+ */
+ retval = ipmp_readifinfo_lists(statep, infop, &end);
+ if (retval != IPMP_SUCCESS)
+ break;
+
retval = ipmp_snap_addifinfo(snap, infop);
+ if (retval != IPMP_SUCCESS) {
+ ipmp_freeifinfo(infop);
+ infop = NULL;
+ }
+ break;
+
+ case IPMP_ADDRINFO:
+ if (snap->sn_naddr == osnap->sn_naddr) {
+ retval = IPMP_EPROTO;
+ break;
+ }
+
+ retval = ipmp_snap_addaddrinfo(snap, infop);
+ /*
+ * NOTE: since we didn't call ipmp_read*info_lists(),
+ * no need to use ipmp_freeaddrinfo() on failure.
+ */
break;
case IPMP_GROUPINFO:
@@ -721,18 +1085,17 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp)
}
/*
- * An IPMP_IFLIST TLV always follows the
- * IPMP_GROUPINFO TLV; read it in.
+ * Read in IPMP groupinfo list TLVs that follow.
*/
- retval = ipmp_readinfo(statep, IPMP_IFLIST,
- (void **)&iflistp, &end);
+ retval = ipmp_readgroupinfo_lists(statep, infop, &end);
if (retval != IPMP_SUCCESS)
break;
- ((ipmp_groupinfo_t *)infop)->gr_iflistp = iflistp;
retval = ipmp_snap_addgroupinfo(snap, infop);
- if (retval != IPMP_SUCCESS)
- free(iflistp);
+ if (retval != IPMP_SUCCESS) {
+ ipmp_freegroupinfo(infop);
+ infop = NULL;
+ }
break;
default:
@@ -747,7 +1110,8 @@ fail:
return (ipmp_querydone(statep, retval));
}
} while (snap->sn_grlistp == NULL || snap->sn_nif < osnap->sn_nif ||
- snap->sn_ngroup < osnap->sn_ngroup);
+ snap->sn_ngroup < osnap->sn_ngroup ||
+ snap->sn_naddr < osnap->sn_naddr);
free(osnap);
*snapp = snap;
diff --git a/usr/src/lib/libipmp/common/ipmp_query.h b/usr/src/lib/libipmp/common/ipmp_query.h
index d92554887a..160f561dd2 100644
--- a/usr/src/lib/libipmp/common/ipmp_query.h
+++ b/usr/src/lib/libipmp/common/ipmp_query.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -18,17 +17,14 @@
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
- */
-/*
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _IPMP_QUERY_H
#define _IPMP_QUERY_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/socket.h> /* needed by <net/if.h> */
#include <net/if.h> /* for LIF*NAMSIZ */
@@ -38,7 +34,7 @@
* IPMP query interfaces.
*
* These interfaces may only be used within ON or after signing a contract
- * with ON. For documentation, refer to PSARC/2002/615.
+ * with ON. For documentation, refer to PSARC/2002/615 and PSARC/2007/272.
*/
#ifdef __cplusplus
@@ -46,6 +42,43 @@ extern "C" {
#endif
/*
+ * Assorted enumerations used in the data types described below.
+ */
+typedef enum ipmp_if_probestate {
+ IPMP_PROBE_OK, /* probes detect no problems */
+ IPMP_PROBE_FAILED, /* probes detect failure */
+ IPMP_PROBE_UNKNOWN, /* probe detection unavailable */
+ IPMP_PROBE_DISABLED /* probe detection disabled */
+} ipmp_if_probestate_t;
+
+typedef enum ipmp_if_linkstate {
+ IPMP_LINK_UP, /* link detects up */
+ IPMP_LINK_DOWN, /* link detects down */
+ IPMP_LINK_UNKNOWN /* link detection unavailable */
+} ipmp_if_linkstate_t;
+
+typedef enum ipmp_if_flags {
+ IPMP_IFFLAG_INACTIVE = 0x1,
+ IPMP_IFFLAG_HWADDRDUP = 0x2,
+ IPMP_IFFLAG_ACTIVE = 0x4,
+ IPMP_IFFLAG_DOWN = 0x8
+} ipmp_if_flags_t;
+
+typedef enum ipmp_addr_state {
+ IPMP_ADDR_UP, /* address is up */
+ IPMP_ADDR_DOWN /* address is down */
+} ipmp_addr_state_t;
+
+typedef enum ipmp_if_targmode {
+ IPMP_TARG_DISABLED, /* use of targets is disabled */
+ IPMP_TARG_ROUTES, /* route-learned targets */
+ IPMP_TARG_MULTICAST /* multicast-learned targets */
+} ipmp_if_targmode_t;
+
+#define IPMP_LIST_SIZE(listtype, elsize, nel) \
+ ((sizeof (ipmp_ ## listtype ## _t) - (elsize)) + ((nel) * (elsize)))
+
+/*
* Data type describing a list of IPMP groups.
*/
typedef struct ipmp_grouplist {
@@ -54,8 +87,8 @@ typedef struct ipmp_grouplist {
char gl_groups[1][LIFGRNAMSIZ];
} ipmp_grouplist_t;
-#define IPMP_GROUPLIST_MINSIZE (sizeof (ipmp_grouplist_t) - LIFGRNAMSIZ)
-#define IPMP_GROUPLIST_SIZE(ngr) (IPMP_GROUPLIST_MINSIZE + (ngr) * LIFGRNAMSIZ)
+#define IPMP_GROUPLIST_SIZE(ngr) \
+ IPMP_LIST_SIZE(grouplist, LIFGRNAMSIZ, ngr)
/*
* Data type describing a list of interfaces.
@@ -65,8 +98,19 @@ typedef struct ipmp_iflist {
char il_ifs[1][LIFNAMSIZ];
} ipmp_iflist_t;
-#define IPMP_IFLIST_MINSIZE (sizeof (ipmp_iflist_t) - LIFNAMSIZ)
-#define IPMP_IFLIST_SIZE(nif) (IPMP_IFLIST_MINSIZE + (nif) * LIFNAMSIZ)
+#define IPMP_IFLIST_SIZE(nif) \
+ IPMP_LIST_SIZE(iflist, LIFNAMSIZ, nif)
+
+/*
+ * Data type describing a list of addresses.
+ */
+typedef struct ipmp_addrlist {
+ unsigned int al_naddr;
+ struct sockaddr_storage al_addrs[1];
+} ipmp_addrlist_t;
+
+#define IPMP_ADDRLIST_SIZE(naddr) \
+ IPMP_LIST_SIZE(addrlist, sizeof (struct sockaddr_storage), naddr)
/*
* Data type describing the state of an IPMP group.
@@ -76,18 +120,49 @@ typedef struct ipmp_groupinfo {
uint64_t gr_sig;
ipmp_group_state_t gr_state;
ipmp_iflist_t *gr_iflistp;
+ ipmp_addrlist_t *gr_adlistp;
+ char gr_ifname[LIFNAMSIZ];
+ char gr_m4ifname[LIFNAMSIZ];
+ char gr_m6ifname[LIFNAMSIZ];
+ char gr_bcifname[LIFNAMSIZ];
+ unsigned int gr_fdt;
} ipmp_groupinfo_t;
/*
+ * Data type describing IPMP target information for a particular interface.
+ */
+typedef struct ipmp_targinfo {
+ char it_name[LIFNAMSIZ];
+ struct sockaddr_storage it_testaddr;
+ ipmp_if_targmode_t it_targmode;
+ ipmp_addrlist_t *it_targlistp;
+} ipmp_targinfo_t;
+
+/*
* Data type describing the IPMP-related state of an interface.
*/
typedef struct ipmp_ifinfo {
- char if_name[LIFNAMSIZ];
- char if_group[LIFGRNAMSIZ];
- ipmp_if_state_t if_state;
- ipmp_if_type_t if_type;
+ char if_name[LIFNAMSIZ];
+ char if_group[LIFGRNAMSIZ];
+ ipmp_if_state_t if_state;
+ ipmp_if_type_t if_type;
+ ipmp_if_linkstate_t if_linkstate;
+ ipmp_if_probestate_t if_probestate;
+ ipmp_if_flags_t if_flags;
+ ipmp_targinfo_t if_targinfo4;
+ ipmp_targinfo_t if_targinfo6;
} ipmp_ifinfo_t;
+/*
+ * Data type describing an IPMP data address.
+ */
+typedef struct ipmp_addrinfo {
+ struct sockaddr_storage ad_addr;
+ ipmp_addr_state_t ad_state;
+ char ad_group[LIFGRNAMSIZ];
+ char ad_binding[LIFNAMSIZ];
+} ipmp_addrinfo_t;
+
typedef enum {
IPMP_QCONTEXT_LIVE,
IPMP_QCONTEXT_SNAP
@@ -100,6 +175,9 @@ extern int ipmp_getgroupinfo(ipmp_handle_t, const char *, ipmp_groupinfo_t **);
extern void ipmp_freegroupinfo(ipmp_groupinfo_t *);
extern int ipmp_getifinfo(ipmp_handle_t, const char *, ipmp_ifinfo_t **);
extern void ipmp_freeifinfo(ipmp_ifinfo_t *);
+extern int ipmp_getaddrinfo(ipmp_handle_t, const char *,
+ struct sockaddr_storage *, ipmp_addrinfo_t **);
+extern void ipmp_freeaddrinfo(ipmp_addrinfo_t *);
#ifdef __cplusplus
}
diff --git a/usr/src/lib/libipmp/common/ipmp_query_impl.h b/usr/src/lib/libipmp/common/ipmp_query_impl.h
index 03ecb5cd84..6ac5c3ca27 100644
--- a/usr/src/lib/libipmp/common/ipmp_query_impl.h
+++ b/usr/src/lib/libipmp/common/ipmp_query_impl.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -18,17 +17,14 @@
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
- */
-/*
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _IPMP_QUERY_IMPL_H
#define _IPMP_QUERY_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <ipmp_query.h>
/*
@@ -58,14 +54,24 @@ typedef struct ipmp_ifinfolist {
} ipmp_ifinfolist_t;
/*
+ * List of ipmp_addrinfo_t structures.
+ */
+typedef struct ipmp_addrinfolist {
+ struct ipmp_addrinfolist *adl_next;
+ ipmp_addrinfo_t *adl_adinfop;
+} ipmp_addrinfolist_t;
+
+/*
* Snapshot of IPMP state.
*/
typedef struct ipmp_snap {
ipmp_grouplist_t *sn_grlistp;
ipmp_groupinfolist_t *sn_grinfolistp;
ipmp_ifinfolist_t *sn_ifinfolistp;
+ ipmp_addrinfolist_t *sn_adinfolistp;
unsigned int sn_ngroup;
unsigned int sn_nif;
+ unsigned int sn_naddr;
} ipmp_snap_t;
/*
@@ -74,17 +80,28 @@ typedef struct ipmp_snap {
extern ipmp_snap_t *ipmp_snap_create(void);
extern void ipmp_snap_free(ipmp_snap_t *);
extern int ipmp_snap_addifinfo(ipmp_snap_t *, ipmp_ifinfo_t *);
+extern int ipmp_snap_addaddrinfo(ipmp_snap_t *, ipmp_addrinfo_t *);
extern int ipmp_snap_addgroupinfo(ipmp_snap_t *, ipmp_groupinfo_t *);
/*
- * IPMP structure creation routines.
+ * IPMP structure creation/destruction routines.
*/
extern ipmp_ifinfo_t *ipmp_ifinfo_create(const char *, const char *,
- ipmp_if_state_t, ipmp_if_type_t);
-extern ipmp_groupinfo_t *ipmp_groupinfo_create(const char *, uint64_t,
- ipmp_group_state_t, unsigned int, char (*)[LIFNAMSIZ]);
+ ipmp_if_state_t, ipmp_if_type_t, ipmp_if_linkstate_t, ipmp_if_probestate_t,
+ ipmp_if_flags_t, ipmp_targinfo_t *, ipmp_targinfo_t *);
+extern ipmp_groupinfo_t *ipmp_groupinfo_create(const char *, uint64_t, uint_t,
+ ipmp_group_state_t, uint_t, char (*)[LIFNAMSIZ], const char *,
+ const char *, const char *, const char *, uint_t,
+ struct sockaddr_storage *);
extern ipmp_grouplist_t *ipmp_grouplist_create(uint64_t, unsigned int,
char (*)[LIFGRNAMSIZ]);
+extern ipmp_addrinfo_t *ipmp_addrinfo_create(struct sockaddr_storage *,
+ ipmp_addr_state_t, const char *, const char *);
+extern ipmp_targinfo_t *ipmp_targinfo_create(const char *,
+ struct sockaddr_storage *, ipmp_if_targmode_t, uint_t,
+ struct sockaddr_storage *);
+extern void ipmp_freetarginfo(ipmp_targinfo_t *);
+
#ifdef __cplusplus
}
diff --git a/usr/src/lib/libipmp/common/llib-lipmp b/usr/src/lib/libipmp/common/llib-lipmp
index a16011745a..a22eec5d66 100644
--- a/usr/src/lib/libipmp/common/llib-lipmp
+++ b/usr/src/lib/libipmp/common/llib-lipmp
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,14 @@
* CDDL HEADER END
*/
/*
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/* LINTLIBRARY */
/* PROTOLIB1 */
#include <ipmp.h>
+#include <ipmp_admin.h>
#include <ipmp_mpathd.h>
#include <ipmp_query_impl.h>
diff --git a/usr/src/lib/libipmp/common/mapfile-vers b/usr/src/lib/libipmp/common/mapfile-vers
index a4052bfcd3..8c93248338 100644
--- a/usr/src/lib/libipmp/common/mapfile-vers
+++ b/usr/src/lib/libipmp/common/mapfile-vers
@@ -19,32 +19,39 @@
# CDDL HEADER END
#
#
-# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
SUNWprivate_1.1 {
global:
+ ipmp_addrinfo_create;
ipmp_close;
ipmp_errmsg;
+ ipmp_freeaddrinfo;
ipmp_freegroupinfo;
ipmp_freegrouplist;
ipmp_freeifinfo;
+ ipmp_freetarginfo;
+ ipmp_getaddrinfo;
ipmp_getgroupinfo;
ipmp_getgrouplist;
ipmp_getifinfo;
ipmp_groupinfo_create;
ipmp_grouplist_create;
ipmp_ifinfo_create;
+ ipmp_offline;
ipmp_open;
+ ipmp_ping_daemon;
ipmp_read;
ipmp_setqcontext;
+ ipmp_snap_addaddrinfo;
ipmp_snap_addgroupinfo;
ipmp_snap_addifinfo;
ipmp_snap_create;
ipmp_snap_free;
+ ipmp_targinfo_create;
+ ipmp_undo_offline;
ipmp_write;
ipmp_writetlv;
local:
diff --git a/usr/src/lib/libnsl/nss/netdir_inet_sundry.c b/usr/src/lib/libnsl/nss/netdir_inet_sundry.c
index 742e7408b2..4e9473a8cf 100644
--- a/usr/src/lib/libnsl/nss/netdir_inet_sundry.c
+++ b/usr/src/lib/libnsl/nss/netdir_inet_sundry.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -21,7 +20,7 @@
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* lib/libnsl/nss/netdir_inet_sundry.c
@@ -39,8 +38,6 @@
* Copied mostly from erstwhile lib/nametoaddr/tcpip/tcpip.c.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "mt.h"
#include <stdlib.h>
#include <stdio.h>
@@ -69,9 +66,6 @@
#include <syslog.h>
#include <values.h>
#include <limits.h>
-#ifdef DEBUG
-#include <stdio.h>
-#endif
#include <nss_dbdefs.h>
#include "nss.h"
@@ -151,8 +145,8 @@ __inet_taddr2uaddr(struct netconfig *tp, struct netbuf *addr)
/* LINTED pointer cast */
sa6 = (struct sockaddr_in6 *)(addr->buf);
myport = ntohs(sa6->sin6_port);
- if (inet_ntop(AF_INET6, (void *)sa6->sin6_addr.s6_addr,
- tmp, sizeof (tmp)) == 0) {
+ if (inet_ntop(AF_INET6, sa6->sin6_addr.s6_addr, tmp,
+ sizeof (tmp)) == NULL) {
_nderror = ND_BADARG;
return (NULL);
}
@@ -400,7 +394,7 @@ getifnum:
continue;
if_info[n_ifs].if_address =
- ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr;
+ ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr;
if (nss_ioctl(AF_INET, SIOCGLIFFLAGS, lifr) < 0)
continue;
@@ -413,7 +407,7 @@ getifnum:
continue;
if_info[n_ifs].if_netmask =
- ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr;
+ ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr;
n_ifs++;
}
free(buf);
@@ -528,21 +522,12 @@ get_best_match(struct in_addr addr)
if_addr = ntohl(ifn->if_address.s_addr); /* host order */
/*
- * Checking if the interface selected is FAILED or DEPRECATED.
- * In case IFF_FAILED or IFF_DEPRECATED flag for the interface
- * is set, we move on to the next interface in the list.
- * Refer IPMP(IP Multi Pathing) for more details.
- */
-
- if ((ifn->if_flags & (IFF_FAILED | IFF_DEPRECATED)) != 0)
- continue;
-
- /*
* set initial count to first bit set in netmask, with
* zero being the number of the least significant bit.
*/
- for (count = 0, mask = netmask; mask && ((mask & 1) == 0);
- count++, mask >>= 1);
+ count = 0;
+ for (mask = netmask; mask && ((mask & 1) == 0); mask >>= 1)
+ count++;
/*
* Set limit so that we don't try to match prefixes shorter
@@ -570,12 +555,6 @@ get_best_match(struct in_addr addr)
* (2) the best partial subnet match
* (3) the first non-loopback && non-PPP interface
* (4) the first non-loopback interface (PPP is OK)
- *
- * While checking for condition (3) and (4), we also look
- * if the interface we are returning is neither FAILED
- * nor DEPRECATED. In case there are no interface
- * available, which are neither FAILED nor DEPRECRATED,
- * we return 0.
*/
found = FALSE;
while (netmask && count < subnet_count) {
@@ -607,8 +586,7 @@ get_best_match(struct in_addr addr)
*/
if (bestmatch == NULL) {
for (ifn = if_info; ifn < (if_info + n_ifs); ifn++) {
- if ((ifn->if_flags & (IFF_LOOPBACK |
- IFF_FAILED | IFF_DEPRECATED)) == 0) {
+ if ((ifn->if_flags & IFF_LOOPBACK) == 0) {
bestmatch = ifn;
/*
@@ -619,10 +597,6 @@ get_best_match(struct in_addr addr)
* list...
*/
if ((ifn->if_flags & IFF_POINTOPOINT) == 0) {
-#ifdef DEBUG
- (void) printf("found !loopback && !non-PPP interface: %s\n",
- inet_ntoa(ifn->if_address));
-#endif
break;
}
}
@@ -701,9 +675,9 @@ select_server_addr(union any_in_addr *dst_addr, int family,
}
/* open a UDP socket */
- if ((tmp_fd = _so_socket(family, SOCK_DGRAM, 0,
- NULL, SOV_SOCKBSD)) < 0) {
- syslog(LOG_ERR, "selsect_server_addr:connect failed\n");
+ tmp_fd = _so_socket(family, SOCK_DGRAM, 0, NULL, SOV_SOCKBSD);
+ if (tmp_fd < 0) {
+ syslog(LOG_ERR, "select_server_addr: connect failed\n");
return (FALSE);
}
@@ -716,15 +690,16 @@ select_server_addr(union any_in_addr *dst_addr, int family,
* message, as it'll try to send the probe packet out and will
* receive ICMP unreachable.
*/
- if (family == AF_INET)
+ if (family == AF_INET) {
src_addr->addr.s_addr = INADDR_ANY;
- else
+ } else {
/*
* Since in6addr_any is not in the scope
* use the following hack
*/
(void) memset(src_addr->addr6.s6_addr,
- 0, sizeof (struct in6_addr));
+ 0, sizeof (struct in6_addr));
+ }
(void) close(tmp_fd);
free(sock);
return (FALSE);
@@ -732,7 +707,7 @@ select_server_addr(union any_in_addr *dst_addr, int family,
/* get the local sock info */
if (_so_getsockname(tmp_fd, sock, &sock_len, SOV_DEFAULT) < 0) {
- syslog(LOG_ERR, "selsect_server_addr:getsockname failed\n");
+ syslog(LOG_ERR, "select_server_addr: getsockname failed\n");
(void) close(tmp_fd);
free(sock);
return (FALSE);
@@ -799,11 +774,6 @@ inet_netdir_mergeaddr(struct netconfig *tp, char *ruaddr, char *uaddr)
clientaddr.s_addr = inet_addr(ruaddr);
-#ifdef DEBUG
- (void) printf("client's address is %s and %s\n",
- ruaddr, inet_ntoa(clientaddr));
-#endif
-
/* We know cp is not NULL due to the check above */
*cp = '.'; /* Put the dot back in the IP addr */
@@ -895,28 +865,22 @@ inet_netdir_mergeaddr(struct netconfig *tp, char *ruaddr, char *uaddr)
FALSE)
return (NULL);
server_addr.sin6_addr = out_addr.addr6;
+ } else {
+ (void) memcpy(&server_addr, &sa, sizeof (server_addr));
}
- else
- (void) memcpy(&server_addr, &sa,
- sizeof (struct sockaddr_in6));
-#ifdef DEBUG
- printf("%s\n", inet_ntop(af, out_addr.addr6.s6_addr,
- tmp, sizeof (tmp)));
-#endif
-
- if (inet_ntop(af, server_addr.sin6_addr.s6_addr,
- tmp, sizeof (tmp)) == NULL) {
+
+ if (inet_ntop(af, server_addr.sin6_addr.s6_addr, tmp,
+ sizeof (tmp)) == NULL) {
_nderror = ND_NOHOST;
return (NULL);
}
/* now extract the port info */
if ((dot = strrchr(uaddr, '.')) != 0) {
+ char *p = --dot;
- char *p;
-
- p = --dot;
- while (*p-- != '.');
+ while (*p-- != '.')
+ ;
p++;
(void) strcat(tmp + strlen(tmp), p);
_nderror = ND_OK;
@@ -1051,7 +1015,7 @@ bindresvport(struct netconfig *nconf, int fd, struct netbuf *addr)
* this, if the caller has set this option before calling
* bindresvport(), it will be unset. Better be safe...
*/
- *optval = 0;
+ *optval = 0;
resp.flags = 0;
resp.opt.buf = (char *)reqbuf;
resp.opt.maxlen = sizeof (reqbuf);
diff --git a/usr/src/lib/libsocket/inet/interface_id.c b/usr/src/lib/libsocket/inet/interface_id.c
index 2a512b025f..88854fe9da 100644
--- a/usr/src/lib/libsocket/inet/interface_id.c
+++ b/usr/src/lib/libsocket/inet/interface_id.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdio.h>
#include <ctype.h>
#include <string.h>
@@ -120,6 +117,9 @@ if_indextoname(uint32_t ifindex, char *ifname)
int numifs;
size_t bufsize;
boolean_t found;
+ uint_t flags;
+
+ flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES | LIFC_UNDER_IPMP;
/* A interface index of 0 is invalid */
if (ifindex == 0) {
@@ -137,14 +137,19 @@ if_indextoname(uint32_t ifindex, char *ifname)
/* Prepare to send a SIOCGLIFNUM request message */
lifn.lifn_family = AF_UNSPEC;
- lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
+ lifn.lifn_flags = flags;
if (ioctl(s, SIOCGLIFNUM, (char *)&lifn) < 0) {
int save_err = errno;
(void) close(s);
errno = save_err;
return (NULL);
}
- numifs = lifn.lifn_count;
+
+ /*
+ * NOTE: "+ 10" sleaze mitigates new IP interfaces showing up between
+ * the SIOCGLIFNUM and the SIOCGLIFCONF.
+ */
+ numifs = lifn.lifn_count + 10;
/*
* Provide enough buffer to obtain the interface
@@ -161,7 +166,7 @@ if_indextoname(uint32_t ifindex, char *ifname)
return (NULL);
}
lifc.lifc_family = AF_UNSPEC;
- lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
+ lifc.lifc_flags = flags;
lifc.lifc_len = bufsize;
lifc.lifc_buf = buf;
if (ioctl(s, SIOCGLIFCONF, (char *)&lifc) < 0) {
diff --git a/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c b/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c
index dabc2e0929..62ebedf522 100644
--- a/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c
+++ b/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c
@@ -1936,7 +1936,7 @@ dyndns_update_core(char *fqdn)
return (-1);
do {
- if (ni.ni_nic.nic_sysflags & (IFF_STANDBY | IFF_PRIVATE))
+ if (ni.ni_nic.nic_sysflags & IFF_PRIVATE)
continue;
addr.s_addr = ni.ni_nic.nic_ip;
@@ -2003,7 +2003,7 @@ dyndns_clear_rev_zone(char *fqdn)
return (-1);
do {
- if (ni.ni_nic.nic_sysflags & (IFF_STANDBY | IFF_PRIVATE))
+ if (ni.ni_nic.nic_sysflags & IFF_PRIVATE)
continue;
addr.s_addr = ni.ni_nic.nic_ip;
diff --git a/usr/src/pkgdefs/SUNWarc/prototype_com b/usr/src/pkgdefs/SUNWarc/prototype_com
index e9d6270d88..7e04f8b580 100644
--- a/usr/src/pkgdefs/SUNWarc/prototype_com
+++ b/usr/src/pkgdefs/SUNWarc/prototype_com
@@ -18,7 +18,7 @@
#
# CDDL HEADER END
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -122,8 +122,6 @@ s none usr/lib/llib-lintl=../../lib/llib-lintl
s none usr/lib/llib-lintl.ln=../../lib/llib-lintl.ln
f none usr/lib/llib-lipmi 644 root bin
f none usr/lib/llib-lipmi.ln 644 root bin
-f none usr/lib/llib-lipmp 644 root bin
-f none usr/lib/llib-lipmp.ln 644 root bin
f none usr/lib/llib-lipp 644 root bin
f none usr/lib/llib-lipp.ln 644 root bin
s none usr/lib/llib-lkstat=../../lib/llib-lkstat
diff --git a/usr/src/pkgdefs/SUNWarcr/prototype_com b/usr/src/pkgdefs/SUNWarcr/prototype_com
index 6095ff7fe5..852330d742 100644
--- a/usr/src/pkgdefs/SUNWarcr/prototype_com
+++ b/usr/src/pkgdefs/SUNWarcr/prototype_com
@@ -18,7 +18,7 @@
#
# CDDL HEADER END
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -80,6 +80,8 @@ f none lib/llib-lgen 644 root bin
f none lib/llib-lgen.ln 644 root bin
f none lib/llib-lintl 644 root bin
f none lib/llib-lintl.ln 644 root bin
+f none lib/llib-lipmp 644 root bin
+f none lib/llib-lipmp.ln 644 root bin
f none lib/llib-lkmf.ln 644 root bin
f none lib/llib-lkmfberder.ln 644 root bin
f none lib/llib-lkstat 644 root bin
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_com b/usr/src/pkgdefs/SUNWckr/prototype_com
index ead3a7e5e8..989847d09d 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_com
+++ b/usr/src/pkgdefs/SUNWckr/prototype_com
@@ -20,7 +20,7 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -81,6 +81,7 @@ f none kernel/drv/crypto.conf 644 root sys
f none kernel/drv/cryptoadm.conf 644 root sys
f none kernel/drv/devinfo.conf 644 root sys
f none kernel/drv/dld.conf 644 root sys
+f none kernel/drv/dlpistub.conf 644 root sys
f none kernel/drv/icmp.conf 644 root sys
f none kernel/drv/icmp6.conf 644 root sys
f none kernel/drv/ip.conf 644 root sys
@@ -123,7 +124,6 @@ f none kernel/drv/tcp6.conf 644 root sys
f none kernel/drv/tl.conf 644 root sys
f none kernel/drv/udp.conf 644 root sys
f none kernel/drv/udp6.conf 644 root sys
-f none kernel/drv/vni.conf 644 root sys
f none kernel/drv/vnic.conf 644 root sys
f none kernel/drv/wc.conf 644 root sys
d none kernel/exec 755 root sys
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_i386 b/usr/src/pkgdefs/SUNWckr/prototype_i386
index 421d760621..e2972713c6 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_i386
+++ b/usr/src/pkgdefs/SUNWckr/prototype_i386
@@ -86,6 +86,7 @@ f none kernel/drv/crypto 755 root sys
f none kernel/drv/cryptoadm 755 root sys
f none kernel/drv/devinfo 755 root sys
f none kernel/drv/dld 755 root sys
+f none kernel/drv/dlpistub 755 root sys
f none kernel/drv/i8042 755 root sys
f none kernel/drv/icmp 755 root sys
f none kernel/drv/icmp6 755 root sys
@@ -152,7 +153,6 @@ f none kernel/drv/ucode.conf 644 root sys
f none kernel/drv/udp 755 root sys
f none kernel/drv/udp6 755 root sys
f none kernel/drv/vgatext 755 root sys
-f none kernel/drv/vni 755 root sys
f none kernel/drv/vnic 755 root sys
f none kernel/drv/wc 755 root sys
f none kernel/exec/elfexec 755 root sys
@@ -308,6 +308,7 @@ f none kernel/drv/amd64/crypto 755 root sys
f none kernel/drv/amd64/cryptoadm 755 root sys
f none kernel/drv/amd64/devinfo 755 root sys
f none kernel/drv/amd64/dld 755 root sys
+f none kernel/drv/amd64/dlpistub 755 root sys
f none kernel/drv/amd64/i8042 755 root sys
f none kernel/drv/amd64/icmp 755 root sys
f none kernel/drv/amd64/icmp6 755 root sys
@@ -366,7 +367,6 @@ f none kernel/drv/amd64/ucode 755 root sys
f none kernel/drv/amd64/udp 755 root sys
f none kernel/drv/amd64/udp6 755 root sys
f none kernel/drv/amd64/vgatext 755 root sys
-f none kernel/drv/amd64/vni 755 root sys
f none kernel/drv/amd64/vnic 755 root sys
f none kernel/drv/amd64/wc 755 root sys
d none kernel/exec/amd64 755 root sys
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_sparc b/usr/src/pkgdefs/SUNWckr/prototype_sparc
index e81a86168e..a8f0b93be0 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc
@@ -20,7 +20,7 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# This required package information file contains a list of package contents.
@@ -82,6 +82,7 @@ f none kernel/drv/sparcv9/cryptoadm 755 root sys
f none kernel/drv/sparcv9/dad 755 root sys
f none kernel/drv/sparcv9/devinfo 755 root sys
f none kernel/drv/sparcv9/dld 755 root sys
+f none kernel/drv/sparcv9/dlpistub 755 root sys
f none kernel/drv/sparcv9/esp 755 root sys
f none kernel/drv/sparcv9/i8042 755 root sys
f none kernel/drv/sparcv9/icmp 755 root sys
@@ -137,7 +138,6 @@ f none kernel/drv/sparcv9/ttymux 755 root sys
f none kernel/drv/sparcv9/uata 755 root sys
f none kernel/drv/sparcv9/udp 755 root sys
f none kernel/drv/sparcv9/udp6 755 root sys
-f none kernel/drv/sparcv9/vni 755 root sys
f none kernel/drv/sparcv9/vnic 755 root sys
f none kernel/drv/sparcv9/wc 755 root sys
d none kernel/exec/sparcv9 755 root sys
diff --git a/usr/src/pkgdefs/SUNWcsd/postinstall b/usr/src/pkgdefs/SUNWcsd/postinstall
index b481a763ca..caa9bb3402 100644
--- a/usr/src/pkgdefs/SUNWcsd/postinstall
+++ b/usr/src/pkgdefs/SUNWcsd/postinstall
@@ -20,7 +20,7 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -48,6 +48,7 @@ prototype_com='
devices/pseudo/arp@0:arp dev/arp
devices/pseudo/clone@0:ibd dev/ibd
devices/pseudo/dld@0:ctl dev/dld
+devices/pseudo/dlpistub@0:ipmpstub dev/ipmpstub
devices/pseudo/icmp@0:icmp dev/icmp
devices/pseudo/icmp@0:icmp dev/rawip
devices/pseudo/icmp6@0:icmp6 dev/icmp6
diff --git a/usr/src/pkgdefs/SUNWcsl/prototype_com b/usr/src/pkgdefs/SUNWcsl/prototype_com
index a856560c5e..d5918f5883 100644
--- a/usr/src/pkgdefs/SUNWcsl/prototype_com
+++ b/usr/src/pkgdefs/SUNWcsl/prototype_com
@@ -18,7 +18,7 @@
#
# CDDL HEADER END
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# This required package information file contains a list of package contents.
@@ -144,8 +144,6 @@ s none usr/lib/libintl.so=../../lib/libintl.so.1
s none usr/lib/libintl.so.1=../../lib/libintl.so.1
f none usr/lib/libipmi.so.1 755 root bin
s none usr/lib/libipmi.so=./libipmi.so.1
-s none usr/lib/libipmp.so=./libipmp.so.1
-f none usr/lib/libipmp.so.1 755 root bin
s none usr/lib/libipp.so=./libipp.so.1
f none usr/lib/libipp.so.1 755 root bin
f none usr/lib/libipsecutil.so.1 755 root bin
diff --git a/usr/src/pkgdefs/SUNWcslr/prototype_com b/usr/src/pkgdefs/SUNWcslr/prototype_com
index ed7059250a..71ebaff013 100644
--- a/usr/src/pkgdefs/SUNWcslr/prototype_com
+++ b/usr/src/pkgdefs/SUNWcslr/prototype_com
@@ -18,7 +18,7 @@
#
# CDDL HEADER END
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# This required package information file contains a list of package contents.
@@ -95,6 +95,8 @@ f none lib/libinetcfg.so.1 755 root bin
f none lib/libinetutil.so.1 755 root bin
s none lib/libintl.so=libintl.so.1
f none lib/libintl.so.1 755 root bin
+s none lib/libipmp.so=./libipmp.so.1
+f none lib/libipmp.so.1 755 root bin
s none lib/libkmf.so=libkmf.so.1
f none lib/libkmf.so.1 755 root bin
s none lib/libkmfberder.so=libkmfberder.so.1
diff --git a/usr/src/pkgdefs/SUNWcsr/prototype_com b/usr/src/pkgdefs/SUNWcsr/prototype_com
index 02051a08ae..b60abe0f00 100644
--- a/usr/src/pkgdefs/SUNWcsr/prototype_com
+++ b/usr/src/pkgdefs/SUNWcsr/prototype_com
@@ -18,7 +18,7 @@
#
# CDDL HEADER END
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# This required package information file contains a list of package contents.
@@ -332,6 +332,7 @@ d none lib 755 root bin
d none lib/crypto 755 root bin
f none lib/crypto/kcfd 555 root bin
d none lib/inet 755 root bin
+f none lib/inet/in.mpathd 555 root bin
f none lib/inet/nwamd 555 root bin
d none lib/svc 0755 root bin
d none lib/svc/bin 0755 root bin
@@ -404,7 +405,8 @@ f none sbin/fiocompress 555 root bin
f none sbin/hostconfig 555 root bin
f none sbin/ifconfig 555 root bin
f none sbin/ifparse 555 root bin
-s none sbin/in.mpathd=../usr/lib/inet/in.mpathd
+s none sbin/in.mpathd=../lib/inet/in.mpathd
+f none sbin/ipmpstat 555 root bin
f none sbin/soconfig 555 root bin
f none sbin/init 555 root sys
s none sbin/jsh=sh
diff --git a/usr/src/pkgdefs/SUNWcsu/prototype_com b/usr/src/pkgdefs/SUNWcsu/prototype_com
index 6bb2772f1a..464da8254a 100644
--- a/usr/src/pkgdefs/SUNWcsu/prototype_com
+++ b/usr/src/pkgdefs/SUNWcsu/prototype_com
@@ -18,7 +18,7 @@
#
# CDDL HEADER END
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# This required package information file contains a list of package contents.
@@ -649,7 +649,7 @@ d none usr/lib/inet/dhcp 755 root bin
d none usr/lib/inet/dhcp/nsu 755 root bin
d none usr/lib/inet/dhcp/svc 755 root bin
f none usr/lib/inet/in.iked 555 root bin
-f none usr/lib/inet/in.mpathd 555 root bin
+s none usr/lib/inet/in.mpathd=../../../lib/inet/in.mpathd
f none usr/lib/inet/inetd 555 root bin
f none usr/lib/intrd 555 root bin
f none usr/lib/isaexec 555 root bin
@@ -865,6 +865,7 @@ s none usr/sbin/init=../../sbin/init
f none usr/sbin/install 555 root bin
f none usr/sbin/installboot 555 root sys
f none usr/sbin/ipaddrsel 555 root bin
+s none usr/sbin/ipmpstat=../../sbin/ipmpstat
f none usr/sbin/ipsecalgs 555 root bin
f none usr/sbin/ipsecconf 555 root bin
f none usr/sbin/ipseckey 555 root bin
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index 45536bf13e..555f28921c 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -20,7 +20,7 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# This required package information file contains a list of package contents.
@@ -270,6 +270,7 @@ f none usr/include/inet/tcp_stack.h 644 root bin
f none usr/include/inet/wifi_ioctl.h 644 root bin
f none usr/include/inttypes.h 644 root bin
f none usr/include/ipmp.h 644 root bin
+f none usr/include/ipmp_admin.h 644 root bin
f none usr/include/ipmp_mpathd.h 644 root bin
f none usr/include/ipmp_query.h 644 root bin
d none usr/include/ipp 755 root bin
diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh
index 3ac332b45c..7fd4a7186b 100644
--- a/usr/src/tools/scripts/bfu.sh
+++ b/usr/src/tools/scripts/bfu.sh
@@ -21,7 +21,7 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# Upgrade a machine from a cpio archive area in about 5 minutes.
@@ -8060,10 +8060,14 @@ mondo_loop() {
# The global zone needs to have its /dev/dld symlink created
# during install so that processes can access it early in boot
- # before devfsadm is run.
+ # before devfsadm is run. Likewise for /dev/ipmpstub.
if [ ! -L $rootprefix/dev/dld ]; then
ln -s ../devices/pseudo/dld@0:ctl $rootprefix/dev/dld
fi
+ if [ ! -L $rootprefix/dev/ipmpstub ]; then
+ ln -s ../devices/pseudo/dlpistub@0:ipmpstub \
+ $rootprefix/dev/ipmpstub
+ fi
fi
# Fix up audit permissions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 5fcd81b433..448a0d712d 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -485,7 +485,7 @@ IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \
sctp_ioc.o sctp_bind.o sctp_notify.o sctp_asconf.o \
sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o
-IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
+IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
@@ -1605,9 +1605,9 @@ IPF_OBJS += ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o ip_nat.o \
IBD_OBJS += ibd.o
-SDP_OBJS += sdpddi.o
+DLPISTUB_OBJS += dlpistub.o
-VNI_OBJS += vni.o
+SDP_OBJS += sdpddi.o
CTF_OBJS += ctf_create.o ctf_decl.o ctf_error.o ctf_hash.o ctf_labels.o \
ctf_lookup.o ctf_open.o ctf_types.o ctf_util.o ctf_subr.o ctf_mod.o
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index 1cd82570c1..db550667da 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -20,7 +20,7 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# uts/common/Makefile.rules
@@ -447,7 +447,7 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ip/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
-$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ipnet/%.c
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ipnet/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -489,7 +489,7 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/sockmods/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
-$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/vni/%.c
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/dlpistub/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -1572,7 +1572,7 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/arp/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ip/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
-$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipnet/%.c
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipnet/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipf/%.c
@@ -1599,10 +1599,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/tcp/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/nca/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
-$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/vni/%.c
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/dlpistub/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
-
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/common/inet/arp.h b/usr/src/uts/common/inet/arp.h
index 0bca52e9ae..4351c91666 100644
--- a/usr/src/uts/common/inet/arp.h
+++ b/usr/src/uts/common/inet/arp.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -28,6 +28,7 @@
#define _INET_ARP_H
#include <sys/types.h>
+#include <net/if.h>
#ifdef __cplusplus
extern "C" {
@@ -64,6 +65,8 @@ extern "C" {
*/
#define AR_ARP_CLOSING (AR_IOCTL + 16)
#define AR_ARP_EXTEND (AR_IOCTL + 17)
+#define AR_IPMP_ACTIVATE (AR_IOCTL + 18)
+#define AR_IPMP_DEACTIVATE (AR_IOCTL + 19)
/* Both ace_flags and area_flags; must also modify arp.c in mdb */
#define ACE_F_PERMANENT 0x0001
@@ -182,6 +185,14 @@ typedef struct ar_mapping_add_s {
/* the mask&proto_addr */
} arma_t;
+/* Structure used to notify ARP of changes to IPMP group topology */
+typedef struct ar_ipmp_event_s {
+ uint32_t arie_cmd;
+ uint32_t arie_name_offset;
+ uint32_t arie_name_length;
+ char arie_grifname[LIFNAMSIZ];
+} arie_t;
+
/* Structure used to notify clients of interesting conditions. */
typedef struct ar_client_notify_s {
uint32_t arcn_cmd;
diff --git a/usr/src/uts/common/inet/arp/arp.c b/usr/src/uts/common/inet/arp/arp.c
index 815dfd19d3..06c499ced9 100644
--- a/usr/src/uts/common/inet/arp/arp.c
+++ b/usr/src/uts/common/inet/arp/arp.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -85,6 +85,30 @@
* talking to a given peer, then it doesn't matter if we have the right mapping
* for that peer. It would be possible to send queries on aging entries that
* are active, but this isn't done.
+ *
+ * IPMP Notes
+ * ----------
+ *
+ * ARP is aware of IPMP. In particular, IP notifies ARP about all "active"
+ * (able to transmit data packets) interfaces in a given group via
+ * AR_IPMP_ACTIVATE and AR_IPMP_DEACTIVATE messages. These messages, combined
+ * with the "IPMP arl_t" that ARP creates over the IPMP DLPI stub driver,
+ * enable ARP to track all the arl_t's that are in the same group and thus
+ * ensure that ACEs are shared across each group and the arl_t that ARP
+ * chooses to transmit on for a given ACE is optimal.
+ *
+ * ARP relies on IP for hardware address updates. In particular, if the
+ * hardware address of an interface changes (DL_NOTE_PHYS_ADDR), then IP will
+ * bring the interface down and back up -- and as part of bringing it back
+ * up, will send messages to ARP that allow it to update the affected arl's
+ * with new hardware addresses.
+ *
+ * N.B.: One side-effect of this approach is that when an interface fails and
+ * then starts to repair, it will temporarily populate the ARP cache with
+ * addresses that are owned by it rather than the group's arl_t. To address
+ * this, we could add more messages (e.g., AR_IPMP_JOIN and AR_IPMP_LEAVE),
+ * but as the issue appears to be only cosmetic (redundant entries in the ARP
+ * cache during interace repair), we've kept things simple for now.
*/
/*
@@ -134,6 +158,12 @@ typedef struct {
#define ARH_FIXED_LEN 8
/*
+ * Macro used when creating ACEs to determine the arl that should own it.
+ */
+#define OWNING_ARL(arl) \
+ ((arl)->arl_ipmp_arl != NULL ? (arl)->arl_ipmp_arl : arl)
+
+/*
* MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK
* doesn't quite do it for us.
*/
@@ -154,7 +184,7 @@ static int ar_ce_create(arl_t *arl, uint32_t proto, uchar_t *hw_addr,
uint32_t hw_addr_len, uchar_t *proto_addr,
uint32_t proto_addr_len, uchar_t *proto_mask,
uchar_t *proto_extract_mask, uint32_t hw_extract_start,
- uint32_t flags);
+ uchar_t *sender_addr, uint32_t flags);
static void ar_ce_delete(ace_t *ace);
static void ar_ce_delete_per_arl(ace_t *ace, void *arg);
static ace_t **ar_ce_hash(arp_stack_t *as, uint32_t proto,
@@ -167,6 +197,8 @@ static ace_t *ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp,
ace_t *matchfn());
static ace_t *ar_ce_lookup_mapping(arl_t *arl, uint32_t proto,
const uchar_t *proto_addr, uint32_t proto_addr_length);
+static ace_t *ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto,
+ uchar_t *proto_addr, uint32_t proto_addr_length);
static boolean_t ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr,
uint32_t hw_addr_length);
static void ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *),
@@ -187,6 +219,8 @@ static int ar_interface_up(queue_t *q, mblk_t *mp);
static int ar_interface_down(queue_t *q, mblk_t *mp);
static int ar_interface_on(queue_t *q, mblk_t *mp);
static int ar_interface_off(queue_t *q, mblk_t *mp);
+static int ar_ipmp_activate(queue_t *q, mblk_t *mp);
+static int ar_ipmp_deactivate(queue_t *q, mblk_t *mp);
static void ar_ll_cleanup_arl_queue(queue_t *q);
static void ar_ll_down(arl_t *arl);
static arl_t *ar_ll_lookup_by_name(arp_stack_t *as, const char *name);
@@ -208,7 +242,7 @@ static int ar_param_set(queue_t *q, mblk_t *mp, char *value,
static void ar_query_delete(ace_t *ace, void *ar);
static void ar_query_reply(ace_t *ace, int ret_val,
uchar_t *proto_addr, uint32_t proto_addr_len);
-static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace);
+static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace);
static void ar_rput(queue_t *q, mblk_t *mp_orig);
static void ar_rput_dlpi(queue_t *q, mblk_t *mp);
static void ar_set_address(ace_t *ace, uchar_t *addrpos,
@@ -344,6 +378,10 @@ static arct_t ar_cmd_tbl[] = {
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_ON" },
{ ar_interface_off, AR_INTERFACE_OFF, sizeof (arc_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_OFF" },
+ { ar_ipmp_activate, AR_IPMP_ACTIVATE, sizeof (arie_t),
+ ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_ACTIVATE" },
+ { ar_ipmp_deactivate, AR_IPMP_DEACTIVATE, sizeof (arie_t),
+ ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_DEACTIVATE" },
{ ar_set_ppa, (uint32_t)IF_UNITSEL, sizeof (int),
ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "IF_UNITSEL" },
{ ar_nd_ioctl, ND_GET, 1,
@@ -358,6 +396,65 @@ static arct_t ar_cmd_tbl[] = {
};
/*
+ * Lookup and return an arl appropriate for sending packets with either source
+ * hardware address `hw_addr' or source protocol address `ip_addr', in that
+ * order. If neither was specified or neither match, return any arl in the
+ * same group as `arl'.
+ */
+static arl_t *
+ar_ipmp_lookup_xmit_arl(arl_t *arl, uchar_t *hw_addr, uint_t hw_addrlen,
+ uchar_t *ip_addr)
+{
+ arlphy_t *ap;
+ ace_t *src_ace;
+ arl_t *xmit_arl = NULL;
+ arp_stack_t *as = ARL_TO_ARPSTACK(arl);
+
+ ASSERT(arl->arl_flags & ARL_F_IPMP);
+
+ if (hw_addr != NULL && hw_addrlen != 0) {
+ xmit_arl = as->as_arl_head;
+ for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) {
+ /*
+ * There may be arls with the same HW address that are
+ * not in our IPMP group; we don't want those.
+ */
+ if (xmit_arl->arl_ipmp_arl != arl)
+ continue;
+
+ ap = xmit_arl->arl_phy;
+ if (ap != NULL && ap->ap_hw_addrlen == hw_addrlen &&
+ bcmp(ap->ap_hw_addr, hw_addr, hw_addrlen) == 0)
+ break;
+ }
+
+ DTRACE_PROBE4(xmit_arl_hwsrc, arl_t *, arl, arl_t *,
+ xmit_arl, uchar_t *, hw_addr, uint_t, hw_addrlen);
+ }
+
+ if (xmit_arl == NULL && ip_addr != NULL) {
+ src_ace = ar_ce_lookup_permanent(as, IP_ARP_PROTO_TYPE, ip_addr,
+ IP_ADDR_LEN);
+ if (src_ace != NULL)
+ xmit_arl = src_ace->ace_xmit_arl;
+
+ DTRACE_PROBE4(xmit_arl_ipsrc, arl_t *, arl, arl_t *,
+ xmit_arl, uchar_t *, ip_addr, uint_t, IP_ADDR_LEN);
+ }
+
+ if (xmit_arl == NULL) {
+ xmit_arl = as->as_arl_head;
+ for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next)
+ if (xmit_arl->arl_ipmp_arl == arl && xmit_arl != arl)
+ break;
+
+ DTRACE_PROBE2(xmit_arl_any, arl_t *, arl, arl_t *, xmit_arl);
+ }
+
+ return (xmit_arl);
+}
+
+/*
* ARP Cache Entry creation routine.
* Cache entries are allocated within timer messages and inserted into
* the global hash list based on protocol and protocol address.
@@ -365,7 +462,8 @@ static arct_t ar_cmd_tbl[] = {
static int
ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
uchar_t *proto_addr, uint_t proto_addr_len, uchar_t *proto_mask,
- uchar_t *proto_extract_mask, uint_t hw_extract_start, uint_t flags)
+ uchar_t *proto_extract_mask, uint_t hw_extract_start, uchar_t *sender_addr,
+ uint_t flags)
{
static ace_t ace_null;
ace_t *ace;
@@ -373,17 +471,35 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
uchar_t *dst;
mblk_t *mp;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
+ arl_t *xmit_arl;
arlphy_t *ap;
if ((flags & ~ACE_EXTERNAL_FLAGS_MASK) || arl == NULL)
return (EINVAL);
- if ((ap = arl->arl_phy) == NULL)
+ if (proto_addr == NULL || proto_addr_len == 0 ||
+ (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN))
return (EINVAL);
if (flags & ACE_F_MYADDR)
flags |= ACE_F_PUBLISH | ACE_F_AUTHORITY;
+ /*
+ * Latch a transmit arl for this ace.
+ */
+ if (arl->arl_flags & ARL_F_IPMP) {
+ ASSERT(proto == IP_ARP_PROTO_TYPE);
+ xmit_arl = ar_ipmp_lookup_xmit_arl(arl, hw_addr, hw_addr_len,
+ sender_addr);
+ } else {
+ xmit_arl = arl;
+ }
+
+ if (xmit_arl == NULL || xmit_arl->arl_phy == NULL)
+ return (EINVAL);
+
+ ap = xmit_arl->arl_phy;
+
if (!hw_addr && hw_addr_len == 0) {
if (flags == ACE_F_PERMANENT) { /* Not publish */
/* 224.0.0.0 to zero length address */
@@ -398,9 +514,6 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
flags |= ACE_F_RESOLVED;
}
- if (proto_addr == NULL || proto_addr_len == 0 ||
- (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN))
- return (EINVAL);
/* Handle hw_addr_len == 0 for DL_ENABMULTI_REQ etc. */
if (hw_addr_len != 0 && hw_addr == NULL)
return (EINVAL);
@@ -432,6 +545,7 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
ace->ace_proto = proto;
ace->ace_mp = mp;
ace->ace_arl = arl;
+ ace->ace_xmit_arl = xmit_arl;
dst = (uchar_t *)&ace[1];
@@ -510,12 +624,73 @@ ar_ce_delete(ace_t *ace)
static void
ar_ce_delete_per_arl(ace_t *ace, void *arl)
{
- if (ace->ace_arl == arl) {
+ if (ace->ace_arl == arl || ace->ace_xmit_arl == arl) {
ace->ace_flags &= ~ACE_F_PERMANENT;
ar_ce_delete(ace);
}
}
+/*
+ * ar_ce_walk routine used when deactivating an `arl' in a group. Deletes
+ * `ace' if it was using `arl_arg' as its output interface.
+ */
+static void
+ar_ce_ipmp_deactivate(ace_t *ace, void *arl_arg)
+{
+ arl_t *arl = arl_arg;
+
+ ASSERT(!(arl->arl_flags & ARL_F_IPMP));
+
+ if (ace->ace_arl == arl) {
+ ASSERT(ace->ace_xmit_arl == arl);
+ /*
+ * This ACE is tied to the arl leaving the group (e.g., an
+ * ACE_F_PERMANENT for a test address) and is not used by the
+ * group, so we can leave it be.
+ */
+ return;
+ }
+
+ if (ace->ace_xmit_arl != arl)
+ return;
+
+ ASSERT(ace->ace_arl == arl->arl_ipmp_arl);
+
+ /*
+ * IP should've already sent us messages asking us to move any
+ * ACE_F_MYADDR entries to another arl, but there are two exceptions:
+ *
+ * 1. The group was misconfigured with interfaces that have duplicate
+ * hardware addresses, but in.mpathd was unable to offline those
+ * duplicate interfaces.
+ *
+ * 2. The messages from IP were lost or never created (e.g. due to
+ * memory pressure).
+ *
+ * We handle the first case by just quietly deleting the ACE. Since
+ * the second case cannot be distinguished from a more serious bug in
+ * the IPMP framework, we ASSERT() that this can't happen on DEBUG
+ * systems, but quietly delete the ACE on production systems (the
+ * deleted ACE will render the IP address unreachable).
+ */
+ if (ace->ace_flags & ACE_F_MYADDR) {
+ arlphy_t *ap = arl->arl_phy;
+ uint_t hw_addrlen = ap->ap_hw_addrlen;
+
+ ASSERT(hw_addrlen == ace->ace_hw_addr_length &&
+ bcmp(ap->ap_hw_addr, ace->ace_hw_addr, hw_addrlen) == 0);
+ }
+
+ /*
+ * NOTE: it's possible this arl got selected as the ace_xmit_arl when
+ * creating an ACE_F_PERMANENT ACE on behalf of an SIOCS*ARP ioctl for
+ * an IPMP IP interface. But it's still OK for us to delete such an
+ * ACE since ipmp_illgrp_refresh_arpent() will ask us to recreate it
+ * and we'll pick another arl then.
+ */
+ ar_ce_delete(ace);
+}
+
/* Cache entry hash routine, based on protocol and protocol address. */
static ace_t **
ar_ce_hash(arp_stack_t *as, uint32_t proto, const uchar_t *proto_addr,
@@ -559,7 +734,8 @@ ar_ce_lookup_entry(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
return (NULL);
ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length);
for (; ace; ace = ace->ace_next) {
- if (ace->ace_arl == arl &&
+ if ((ace->ace_arl == arl ||
+ ace->ace_arl == arl->arl_ipmp_arl) &&
ace->ace_proto_addr_length == proto_addr_length &&
ace->ace_proto == proto) {
int i1 = proto_addr_length;
@@ -632,13 +808,6 @@ ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
/*
* Look for a permanent entry for proto_addr across all interfaces.
- * This is used for sending ARP requests out. Requests may come from
- * IP on le0 with the source address of le1 and we need to send out
- * the request on le1 so that ARP does not think that somebody else
- * is using its PERMANENT address. If le0 and le1 are sitting on
- * the same wire, the same IP -> ethernet mapping might exist on
- * both the interfaces. But we should look for the permanent
- * mapping to avoid arp interpreting it as a duplicate.
*/
static ace_t *
ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr,
@@ -653,8 +822,8 @@ ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr,
if (ace->ace_proto_addr_length == proto_addr_length &&
ace->ace_proto == proto) {
int i1 = proto_addr_length;
- uchar_t *ace_addr = ace->ace_proto_addr;
- uchar_t *mask = ace->ace_proto_mask;
+ uchar_t *ace_addr = ace->ace_proto_addr;
+ uchar_t *mask = ace->ace_proto_mask;
/*
* Note that the ace_proto_mask is applied to the
@@ -703,12 +872,8 @@ ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length)
* 1. Resolution of unresolved entries and update of resolved entries.
* 2. Detection of nodes with our own IP address (duplicates).
*
- * This is complicated by ill groups. We don't currently have knowledge of ill
- * groups, so we can't distinguish between a packet that comes in on one of the
- * arls that's part of the group versus one that's on an unrelated arl. Thus,
- * we take a conservative approach. If the arls match, then we update resolved
- * and unresolved entries alike. If they don't match, then we update only
- * unresolved entries.
+ * If the resolving ARL is in the same group as a matching ACE's ARL, then
+ * update the ACE. Otherwise, make no updates.
*
* For all entries, we first check to see if this is a duplicate (probable
* loopback) message. If so, then just ignore it.
@@ -741,7 +906,7 @@ ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length)
static int
ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
- uint32_t hlen, const uchar_t *src_paddr, uint32_t plen)
+ uint32_t hlen, const uchar_t *src_paddr, uint32_t plen, arl_t **ace_arlp)
{
ace_t *ace;
ace_t *ace_next;
@@ -778,31 +943,35 @@ ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
if (i1 >= 0)
continue;
+ *ace_arlp = ace->ace_arl;
+
/*
- * If both IP addr and hardware address match what we already
- * have, then this is a broadcast packet emitted by one of our
- * interfaces, reflected by the switch and received on another
- * interface. We return AR_LOOPBACK.
+ * If the IP address is ours, and the hardware address matches
+ * one of our own arls, then this is a broadcast packet
+ * emitted by one of our interfaces, reflected by the switch
+ * and received on another interface. We return AR_LOOPBACK.
*/
- if ((ace->ace_flags & ACE_F_MYADDR) &&
- hlen == ace->ace_hw_addr_length &&
- bcmp(ace->ace_hw_addr, src_haddr,
- ace->ace_hw_addr_length) == 0) {
- return (AR_LOOPBACK);
+ if (ace->ace_flags & ACE_F_MYADDR) {
+ arl_t *hw_arl = as->as_arl_head;
+ arlphy_t *ap;
+
+ for (; hw_arl != NULL; hw_arl = hw_arl->arl_next) {
+ ap = hw_arl->arl_phy;
+ if (ap != NULL && ap->ap_hw_addrlen == hlen &&
+ bcmp(ap->ap_hw_addr, src_haddr, hlen) == 0)
+ return (AR_LOOPBACK);
+ }
}
/*
* If the entry is unverified, then we've just verified that
* someone else already owns this address, because this is a
* message with the same protocol address but different
- * hardware address. Conflicts received via an interface which
- * doesn't own the conflict address are not actioned. Multiple
- * interfaces on the same segment imply any conflict will also
- * be seen via the correct interface, so we can ignore anything
- * not matching the arl from the ace.
+ * hardware address. NOTE: the ace_xmit_arl check ensures we
+ * don't send duplicate AR_FAILEDs if arl is in an IPMP group.
*/
if ((ace->ace_flags & ACE_F_UNVERIFIED) &&
- arl == ace->ace_arl) {
+ arl == ace->ace_xmit_arl) {
ar_ce_delete(ace);
return (AR_FAILED);
}
@@ -814,30 +983,29 @@ ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
* that, if we're currently in initial announcement mode, we
* switch back to the lazier defense mode. Knowing that
* there's at least one duplicate out there, we ought not
- * blindly announce. Conflicts received via an interface which
- * doesn't own the conflict address are not actioned. Multiple
- * interfaces on the same segment imply the conflict will also
- * be seen via the correct interface, so we can ignore anything
- * not matching the arl from the ace.
+ * blindly announce. NOTE: the ace_xmit_arl check ensures we
+ * don't send duplicate AR_BOGONs if arl is in an IPMP group.
*/
if ((ace->ace_flags & ACE_F_AUTHORITY) &&
- arl == ace->ace_arl) {
+ arl == ace->ace_xmit_arl) {
ace->ace_xmit_count = 0;
return (AR_BOGON);
}
/*
- * Limit updating across other ills to unresolved
- * entries only. We don't want to inadvertently update
- * published entries.
+ * Only update this ACE if it's on the same network -- i.e.,
+ * it's for our ARL or another ARL in the same IPMP group.
*/
- if (ace->ace_arl == arl || !ACE_RESOLVED(ace)) {
+ if (ace->ace_arl == arl || ace->ace_arl == arl->arl_ipmp_arl) {
if (ar_ce_resolve(ace, src_haddr, hlen))
retv = AR_CHANGED;
else if (retv == AR_NOTFOUND)
retv = AR_MERGED;
}
}
+
+ if (retv == AR_NOTFOUND)
+ *ace_arlp = NULL;
return (retv);
}
@@ -917,7 +1085,7 @@ static void
ar_delete_notify(const ace_t *ace)
{
const arl_t *arl = ace->ace_arl;
- const arlphy_t *ap = arl->arl_phy;
+ const arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
mblk_t *mp;
size_t len;
arh_t *arh;
@@ -945,7 +1113,7 @@ ar_close(queue_t *q)
{
ar_t *ar = (ar_t *)q->q_ptr;
char name[LIFNAMSIZ];
- arl_t *arl;
+ arl_t *arl, *xarl;
arl_t **arlp;
cred_t *cr;
arc_t *arc;
@@ -999,6 +1167,21 @@ ar_close(queue_t *q)
while (arl->arl_state != ARL_S_DOWN)
qwait(arl->arl_rq);
+ if (arl->arl_flags & ARL_F_IPMP) {
+ /*
+ * Though rude, someone could force the IPMP arl
+ * closed without removing the underlying interfaces.
+ * In that case, force the ARLs out of the group.
+ */
+ xarl = as->as_arl_head;
+ for (; xarl != NULL; xarl = xarl->arl_next) {
+ if (xarl->arl_ipmp_arl != arl || xarl == arl)
+ continue;
+ ar_ce_walk(as, ar_ce_ipmp_deactivate, xarl);
+ xarl->arl_ipmp_arl = NULL;
+ }
+ }
+
ar_ll_clear_defaults(arl);
/*
* If this is the control stream for an arl, delete anything
@@ -1417,9 +1600,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
area_t *area;
ace_t *ace;
uchar_t *hw_addr;
- uint32_t hw_addr_len;
+ uint32_t hw_addr_len;
uchar_t *proto_addr;
- uint32_t proto_addr_len;
+ uint32_t proto_addr_len;
uchar_t *proto_mask;
arl_t *arl;
mblk_t *mp = mp_orig;
@@ -1494,6 +1677,7 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
proto_mask,
NULL,
(uint32_t)0,
+ NULL,
aflags & ~ACE_F_MAPPING & ~ACE_F_UNVERIFIED & ~ACE_F_DEFEND);
if (err != 0) {
DTRACE_PROBE3(eadd_create_failed, arl_t *, arl, area_t *, area,
@@ -1502,7 +1686,13 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
}
if (aflags & ACE_F_PUBLISH) {
- arlphy_t *ap = arl->arl_phy;
+ arlphy_t *ap;
+
+ ace = ar_ce_lookup(arl, area->area_proto, proto_addr,
+ proto_addr_len);
+ ASSERT(ace != NULL);
+
+ ap = ace->ace_xmit_arl->arl_phy;
if (hw_addr == NULL || hw_addr_len == 0) {
hw_addr = ap->ap_hw_addr;
@@ -1519,10 +1709,6 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
ap->ap_hw_addrlen = hw_addr_len;
}
- ace = ar_ce_lookup(arl, area->area_proto, proto_addr,
- proto_addr_len);
- ASSERT(ace != NULL);
-
if (ace->ace_flags & ACE_F_FAST) {
ace->ace_xmit_count = as->as_fastprobe_count;
ace->ace_xmit_interval = as->as_fastprobe_delay;
@@ -1555,9 +1741,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
*/
DTRACE_PROBE2(eadd_probe, ace_t *, ace,
area_t *, area);
- ar_xmit(arl, ARP_REQUEST, area->area_proto,
- proto_addr_len, hw_addr, NULL, NULL,
- proto_addr, NULL, as);
+ ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
+ area->area_proto, proto_addr_len,
+ hw_addr, NULL, NULL, proto_addr, NULL, as);
ace->ace_xmit_count--;
ace->ace_xmit_interval =
(ace->ace_flags & ACE_F_FAST) ?
@@ -1573,9 +1759,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
} else {
DTRACE_PROBE2(eadd_announce, ace_t *, ace,
area_t *, area);
- ar_xmit(arl, ARP_REQUEST, area->area_proto,
- proto_addr_len, hw_addr, proto_addr,
- ap->ap_arp_addr, proto_addr, NULL, as);
+ ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
+ area->area_proto, proto_addr_len, hw_addr,
+ proto_addr, ap->ap_arp_addr, proto_addr, NULL, as);
ace->ace_last_bcast = ddi_get_lbolt();
/*
@@ -1583,9 +1769,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
* entry; we believe we're the authority for this
* entry. In that case, and if we're not just doing
* one-off defense of the address, we send more than
- * one copy, so that if this is an IPMP failover, we'll
- * still have a good chance of updating everyone even
- * when there's a packet loss or two.
+ * one copy, so we'll still have a good chance of
+ * updating everyone even when there's a packet loss
+ * or two.
*/
if ((aflags & ACE_F_AUTHORITY) &&
!(aflags & ACE_F_DEFEND) &&
@@ -1667,7 +1853,6 @@ static int
ar_entry_query(queue_t *q, mblk_t *mp_orig)
{
ace_t *ace;
- ace_t *src_ace = NULL;
areq_t *areq;
arl_t *arl;
int err;
@@ -1782,20 +1967,12 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig)
err = ENXIO;
goto err_ret;
}
- if (arl->arl_phy == NULL) {
- /* Can't get help if we don't know how. */
- DTRACE_PROBE2(query_no_phy, ace_t *, ace,
- areq_t *, areq);
- mpp[0] = NULL;
- mp->b_prev = NULL;
- err = ENXIO;
- goto err_ret;
- }
DTRACE_PROBE2(query_unresolved, ace_t, ace, areq_t *, areq);
} else {
/* No ace yet. Make one now. (This is the common case.) */
- if (areq->areq_xmit_count == 0 || arl->arl_phy == NULL) {
- DTRACE_PROBE2(query_phy, arl_t *, arl, areq_t *, areq);
+ if (areq->areq_xmit_count == 0) {
+ DTRACE_PROBE2(query_template, arl_t *, arl,
+ areq_t *, areq);
mp->b_prev = NULL;
err = ENXIO;
goto err_ret;
@@ -1814,9 +1991,9 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig)
err = EINVAL;
goto err_ret;
}
- err = ar_ce_create(arl, areq->areq_proto, NULL, 0,
+ err = ar_ce_create(OWNING_ARL(arl), areq->areq_proto, NULL, 0,
proto_addr, proto_addr_len, NULL,
- NULL, (uint32_t)0,
+ NULL, (uint32_t)0, sender_addr,
areq->areq_flags);
if (err != 0) {
DTRACE_PROBE3(query_create_failed, arl_t *, arl,
@@ -1835,49 +2012,13 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig)
goto err_ret;
}
ace->ace_query_mp = mp;
- /*
- * We don't have group information here. But if the sender
- * address belongs to a different arl, we might as well
- * search the other arl for a resolved ACE. If we find one,
- * we resolve it rather than sending out a ARP request.
- */
- src_ace = ar_ce_lookup_permanent(as, areq->areq_proto,
- sender_addr, areq->areq_sender_addr_length);
- if (src_ace == NULL) {
- DTRACE_PROBE3(query_source_missing, arl_t *, arl,
- areq_t *, areq, ace_t *, ace);
- ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
- /*
- * ar_query_reply has already freed the mp.
- * Return EINPROGRESS, so that caller won't attempt
- * to free the 'mp' again.
- */
- return (EINPROGRESS);
- }
- if (src_ace->ace_arl != ace->ace_arl) {
- ace_t *dst_ace;
-
- /*
- * Check for a resolved entry in the src_ace->ace_arl.
- */
- dst_ace = ar_ce_lookup_entry(src_ace->ace_arl,
- areq->areq_proto, proto_addr, proto_addr_len);
-
- if (dst_ace != NULL && ACE_RESOLVED(dst_ace)) {
- DTRACE_PROBE3(query_other_arl, arl_t *, arl,
- areq_t *, areq, ace_t *, dst_ace);
- (void) ar_ce_resolve(ace, dst_ace->ace_hw_addr,
- dst_ace->ace_hw_addr_length);
- return (EINPROGRESS);
- }
- }
}
- ms = ar_query_xmit(as, ace, src_ace);
+ ms = ar_query_xmit(as, ace);
if (ms == 0) {
/* Immediate reply requested. */
ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
} else {
- mi_timer(arl->arl_wq, ace->ace_mp, ms);
+ mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, ms);
}
return (EINPROGRESS);
err_ret:
@@ -2073,6 +2214,80 @@ done:
}
/*
+ * Given an arie_t `mp', find the arl_t's that it names and return them
+ * in `*arlp' and `*ipmp_arlp'. If they cannot be found, return B_FALSE.
+ */
+static boolean_t
+ar_ipmp_lookup(arp_stack_t *as, mblk_t *mp, arl_t **arlp, arl_t **ipmp_arlp)
+{
+ arie_t *arie = (arie_t *)mp->b_rptr;
+
+ *arlp = ar_ll_lookup_from_mp(as, mp);
+ if (*arlp == NULL) {
+ DTRACE_PROBE1(ipmp_lookup_no_arl, mblk_t *, mp);
+ return (B_FALSE);
+ }
+
+ arie->arie_grifname[LIFNAMSIZ - 1] = '\0';
+ *ipmp_arlp = ar_ll_lookup_by_name(as, arie->arie_grifname);
+ if (*ipmp_arlp == NULL) {
+ DTRACE_PROBE1(ipmp_lookup_no_ipmp_arl, mblk_t *, mp);
+ return (B_FALSE);
+ }
+
+ DTRACE_PROBE2(ipmp_lookup, arl_t *, *arlp, arl_t *, *ipmp_arlp);
+ return (B_TRUE);
+}
+
+/*
+ * Bind an arl_t to an IPMP group arl_t.
+ */
+static int
+ar_ipmp_activate(queue_t *q, mblk_t *mp)
+{
+ arl_t *arl, *ipmp_arl;
+ arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
+
+ if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
+ return (EINVAL);
+
+ if (arl->arl_ipmp_arl != NULL) {
+ DTRACE_PROBE1(ipmp_activated_already, arl_t *, arl);
+ return (EALREADY);
+ }
+
+ DTRACE_PROBE2(ipmp_activate, arl_t *, arl, arl_t *, ipmp_arl);
+ arl->arl_ipmp_arl = ipmp_arl;
+ return (0);
+}
+
+/*
+ * Unbind an arl_t from an IPMP group arl_t and update the ace_t's so
+ * that it is no longer part of the group.
+ */
+static int
+ar_ipmp_deactivate(queue_t *q, mblk_t *mp)
+{
+ arl_t *arl, *ipmp_arl;
+ arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
+
+ if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
+ return (EINVAL);
+
+ if (ipmp_arl != arl->arl_ipmp_arl) {
+ DTRACE_PROBE2(ipmp_deactivate_notactive, arl_t *, arl, arl_t *,
+ ipmp_arl);
+ return (EINVAL);
+ }
+
+ DTRACE_PROBE2(ipmp_deactivate, arl_t *, arl, arl_t *,
+ arl->arl_ipmp_arl);
+ ar_ce_walk(as, ar_ce_ipmp_deactivate, arl);
+ arl->arl_ipmp_arl = NULL;
+ return (0);
+}
+
+/*
* Enable an interface to process ARP_REQUEST and ARP_RESPONSE messages.
*/
/* ARGSUSED */
@@ -2199,6 +2414,11 @@ ar_ll_init(arp_stack_t *as, ar_t *ar, mblk_t *mp)
if ((arl = (arl_t *)mi_zalloc(sizeof (arl_t))) == NULL)
return;
+ if (dlia->dl_mac_type == SUNW_DL_IPMP) {
+ arl->arl_flags |= ARL_F_IPMP;
+ arl->arl_ipmp_arl = arl;
+ }
+
arl->arl_provider_style = dlia->dl_provider_style;
arl->arl_rq = ar->ar_rq;
arl->arl_wq = ar->ar_wq;
@@ -2261,7 +2481,7 @@ ar_ll_set_defaults(arl_t *arl, mblk_t *mp)
dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
dl_unitdata_req_t *dlur;
uchar_t *up;
- arlphy_t *ap;
+ arlphy_t *ap;
ASSERT(arl != NULL);
@@ -2270,6 +2490,14 @@ ar_ll_set_defaults(arl_t *arl, mblk_t *mp)
*/
ar_ll_clear_defaults(arl);
+ if (arl->arl_flags & ARL_F_IPMP) {
+ /*
+ * If this is an IPMP arl_t, we have nothing to do,
+ * since we will never transmit or receive.
+ */
+ return;
+ }
+
ap = kmem_zalloc(sizeof (arlphy_t), KM_NOSLEEP);
if (ap == NULL)
goto bad;
@@ -2470,12 +2698,12 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig)
mblk_t *mp = mp_orig;
ace_t *ace;
uchar_t *hw_addr;
- uint32_t hw_addr_len;
+ uint32_t hw_addr_len;
uchar_t *proto_addr;
- uint32_t proto_addr_len;
+ uint32_t proto_addr_len;
uchar_t *proto_mask;
uchar_t *proto_extract_mask;
- uint32_t hw_extract_start;
+ uint32_t hw_extract_start;
arl_t *arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
@@ -2524,6 +2752,7 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig)
proto_mask,
proto_extract_mask,
hw_extract_start,
+ NULL,
arma->arma_flags | ACE_F_MAPPING));
}
@@ -2857,12 +3086,12 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
uint32_t proto_addr_len)
{
mblk_t *areq_mp;
- arl_t *arl = ace->ace_arl;
mblk_t *mp;
mblk_t *xmit_mp;
- arp_stack_t *as = ARL_TO_ARPSTACK(arl);
+ queue_t *arl_wq = ace->ace_arl->arl_wq;
+ arp_stack_t *as = ARL_TO_ARPSTACK(ace->ace_arl);
ip_stack_t *ipst = as->as_netstack->netstack_ip;
- arlphy_t *ap = arl->arl_phy;
+ arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
/*
* On error or completion for a query, we need to shut down the timer.
@@ -2870,7 +3099,8 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
* Duplicate Address Detection, or it will never finish that phase.
*/
if (!(ace->ace_flags & (ACE_F_UNVERIFIED | ACE_F_AUTHORITY)))
- mi_timer(arl->arl_wq, ace->ace_mp, -1L);
+ mi_timer(arl_wq, ace->ace_mp, -1L);
+
/* Establish the return value appropriate. */
if (ret_val == 0) {
if (!ACE_RESOLVED(ace) || ap == NULL)
@@ -2973,25 +3203,24 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
*/
ar_ce_delete(ace);
} else {
- mi_timer(arl->arl_wq, ace->ace_mp,
- as->as_cleanup_interval);
+ mi_timer(arl_wq, ace->ace_mp, as->as_cleanup_interval);
}
}
}
/*
* Returns number of milliseconds after which we should either rexmit or abort.
- * Return of zero means we should abort. src_ace is the ace corresponding
- * to the source address in the areq sent by IP.
+ * Return of zero means we should abort.
*/
static clock_t
-ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace)
+ar_query_xmit(arp_stack_t *as, ace_t *ace)
{
areq_t *areq;
mblk_t *mp;
uchar_t *proto_addr;
uchar_t *sender_addr;
- arl_t *src_arl;
+ ace_t *src_ace;
+ arl_t *xmit_arl = ace->ace_xmit_arl;
mp = ace->ace_query_mp;
/*
@@ -3016,18 +3245,15 @@ ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace)
areq->areq_sender_addr_length);
/*
- * Get the source h/w address for the sender addr. With interface
- * groups, IP sends us source address belonging to a different
- * interface.
+ * Get the ace for the sender address, so that we can verify that
+ * we have one and that DAD has completed.
*/
+ src_ace = ar_ce_lookup(xmit_arl, areq->areq_proto, sender_addr,
+ areq->areq_sender_addr_length);
if (src_ace == NULL) {
- src_ace = ar_ce_lookup_permanent(as, areq->areq_proto,
- sender_addr, areq->areq_sender_addr_length);
- if (src_ace == NULL) {
- DTRACE_PROBE3(xmit_no_source, ace_t *, ace,
- areq_t *, areq, uchar_t *, sender_addr);
- return (0);
- }
+ DTRACE_PROBE3(xmit_no_source, ace_t *, ace, areq_t *, areq,
+ uchar_t *, sender_addr);
+ return (0);
}
/*
@@ -3044,18 +3270,12 @@ ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace)
return (areq->areq_xmit_interval);
}
- /*
- * Transmit on src_arl. We should transmit on src_arl. Otherwise
- * the switch will send back a copy on other interfaces of the
- * same group and as we could be using somebody else's source
- * address + hardware address, ARP will treat this as a bogon.
- */
- src_arl = src_ace->ace_arl;
DTRACE_PROBE3(xmit_send, ace_t *, ace, ace_t *, src_ace,
areq_t *, areq);
- ar_xmit(src_arl, ARP_REQUEST, areq->areq_proto,
- areq->areq_sender_addr_length, src_arl->arl_phy->ap_hw_addr,
- sender_addr, src_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as);
+
+ ar_xmit(xmit_arl, ARP_REQUEST, areq->areq_proto,
+ areq->areq_sender_addr_length, xmit_arl->arl_phy->ap_hw_addr,
+ sender_addr, xmit_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as);
src_ace->ace_last_bcast = ddi_get_lbolt();
return (areq->areq_xmit_interval);
}
@@ -3066,6 +3286,7 @@ ar_rput(queue_t *q, mblk_t *mp)
{
arh_t *arh;
arl_t *arl;
+ arl_t *client_arl;
ace_t *dst_ace;
uchar_t *dst_paddr;
int err;
@@ -3079,6 +3300,8 @@ ar_rput(queue_t *q, mblk_t *mp)
uchar_t *src_paddr;
uchar_t *dst_haddr;
boolean_t is_probe;
+ boolean_t is_unicast = B_FALSE;
+ dl_unitdata_ind_t *dlindp;
int i;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
@@ -3135,9 +3358,10 @@ ar_rput(queue_t *q, mblk_t *mp)
return;
case M_PCPROTO:
case M_PROTO:
+ dlindp = (dl_unitdata_ind_t *)mp->b_rptr;
if (MBLKL(mp) >= sizeof (dl_unitdata_ind_t) &&
- ((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive ==
- DL_UNITDATA_IND) {
+ dlindp->dl_primitive == DL_UNITDATA_IND) {
+ is_unicast = (dlindp->dl_group_address == 0);
arl = ((ar_t *)q->q_ptr)->ar_arl;
if (arl != NULL && arl->arl_phy != NULL) {
/* Real messages from the wire! */
@@ -3261,19 +3485,24 @@ ar_rput(queue_t *q, mblk_t *mp)
* RFC 826: first check if the <protocol, sender protocol address> is
* in the cache, if there is a sender protocol address. Note that this
* step also handles resolutions based on source.
+ *
+ * Note that IP expects that each notification it receives will be
+ * tied to the ill it received it on. Thus, we must talk to it over
+ * the arl tied to the resolved IP address (if any), hence client_arl.
*/
if (is_probe)
err = AR_NOTFOUND;
else
err = ar_ce_resolve_all(arl, proto, src_haddr, hlen, src_paddr,
- plen);
+ plen, &client_arl);
+
switch (err) {
case AR_BOGON:
- ar_client_notify(arl, mp1, AR_CN_BOGON);
+ ar_client_notify(client_arl, mp1, AR_CN_BOGON);
mp1 = NULL;
break;
case AR_FAILED:
- ar_client_notify(arl, mp1, AR_CN_FAILED);
+ ar_client_notify(client_arl, mp1, AR_CN_FAILED);
mp1 = NULL;
break;
case AR_LOOPBACK:
@@ -3293,7 +3522,9 @@ ar_rput(queue_t *q, mblk_t *mp)
* Now look up the destination address. By RFC 826, we ignore the
* packet at this step if the target isn't one of our addresses. This
* is true even if the target is something we're trying to resolve and
- * the packet is a response.
+ * the packet is a response. To avoid duplicate responses, we also
+ * ignore the packet if it was multicast/broadcast to an arl that's in
+ * an IPMP group but was not the designated xmit_arl for the ACE.
*
* Note that in order to do this correctly, we need to know when to
* notify IP of a change implied by the source address of the ARP
@@ -3304,6 +3535,7 @@ ar_rput(queue_t *q, mblk_t *mp)
*/
dst_ace = ar_ce_lookup_entry(arl, proto, dst_paddr, plen);
if (dst_ace == NULL || !ACE_RESOLVED(dst_ace) ||
+ (dst_ace->ace_xmit_arl != arl && !is_unicast) ||
!(dst_ace->ace_flags & ACE_F_PUBLISH)) {
/*
* Let the client know if the source mapping has changed, even
@@ -3311,7 +3543,7 @@ ar_rput(queue_t *q, mblk_t *mp)
* client.
*/
if (err == AR_CHANGED)
- ar_client_notify(arl, mp1, AR_CN_ANNOUNCE);
+ ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
else
freemsg(mp1);
freeb(mp);
@@ -3341,6 +3573,7 @@ ar_rput(queue_t *q, mblk_t *mp)
"arp_rput_end: q %p (%S)", q, "reflection");
return;
}
+
/*
* Conflicts seen via the wrong interface may be bogus.
* Multiple interfaces on the same segment imply any conflict
@@ -3378,12 +3611,21 @@ ar_rput(queue_t *q, mblk_t *mp)
* the src_paddr field before sending it to IP. The same is
* required for probes, where src_paddr will be INADDR_ANY.
*/
- if (is_probe || op == ARP_RESPONSE) {
+ if (is_probe) {
+ /*
+ * In this case, client_arl will be invalid (e.g.,
+ * since probes don't have a valid sender address).
+ * But dst_ace has the appropriate arl.
+ */
bcopy(dst_paddr, src_paddr, plen);
- ar_client_notify(arl, mp1, AR_CN_FAILED);
+ ar_client_notify(dst_ace->ace_arl, mp1, AR_CN_FAILED);
+ ar_ce_delete(dst_ace);
+ } else if (op == ARP_RESPONSE) {
+ bcopy(dst_paddr, src_paddr, plen);
+ ar_client_notify(client_arl, mp1, AR_CN_FAILED);
ar_ce_delete(dst_ace);
} else if (err == AR_CHANGED) {
- ar_client_notify(arl, mp1, AR_CN_ANNOUNCE);
+ ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
} else {
DTRACE_PROBE3(rput_request_unverified, arl_t *, arl,
arh_t *, arh, ace_t *, dst_ace);
@@ -3431,19 +3673,19 @@ ar_rput(queue_t *q, mblk_t *mp)
dst_ace->ace_hw_addr, dst_ace->ace_proto_addr,
src_haddr, src_paddr, dstaddr, as);
if (!is_probe && err == AR_NOTFOUND &&
- ar_ce_create(arl, proto, src_haddr, hlen, src_paddr, plen,
- NULL, NULL, 0, 0) == 0) {
+ ar_ce_create(OWNING_ARL(arl), proto, src_haddr, hlen,
+ src_paddr, plen, NULL, NULL, 0, NULL, 0) == 0) {
ace_t *ace;
ace = ar_ce_lookup(arl, proto, src_paddr, plen);
ASSERT(ace != NULL);
- mi_timer(arl->arl_wq, ace->ace_mp,
+ mi_timer(ace->ace_arl->arl_wq, ace->ace_mp,
as->as_cleanup_interval);
}
}
if (err == AR_CHANGED) {
freeb(mp);
- ar_client_notify(arl, mp1, AR_CN_ANNOUNCE);
+ ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "reqchange");
} else {
@@ -3459,7 +3701,7 @@ ar_ce_restart_dad(ace_t *ace, void *arl_arg)
arl_t *arl = arl_arg;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
- if ((ace->ace_arl == arl) &&
+ if ((ace->ace_xmit_arl == arl) &&
(ace->ace_flags & (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) ==
(ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) {
/*
@@ -4060,9 +4302,9 @@ ar_wput(queue_t *q, mblk_t *mp)
static boolean_t
arp_say_ready(ace_t *ace)
{
- mblk_t *mp;
+ mblk_t *mp;
arl_t *arl = ace->ace_arl;
- arlphy_t *ap = arl->arl_phy;
+ arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
arh_t *arh;
uchar_t *cp;
@@ -4107,7 +4349,7 @@ ace_reschedule(ace_t *ace, void *arg)
ace_t **acemax;
ace_t *atemp;
- if (ace->ace_arl != art->art_arl)
+ if (ace->ace_xmit_arl != art->art_arl)
return;
/*
* Only published entries that are ready for announcement are eligible.
@@ -4179,7 +4421,6 @@ static void
ar_wsrv(queue_t *q)
{
ace_t *ace;
- arl_t *arl;
arlphy_t *ap;
mblk_t *mp;
clock_t ms;
@@ -4196,8 +4437,7 @@ ar_wsrv(queue_t *q)
ace = (ace_t *)mp->b_rptr;
if (ace->ace_flags & ACE_F_DYING)
continue;
- arl = ace->ace_arl;
- ap = arl->arl_phy;
+ ap = ace->ace_xmit_arl->arl_phy;
if (ace->ace_flags & ACE_F_UNVERIFIED) {
ASSERT(ace->ace_flags & ACE_F_PUBLISH);
ASSERT(ace->ace_query_mp == NULL);
@@ -4216,7 +4456,7 @@ ar_wsrv(queue_t *q)
DTRACE_PROBE1(timer_probe,
ace_t *, ace);
ace->ace_xmit_count--;
- ar_xmit(arl, ARP_REQUEST,
+ ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
ace->ace_proto,
ace->ace_proto_addr_length,
ace->ace_hw_addr, NULL, NULL,
@@ -4247,7 +4487,7 @@ ar_wsrv(queue_t *q)
now - ap->ap_defend_start >
SEC_TO_TICK(as->as_defend_period)) {
ap->ap_defend_start = now;
- arl_reschedule(arl);
+ arl_reschedule(ace->ace_xmit_arl);
}
/*
* Finish the job that we started in
@@ -4288,12 +4528,12 @@ ar_wsrv(queue_t *q)
DTRACE_PROBE1(timer_defend,
ace_t *, ace);
}
- ar_xmit(arl, ARP_REQUEST,
+ ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
ace->ace_proto,
ace->ace_proto_addr_length,
ace->ace_hw_addr,
ace->ace_proto_addr,
- ap->ap_arp_addr,
+ ace->ace_xmit_arl->arl_phy->ap_arp_addr,
ace->ace_proto_addr, NULL, as);
ace->ace_last_bcast = now;
if (ace->ace_xmit_count == 0)
@@ -4316,7 +4556,8 @@ ar_wsrv(queue_t *q)
ndp_lookup_ipaddr(*(ipaddr_t *)
ace->ace_proto_addr, as->as_netstack)) {
ace->ace_flags |= ACE_F_OLD;
- mi_timer(arl->arl_wq, ace->ace_mp,
+ mi_timer(ace->ace_arl->arl_wq,
+ ace->ace_mp,
as->as_cleanup_interval);
} else {
ar_delete_notify(ace);
@@ -4333,7 +4574,7 @@ ar_wsrv(queue_t *q)
* we complete the operation with a failure indication.
* Otherwise, we restart the timer.
*/
- ms = ar_query_xmit(as, ace, NULL);
+ ms = ar_query_xmit(as, ace);
if (ms == 0)
ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
else
@@ -4360,6 +4601,8 @@ ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen,
mblk_t *mp;
arlphy_t *ap = arl->arl_phy;
+ ASSERT(!(arl->arl_flags & ARL_F_IPMP));
+
if (ap == NULL) {
DTRACE_PROBE1(xmit_no_arl_phy, arl_t *, arl);
return;
diff --git a/usr/src/uts/common/inet/arp_impl.h b/usr/src/uts/common/inet/arp_impl.h
index a2564d5602..f16fdc97a0 100644
--- a/usr/src/uts/common/inet/arp_impl.h
+++ b/usr/src/uts/common/inet/arp_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -67,6 +67,7 @@ typedef struct arl_s {
uint_t arl_closing : 1; /* stream is closing */
uint32_t arl_index; /* instance number */
struct arlphy_s *arl_phy; /* physical info, if any */
+ struct arl_s *arl_ipmp_arl; /* pointer to group arl_t */
} arl_t;
/*
@@ -75,7 +76,7 @@ typedef struct arl_s {
*/
#define ARL_TO_ARPSTACK(_arl) (((ar_t *)(_arl)->arl_rq->q_ptr)->ar_as)
-/* ARL physical info structure for a link level device */
+/* ARL physical info structure, one per physical link level device */
typedef struct arlphy_s {
uint32_t ap_arp_hw_type; /* hardware type */
uchar_t *ap_arp_addr; /* multicast address to use */
@@ -110,6 +111,7 @@ typedef struct ace_s {
clock_t ace_last_bcast; /* last broadcast Response */
clock_t ace_xmit_interval;
int ace_xmit_count;
+ arl_t *ace_xmit_arl; /* xmit on this arl */
} ace_t;
#define ARPHOOK_INTERESTED_PHYSICAL_IN(as) \
@@ -216,6 +218,7 @@ struct arp_stack {
typedef struct arp_stack arp_stack_t;
#define ARL_F_NOARP 0x01
+#define ARL_F_IPMP 0x02
#define ARL_S_DOWN 0x00
#define ARL_S_PENDING 0x01
diff --git a/usr/src/uts/common/inet/dlpistub/dlpistub.c b/usr/src/uts/common/inet/dlpistub/dlpistub.c
new file mode 100644
index 0000000000..961876ac47
--- /dev/null
+++ b/usr/src/uts/common/inet/dlpistub/dlpistub.c
@@ -0,0 +1,370 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * DLPI stub driver; currently supports VNI and IPMP stub devices.
+ */
+
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/dlpi.h>
+#include <sys/stat.h>
+#include <sys/strsun.h>
+#include <sys/stropts.h>
+#include <sys/types.h>
+#include <sys/id_space.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/modctl.h>
+#include <sys/mkdev.h>
+#include <sys/sdt.h>
+
+#include "dlpistub_impl.h"
+
+static id_space_t *ds_minors;
+static dev_info_t *ds_dip;
+
+/*
+ * DL_INFO_ACK template.
+ */
+static dl_info_ack_t ds_infoack = {
+ DL_INFO_ACK, /* dl_primitive */
+ 0, /* dl_max_sdu */
+ 0, /* dl_min_sdu */
+ 0, /* dl_addr_length */
+ 0, /* dl_mac_type */
+ 0, /* dl_reserved */
+ 0, /* dl_current_state */
+ 0, /* dl_sap_length */
+ DL_CLDLS, /* dl_service_mode */
+ 0, /* dl_qos_length */
+ 0, /* dl_qos_offset */
+ 0, /* dl_qos_range_length */
+ 0, /* dl_qos_range_offset */
+ DL_STYLE2, /* dl_provider_style */
+ 0, /* dl_addr_offset */
+ DL_VERSION_2, /* dl_version */
+ 0, /* dl_brdcst_addr_length */
+ 0, /* dl_brdcst_addr_offset */
+ 0 /* dl_growth */
+};
+
+static int
+ds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (ddi_create_minor_node(dip, "vni", S_IFCHR, DS_MINOR_VNI,
+ DDI_PSEUDO, 0) == DDI_FAILURE ||
+ ddi_create_minor_node(dip, "ipmpstub", S_IFCHR, DS_MINOR_IPMP,
+ DDI_PSEUDO, 0) == DDI_FAILURE) {
+ ddi_remove_minor_node(dip, NULL);
+ cmn_err(CE_NOTE, "ds_attach: cannot create minor nodes");
+ return (DDI_FAILURE);
+ }
+
+ ds_dip = dip;
+ ds_minors = id_space_create("ds_minors", DS_MINOR_START, MAXMIN32);
+ return (DDI_SUCCESS);
+}
+
+static int
+ds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ id_space_destroy(ds_minors);
+ ds_minors = NULL;
+ ASSERT(dip == ds_dip);
+ ddi_remove_minor_node(dip, NULL);
+ ds_dip = NULL;
+ return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+ds_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ int error = DDI_FAILURE;
+
+ switch (infocmd) {
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2DEVINFO:
+ if (ds_dip != NULL) {
+ *result = ds_dip;
+ error = DDI_SUCCESS;
+ }
+ break;
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+ds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+ int type;
+ dlpistub_t *dsp;
+
+ if (sflag == CLONEOPEN || sflag == MODOPEN)
+ return (EINVAL);
+
+ if (q->q_ptr != NULL)
+ return (0);
+
+ switch (getminor(*devp)) {
+ case DS_MINOR_VNI:
+ type = SUNW_DL_VNI;
+ break;
+ case DS_MINOR_IPMP:
+ type = SUNW_DL_IPMP;
+ break;
+ default:
+ return (ENXIO);
+ }
+
+ dsp = kmem_zalloc(sizeof (dlpistub_t), KM_SLEEP);
+ dsp->ds_type = type;
+ dsp->ds_minor = (minor_t)id_alloc(ds_minors);
+ dsp->ds_state = DL_UNATTACHED;
+ *devp = makedevice(getmajor(*devp), dsp->ds_minor);
+ q->q_ptr = WR(q)->q_ptr = dsp;
+ qprocson(q);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+ds_close(queue_t *q, int flag, cred_t *credp)
+{
+ dlpistub_t *dsp = q->q_ptr;
+
+ qprocsoff(q);
+ q->q_ptr = WR(q)->q_ptr = NULL;
+
+ id_free(ds_minors, dsp->ds_minor);
+ kmem_free(dsp, sizeof (dlpistub_t));
+
+ return (0);
+}
+
+static int
+ds_badprim(queue_t *q, mblk_t *mp, t_scalar_t prim)
+{
+ dlerrorack(q, mp, prim, DL_BADPRIM, 0);
+ return (0);
+}
+
+static int
+ds_outstate(queue_t *q, mblk_t *mp, t_scalar_t prim)
+{
+ dlerrorack(q, mp, prim, DL_OUTSTATE, 0);
+ return (0);
+}
+
+static int
+ds_wput(queue_t *q, mblk_t *mp)
+{
+ union DL_primitives *dlp;
+ dl_info_ack_t *dlip;
+ dlpistub_t *dsp = q->q_ptr;
+ t_scalar_t prim;
+
+ switch (DB_TYPE(mp)) {
+ case M_PROTO:
+ case M_PCPROTO:
+ if (MBLKL(mp) < sizeof (t_scalar_t)) {
+ dlerrorack(q, mp, DL_PRIM_INVAL, DL_UNSUPPORTED, 0);
+ return (0);
+ }
+
+ dlp = (void *)mp->b_rptr;
+ prim = dlp->dl_primitive;
+ switch (prim) {
+ case DL_ATTACH_REQ:
+ if (MBLKL(mp) < DL_ATTACH_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ if (dsp->ds_state != DL_UNATTACHED)
+ return (ds_outstate(q, mp, prim));
+
+ dsp->ds_state = DL_UNBOUND;
+ dlokack(q, mp, DL_ATTACH_REQ);
+ break;
+
+ case DL_BIND_REQ:
+ if (MBLKL(mp) < DL_BIND_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ if (dsp->ds_state != DL_UNBOUND)
+ return (ds_outstate(q, mp, prim));
+
+ dsp->ds_state = DL_IDLE;
+ dlbindack(q, mp, dlp->bind_req.dl_sap, NULL, 0, 0, 0);
+ break;
+
+ case DL_INFO_REQ:
+ if (MBLKL(mp) < DL_INFO_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ mp = mexchange(q, mp, sizeof (dl_info_ack_t),
+ M_PCPROTO, DL_INFO_ACK);
+ if (mp != NULL) {
+ dlip = (void *)mp->b_rptr;
+ *dlip = ds_infoack;
+ dlip->dl_mac_type = dsp->ds_type;
+ dlip->dl_current_state = dsp->ds_state;
+ qreply(q, mp);
+ }
+ break;
+
+ case DL_PHYS_ADDR_REQ:
+ if (MBLKL(mp) < DL_PHYS_ADDR_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ dlphysaddrack(q, mp, NULL, 0);
+ break;
+
+ case DL_UNBIND_REQ:
+ if (MBLKL(mp) < DL_UNBIND_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ if (dsp->ds_state != DL_IDLE)
+ return (ds_outstate(q, mp, prim));
+
+ dsp->ds_state = DL_UNBOUND;
+ dlokack(q, mp, DL_UNBIND_REQ);
+ break;
+
+ case DL_DETACH_REQ:
+ if (MBLKL(mp) < DL_DETACH_REQ_SIZE)
+ return (ds_badprim(q, mp, prim));
+
+ if (dsp->ds_state != DL_UNBOUND)
+ return (ds_outstate(q, mp, prim));
+
+ dsp->ds_state = DL_UNATTACHED;
+ dlokack(q, mp, DL_DETACH_REQ);
+ break;
+
+ case DL_UNITDATA_REQ:
+ DTRACE_PROBE2(dlpistub__data, dlpistub_t *, dsp,
+ mblk_t *, mp);
+ freemsg(mp);
+ break;
+
+ default:
+ dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
+ }
+ break;
+
+ case M_IOCTL:
+ miocnak(q, mp, 0, EINVAL);
+ break;
+
+ case M_FLUSH:
+ *mp->b_rptr &= ~FLUSHW;
+ if (*mp->b_rptr & FLUSHR)
+ qreply(q, mp);
+ else
+ freemsg(mp);
+ break;
+ default:
+ freemsg(mp);
+ break;
+ }
+
+ return (0);
+}
+
+static struct module_info ds_minfo = {
+ DS_IDNUM, /* mi_idnum */
+ "dlpistub", /* mi_idname */
+ 0, /* mi_minpsz */
+ INFPSZ, /* mi_maxpsz */
+ 0, /* mi_hiwat */
+ 0, /* mi_lowat */
+};
+
+static struct qinit ds_rinit = {
+ NULL, /* qi_putp */
+ NULL, /* qi_srvp */
+ ds_open, /* qi_qopen */
+ ds_close, /* qi_qclose */
+ NULL, /* qi_qadmin */
+ &ds_minfo, /* qi_minfo */
+};
+
+static struct qinit ds_winit = {
+ ds_wput, /* qi_putp */
+ NULL, /* qi_srvp */
+ NULL, /* qi_qopen */
+ NULL, /* qi_qclose */
+ NULL, /* qi_qadmin */
+ &ds_minfo, /* qi_minfo */
+};
+
+static struct streamtab ds_info = {
+ &ds_rinit, /* st_rdinit */
+ &ds_winit /* st_wrinit */
+};
+
+DDI_DEFINE_STREAM_OPS(ds_ops, nulldev, nulldev, ds_attach, ds_detach,
+ nodev, ds_devinfo, D_MP|D_MTPERMOD, &ds_info, ddi_quiesce_not_supported);
+
+static struct modldrv modldrv = {
+ &mod_driverops,
+ "DLPI stub driver",
+ &ds_ops,
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, &modldrv, NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/inet/vni/vni.conf b/usr/src/uts/common/inet/dlpistub/dlpistub.conf
index d79915e01c..72264ca466 100644
--- a/usr/src/uts/common/inet/vni/vni.conf
+++ b/usr/src/uts/common/inet/dlpistub/dlpistub.conf
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,10 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-
-#ident "%Z%%M% %I% %E% SMI"
-#
-name="vni" parent="pseudo" instance=0;
+name="dlpistub" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h b/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h
new file mode 100644
index 0000000000..ece15320ee
--- /dev/null
+++ b/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _INET_DLPISTUB_IMPL_H
+#define _INET_DLPISTUB_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+
+typedef struct dlpistub {
+ int ds_type; /* DLPI MAC type */
+ t_uscalar_t ds_state; /* DLPI state */
+ minor_t ds_minor; /* corresponding minor */
+} dlpistub_t;
+
+#define DS_IDNUM 0x2a84
+
+enum { DS_MINOR_VNI = 1, DS_MINOR_IPMP, DS_MINOR_START };
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INET_DLPISTUB_IMPL_H */
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 323c8fd0de..41595280cb 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -56,6 +56,7 @@ extern "C" {
#include <net/route.h>
#include <sys/systm.h>
#include <sys/multidata.h>
+#include <sys/list.h>
#include <net/radix.h>
#include <sys/modhash.h>
@@ -565,15 +566,21 @@ typedef struct ipha_s {
#define IPH_ECN_ECT0 0x2 /* ECN-Capable Transport, ECT(0) */
#define IPH_ECN_CE 0x3 /* ECN-Congestion Experienced (CE) */
+struct ill_s;
+
+typedef boolean_t ip_v6intfid_func_t(struct ill_s *, in6_addr_t *);
+typedef boolean_t ip_v6mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *,
+ in6_addr_t *);
+typedef boolean_t ip_v4mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *,
+ ipaddr_t *);
+
/* IP Mac info structure */
typedef struct ip_m_s {
- t_uscalar_t ip_m_mac_type; /* From <sys/dlpi.h> */
- int ip_m_type; /* From <net/if_types.h> */
- boolean_t (*ip_m_v4mapinfo)(uint_t, uint8_t *, uint8_t *,
- uint32_t *, ipaddr_t *);
- boolean_t (*ip_m_v6mapinfo)(uint_t, uint8_t *, uint8_t *,
- uint32_t *, in6_addr_t *);
- boolean_t (*ip_m_v6intfid)(uint_t, uint8_t *, in6_addr_t *);
+ t_uscalar_t ip_m_mac_type; /* From <sys/dlpi.h> */
+ int ip_m_type; /* From <net/if_types.h> */
+ ip_v4mapinfo_func_t *ip_m_v4mapinfo;
+ ip_v6mapinfo_func_t *ip_m_v6mapinfo;
+ ip_v6intfid_func_t *ip_m_v6intfid;
} ip_m_t;
/*
@@ -583,18 +590,22 @@ typedef struct ip_m_s {
* layer multicast address range.
* b. map from IPv6 multicast address range (ff00::/8) to the link
* layer multicast address range.
- * c. derive the default IPv6 interface identifier from the link layer
- * address.
+ * c. derive the default IPv6 interface identifier from the interface.
+ * d. derive the default IPv6 destination interface identifier from
+ * the interface (point-to-point only).
*/
#define MEDIA_V4MINFO(ip_m, plen, bphys, maddr, hwxp, v4ptr) \
(((ip_m)->ip_m_v4mapinfo != NULL) && \
(*(ip_m)->ip_m_v4mapinfo)(plen, bphys, maddr, hwxp, v4ptr))
-#define MEDIA_V6INTFID(ip_m, plen, phys, v6ptr) \
- (((ip_m)->ip_m_v6intfid != NULL) && \
- (*(ip_m)->ip_m_v6intfid)(plen, phys, v6ptr))
#define MEDIA_V6MINFO(ip_m, plen, bphys, maddr, hwxp, v6ptr) \
(((ip_m)->ip_m_v6mapinfo != NULL) && \
(*(ip_m)->ip_m_v6mapinfo)(plen, bphys, maddr, hwxp, v6ptr))
+#define MEDIA_V6INTFID(ip_m, ill, v6ptr) \
+ (((ip_m)->ip_m_v6intfid != NULL) && \
+ (*(ip_m)->ip_m_v6intfid)(ill, v6ptr))
+#define MEDIA_V6DESTINTFID(ip_m, ill, v6ptr) \
+ (((ip_m)->ip_m_v6destintfid != NULL) && \
+ (*(ip_m)->ip_m_v6destintfid)(ill, v6ptr))
/* Router entry types */
#define IRE_BROADCAST 0x0001 /* Route entry for broadcast address */
@@ -621,18 +632,12 @@ typedef struct ip_m_s {
* the bucket should delete this IRE from this bucket.
*/
#define IRE_MARK_CONDEMNED 0x0001
+
/*
- * If a broadcast IRE is marked with IRE_MARK_NORECV, ip_rput will drop the
- * broadcast packets received on that interface. This is marked only
- * on broadcast ires. Employed by IPMP, where we have multiple NICs on the
- * same subnet receiving the same broadcast packet.
- */
-#define IRE_MARK_NORECV 0x0002
-/*
- * IRE_CACHE marked this way won't be returned by ire_cache_lookup. Need
- * to look specifically using MATCH_IRE_MARK_HIDDEN. Used by IPMP.
+ * An IRE with IRE_MARK_TESTHIDDEN is used by in.mpathd for test traffic. It
+ * can only be looked up by requesting MATCH_IRE_MARK_TESTHIDDEN.
*/
-#define IRE_MARK_HIDDEN 0x0004 /* Typically Used by in.mpathd */
+#define IRE_MARK_TESTHIDDEN 0x0004
/*
* An IRE with IRE_MARK_NOADD is created in ip_newroute_ipif when the outgoing
@@ -788,45 +793,18 @@ typedef struct mrec_s {
* ilm records the state of multicast memberships with the driver and is
* maintained per interface.
*
- * Notes :
- *
- * 1) There is no direct link between a given ilg and ilm. If the
- * application has joined a group G with ifindex I, we will have
- * an ilg with ilg_v6group and ilg_ill. There will be a corresponding
- * ilm with ilm_ill/ilm_v6addr recording the multicast membership.
- * To delete the membership,
- *
- * a) Search for ilg matching on G and I with ilg_v6group
- * and ilg_ill. Delete ilg_ill.
- * b) Search the corresponding ilm matching on G and I with
- * ilm_v6addr and ilm_ill. Delete ilm.
- *
- * In IPv4, the only difference is, we look using ipifs instead of
- * ills.
- *
- * 2) With IP multipathing, we want to keep receiving even after the
- * interface has failed. We do this by moving multicast memberships
- * to a new_ill within the group. This is achieved by sending
- * DL_DISABMULTI_REQS on ilg_ill/ilm_ill and sending DL_ENABMULTIREQS
- * on the new_ill and changing ilg_ill/ilm_ill to new_ill. But, we
- * need to be able to delete memberships which will still come down
- * with the ifindex of the old ill which is what the application
- * knows of. Thus we store the ilm_/ilg_orig_ifindex to keep track
- * of where we joined initially so that we can lookup even after we
- * moved the membership. It is also used for moving back the membership
- * when the old ill has been repaired. This is done by looking up for
- * ilms with ilm_orig_ifindex matching on the old ill's ifindex. Only
- * ilms actually move from old ill to new ill. ilgs don't move (just
- * the ilg_ill is changed when it moves) as it just records the state
- * of the application that has joined a group G where as ilm records
- * the state joined with the driver. Thus when we send DL_XXXMULTI_REQs
- * we also need to keep the ilm in the right ill.
- *
- * In IPv4, as ipifs move from old ill to new_ill, ilgs and ilms move
- * implicitly as we use only ipifs in IPv4. Thus, one can always lookup
- * a given ilm/ilg even after it fails without the support of
- * orig_ifindex. We move ilms still to record the driver state as
- * mentioned above.
+ * There is no direct link between a given ilg and ilm. If the
+ * application has joined a group G with ifindex I, we will have
+ * an ilg with ilg_v6group and ilg_ill. There will be a corresponding
+ * ilm with ilm_ill/ilm_v6addr recording the multicast membership.
+ * To delete the membership:
+ *
+ * a) Search for ilg matching on G and I with ilg_v6group
+ * and ilg_ill. Delete ilg_ill.
+ * b) Search the corresponding ilm matching on G and I with
+ * ilm_v6addr and ilm_ill. Delete ilm.
+ *
+ * For IPv4 the only difference is that we look using ipifs, not ills.
*/
/*
@@ -839,7 +817,6 @@ typedef struct ilg_s {
in6_addr_t ilg_v6group;
struct ipif_s *ilg_ipif; /* Logical interface we are member on */
struct ill_s *ilg_ill; /* Used by IPv6 */
- int ilg_orig_ifindex; /* Interface originally joined on */
uint_t ilg_flags;
mcast_record_t ilg_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */
slist_t *ilg_filter;
@@ -866,9 +843,7 @@ typedef struct ilm_s {
struct ilm_s *ilm_next; /* Linked list for each ill */
uint_t ilm_state; /* state of the membership */
struct ill_s *ilm_ill; /* Back pointer to ill for IPv6 */
- int ilm_orig_ifindex; /* V6_MULTICAST_IF/ilm_ipif index */
uint_t ilm_flags;
- boolean_t ilm_is_new; /* new ilm */
boolean_t ilm_notify_driver; /* Need to notify the driver */
zoneid_t ilm_zoneid;
int ilm_no_ilg_cnt; /* number of joins w/ no ilg */
@@ -881,28 +856,11 @@ typedef struct ilm_s {
#define ilm_addr V4_PART_OF_V6(ilm_v6addr)
-/*
- * ilm_walker_cleanup needs to execute when the ilm_walker_cnt goes down to
- * zero. In addition it needs to block new walkers while it is unlinking ilm's
- * from the list. Thus simple atomics for the ill_ilm_walker_cnt don't suffice.
- */
-#define ILM_WALKER_HOLD(ill) { \
- mutex_enter(&(ill)->ill_lock); \
- ill->ill_ilm_walker_cnt++; \
- mutex_exit(&(ill)->ill_lock); \
-}
-
-/*
- * ilm_walker_cleanup releases ill_lock
- */
-#define ILM_WALKER_RELE(ill) { \
- mutex_enter(&(ill)->ill_lock); \
- (ill)->ill_ilm_walker_cnt--; \
- if ((ill)->ill_ilm_walker_cnt == 0 && (ill)->ill_ilm_cleanup_reqd) \
- ilm_walker_cleanup(ill); \
- else \
- mutex_exit(&(ill)->ill_lock); \
-}
+typedef struct ilm_walker {
+ struct ill_s *ilw_ill; /* associated ill */
+ struct ill_s *ilw_ipmp_ill; /* associated ipmp ill (if any) */
+ struct ill_s *ilw_walk_ill; /* current ill being walked */
+} ilm_walker_t;
/*
* Soft reference to an IPsec SA.
@@ -1047,11 +1005,8 @@ typedef struct conn_s conn_t;
* ipc_acking_unbind conn_acking_unbind
* ipc_pad_to_bit_31 conn_pad_to_bit_31
*
- * ipc_nofailover_ill conn_nofailover_ill
- *
* ipc_proto conn_proto
* ipc_incoming_ill conn_incoming_ill
- * ipc_outgoing_pill conn_outgoing_pill
* ipc_pending_ill conn_pending_ill
* ipc_unbind_mp conn_unbind_mp
* ipc_ilg conn_ilg
@@ -1061,8 +1016,6 @@ typedef struct conn_s conn_t;
* ipc_refcv conn_refcv
* ipc_multicast_ipif conn_multicast_ipif
* ipc_multicast_ill conn_multicast_ill
- * ipc_orig_bound_ifindex conn_orig_bound_ifindex
- * ipc_orig_multicast_ifindex conn_orig_multicast_ifindex
* ipc_drain_next conn_drain_next
* ipc_drain_prev conn_drain_prev
* ipc_idl conn_idl
@@ -1263,7 +1216,6 @@ typedef struct th_hash_s {
/* The following are ipif_state_flags */
#define IPIF_CONDEMNED 0x1 /* The ipif is being removed */
#define IPIF_CHANGING 0x2 /* A critcal ipif field is changing */
-#define IPIF_MOVING 0x8 /* The ipif is being moved */
#define IPIF_SET_LINKLOCAL 0x10 /* transient flag during bringup */
#define IPIF_ZERO_SOURCE 0x20 /* transient flag during bringup */
@@ -1273,7 +1225,6 @@ typedef struct ipif_s {
struct ill_s *ipif_ill; /* Back pointer to our ill */
int ipif_id; /* Logical unit number */
uint_t ipif_mtu; /* Starts at ipif_ill->ill_max_frag */
- uint_t ipif_saved_mtu; /* Save of mtu during ipif_move() */
in6_addr_t ipif_v6lcl_addr; /* Local IP address for this if. */
in6_addr_t ipif_v6src_addr; /* Source IP address for this if. */
in6_addr_t ipif_v6subnet; /* Subnet prefix for this if. */
@@ -1306,17 +1257,15 @@ typedef struct ipif_s {
uint_t ipif_ob_pkt_count; /* Outbound packets to our dead IREs */
/* Exclusive bit fields, protected by ipsq_t */
unsigned int
- ipif_multicast_up : 1, /* We have joined the allhosts group */
- ipif_replace_zero : 1, /* Replacement for zero */
+ ipif_multicast_up : 1, /* ipif_multicast_up() successful */
ipif_was_up : 1, /* ipif was up before */
ipif_addr_ready : 1, /* DAD is done */
-
ipif_was_dup : 1, /* DAD had failed */
+
+ ipif_joined_allhosts : 1, /* allhosts joined */
ipif_pad_to_31 : 27;
- int ipif_orig_ifindex; /* ifindex before SLIFFAILOVER */
uint_t ipif_seqid; /* unique index across all ills */
- uint_t ipif_orig_ipifid; /* ipif_id before SLIFFAILOVER */
uint_t ipif_state_flags; /* See IPIF_* flag defs above */
uint_t ipif_refcnt; /* active consistent reader cnt */
@@ -1328,6 +1277,16 @@ typedef struct ipif_s {
zoneid_t ipif_zoneid; /* zone ID number */
timeout_id_t ipif_recovery_id; /* Timer for DAD recovery */
boolean_t ipif_trace_disable; /* True when alloc fails */
+ /*
+ * For an IPMP interface, ipif_bound_ill tracks the ill whose hardware
+ * information this ipif is associated with via ARP/NDP. We can use
+ * an ill pointer (rather than an index) because only ills that are
+ * part of a group will be pointed to, and an ill cannot disappear
+ * while it's in a group.
+ */
+ struct ill_s *ipif_bound_ill;
+ struct ipif_s *ipif_bound_next; /* bound ipif chain */
+ boolean_t ipif_bound; /* B_TRUE if we successfully bound */
} ipif_t;
/*
@@ -1405,8 +1364,6 @@ typedef struct ipif_s {
*
* bit fields ill_lock ill_lock
*
- * ipif_orig_ifindex ipsq None
- * ipif_orig_ipifid ipsq None
* ipif_seqid ipsq Write once
*
* ipif_state_flags ill_lock ill_lock
@@ -1414,6 +1371,10 @@ typedef struct ipif_s {
* ipif_ire_cnt ill_lock ill_lock
* ipif_ilm_cnt ill_lock ill_lock
* ipif_saved_ire_cnt
+ *
+ * ipif_bound_ill ipsq + ipmp_lock ipsq OR ipmp_lock
+ * ipif_bound_next ipsq ipsq
+ * ipif_bound ipsq ipsq
*/
#define IP_TR_HASH(tid) ((((uintptr_t)tid) >> 6) & (IP_TR_HASH_MAX - 1))
@@ -1457,103 +1418,154 @@ typedef struct ipif_s {
#define IPI2MODE(ipi) ((ipi)->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT)
/*
- * The IP-MT design revolves around the serialization object ipsq_t.
- * It is associated with an IPMP group. If IPMP is not enabled, there is
- * 1 ipsq_t per phyint. Eg. an ipsq_t would cover both hme0's IPv4 stream
- *
- * ipsq_lock protects
- * ipsq_reentry_cnt, ipsq_writer, ipsq_xopq_mphead, ipsq_xopq_mptail,
- * ipsq_mphead, ipsq_mptail, ipsq_split
- *
- * ipsq_pending_ipif, ipsq_current_ipif, ipsq_pending_mp, ipsq_flags,
- * ipsq_waitfor
- *
- * The fields in the last line above below are set mostly by a writer thread
- * But there is an exception in the last call to ipif_ill_refrele_tail which
- * could also race with a conn close which could be cleaning up the
- * fields. So we choose to protect using ipsq_lock instead of depending on
- * the property of the writer.
- * ill_g_lock protects
- * ipsq_refs, ipsq_phyint_list
- */
-typedef struct ipsq_s {
- kmutex_t ipsq_lock;
- int ipsq_reentry_cnt;
- kthread_t *ipsq_writer; /* current owner (thread id) */
- int ipsq_flags;
- mblk_t *ipsq_xopq_mphead; /* list of excl ops mostly ioctls */
- mblk_t *ipsq_xopq_mptail;
- mblk_t *ipsq_mphead; /* msgs on ipsq linked thru b_next */
- mblk_t *ipsq_mptail; /* msgs on ipsq linked thru b_next */
- int ipsq_current_ioctl; /* current ioctl, or 0 if no ioctl */
- boolean_t ipsq_current_done; /* is the current op done? */
- ipif_t *ipsq_current_ipif; /* ipif associated with current op */
- ipif_t *ipsq_pending_ipif; /* ipif associated w. ipsq_pending_mp */
- mblk_t *ipsq_pending_mp; /* current ioctl mp while waiting for */
- /* response from another module */
- struct ipsq_s *ipsq_next; /* list of all syncq's (ipsq_g_list) */
- uint_t ipsq_refs; /* Number of phyints on this ipsq */
- struct phyint *ipsq_phyint_list; /* List of phyints on this ipsq */
- boolean_t ipsq_split; /* ipsq may need to be split */
- int ipsq_waitfor; /* Values encoded below */
- char ipsq_name[LIFNAMSIZ+1]; /* same as phyint_groupname */
- ip_stack_t *ipsq_ipst; /* Does not have a netstack_hold */
-
+ * The IP-MT design revolves around the serialization objects ipsq_t (IPSQ)
+ * and ipxop_t (exclusive operation or "xop"). Becoming "writer" on an IPSQ
+ * ensures that no other threads can become "writer" on any IPSQs sharing that
+ * IPSQ's xop until the writer thread is done.
+ *
+ * Each phyint points to one IPSQ that remains fixed over the phyint's life.
+ * Each IPSQ points to one xop that can change over the IPSQ's life. If a
+ * phyint is *not* in an IPMP group, then its IPSQ will refer to the IPSQ's
+ * "own" xop (ipsq_ownxop). If a phyint *is* part of an IPMP group, then its
+ * IPSQ will refer to the "group" xop, which is shorthand for the xop of the
+ * IPSQ of the IPMP meta-interface's phyint. Thus, all phyints that are part
+ * of the same IPMP group will have their IPSQ's point to the group xop, and
+ * thus becoming "writer" on any phyint in the group will prevent any other
+ * writer on any other phyint in the group. All IPSQs sharing the same xop
+ * are chained together through ipsq_next (in the degenerate common case,
+ * ipsq_next simply refers to itself). Note that the group xop is guaranteed
+ * to exist at least as long as there are members in the group, since the IPMP
+ * meta-interface can only be destroyed if the group is empty.
+ *
+ * Incoming exclusive operation requests are enqueued on the IPSQ they arrived
+ * on rather than the xop. This makes switching xop's (as would happen when a
+ * phyint leaves an IPMP group) simple, because after the phyint leaves the
+ * group, any operations enqueued on its IPSQ can be safely processed with
+ * respect to its new xop, and any operations enqueued on the IPSQs of its
+ * former group can be processed with respect to their existing group xop.
+ * Even so, switching xops is a subtle dance; see ipsq_dq() for details.
+ *
+ * An IPSQ's "own" xop is embedded within the IPSQ itself since they have have
+ * identical lifetimes, and because doing so simplifies pointer management.
+ * While each phyint and IPSQ point to each other, it is not possible to free
+ * the IPSQ when the phyint is freed, since we may still *inside* the IPSQ
+ * when the phyint is being freed. Thus, ipsq_phyint is set to NULL when the
+ * phyint is freed, and the IPSQ free is later done in ipsq_exit().
+ *
+ * ipsq_t synchronization: read write
+ *
+ * ipsq_xopq_mphead ipx_lock ipx_lock
+ * ipsq_xopq_mptail ipx_lock ipx_lock
+ * ipsq_xop_switch_mp ipsq_lock ipsq_lock
+ * ipsq_phyint write once write once
+ * ipsq_next RW_READER ill_g_lock RW_WRITER ill_g_lock
+ * ipsq_xop ipsq_lock or ipsq ipsq_lock + ipsq
+ * ipsq_swxop ipsq ipsq
+ * ipsq_ownxop see ipxop_t see ipxop_t
+ * ipsq_ipst write once write once
+ *
+ * ipxop_t synchronization: read write
+ *
+ * ipx_writer ipx_lock ipx_lock
+ * ipx_xop_queued ipx_lock ipx_lock
+ * ipx_mphead ipx_lock ipx_lock
+ * ipx_mptail ipx_lock ipx_lock
+ * ipx_ipsq write once write once
+ * ips_ipsq_queued ipx_lock ipx_lock
+ * ipx_waitfor ipsq or ipx_lock ipsq + ipx_lock
+ * ipx_reentry_cnt ipsq or ipx_lock ipsq + ipx_lock
+ * ipx_current_done ipsq ipsq
+ * ipx_current_ioctl ipsq ipsq
+ * ipx_current_ipif ipsq or ipx_lock ipsq + ipx_lock
+ * ipx_pending_ipif ipsq or ipx_lock ipsq + ipx_lock
+ * ipx_pending_mp ipsq or ipx_lock ipsq + ipx_lock
+ * ipx_forced ipsq ipsq
+ * ipx_depth ipsq ipsq
+ * ipx_stack ipsq ipsq
+ */
+typedef struct ipxop_s {
+ kmutex_t ipx_lock; /* see above */
+ kthread_t *ipx_writer; /* current owner */
+ mblk_t *ipx_mphead; /* messages tied to this op */
+ mblk_t *ipx_mptail;
+ struct ipsq_s *ipx_ipsq; /* associated ipsq */
+ boolean_t ipx_ipsq_queued; /* ipsq using xop has queued op */
+ int ipx_waitfor; /* waiting; values encoded below */
+ int ipx_reentry_cnt;
+ boolean_t ipx_current_done; /* is the current operation done? */
+ int ipx_current_ioctl; /* current ioctl, or 0 if no ioctl */
+ ipif_t *ipx_current_ipif; /* ipif for current op */
+ ipif_t *ipx_pending_ipif; /* ipif for ipsq_pending_mp */
+ mblk_t *ipx_pending_mp; /* current ioctl mp while waiting */
+ boolean_t ipx_forced; /* debugging aid */
#ifdef DEBUG
- int ipsq_depth; /* debugging aid */
-#define IPSQ_STACK_DEPTH 15
- pc_t ipsq_stack[IPSQ_STACK_DEPTH]; /* debugging aid */
+ int ipx_depth; /* debugging aid */
+#define IPX_STACK_DEPTH 15
+ pc_t ipx_stack[IPX_STACK_DEPTH]; /* debugging aid */
#endif
-} ipsq_t;
+} ipxop_t;
-/* ipsq_flags */
-#define IPSQ_GROUP 0x1 /* This ipsq belongs to an IPMP group */
+typedef struct ipsq_s {
+ kmutex_t ipsq_lock; /* see above */
+ mblk_t *ipsq_switch_mp; /* op to handle right after switch */
+ mblk_t *ipsq_xopq_mphead; /* list of excl ops (mostly ioctls) */
+ mblk_t *ipsq_xopq_mptail;
+ struct phyint *ipsq_phyint; /* associated phyint */
+ struct ipsq_s *ipsq_next; /* next ipsq sharing ipsq_xop */
+ struct ipxop_s *ipsq_xop; /* current xop synchronization info */
+ struct ipxop_s *ipsq_swxop; /* switch xop to on ipsq_exit() */
+ struct ipxop_s ipsq_ownxop; /* our own xop (may not be in-use) */
+ ip_stack_t *ipsq_ipst; /* does not have a netstack_hold */
+} ipsq_t;
/*
- * ipsq_waitfor:
- *
- * IPIF_DOWN 1 ipif_down waiting for refcnts to drop
- * ILL_DOWN 2 ill_down waiting for refcnts to drop
- * IPIF_FREE 3 ipif_free waiting for refcnts to drop
- * ILL_FREE 4 ill unplumb waiting for refcnts to drop
- * ILL_MOVE_OK 5 failover waiting for refcnts to drop
+ * ipx_waitfor values:
*/
+enum {
+ IPIF_DOWN = 1, /* ipif_down() waiting for refcnts to drop */
+ ILL_DOWN, /* ill_down() waiting for refcnts to drop */
+ IPIF_FREE, /* ipif_free() waiting for refcnts to drop */
+ ILL_FREE /* ill unplumb waiting for refcnts to drop */
+};
-enum { IPIF_DOWN = 1, ILL_DOWN, IPIF_FREE, ILL_FREE, ILL_MOVE_OK };
+/* Operation types for ipsq_try_enter() */
+#define CUR_OP 0 /* request writer within current operation */
+#define NEW_OP 1 /* request writer for a new operation */
+#define SWITCH_OP 2 /* request writer once IPSQ XOP switches */
-/* Flags passed to ipsq_try_enter */
-#define CUR_OP 0 /* Current ioctl continuing again */
-#define NEW_OP 1 /* New ioctl starting afresh */
+/*
+ * Kstats tracked on each IPMP meta-interface. Order here must match
+ * ipmp_kstats[] in ip/ipmp.c.
+ */
+enum {
+ IPMP_KSTAT_OBYTES, IPMP_KSTAT_OBYTES64, IPMP_KSTAT_RBYTES,
+ IPMP_KSTAT_RBYTES64, IPMP_KSTAT_OPACKETS, IPMP_KSTAT_OPACKETS64,
+ IPMP_KSTAT_OERRORS, IPMP_KSTAT_IPACKETS, IPMP_KSTAT_IPACKETS64,
+ IPMP_KSTAT_IERRORS, IPMP_KSTAT_MULTIRCV, IPMP_KSTAT_MULTIXMT,
+ IPMP_KSTAT_BRDCSTRCV, IPMP_KSTAT_BRDCSTXMT, IPMP_KSTAT_LINK_UP,
+ IPMP_KSTAT_MAX /* keep last */
+};
/*
* phyint represents state that is common to both IPv4 and IPv6 interfaces.
* There is a separate ill_t representing IPv4 and IPv6 which has a
* backpointer to the phyint structure for accessing common state.
- *
- * NOTE : It just stores the group name as there is only one name for
- * IPv4 and IPv6 i.e it is a underlying link property. Actually
- * IPv4 and IPv6 ill are grouped together when their phyints have
- * the same name.
*/
typedef struct phyint {
struct ill_s *phyint_illv4;
struct ill_s *phyint_illv6;
- uint_t phyint_ifindex; /* SIOCLSLIFINDEX */
- char *phyint_groupname; /* SIOCSLIFGROUPNAME */
- uint_t phyint_groupname_len;
+ uint_t phyint_ifindex; /* SIOCSLIFINDEX */
uint64_t phyint_flags;
avl_node_t phyint_avl_by_index; /* avl tree by index */
avl_node_t phyint_avl_by_name; /* avl tree by name */
kmutex_t phyint_lock;
struct ipsq_s *phyint_ipsq; /* back pointer to ipsq */
- struct phyint *phyint_ipsq_next; /* phyint list on this ipsq */
- /* Once Clearview IPMP is added the follow two fields can be removed */
- uint_t phyint_group_ifindex; /* index assigned to group */
- uint_t phyint_hook_ifindex; /* index used with neti/hook */
+ struct ipmp_grp_s *phyint_grp; /* associated IPMP group */
+ char phyint_name[LIFNAMSIZ]; /* physical interface name */
+ uint64_t phyint_kstats0[IPMP_KSTAT_MAX]; /* baseline kstats */
} phyint_t;
#define CACHE_ALIGN_SIZE 64
-
#define CACHE_ALIGN(align_struct) P2ROUNDUP(sizeof (struct align_struct),\
CACHE_ALIGN_SIZE)
struct _phyint_list_s_ {
@@ -1568,34 +1580,6 @@ typedef union phyint_list_u {
#define phyint_list_avl_by_index phyint_list_s.phyint_list_avl_by_index
#define phyint_list_avl_by_name phyint_list_s.phyint_list_avl_by_name
-/*
- * ILL groups. We group ills,
- *
- * - if the ills have the same group name. (New way)
- *
- * ill_group locking notes:
- *
- * illgrp_lock protects ill_grp_ill_schednext.
- *
- * ill_g_lock protects ill_grp_next, illgrp_ill, illgrp_ill_count.
- * Holding ill_g_lock freezes the memberships of ills in IPMP groups.
- * It also freezes the global list of ills and all ipifs in all ills.
- *
- * To remove an ipif from the linked list of ipifs of that ill ipif_free_tail
- * holds both ill_g_lock, and ill_lock. Similarly to remove an ill from the
- * global list of ills, ill_glist_delete() holds ill_g_lock as writer.
- * This simplifies things for ipif_select_source, illgrp_scheduler etc.
- * that need to walk the members of an illgrp. They just hold ill_g_lock
- * as reader to do the walk.
- *
- */
-typedef struct ill_group {
- kmutex_t illgrp_lock;
- struct ill_group *illgrp_next; /* Next ill_group */
- struct ill_s *illgrp_ill_schednext; /* Next ill to be scheduled */
- struct ill_s *illgrp_ill; /* First ill in the group */
- int illgrp_ill_count;
-} ill_group_t;
/*
* Fragmentation hash bucket
@@ -1792,6 +1776,108 @@ typedef struct ill_lso_capab_s ill_lso_capab_t;
#define IS_LOOPBACK(ill) \
((ill)->ill_phyint->phyint_flags & PHYI_LOOPBACK)
+/* Is this an IPMP meta-interface ILL? */
+#define IS_IPMP(ill) \
+ ((ill)->ill_phyint->phyint_flags & PHYI_IPMP)
+
+/* Is this ILL under an IPMP meta-interface? (aka "in a group?") */
+#define IS_UNDER_IPMP(ill) \
+ ((ill)->ill_grp != NULL && !IS_IPMP(ill))
+
+/* Is ill1 in the same illgrp as ill2? */
+#define IS_IN_SAME_ILLGRP(ill1, ill2) \
+ ((ill1)->ill_grp != NULL && ((ill1)->ill_grp == (ill2)->ill_grp))
+
+/* Is ill1 on the same LAN as ill2? */
+#define IS_ON_SAME_LAN(ill1, ill2) \
+ ((ill1) == (ill2) || IS_IN_SAME_ILLGRP(ill1, ill2))
+
+#define ILL_OTHER(ill) \
+ ((ill)->ill_isv6 ? (ill)->ill_phyint->phyint_illv4 : \
+ (ill)->ill_phyint->phyint_illv6)
+
+/*
+ * IPMP group ILL state structure -- up to two per IPMP group (V4 and V6).
+ * Created when the V4 and/or V6 IPMP meta-interface is I_PLINK'd. It is
+ * guaranteed to persist while there are interfaces of that type in the group.
+ * In general, most fields are accessed outside of the IPSQ (e.g., in the
+ * datapath), and thus use locks in addition to the IPSQ for protection.
+ *
+ * synchronization: read write
+ *
+ * ig_if ipsq or ill_g_lock ipsq and ill_g_lock
+ * ig_actif ipsq or ipmp_lock ipsq and ipmp_lock
+ * ig_nactif ipsq or ipmp_lock ipsq and ipmp_lock
+ * ig_next_ill ipsq or ipmp_lock ipsq and ipmp_lock
+ * ig_ipmp_ill write once write once
+ * ig_cast_ill ipsq or ipmp_lock ipsq and ipmp_lock
+ * ig_arpent ipsq ipsq
+ * ig_mtu ipsq ipsq
+ */
+typedef struct ipmp_illgrp_s {
+ list_t ig_if; /* list of all interfaces */
+ list_t ig_actif; /* list of active interfaces */
+ uint_t ig_nactif; /* number of active interfaces */
+ struct ill_s *ig_next_ill; /* next active interface to use */
+ struct ill_s *ig_ipmp_ill; /* backpointer to IPMP meta-interface */
+ struct ill_s *ig_cast_ill; /* nominated ill for multi/broadcast */
+ list_t ig_arpent; /* list of ARP entries */
+ uint_t ig_mtu; /* ig_ipmp_ill->ill_max_mtu */
+} ipmp_illgrp_t;
+
+/*
+ * IPMP group state structure -- one per IPMP group. Created when the
+ * IPMP meta-interface is plumbed; it is guaranteed to persist while there
+ * are interfaces in it.
+ *
+ * ipmp_grp_t synchronization: read write
+ *
+ * gr_name ipmp_lock ipmp_lock
+ * gr_ifname write once write once
+ * gr_mactype ipmp_lock ipmp_lock
+ * gr_phyint write once write once
+ * gr_nif ipmp_lock ipmp_lock
+ * gr_nactif ipsq ipsq
+ * gr_v4 ipmp_lock ipmp_lock
+ * gr_v6 ipmp_lock ipmp_lock
+ * gr_nv4 ipmp_lock ipmp_lock
+ * gr_nv6 ipmp_lock ipmp_lock
+ * gr_pendv4 ipmp_lock ipmp_lock
+ * gr_pendv6 ipmp_lock ipmp_lock
+ * gr_linkdownmp ipsq ipsq
+ * gr_ksp ipmp_lock ipmp_lock
+ * gr_kstats0 atomic atomic
+ */
+typedef struct ipmp_grp_s {
+ char gr_name[LIFGRNAMSIZ]; /* group name */
+ char gr_ifname[LIFNAMSIZ]; /* interface name */
+ t_uscalar_t gr_mactype; /* DLPI mactype of group */
+ phyint_t *gr_phyint; /* IPMP group phyint */
+ uint_t gr_nif; /* number of interfaces in group */
+ uint_t gr_nactif; /* number of active interfaces */
+ ipmp_illgrp_t *gr_v4; /* V4 group information */
+ ipmp_illgrp_t *gr_v6; /* V6 group information */
+ uint_t gr_nv4; /* number of ills in V4 group */
+ uint_t gr_nv6; /* number of ills in V6 group */
+ uint_t gr_pendv4; /* number of pending ills in V4 group */
+ uint_t gr_pendv6; /* number of pending ills in V6 group */
+ mblk_t *gr_linkdownmp; /* message used to bring link down */
+ kstat_t *gr_ksp; /* group kstat pointer */
+ uint64_t gr_kstats0[IPMP_KSTAT_MAX]; /* baseline group kstats */
+} ipmp_grp_t;
+
+/*
+ * IPMP ARP entry -- one per SIOCS*ARP entry tied to the group. Used to keep
+ * ARP up-to-date as the active set of interfaces in the group changes.
+ */
+typedef struct ipmp_arpent_s {
+ mblk_t *ia_area_mp; /* AR_ENTRY_ADD pointer */
+ ipaddr_t ia_ipaddr; /* IP address for this entry */
+ boolean_t ia_proxyarp; /* proxy ARP entry? */
+ boolean_t ia_notified; /* ARP notified about this entry? */
+ list_node_t ia_node; /* next ARP entry in list */
+} ipmp_arpent_t;
+
/*
* IP Lower level Structure.
* Instance data structure in ip_open when there is a device below us.
@@ -1851,6 +1937,7 @@ typedef struct ill_s {
mblk_t *ill_unbind_mp; /* unbind mp from ill_dl_up() */
mblk_t *ill_promiscoff_mp; /* for ill_leave_allmulti() */
mblk_t *ill_dlpi_deferred; /* b_next chain of control messages */
+ mblk_t *ill_ardeact_mp; /* deact mp from ipmp_ill_activate() */
mblk_t *ill_phys_addr_mp; /* mblk which holds ill_phys_addr */
#define ill_last_mp_to_free ill_phys_addr_mp
@@ -1867,21 +1954,19 @@ typedef struct ill_s {
ill_dlpi_style_set : 1,
ill_ifname_pending : 1,
- ill_move_in_progress : 1, /* FAILOVER/FAILBACK in progress */
ill_join_allmulti : 1,
ill_logical_down : 1,
-
ill_is_6to4tun : 1, /* Interface is a 6to4 tunnel */
+
ill_promisc_on_phys : 1, /* phys interface in promisc mode */
ill_dl_up : 1,
ill_up_ipifs : 1,
-
ill_note_link : 1, /* supports link-up notification */
+
ill_capab_reneg : 1, /* capability renegotiation to be done */
ill_dld_capab_inprog : 1, /* direct dld capab call in prog */
ill_need_recover_multicast : 1,
-
- ill_pad_to_bit_31 : 16;
+ ill_pad_to_bit_31 : 17;
/* Following bit fields protected by ill_lock */
uint_t
@@ -1891,10 +1976,8 @@ typedef struct ill_s {
ill_arp_closing : 1,
ill_arp_bringup_pending : 1,
- ill_mtu_userspecified : 1, /* SIOCSLIFLNKINFO has set the mtu */
ill_arp_extend : 1, /* ARP has DAD extensions */
-
- ill_pad_bit_31 : 25;
+ ill_pad_bit_31 : 26;
/*
* Used in SIOCSIFMUXID and SIOCGIFMUXID for 'ifconfig unplumb'.
@@ -1931,6 +2014,7 @@ typedef struct ill_s {
*/
uint8_t ill_max_hops; /* Maximum hops for any logical interface */
uint_t ill_max_mtu; /* Maximum MTU for any logical interface */
+ uint_t ill_user_mtu; /* User-specified MTU via SIOCSLIFLNKINFO */
uint32_t ill_reachable_time; /* Value for ND algorithm in msec */
uint32_t ill_reachable_retrans_time; /* Value for ND algorithm msec */
uint_t ill_max_buf; /* Max # of req to buffer for ND */
@@ -1953,13 +2037,9 @@ typedef struct ill_s {
* of the ipif.
*/
mblk_t *ill_arp_on_mp;
- /* Peer ill of an IPMP move operation */
- struct ill_s *ill_move_peer;
phyint_t *ill_phyint;
uint64_t ill_flags;
- ill_group_t *ill_group;
- struct ill_s *ill_group_next;
kmutex_t ill_lock; /* Please see table below */
/*
@@ -2005,6 +2085,18 @@ typedef struct ill_s {
void *ill_flownotify_mh; /* Tx flow ctl, mac cb handle */
uint_t ill_ilm_cnt; /* ilms referencing this ill */
uint_t ill_ipallmulti_cnt; /* ip_join_allmulti() calls */
+ /*
+ * IPMP fields.
+ */
+ ipmp_illgrp_t *ill_grp; /* IPMP group information */
+ list_node_t ill_actnode; /* next active ill in group */
+ list_node_t ill_grpnode; /* next ill in group */
+ ipif_t *ill_src_ipif; /* source address selection rotor */
+ ipif_t *ill_move_ipif; /* ipif awaiting move to new ill */
+ boolean_t ill_nom_cast; /* nominated for mcast/bcast */
+ uint_t ill_bound_cnt; /* # of data addresses bound to ill */
+ ipif_t *ill_bound_ipif; /* ipif chain bound to ill */
+ timeout_id_t ill_refresh_tid; /* ill refresh retry timeout id */
} ill_t;
/*
@@ -2088,6 +2180,7 @@ typedef struct ill_s {
*
* ill_max_mtu
*
+ * ill_user_mtu ipsq + ill_lock ill_lock
* ill_reachable_time ipsq + ill_lock ill_lock
* ill_reachable_retrans_time ipsq + ill_lock ill_lock
* ill_max_buf ipsq + ill_lock ill_lock
@@ -2102,12 +2195,9 @@ typedef struct ill_s {
* ill_arp_down_mp ipsq ipsq
* ill_arp_del_mapping_mp ipsq ipsq
* ill_arp_on_mp ipsq ipsq
- * ill_move_peer ipsq ipsq
*
* ill_phyint ipsq, ill_g_lock, ill_lock Any of them
* ill_flags ill_lock ill_lock
- * ill_group ipsq, ill_g_lock, ill_lock Any of them
- * ill_group_next ipsq, ill_g_lock, ill_lock Any of them
* ill_nd_lla_mp ipsq + down ill only when ill is up
* ill_nd_lla ipsq + down ill only when ill is up
* ill_nd_lla_len ipsq + down ill only when ill is up
@@ -2122,11 +2212,26 @@ typedef struct ill_s {
* ill_ilm_walker_cnt ill_lock ill_lock
* ill_nce_cnt ill_lock ill_lock
* ill_ilm_cnt ill_lock ill_lock
+ * ill_src_ipif ill_g_lock ill_g_lock
* ill_trace ill_lock ill_lock
* ill_usesrc_grp_next ill_g_usesrc_lock ill_g_usesrc_lock
* ill_dhcpinit atomics atomics
* ill_flownotify_mh write once write once
* ill_capab_pending_cnt ipsq ipsq
+ *
+ * ill_bound_cnt ipsq ipsq
+ * ill_bound_ipif ipsq ipsq
+ * ill_actnode ipsq + ipmp_lock ipsq OR ipmp_lock
+ * ill_grpnode ipsq + ill_g_lock ipsq OR ill_g_lock
+ * ill_src_ipif ill_g_lock ill_g_lock
+ * ill_move_ipif ipsq ipsq
+ * ill_nom_cast ipsq ipsq OR advisory
+ * ill_refresh_tid ill_lock ill_lock
+ * ill_grp (for IPMP ill) write once write once
+ * ill_grp (for underlying ill) ipsq + ill_g_lock ipsq OR ill_g_lock
+ *
+ * NOTE: It's OK to make heuristic decisions on an underlying interface
+ * by using IS_UNDER_IPMP() or comparing ill_grp's raw pointer value.
*/
/*
@@ -2167,7 +2272,7 @@ enum { IF_CMD = 1, LIF_CMD, TUN_CMD, ARP_CMD, XARP_CMD, MSFILT_CMD, MISC_CMD };
#define IPI_MODOK 0x2 /* Permitted on mod instance of IP */
#define IPI_WR 0x4 /* Need to grab writer access */
#define IPI_GET_CMD 0x8 /* branch to mi_copyout on success */
-#define IPI_REPL 0x10 /* valid for replacement ipif created in MOVE */
+/* unused 0x10 */
#define IPI_NULL_BCONT 0x20 /* ioctl has not data and hence no b_cont */
#define IPI_PASS_DOWN 0x40 /* pass this ioctl down when a module only */
@@ -2176,17 +2281,6 @@ extern ip_ioctl_cmd_t ip_misc_ioctl_table[];
extern int ip_ndx_ioctl_count;
extern int ip_misc_ioctl_count;
-#define ILL_CLEAR_MOVE(ill) { \
- ill_t *peer_ill; \
- \
- peer_ill = (ill)->ill_move_peer; \
- ASSERT(peer_ill != NULL); \
- (ill)->ill_move_in_progress = B_FALSE; \
- peer_ill->ill_move_in_progress = B_FALSE; \
- (ill)->ill_move_peer = NULL; \
- peer_ill->ill_move_peer = NULL; \
-}
-
/* Passed down by ARP to IP during I_PLINK/I_PUNLINK */
typedef struct ipmx_s {
char ipmx_name[LIFNAMSIZ]; /* if name */
@@ -2799,19 +2893,11 @@ typedef struct ip_pktinfo {
(!((ipif)->ipif_state_flags & (IPIF_CONDEMNED)) || \
IAM_WRITER_IPIF(ipif))
-/*
- * These macros are used by critical set ioctls and failover ioctls to
- * mark the ipif appropriately before starting the operation and to clear the
- * marks after completing the operation.
- */
-#define IPIF_UNMARK_MOVING(ipif) \
- (ipif)->ipif_state_flags &= ~IPIF_MOVING & ~IPIF_CHANGING;
-
#define ILL_UNMARK_CHANGING(ill) \
(ill)->ill_state_flags &= ~ILL_CHANGING;
/* Macros used to assert that this thread is a writer */
-#define IAM_WRITER_IPSQ(ipsq) ((ipsq)->ipsq_writer == curthread)
+#define IAM_WRITER_IPSQ(ipsq) ((ipsq)->ipsq_xop->ipx_writer == curthread)
#define IAM_WRITER_ILL(ill) IAM_WRITER_IPSQ((ill)->ill_phyint->phyint_ipsq)
#define IAM_WRITER_IPIF(ipif) IAM_WRITER_ILL((ipif)->ipif_ill)
@@ -2837,9 +2923,9 @@ typedef struct ip_pktinfo {
#define RELEASE_ILL_LOCKS(ill_1, ill_2) \
{ \
if (ill_1 != NULL) \
- mutex_exit(&(ill_1)->ill_lock); \
+ mutex_exit(&(ill_1)->ill_lock); \
if (ill_2 != NULL && ill_2 != ill_1) \
- mutex_exit(&(ill_2)->ill_lock); \
+ mutex_exit(&(ill_2)->ill_lock); \
}
/* Get the other protocol instance ill */
@@ -2847,14 +2933,9 @@ typedef struct ip_pktinfo {
((ill)->ill_isv6 ? (ill)->ill_phyint->phyint_illv4 : \
(ill)->ill_phyint->phyint_illv6)
-#define MATCH_V4_ONLY 0x1
-#define MATCH_V6_ONLY 0x2
-#define MATCH_ILL_ONLY 0x4
-
/* ioctl command info: Ioctl properties extracted and stored in here */
typedef struct cmd_info_s
{
- char ci_groupname[LIFNAMSIZ + 1]; /* SIOCSLIFGROUPNAME */
ipif_t *ci_ipif; /* ipif associated with [l]ifreq ioctl's */
sin_t *ci_sin; /* the sin struct passed down */
sin6_t *ci_sin6; /* the sin6_t struct passed down */
@@ -2990,10 +3071,8 @@ extern struct module_info ip_mod_info;
((ipst)->ips_ip6_loopback_out_event.he_interested)
/*
- * Hooks marcos used inside of ip
+ * Hooks macros used inside of ip
*/
-#define IPHA_VHL ipha_version_and_hdr_length
-
#define FW_HOOKS(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst) \
\
if ((_hook).he_interested) { \
@@ -3002,21 +3081,8 @@ extern struct module_info ip_mod_info;
_NOTE(CONSTCOND) \
ASSERT((_ilp != NULL) || (_olp != NULL)); \
\
- _NOTE(CONSTCOND) \
- if ((_ilp != NULL) && \
- (((ill_t *)(_ilp))->ill_phyint != NULL)) \
- info.hpe_ifp = (phy_if_t)((ill_t *) \
- (_ilp))->ill_phyint->phyint_hook_ifindex; \
- else \
- info.hpe_ifp = 0; \
- \
- _NOTE(CONSTCOND) \
- if ((_olp != NULL) && \
- (((ill_t *)(_olp))->ill_phyint != NULL)) \
- info.hpe_ofp = (phy_if_t)((ill_t *) \
- (_olp))->ill_phyint->phyint_hook_ifindex; \
- else \
- info.hpe_ofp = 0; \
+ FW_SET_ILL_INDEX(info.hpe_ifp, (ill_t *)_ilp); \
+ FW_SET_ILL_INDEX(info.hpe_ofp, (ill_t *)_olp); \
info.hpe_protocol = ipst->ips_ipv4_net_data; \
info.hpe_hdr = _iph; \
info.hpe_mp = &(_fm); \
@@ -3026,10 +3092,8 @@ extern struct module_info ip_mod_info;
_event, (hook_data_t)&info) != 0) { \
ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\
(_hook).he_name, (void *)_fm, (void *)_m)); \
- if (_fm != NULL) { \
- freemsg(_fm); \
- _fm = NULL; \
- } \
+ freemsg(_fm); \
+ _fm = NULL; \
_iph = NULL; \
_m = NULL; \
} else { \
@@ -3046,21 +3110,8 @@ extern struct module_info ip_mod_info;
_NOTE(CONSTCOND) \
ASSERT((_ilp != NULL) || (_olp != NULL)); \
\
- _NOTE(CONSTCOND) \
- if ((_ilp != NULL) && \
- (((ill_t *)(_ilp))->ill_phyint != NULL)) \
- info.hpe_ifp = (phy_if_t)((ill_t *) \
- (_ilp))->ill_phyint->phyint_hook_ifindex; \
- else \
- info.hpe_ifp = 0; \
- \
- _NOTE(CONSTCOND) \
- if ((_olp != NULL) && \
- (((ill_t *)(_olp))->ill_phyint != NULL)) \
- info.hpe_ofp = (phy_if_t)((ill_t *) \
- (_olp))->ill_phyint->phyint_hook_ifindex; \
- else \
- info.hpe_ofp = 0; \
+ FW_SET_ILL_INDEX(info.hpe_ifp, (ill_t *)_ilp); \
+ FW_SET_ILL_INDEX(info.hpe_ofp, (ill_t *)_olp); \
info.hpe_protocol = ipst->ips_ipv6_net_data; \
info.hpe_hdr = _iph; \
info.hpe_mp = &(_fm); \
@@ -3070,10 +3121,8 @@ extern struct module_info ip_mod_info;
_event, (hook_data_t)&info) != 0) { \
ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\
(_hook).he_name, (void *)_fm, (void *)_m)); \
- if (_fm != NULL) { \
- freemsg(_fm); \
- _fm = NULL; \
- } \
+ freemsg(_fm); \
+ _fm = NULL; \
_iph = NULL; \
_m = NULL; \
} else { \
@@ -3082,6 +3131,17 @@ extern struct module_info ip_mod_info;
} \
}
+#define FW_SET_ILL_INDEX(fp, ill) \
+ _NOTE(CONSTCOND) \
+ if ((ill) == NULL || (ill)->ill_phyint == NULL) { \
+ (fp) = 0; \
+ _NOTE(CONSTCOND) \
+ } else if (IS_UNDER_IPMP(ill)) { \
+ (fp) = ipmp_ill_get_ipmp_ifindex(ill); \
+ } else { \
+ (fp) = (ill)->ill_phyint->phyint_ifindex; \
+ }
+
/*
* Network byte order macros
*/
@@ -3146,16 +3206,15 @@ struct ipsec_out_s;
struct mac_header_info_s;
-extern boolean_t ip_assign_ifindex(uint_t *, ip_stack_t *);
extern void ill_frag_timer(void *);
extern ill_t *ill_first(int, int, ill_walk_context_t *, ip_stack_t *);
extern ill_t *ill_next(ill_walk_context_t *, ill_t *);
extern void ill_frag_timer_start(ill_t *);
extern void ill_nic_event_dispatch(ill_t *, lif_if_t, nic_event_t,
nic_event_data_t, size_t);
-extern void ill_nic_event_plumb(ill_t *, boolean_t);
extern mblk_t *ip_carve_mp(mblk_t **, ssize_t);
extern mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t);
+extern mblk_t *ip_dlnotify_alloc(uint_t, uint_t);
extern char *ip_dot_addr(ipaddr_t, char *);
extern const char *mac_colon_addr(const uint8_t *, size_t, char *, size_t);
extern void ip_lwput(queue_t *, mblk_t *);
@@ -3239,8 +3298,49 @@ extern int ip_hdr_complete(ipha_t *, zoneid_t, ip_stack_t *);
extern struct qinit iprinitv6;
extern struct qinit ipwinitv6;
-extern void conn_drain_insert(conn_t *connp);
-extern int conn_ipsec_length(conn_t *connp);
+extern void ipmp_init(ip_stack_t *);
+extern void ipmp_destroy(ip_stack_t *);
+extern ipmp_grp_t *ipmp_grp_create(const char *, phyint_t *);
+extern void ipmp_grp_destroy(ipmp_grp_t *);
+extern void ipmp_grp_info(const ipmp_grp_t *, lifgroupinfo_t *);
+extern int ipmp_grp_rename(ipmp_grp_t *, const char *);
+extern ipmp_grp_t *ipmp_grp_lookup(const char *, ip_stack_t *);
+extern int ipmp_grp_vet_phyint(ipmp_grp_t *, phyint_t *);
+extern ipmp_illgrp_t *ipmp_illgrp_create(ill_t *);
+extern void ipmp_illgrp_destroy(ipmp_illgrp_t *);
+extern ill_t *ipmp_illgrp_add_ipif(ipmp_illgrp_t *, ipif_t *);
+extern void ipmp_illgrp_del_ipif(ipmp_illgrp_t *, ipif_t *);
+extern ill_t *ipmp_illgrp_next_ill(ipmp_illgrp_t *);
+extern ill_t *ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *);
+extern ill_t *ipmp_illgrp_cast_ill(ipmp_illgrp_t *);
+extern ill_t *ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *);
+extern ill_t *ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *);
+extern void ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *);
+extern ipmp_arpent_t *ipmp_illgrp_create_arpent(ipmp_illgrp_t *, mblk_t *,
+ boolean_t);
+extern void ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *, ipmp_arpent_t *);
+extern ipmp_arpent_t *ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *, ipaddr_t *);
+extern void ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *);
+extern void ipmp_illgrp_mark_arpent(ipmp_illgrp_t *, ipmp_arpent_t *);
+extern ill_t *ipmp_illgrp_find_ill(ipmp_illgrp_t *, uchar_t *, uint_t);
+extern void ipmp_illgrp_link_grp(ipmp_illgrp_t *, ipmp_grp_t *);
+extern int ipmp_illgrp_unlink_grp(ipmp_illgrp_t *);
+extern uint_t ipmp_ill_get_ipmp_ifindex(const ill_t *);
+extern void ipmp_ill_join_illgrp(ill_t *, ipmp_illgrp_t *);
+extern void ipmp_ill_leave_illgrp(ill_t *);
+extern ill_t *ipmp_ill_hold_ipmp_ill(ill_t *);
+extern boolean_t ipmp_ill_is_active(ill_t *);
+extern void ipmp_ill_refresh_active(ill_t *);
+extern void ipmp_phyint_join_grp(phyint_t *, ipmp_grp_t *);
+extern void ipmp_phyint_leave_grp(phyint_t *);
+extern void ipmp_phyint_refresh_active(phyint_t *);
+extern ill_t *ipmp_ipif_bound_ill(const ipif_t *);
+extern ill_t *ipmp_ipif_hold_bound_ill(const ipif_t *);
+extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *);
+extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *);
+
+extern void conn_drain_insert(conn_t *connp);
+extern int conn_ipsec_length(conn_t *connp);
extern void ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *,
ire_t *);
extern ipaddr_t ip_get_dst(ipha_t *);
@@ -3274,9 +3374,6 @@ extern int ip_srcid_report(queue_t *, mblk_t *, caddr_t, cred_t *);
extern uint8_t ipoptp_next(ipoptp_t *);
extern uint8_t ipoptp_first(ipoptp_t *, ipha_t *);
extern int ip_opt_get_user(const ipha_t *, uchar_t *);
-extern ill_t *ip_grab_attach_ill(ill_t *, mblk_t *, int, boolean_t,
- ip_stack_t *);
-extern ire_t *conn_set_outgoing_ill(conn_t *, ire_t *, ill_t **);
extern int ipsec_req_from_conn(conn_t *, ipsec_req_t *, int);
extern int ip_snmp_get(queue_t *q, mblk_t *mctl, int level);
extern int ip_snmp_set(queue_t *q, int, int, uchar_t *, int);
@@ -3295,7 +3392,6 @@ extern void ip_savebuf(void **, uint_t *, boolean_t, const void *, uint_t);
extern boolean_t ipsq_pending_mp_cleanup(ill_t *, conn_t *);
extern void conn_ioctl_cleanup(conn_t *);
extern ill_t *conn_get_held_ill(conn_t *, ill_t **, int *);
-extern ill_t *ip_newroute_get_dst_ill(ill_t *);
struct multidata_s;
struct pdesc_s;
@@ -3314,9 +3410,6 @@ extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *,
uint_t);
extern void ip_unbind(conn_t *connp);
-extern phyint_t *phyint_lookup_group(char *, boolean_t, ip_stack_t *);
-extern phyint_t *phyint_lookup_group_ifindex(uint_t, ip_stack_t *);
-
extern void tnet_init(void);
extern void tnet_fini(void);
@@ -3434,6 +3527,8 @@ typedef struct ipobs_cb {
* ihd_ifindex Interface index that the packet was received/sent over.
* For local packets, this is the index of the interface
* associated with the local destination address.
+ * ihd_grifindex IPMP group interface index (zero unless ihd_ifindex
+ * is an IPMP underlying interface).
* ihd_stack Netstack the packet is from.
*/
typedef struct ipobs_hook_data {
@@ -3443,6 +3538,7 @@ typedef struct ipobs_hook_data {
ipobs_hook_type_t ihd_htype;
uint16_t ihd_ipver;
uint64_t ihd_ifindex;
+ uint64_t ihd_grifindex;
netstack_t *ihd_stack;
} ipobs_hook_data_t;
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 3f967ea183..d484831a3c 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -1892,7 +1892,6 @@ icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
* case MRT_VERSION:
* case MRT_ASSERT:
* case IP_SEC_OPT:
- * case IP_DONTFAILOVER_IF:
* case IP_NEXTHOP:
*/
default:
@@ -2481,7 +2480,6 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
case MRT_VERSION:
case MRT_ASSERT:
case IP_SEC_OPT:
- case IP_DONTFAILOVER_IF:
case IP_NEXTHOP:
/*
* "soft" error (negative)
@@ -3014,9 +3012,7 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
case IPV6_PATHMTU:
return (EINVAL);
- case IPV6_BOUND_PIF:
case IPV6_SEC_OPT:
- case IPV6_DONTFAILOVER_IF:
case IPV6_SRC_PREFERENCES:
case IPV6_V6ONLY:
/* Handled at IP level */
diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c
index 4f15801dfb..24ba9d689c 100644
--- a/usr/src/uts/common/inet/ip/icmp_opt_data.c
+++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -138,9 +138,6 @@ opdes_t icmp_opt_arr[] = {
{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
sizeof (int), 0 /* no ifindex */ },
-{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (struct in_addr), 0 /* not initialized */ },
-
{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
sizeof (int), 0 },
@@ -222,12 +219,6 @@ opdes_t icmp_opt_arr[] = {
{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
sizeof (int), 0 /* no ifindex */ },
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (int), 0 /* no ifindex */ },
-
-{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (int), 0 /* no ifindex */ },
-
{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
sizeof (int), 0 },
diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c
index 091509c71e..681f198aa7 100644
--- a/usr/src/uts/common/inet/ip/igmp.c
+++ b/usr/src/uts/common/inet/ip/igmp.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -46,7 +46,7 @@
#include <sys/cmn_err.h>
#include <sys/atomic.h>
#include <sys/zone.h>
-
+#include <sys/callb.h>
#include <sys/param.h>
#include <sys/socket.h>
#include <inet/ipclassifier.h>
@@ -83,7 +83,7 @@ static mrec_t *mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
static void mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
mcast_record_t rtype, slist_t *flist);
static mrec_t *mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
-
+static void mcast_signal_restart_thread(ip_stack_t *ipst);
/*
* Macros used to do timer len conversions. Timer values are always
@@ -122,7 +122,7 @@ static mrec_t *mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
* The first multicast join will trigger the igmp timers / mld timers
* The unit for next is milliseconds.
*/
-void
+static void
igmp_start_timers(unsigned next, ip_stack_t *ipst)
{
int time_left;
@@ -207,7 +207,7 @@ igmp_start_timers(unsigned next, ip_stack_t *ipst)
* mld_start_timers:
* The unit for next is milliseconds.
*/
-void
+static void
mld_start_timers(unsigned next, ip_stack_t *ipst)
{
int time_left;
@@ -306,7 +306,8 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
uint32_t group;
uint_t next;
ipif_t *ipif;
- ip_stack_t *ipst;
+ ip_stack_t *ipst;
+ ilm_walker_t ilw;
ASSERT(ill != NULL);
ASSERT(!ill->ill_isv6);
@@ -401,8 +402,7 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
"igmp_input: we are only "
"member src 0x%x ipif_local 0x%x",
(int)ntohl(src),
- (int)
- ntohl(ipif->ipif_lcl_addr));
+ (int)ntohl(ipif->ipif_lcl_addr));
}
mutex_exit(&ill->ill_lock);
return (mp);
@@ -440,23 +440,20 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
}
/*
- * If we belong to the group being reported, and
- * we are a 'Delaying member' in the RFC terminology,
- * stop our timer for that group and 'clear flag' i.e.
- * mark as IGMP_OTHERMEMBER. Do this for all logical
- * interfaces on the given physical interface.
+ * If our ill has ILMs that belong to the group being
+ * reported, and we are a 'Delaying Member' in the RFC
+ * terminology, stop our timer for that group and 'clear
+ * flag' i.e. mark as IGMP_OTHERMEMBER.
*/
- mutex_enter(&ill->ill_lock);
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- ilm = ilm_lookup_ipif(ipif, group);
- if (ilm != NULL) {
+ ilm = ilm_walker_start(&ilw, ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ if (ilm->ilm_addr == group) {
++ipst->ips_igmpstat.igps_rcv_ourreports;
ilm->ilm_timer = INFINITY;
ilm->ilm_state = IGMP_OTHERMEMBER;
}
- } /* for */
- mutex_exit(&ill->ill_lock);
+ }
+ ilm_walker_finish(&ilw);
break;
case IGMP_V3_MEMBERSHIP_REPORT:
@@ -485,6 +482,7 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
int timer;
uint_t next, current;
ip_stack_t *ipst;
+ ilm_walker_t ilw;
ipst = ill->ill_ipst;
++ipst->ips_igmpstat.igps_rcv_queries;
@@ -583,11 +581,12 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
* the maximum timeout.
*/
next = (unsigned)INFINITY;
- mutex_enter(&ill->ill_lock);
+ ilm = ilm_walker_start(&ilw, ill);
+ mutex_enter(&ill->ill_lock);
current = CURRENT_MSTIME;
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
/*
* A multicast router joins INADDR_ANY address
* to enable promiscuous reception of all
@@ -610,6 +609,7 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
}
}
mutex_exit(&ill->ill_lock);
+ ilm_walker_finish(&ilw);
return (next);
}
@@ -623,6 +623,7 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
ipaddr_t *src_array;
uint8_t qrv;
ip_stack_t *ipst;
+ ilm_walker_t ilw;
ipst = ill->ill_ipst;
/* make sure numsrc matches packet size */
@@ -693,8 +694,9 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
} else {
/* group or group/source specific query */
+ ilm = ilm_walker_start(&ilw, ill);
mutex_enter(&ill->ill_lock);
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
(ilm->ilm_addr == htonl(INADDR_ANY)) ||
(ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
@@ -749,6 +751,7 @@ group_query:
ilm->ilm_timer += current;
}
mutex_exit(&ill->ill_lock);
+ ilm_walker_finish(&ilw);
}
return (next);
@@ -819,13 +822,22 @@ igmp_joingroup(ilm_t *ilm)
mutex_exit(&ill->ill_lock);
/*
- * To avoid deadlock, we defer igmp_start_timers() to
- * ipsq_exit(). See the comment in ipsq_exit() for details.
+ * We need to restart the IGMP timers, but we can't do it here
+ * since we're inside the IPSQ and thus igmp_start_timers() ->
+ * untimeout() (inside the IPSQ, waiting for a running timeout
+ * to finish) could deadlock with igmp_timeout_handler() ->
+ * ipsq_enter() (running the timeout, waiting to get inside
+ * the IPSQ). We also can't just delay it until after we
+ * ipsq_exit() since we could be inside more than one IPSQ and
+ * thus still have the other IPSQs pinned after we exit -- and
+ * igmp_start_timers() may be trying to enter one of those.
+ * Instead, signal a dedicated thread that will do it for us.
*/
mutex_enter(&ipst->ips_igmp_timer_lock);
ipst->ips_igmp_deferred_next = MIN(timer,
ipst->ips_igmp_deferred_next);
mutex_exit(&ipst->ips_igmp_timer_lock);
+ mcast_signal_restart_thread(ipst);
}
if (ip_debug > 1) {
@@ -897,13 +909,14 @@ mld_joingroup(ilm_t *ilm)
mutex_exit(&ill->ill_lock);
/*
- * To avoid deadlock, we defer mld_start_timers() to
- * ipsq_exit(). See the comment in ipsq_exit() for details.
+ * Signal another thread to restart the timers. See the
+ * comment in igmp_joingroup() for details.
*/
mutex_enter(&ipst->ips_mld_timer_lock);
ipst->ips_mld_deferred_next = MIN(timer,
ipst->ips_mld_deferred_next);
mutex_exit(&ipst->ips_mld_timer_lock);
+ mcast_signal_restart_thread(ipst);
}
if (ip_debug > 1) {
@@ -1073,8 +1086,8 @@ send_to_in:
/*
* Need to set up retransmission state; merge the new info with the
* current state (which may be null). If the timer is not currently
- * running, start it (need to do a delayed start of the timer as
- * we're currently in the sq).
+ * running, signal a thread to restart it -- see the comment in
+ * igmp_joingroup() for details.
*/
rp = mcast_merge_rtx(ilm, rp, flist);
if (ilm->ilm_rtx.rtx_timer == INFINITY) {
@@ -1085,6 +1098,7 @@ send_to_in:
ilm->ilm_rtx.rtx_timer);
ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
mutex_exit(&ipst->ips_igmp_timer_lock);
+ mcast_signal_restart_thread(ipst);
}
mutex_exit(&ill->ill_lock);
@@ -1161,8 +1175,8 @@ send_to_in:
/*
* Need to set up retransmission state; merge the new info with the
* current state (which may be null). If the timer is not currently
- * running, start it (need to do a deferred start of the timer as
- * we're currently in the sq).
+ * running, signal a thread to restart it -- see the comment in
+ * igmp_joingroup() for details.
*/
rp = mcast_merge_rtx(ilm, rp, flist);
ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
@@ -1174,6 +1188,7 @@ send_to_in:
MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
mutex_exit(&ipst->ips_mld_timer_lock);
+ mcast_signal_restart_thread(ipst);
}
mutex_exit(&ill->ill_lock);
@@ -1397,12 +1412,10 @@ per_ilm_rtxtimer:
*
* igmp_input() receives igmp queries and responds to the queries
* in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
- * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
+ * Later the igmp_timer fires, the timeout handler igmp_timeout_handler()
* performs the action exclusively after entering each ill's ipsq as writer.
- * The actual igmp timeout handler needs to run in the ipsq since it has to
- * access the ilm's and we don't want another exclusive operation like
- * say an IPMP failover to be simultaneously moving the ilms from one ill to
- * another.
+ * (The need to enter the IPSQ is largely historical but there are still some
+ * fields like ilm_filter that rely on it.)
*
* The igmp_slowtimeo() function is called thru another timer.
* igmp_slowtimeout_lock protects the igmp_slowtimeout_id
@@ -1420,7 +1433,6 @@ igmp_timeout_handler(void *arg)
ASSERT(arg != NULL);
mutex_enter(&ipst->ips_igmp_timer_lock);
ASSERT(ipst->ips_igmp_timeout_id != 0);
- ipst->ips_igmp_timer_thread = curthread;
ipst->ips_igmp_timer_scheduled_last = 0;
ipst->ips_igmp_time_to_next = 0;
mutex_exit(&ipst->ips_igmp_timer_lock);
@@ -1452,7 +1464,6 @@ igmp_timeout_handler(void *arg)
mutex_enter(&ipst->ips_igmp_timer_lock);
ASSERT(ipst->ips_igmp_timeout_id != 0);
ipst->ips_igmp_timeout_id = 0;
- ipst->ips_igmp_timer_thread = NULL;
mutex_exit(&ipst->ips_igmp_timer_lock);
if (global_next != INFINITY)
@@ -1663,7 +1674,6 @@ mld_timeout_handler(void *arg)
ASSERT(arg != NULL);
mutex_enter(&ipst->ips_mld_timer_lock);
ASSERT(ipst->ips_mld_timeout_id != 0);
- ipst->ips_mld_timer_thread = curthread;
ipst->ips_mld_timer_scheduled_last = 0;
ipst->ips_mld_time_to_next = 0;
mutex_exit(&ipst->ips_mld_timer_lock);
@@ -1695,7 +1705,6 @@ mld_timeout_handler(void *arg)
mutex_enter(&ipst->ips_mld_timer_lock);
ASSERT(ipst->ips_mld_timeout_id != 0);
ipst->ips_mld_timeout_id = 0;
- ipst->ips_mld_timer_thread = NULL;
mutex_exit(&ipst->ips_mld_timer_lock);
if (global_next != INFINITY)
@@ -1871,7 +1880,7 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
int hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
size_t size = hdrlen + sizeof (igmpa_t);
ipif_t *ipif = ilm->ilm_ipif;
- ill_t *ill = ipif->ipif_ill; /* Will be the "lower" ill */
+ ill_t *ill = ipif->ipif_ill;
mblk_t *first_mp;
ipsec_out_t *io;
zoneid_t zoneid;
@@ -1887,14 +1896,6 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
* not get forwarded on other interfaces or looped back, we
* set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
* to B_FALSE.
- *
- * We also need to make sure that this does not get load balanced
- * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if
- * here. If it gets load balanced, switches supporting igmp snooping
- * will send the packet that it receives for this multicast group
- * to the interface that we are sending on. As we have joined the
- * multicast group on this ill, by sending the packet out on this
- * ill, we receive all the packets back on this ill.
*/
first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
if (first_mp == NULL)
@@ -1909,7 +1910,6 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
io->ipsec_out_len = sizeof (ipsec_out_t);
io->ipsec_out_use_global_policy = B_TRUE;
io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
- io->ipsec_out_attach_if = B_TRUE;
io->ipsec_out_multicast_loop = B_FALSE;
io->ipsec_out_dontroute = B_TRUE;
if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES)
@@ -1995,6 +1995,8 @@ igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
zoneid_t zoneid;
ip_stack_t *ipst = ill->ill_ipst;
+ ASSERT(IAM_WRITER_IPIF(ipif));
+
/* if there aren't any records, there's nothing to send */
if (reclist == NULL)
return;
@@ -2022,6 +2024,14 @@ nextpkt:
int srcspace, srcsperpkt;
srcspace = ill->ill_max_frag - (size +
sizeof (grphdra_t));
+
+ /*
+ * Skip if there's not even enough room in
+ * a single packet to send something useful.
+ */
+ if (srcspace <= sizeof (ipaddr_t))
+ continue;
+
srcsperpkt = srcspace / sizeof (ipaddr_t);
/*
* Increment size and numrec, because we will
@@ -2082,7 +2092,6 @@ nextpkt:
io->ipsec_out_len = sizeof (ipsec_out_t);
io->ipsec_out_use_global_policy = B_TRUE;
io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
- io->ipsec_out_attach_if = B_TRUE;
io->ipsec_out_multicast_loop = B_FALSE;
io->ipsec_out_dontroute = B_TRUE;
if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES)
@@ -2188,6 +2197,7 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
uint_t next;
int mldlen;
ip_stack_t *ipst = ill->ill_ipst;
+ ilm_walker_t ilw;
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
@@ -2294,7 +2304,6 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
return;
}
-
/*
* If we belong to the group being reported, and we are a
* 'Delaying member' per the RFC terminology, stop our timer
@@ -2303,8 +2312,8 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
* membership entries for the same group address (one per zone)
* so we need to walk the ill_ilm list.
*/
- mutex_enter(&ill->ill_lock);
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
+ ilm = ilm_walker_start(&ilw, ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
continue;
BUMP_MIB(ill->ill_icmp6_mib,
@@ -2313,7 +2322,7 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
ilm->ilm_timer = INFINITY;
ilm->ilm_state = IGMP_OTHERMEMBER;
}
- mutex_exit(&ill->ill_lock);
+ ilm_walker_finish(&ilw);
break;
}
case MLD_LISTENER_REDUCTION:
@@ -2343,6 +2352,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
int timer;
uint_t next, current;
in6_addr_t *v6group;
+ ilm_walker_t ilw;
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
@@ -2397,10 +2407,12 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
* maximum timeout.
*/
next = INFINITY;
- mutex_enter(&ill->ill_lock);
+ ilm = ilm_walker_start(&ilw, ill);
+ mutex_enter(&ill->ill_lock);
current = CURRENT_MSTIME;
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
+
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
@@ -2430,6 +2442,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
}
}
mutex_exit(&ill->ill_lock);
+ ilm_walker_finish(&ilw);
return (next);
}
@@ -2446,6 +2459,7 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
in6_addr_t *v6group, *src_array;
uint_t next, numsrc, i, mrd, delay, qqi, current;
uint8_t qrv;
+ ilm_walker_t ilw;
v6group = &mld2q->mld2q_addr;
numsrc = ntohs(mld2q->mld2q_numsrc);
@@ -2518,8 +2532,9 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
} else {
/* group or group/source specific query */
+ ilm = ilm_walker_start(&ilw, ill);
mutex_enter(&ill->ill_lock);
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
@@ -2574,6 +2589,7 @@ group_query:
break;
}
mutex_exit(&ill->ill_lock);
+ ilm_walker_finish(&ilw);
}
return (next);
@@ -2591,9 +2607,8 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
ip6_hbh_t *ip6hbh;
struct ip6_opt_router *ip6router;
size_t size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
- ill_t *ill = ilm->ilm_ill; /* Will be the "lower" ill */
+ ill_t *ill = ilm->ilm_ill;
ipif_t *ipif;
- ip6i_t *ip6i;
/*
* We need to place a router alert option in this packet. The length
@@ -2605,30 +2620,14 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
ASSERT(ill->ill_isv6);
- /*
- * We need to make sure that this packet does not get load balanced.
- * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
- * ip_newroute_ipif_v6 knows how to handle such packets.
- * If it gets load balanced, switches supporting MLD snooping
- * (in the future) will send the packet that it receives for this
- * multicast group to the interface that we are sending on. As we have
- * joined the multicast group on this ill, by sending the packet out
- * on this ill, we receive all the packets back on this ill.
- */
- size += sizeof (ip6i_t) + router_alert_length;
+ size += router_alert_length;
mp = allocb(size, BPRI_HI);
if (mp == NULL)
return;
bzero(mp->b_rptr, size);
mp->b_wptr = mp->b_rptr + size;
- ip6i = (ip6i_t *)mp->b_rptr;
- ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- ip6i->ip6i_nxt = IPPROTO_RAW;
- ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
- ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
-
- ip6h = (ip6_t *)&ip6i[1];
+ ip6h = (ip6_t *)mp->b_rptr;
ip6hbh = (struct ip6_hbh *)&ip6h[1];
ip6router = (struct ip6_opt_router *)&ip6hbh[1];
/*
@@ -2698,7 +2697,6 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
in6_addr_t *srcarray;
ip6_t *ip6h;
ip6_hbh_t *ip6hbh;
- ip6i_t *ip6i;
struct ip6_opt_router *ip6router;
size_t size, optlen, padlen, icmpsize, rsize;
ipif_t *ipif;
@@ -2707,6 +2705,8 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
mrec_t *next_reclist = reclist;
boolean_t morepkts;
+ ASSERT(IAM_WRITER_ILL(ill));
+
/* If there aren't any records, there's nothing to send */
if (reclist == NULL)
return;
@@ -2743,6 +2743,14 @@ nextpkt:
int srcspace, srcsperpkt;
srcspace = ill->ill_max_frag -
(size + sizeof (mld2mar_t));
+
+ /*
+ * Skip if there's not even enough room in
+ * a single packet to send something useful.
+ */
+ if (srcspace <= sizeof (in6_addr_t))
+ continue;
+
srcsperpkt = srcspace / sizeof (in6_addr_t);
/*
* Increment icmpsize and size, because we will
@@ -2787,30 +2795,13 @@ nextpkt:
size += rsize;
}
- /*
- * We need to make sure that this packet does not get load balanced.
- * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
- * ip_newroute_ipif_v6 know how to handle such packets.
- * If it gets load balanced, switches supporting MLD snooping
- * (in the future) will send the packet that it receives for this
- * multicast group to the interface that we are sending on. As we have
- * joined the multicast group on this ill, by sending the packet out
- * on this ill, we receive all the packets back on this ill.
- */
- size += sizeof (ip6i_t);
mp = allocb(size, BPRI_HI);
if (mp == NULL)
goto free_reclist;
bzero(mp->b_rptr, size);
mp->b_wptr = mp->b_rptr + size;
- ip6i = (ip6i_t *)mp->b_rptr;
- ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- ip6i->ip6i_nxt = IPPROTO_RAW;
- ip6i->ip6i_flags = IP6I_ATTACH_IF;
- ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
-
- ip6h = (ip6_t *)&(ip6i[1]);
+ ip6h = (ip6_t *)mp->b_rptr;
ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
@@ -3102,3 +3093,64 @@ mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
return (rtnmrec);
}
+
+/*
+ * Convenience routine to signal the restart-timer thread.
+ */
+static void
+mcast_signal_restart_thread(ip_stack_t *ipst)
+{
+ mutex_enter(&ipst->ips_mrt_lock);
+ ipst->ips_mrt_flags |= IP_MRT_RUN;
+ cv_signal(&ipst->ips_mrt_cv);
+ mutex_exit(&ipst->ips_mrt_lock);
+}
+
+/*
+ * Thread to restart IGMP/MLD timers. See the comment in igmp_joingroup() for
+ * the story behind this unfortunate thread.
+ */
+void
+mcast_restart_timers_thread(ip_stack_t *ipst)
+{
+ int next;
+ char name[64];
+ callb_cpr_t cprinfo;
+
+ (void) snprintf(name, sizeof (name), "mcast_restart_timers_thread_%d",
+ ipst->ips_netstack->netstack_stackid);
+ CALLB_CPR_INIT(&cprinfo, &ipst->ips_mrt_lock, callb_generic_cpr, name);
+
+ for (;;) {
+ mutex_enter(&ipst->ips_mrt_lock);
+ while (!(ipst->ips_mrt_flags & (IP_MRT_STOP|IP_MRT_RUN))) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(&ipst->ips_mrt_cv, &ipst->ips_mrt_lock);
+ CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_mrt_lock);
+ }
+ if (ipst->ips_mrt_flags & IP_MRT_STOP)
+ break;
+ ipst->ips_mrt_flags &= ~IP_MRT_RUN;
+ mutex_exit(&ipst->ips_mrt_lock);
+
+ mutex_enter(&ipst->ips_igmp_timer_lock);
+ next = ipst->ips_igmp_deferred_next;
+ ipst->ips_igmp_deferred_next = INFINITY;
+ mutex_exit(&ipst->ips_igmp_timer_lock);
+
+ if (next != INFINITY)
+ igmp_start_timers(next, ipst);
+
+ mutex_enter(&ipst->ips_mld_timer_lock);
+ next = ipst->ips_mld_deferred_next;
+ ipst->ips_mld_deferred_next = INFINITY;
+ mutex_exit(&ipst->ips_mld_timer_lock);
+ if (next != INFINITY)
+ mld_start_timers(next, ipst);
+ }
+
+ ipst->ips_mrt_flags |= IP_MRT_DONE;
+ cv_signal(&ipst->ips_mrt_done_cv);
+ CALLB_CPR_EXIT(&cprinfo); /* drops ips_mrt_lock */
+ thread_exit();
+}
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 1d0bcf37de..dd87a09974 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -170,11 +170,14 @@ typedef struct listptr_s listptr_t;
*/
typedef struct iproutedata_s {
uint_t ird_idx;
+ uint_t ird_flags; /* see below */
listptr_t ird_route; /* ipRouteEntryTable */
listptr_t ird_netmedia; /* ipNetToMediaEntryTable */
listptr_t ird_attrs; /* ipRouteAttributeTable */
} iproutedata_t;
+#define IRD_REPORT_TESTHIDDEN 0x01 /* include IRE_MARK_TESTHIDDEN routes */
+
/*
* Cluster specific hooks. These should be NULL when booted as a non-cluster
*/
@@ -228,31 +231,27 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
* MT level protection given by STREAMS. IP uses a combination of its own
* internal serialization mechanism and standard Solaris locking techniques.
- * The internal serialization is per phyint (no IPMP) or per IPMP group.
- * This is used to serialize plumbing operations, IPMP operations, certain
- * multicast operations, most set ioctls, igmp/mld timers etc.
+ * The internal serialization is per phyint. This is used to serialize
+ * plumbing operations, certain multicast operations, most set ioctls,
+ * igmp/mld timers etc.
*
* Plumbing is a long sequence of operations involving message
* exchanges between IP, ARP and device drivers. Many set ioctls are typically
* involved in plumbing operations. A natural model is to serialize these
* ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
* parallel without any interference. But various set ioctls on hme0 are best
- * serialized. However if the system uses IPMP, the operations are easier if
- * they are serialized on a per IPMP group basis since IPMP operations
- * happen across ill's of a group. Thus the lowest common denominator is to
- * serialize most set ioctls, multicast join/leave operations, IPMP operations
- * igmp/mld timer operations, and processing of DLPI control messages received
- * from drivers on a per IPMP group basis. If the system does not employ
- * IPMP the serialization is on a per phyint basis. This serialization is
- * provided by the ipsq_t and primitives operating on this. Details can
- * be found in ip_if.c above the core primitives operating on ipsq_t.
+ * serialized, along with multicast join/leave operations, igmp/mld timer
+ * operations, and processing of DLPI control messages received from drivers
+ * on a per phyint basis. This serialization is provided by the ipsq_t and
+ * primitives operating on this. Details can be found in ip_if.c above the
+ * core primitives operating on ipsq_t.
*
* Lookups of an ipif or ill by a thread return a refheld ipif / ill.
* Simiarly lookup of an ire by a thread also returns a refheld ire.
* In addition ipif's and ill's referenced by the ire are also indirectly
* refheld. Thus no ipif or ill can vanish nor can critical parameters like
* the ipif's address or netmask change as long as an ipif is refheld
- * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the
+ * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
* address of an ipif has to go through the ipsq_t. This ensures that only
* 1 such exclusive operation proceeds at any time on the ipif. It then
* deletes all ires associated with this ipif, and waits for all refcnts
@@ -281,33 +280,24 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* - ill_g_lock: This is a global reader/writer lock. Protects the following
* * The AVL tree based global multi list of all ills.
* * The linked list of all ipifs of an ill
- * * The <ill-ipsq> mapping
- * * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next
- * * The illgroup list threaded by ill_group_next.
+ * * The <ipsq-xop> mapping
* * <ill-phyint> association
* Insertion/deletion of an ill in the system, insertion/deletion of an ipif
- * into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion
- * of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill
- * will all have to hold the ill_g_lock as writer for the actual duration
- * of the insertion/deletion/change. More details about the <ill-ipsq> mapping
- * may be found in the IPMP section.
+ * into an ill, changing the <ipsq-xop> mapping of an ill, changing the
+ * <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
+ * writer for the actual duration of the insertion/deletion/change.
*
* - ill_lock: This is a per ill mutex.
- * It protects some members of the ill and is documented below.
- * It also protects the <ill-ipsq> mapping
- * It also protects the illgroup list threaded by ill_group_next.
+ * It protects some members of the ill_t struct; see ip.h for details.
* It also protects the <ill-phyint> assoc.
* It also protects the list of ipifs hanging off the ill.
*
* - ipsq_lock: This is a per ipsq_t mutex lock.
- * This protects all the other members of the ipsq struct except
- * ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock
+ * This protects some members of the ipsq_t struct; see ip.h for details.
+ * It also protects the <ipsq-ipxop> mapping
*
- * - illgrp_lock: This is a per ill_group mutex lock.
- * The only thing it protects is the illgrp_ill_schednext member of ill_group
- * which dictates which is the next ill in an ill_group that is to be chosen
- * for sending outgoing packets, through creation of an IRE_CACHE that
- * references this ill.
+ * - ipx_lock: This is a per ipxop_t mutex lock.
+ * This protects some members of the ipxop_t struct; see ip.h for details.
*
* - phyint_lock: This is a per phyint mutex lock. Protects just the
* phyint_flags
@@ -335,27 +325,24 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* Note, it is only necessary to take this lock if the ill_usesrc_grp_next
* field is changing state i.e from NULL to non-NULL or vice-versa. For
* example, it is not necessary to take this lock in the initial portion
- * of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and
- * ip_sioctl_flags since the these operations are executed exclusively and
- * that ensures that the "usesrc group state" cannot change. The "usesrc
- * group state" change can happen only in the latter part of
- * ip_sioctl_slifusesrc and in ill_delete.
+ * of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
+ * operations are executed exclusively and that ensures that the "usesrc
+ * group state" cannot change. The "usesrc group state" change can happen
+ * only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
*
- * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications.
+ * Changing <ill-phyint>, <ipsq-xop> assocications:
*
* To change the <ill-phyint> association, the ill_g_lock must be held
* as writer, and the ill_locks of both the v4 and v6 instance of the ill
* must be held.
*
- * To change the <ill-ipsq> association the ill_g_lock must be held as writer
- * and the ill_lock of the ill in question must be held.
- *
- * To change the <ill-illgroup> association the ill_g_lock must be held as
- * writer and the ill_lock of the ill in question must be held.
+ * To change the <ipsq-xop> association, the ill_g_lock must be held as
+ * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
+ * This is only done when ills are added or removed from IPMP groups.
*
* To add or delete an ipif from the list of ipifs hanging off the ill,
* ill_g_lock (writer) and ill_lock must be held and the thread must be
- * a writer on the associated ipsq,.
+ * a writer on the associated ipsq.
*
* To add or delete an ill to the system, the ill_g_lock must be held as
* writer and the thread must be a writer on the associated ipsq.
@@ -367,8 +354,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
*
* Some lock hierarchy scenarios are listed below.
*
- * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock
- * ill_g_lock -> illgrp_lock -> ill_lock
+ * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
* ill_g_lock -> ill_lock(s) -> phyint_lock
* ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock
* ill_g_lock -> ip_addr_avail_lock
@@ -587,8 +573,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* back, i.e. the loopback which is required since neither Ethernet drivers
* nor Ethernet hardware loops them back. This is the case when the normal
* routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill (or ill group) as the ill with which is IRE_LOCAL is
- * associated.
+ * the same ill as the ill with which is IRE_LOCAL is associated.
*
* Multiple zones can share a common broadcast address; typically all zones
* share the 255.255.255.255 address. Incoming as well as locally originated
@@ -695,8 +680,8 @@ static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *,
mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *);
static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *,
ip_stack_t *);
-static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *,
- uint16_t *);
+static boolean_t ip_rput_fragment(ill_t *, ill_t *, mblk_t **, ipha_t *,
+ uint32_t *, uint16_t *);
int ip_snmp_get(queue_t *, mblk_t *, int);
static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
mib2_ipIfStatsEntry_t *, ip_stack_t *);
@@ -723,9 +708,9 @@ static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
ip_stack_t *ipst);
-static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *,
+static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
ip_stack_t *ipst);
-static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *,
+static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
ip_stack_t *ipst);
static void ip_snmp_get2_v4(ire_t *, iproutedata_t *);
static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
@@ -775,8 +760,6 @@ static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
cred_t *);
-static int ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t,
- cred_t *);
static int ip_squeue_switch(int);
static void *ip_kstat_init(netstackid_t, ip_stack_t *);
@@ -946,8 +929,6 @@ static ipndp_t lcl_ndp_arr[] = {
{ ip_cgtp_filter_get, ip_cgtp_filter_set, NULL,
"ip_cgtp_filter" },
#define IPNDP_IPMP_HOOK_OFFSET 10
- { ip_param_generic_get, ipmp_hook_emulation_set, NULL,
- "ipmp_hook_emulation" },
{ ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug,
"ip_debug" },
};
@@ -984,20 +965,19 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
- /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+ /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_addr, NULL },
/* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
/* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
- IPI_GET_CMD | IPI_REPL,
- IF_CMD, ip_sioctl_get_dstaddr, NULL },
+ IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
/* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
+ IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
/* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
- IPI_MODOK | IPI_GET_CMD | IPI_REPL,
+ IPI_MODOK | IPI_GET_CMD,
IF_CMD, ip_sioctl_get_flags, NULL },
/* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1009,31 +989,28 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_mtu, NULL },
- /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+ /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_mtu, NULL },
/* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
- IPI_GET_CMD | IPI_REPL,
- IF_CMD, ip_sioctl_get_brdaddr, NULL },
+ IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
/* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_brdaddr, NULL },
/* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
- IPI_GET_CMD | IPI_REPL,
- IF_CMD, ip_sioctl_get_netmask, NULL },
+ IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
/* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
/* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
- IPI_GET_CMD | IPI_REPL,
- IF_CMD, ip_sioctl_get_metric, NULL },
+ IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
/* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
IF_CMD, ip_sioctl_metric, NULL },
/* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* See 166-168 below for extended SIOC*XARP ioctls */
- /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV,
+ /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
ARP_CMD, ip_sioctl_arp, NULL },
- /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL,
+ /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
ARP_CMD, ip_sioctl_arp, NULL },
- /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV,
+ /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
ARP_CMD, ip_sioctl_arp, NULL },
/* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1098,21 +1075,19 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
- /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL,
+ /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
MISC_CMD, ip_sioctl_get_ifnum, NULL },
- /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+ /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_muxid, NULL },
/* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- IF_CMD, ip_sioctl_muxid, NULL },
+ IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
/* Both if and lif variants share same func */
- /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+ /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_lifindex, NULL },
/* Both if and lif variants share same func */
/* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- IF_CMD, ip_sioctl_slifindex, NULL },
+ IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
/* copyin size cannot be coded for SIOCGIFCONF */
/* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
@@ -1136,28 +1111,25 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_removeif,
+ IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
ip_sioctl_removeif_restart },
/* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL,
+ IPI_GET_CMD | IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_addif, NULL },
#define SIOCLIFADDR_NDX 112
/* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
/* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_addr, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
/* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
/* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_dstaddr, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
/* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
+ IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
/* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_MODOK | IPI_REPL,
+ IPI_GET_CMD | IPI_MODOK,
LIF_CMD, ip_sioctl_get_flags, NULL },
/* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1167,58 +1139,48 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
ip_sioctl_get_lifconf, NULL },
/* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_mtu, NULL },
- /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL,
+ /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
LIF_CMD, ip_sioctl_get_mtu, NULL },
/* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_brdaddr, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
/* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_brdaddr, NULL },
/* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_netmask, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
/* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
/* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_metric, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
/* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_metric, NULL },
/* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL,
+ IPI_PRIV | IPI_WR | IPI_MODOK,
LIF_CMD, ip_sioctl_slifname,
ip_sioctl_slifname_restart },
- /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL,
+ /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
MISC_CMD, ip_sioctl_get_lifnum, NULL },
/* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_muxid, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
/* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_muxid, NULL },
+ IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
/* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_lifindex, 0 },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
/* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_slifindex, 0 },
+ IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
/* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_token, NULL },
/* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_token, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
/* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
/* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_subnet, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
/* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_lnkinfo, NULL },
/* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
/* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
LIF_CMD, ip_siocdelndp_v6, NULL },
/* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
@@ -1231,8 +1193,8 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
MISC_CMD, ip_sioctl_tonlink, NULL },
/* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
MISC_CMD, ip_sioctl_tmysite, NULL },
- /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL,
- TUN_CMD, ip_sioctl_tunparam, NULL },
+ /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), 0,
+ TUN_CMD, ip_sioctl_tunparam, NULL },
/* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req),
IPI_PRIV | IPI_WR,
TUN_CMD, ip_sioctl_tunparam, NULL },
@@ -1243,29 +1205,24 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
/* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
- /* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_move, ip_sioctl_move },
- /* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_move, ip_sioctl_move },
+ /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
+
+ /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD |
+ IPI_WR, LIF_CMD, ip_sioctl_get_binding, NULL },
/* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
+ IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
/* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_groupname, NULL },
- /* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_oindex, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
+ /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
+ IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
/* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
/* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
- /* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
- LIF_CMD, ip_sioctl_slifoindex, NULL },
+ /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* These are handled in ip_sioctl_copyin_setup itself */
/* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
@@ -1277,22 +1234,20 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
ip_sioctl_get_lifconf, NULL },
- /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV,
+ /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
XARP_CMD, ip_sioctl_arp, NULL },
- /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL,
+ /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
XARP_CMD, ip_sioctl_arp, NULL },
- /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV,
+ /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
XARP_CMD, ip_sioctl_arp, NULL },
/* SIOCPOPSOCKFS is not handled by IP */
/* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
/* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
- IPI_GET_CMD | IPI_REPL,
- LIF_CMD, ip_sioctl_get_lifzone, NULL },
+ IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
/* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
- IPI_PRIV | IPI_WR | IPI_REPL,
- LIF_CMD, ip_sioctl_slifzone,
+ IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
ip_sioctl_slifzone_restart },
/* 172-174 are SCTP ioctls and not handled by IP */
/* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1315,8 +1270,7 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
MSFILT_CMD, ip_sioctl_msfilter, NULL },
/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR,
MSFILT_CMD, ip_sioctl_msfilter, NULL },
- /* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD,
- ip_sioctl_set_ipmpfailback, NULL },
+ /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* SIOCSENABLESDP is handled by SDP */
/* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
/* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
@@ -1326,7 +1280,7 @@ int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
{ OSIOCGTUNPARAM, sizeof (struct old_iftun_req),
- IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL },
+ IPI_GET_CMD, TUN_CMD, ip_sioctl_tunparam, NULL },
{ OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR,
TUN_CMD, ip_sioctl_tunparam, NULL },
{ I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
@@ -1336,11 +1290,11 @@ ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
{ ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL },
{ ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
{ IP_IOCTL, 0, 0, 0, NULL, NULL },
- { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD,
+ { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
MISC_CMD, mrt_ioctl},
- { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD,
+ { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD,
MISC_CMD, mrt_ioctl},
- { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD,
+ { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
MISC_CMD, mrt_ioctl}
};
@@ -1629,8 +1583,6 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
ipif_t *ipif;
mblk_t *first_mp;
ipsec_in_t *ii;
- ire_t *src_ire;
- boolean_t onlink;
timestruc_t now;
uint32_t ill_index;
ip_stack_t *ipst;
@@ -2014,59 +1966,6 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
if (!IS_SIMPLE_IPH(ipha))
icmp_options_update(ipha);
- /*
- * ICMP echo replies should go out on the same interface
- * the request came on as probes used by in.mpathd for detecting
- * NIC failures are ECHO packets. We turn-off load spreading
- * by setting ipsec_in_attach_if to B_TRUE, which is copied
- * to ipsec_out_attach_if by ipsec_in_to_out called later in this
- * function. This is in turn handled by ip_wput and ip_newroute
- * to make sure that the packet goes out on the interface it came
- * in on. If we don't turnoff load spreading, the packets might get
- * dropped if there are no non-FAILED/INACTIVE interfaces for it
- * to go out and in.mpathd would wrongly detect a failure or
- * mis-detect a NIC failure for link failure. As load spreading
- * can happen only if ill_group is not NULL, we do only for
- * that case and this does not affect the normal case.
- *
- * We turn off load spreading only on echo packets that came from
- * on-link hosts. If the interface route has been deleted, this will
- * not be enforced as we can't do much. For off-link hosts, as the
- * default routes in IPv4 does not typically have an ire_ipif
- * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute.
- * Moreover, expecting a default route through this interface may
- * not be correct. We use ipha_dst because of the swap above.
- */
- onlink = B_FALSE;
- if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) {
- /*
- * First, we need to make sure that it is not one of our
- * local addresses. If we set onlink when it is one of
- * our local addresses, we will end up creating IRE_CACHES
- * for one of our local addresses. Then, we will never
- * accept packets for them afterwards.
- */
- src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (src_ire == NULL) {
- ipif = ipif_get_next_ipif(NULL, ill);
- if (ipif == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(mp);
- return;
- }
- src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0,
- IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
- NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE, ipst);
- ipif_refrele(ipif);
- if (src_ire != NULL) {
- onlink = B_TRUE;
- ire_refrele(src_ire);
- }
- } else {
- ire_refrele(src_ire);
- }
- }
if (!mctl_present) {
/*
* This packet should go out the same way as it
@@ -2085,20 +1984,7 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
/* This is not a secure packet */
ii->ipsec_in_secure = B_FALSE;
- if (onlink) {
- ii->ipsec_in_attach_if = B_TRUE;
- ii->ipsec_in_ill_index =
- ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index =
- recv_ill->ill_phyint->phyint_ifindex;
- }
first_mp->b_cont = mp;
- } else if (onlink) {
- ii = (ipsec_in_t *)first_mp->b_rptr;
- ii->ipsec_in_attach_if = B_TRUE;
- ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */
} else {
ii = (ipsec_in_t *)first_mp->b_rptr;
ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */
@@ -3733,7 +3619,6 @@ ipif_dup_recovery(void *arg)
ill_t *ill = ipif->ipif_ill;
mblk_t *arp_add_mp;
mblk_t *arp_del_mp;
- area_t *area;
ip_stack_t *ipst = ill->ill_ipst;
ipif->ipif_recovery_id = 0;
@@ -3744,12 +3629,13 @@ ipif_dup_recovery(void *arg)
*/
if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) ||
(ipif->ipif_flags & IPIF_POINTOPOINT) ||
- (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
+ (ipif->ipif_state_flags & (IPIF_CONDEMNED))) {
/* No reason to try to bring this address back. */
return;
}
- if ((arp_add_mp = ipif_area_alloc(ipif)) == NULL)
+ /* ACE_F_UNVERIFIED restarts DAD */
+ if ((arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL)
goto alloc_fail;
if (ipif->ipif_arp_del_mp == NULL) {
@@ -3758,10 +3644,6 @@ ipif_dup_recovery(void *arg)
ipif->ipif_arp_del_mp = arp_del_mp;
}
- /* Setting the 'unverified' flag restarts DAD */
- area = (area_t *)arp_add_mp->b_rptr;
- area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR |
- ACE_F_UNVERIFIED;
putnext(ill->ill_rq, arp_add_mp);
return;
@@ -3873,6 +3755,7 @@ ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
EINPROGRESS) {
ipif->ipif_addr_ready = 1;
(void) ipif_up_done(ipif);
+ ASSERT(ill->ill_move_ipif == NULL);
}
continue;
}
@@ -3893,6 +3776,7 @@ ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
ill->ill_net_type == IRE_IF_RESOLVER &&
!(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
ipst->ips_ip_dup_recovery > 0) {
+ ASSERT(ipif->ipif_recovery_id == 0);
ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
}
@@ -4196,8 +4080,9 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
{
mblk_t *mp;
ip_pktinfo_t *pinfo;
- ipha_t *ipha;
+ ipha_t *ipha;
struct ether_header *pether;
+ boolean_t ipmp_ill_held = B_FALSE;
mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED);
if (mp == NULL) {
@@ -4205,12 +4090,53 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
return (data_mp);
}
- ipha = (ipha_t *)data_mp->b_rptr;
+ ipha = (ipha_t *)data_mp->b_rptr;
pinfo = (ip_pktinfo_t *)mp->b_rptr;
bzero(pinfo, sizeof (ip_pktinfo_t));
pinfo->ip_pkt_flags = (uchar_t)flags;
pinfo->ip_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */
+ pether = (struct ether_header *)((char *)ipha
+ - sizeof (struct ether_header));
+
+ /*
+ * Make sure the interface is an ethernet type, since this option
+ * is currently supported only on this type of interface. Also make
+ * sure we are pointing correctly above db_base.
+ */
+ if ((flags & IPF_RECVSLLA) &&
+ ((uchar_t *)pether >= data_mp->b_datap->db_base) &&
+ (ill->ill_type == IFT_ETHER) &&
+ (ill->ill_net_type == IRE_IF_RESOLVER)) {
+ pinfo->ip_pkt_slla.sdl_type = IFT_ETHER;
+ bcopy(pether->ether_shost.ether_addr_octet,
+ pinfo->ip_pkt_slla.sdl_data, ETHERADDRL);
+ } else {
+ /*
+ * Clear the bit. Indicate to upper layer that IP is not
+ * sending this ancillary info.
+ */
+ pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA;
+ }
+
+ /*
+ * If `ill' is in an IPMP group, use the IPMP ill to determine
+ * IPF_RECVIF and IPF_RECVADDR. (This currently assumes that
+ * IPF_RECVADDR support on test addresses is not needed.)
+ *
+ * Note that `ill' may already be an IPMP ill if e.g. we're
+ * processing a packet looped back to an IPMP data address
+ * (since those IRE_LOCALs are tied to IPMP ills).
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ if ((ill = ipmp_ill_hold_ipmp_ill(ill)) == NULL) {
+ ip1dbg(("ip_add_info: cannot hold IPMP ill.\n"));
+ freemsg(mp);
+ return (data_mp);
+ }
+ ipmp_ill_held = B_TRUE;
+ }
+
if (flags & (IPF_RECVIF | IPF_RECVADDR))
pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex;
if (flags & IPF_RECVADDR) {
@@ -4239,7 +4165,7 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
ire = ire_ctable_lookup(ipha->ipha_dst, 0,
IRE_LOCAL | IRE_LOOPBACK,
ipif, zoneid, NULL,
- MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst);
+ MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
if (ire == NULL) {
/*
* packet must have come on a different
@@ -4276,29 +4202,8 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
}
}
- pether = (struct ether_header *)((char *)ipha
- - sizeof (struct ether_header));
- /*
- * Make sure the interface is an ethernet type, since this option
- * is currently supported only on this type of interface. Also make
- * sure we are pointing correctly above db_base.
- */
-
- if ((flags & IPF_RECVSLLA) &&
- ((uchar_t *)pether >= data_mp->b_datap->db_base) &&
- (ill->ill_type == IFT_ETHER) &&
- (ill->ill_net_type == IRE_IF_RESOLVER)) {
-
- pinfo->ip_pkt_slla.sdl_type = IFT_ETHER;
- bcopy((uchar_t *)pether->ether_shost.ether_addr_octet,
- (uchar_t *)pinfo->ip_pkt_slla.sdl_data, ETHERADDRL);
- } else {
- /*
- * Clear the bit. Indicate to upper layer that IP is not
- * sending this ancillary info.
- */
- pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA;
- }
+ if (ipmp_ill_held)
+ ill_refrele(ill);
mp->b_datap->db_type = M_CTL;
mp->b_wptr += sizeof (ip_pktinfo_t);
@@ -4946,8 +4851,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
}
}
- if (dst_ire != NULL &&
- dst_ire->ire_type == IRE_LOCAL &&
+ if (dst_ire != NULL && dst_ire->ire_type == IRE_LOCAL &&
dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) {
/*
* If the IRE belongs to a different zone, look for a matching
@@ -4983,7 +4887,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
* Pick a source address so that a proper inbound
* load spreading would happen.
*/
- ill_t *dst_ill = dst_ire->ire_ipif->ipif_ill;
+ ill_t *ire_ill = dst_ire->ire_ipif->ipif_ill;
ipif_t *src_ipif = NULL;
ire_t *ipif_ire;
@@ -4998,10 +4902,10 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
* found above so that upper layers know that the
* destination address is a broadcast address.
*
- * 2) If this is part of a group, select a better
- * source address so that better inbound load
- * balancing happens. Do the same if the ipif
- * is DEPRECATED.
+ * 2) If the ipif is DEPRECATED, select a better
+ * source address. Similarly, if the ipif is on
+ * the IPMP meta-interface, pick a source address
+ * at random to improve inbound load spreading.
*
* 3) If the outgoing interface is part of a usesrc
* group, then try selecting a source address from
@@ -5011,9 +4915,9 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
dst_ire->ire_zoneid != ALL_ZONES) ||
(!(dst_ire->ire_flags & RTF_SETSRC)) &&
(!(dst_ire->ire_type & IRE_BROADCAST) &&
- ((dst_ill->ill_group != NULL) ||
+ (IS_IPMP(ire_ill) ||
(dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
- (dst_ill->ill_usesrc_ifindex != 0)))) {
+ (ire_ill->ill_usesrc_ifindex != 0)))) {
/*
* If the destination is reachable via a
* given gateway, the selected source address
@@ -5035,7 +4939,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
*/
ipaddr_t saddr =
dst_ire->ire_ipif->ipif_src_addr;
- src_ipif = ipif_select_source(dst_ill,
+ src_ipif = ipif_select_source(ire_ill,
saddr, zoneid);
if (src_ipif != NULL) {
if (IS_VNI(src_ipif->ipif_ill)) {
@@ -5478,14 +5382,6 @@ ip_modclose(ill_t *ill)
(void) ill_frag_timeout(ill, 0);
/*
- * If MOVE was in progress, clear the
- * move_in_progress fields also.
- */
- if (ill->ill_move_in_progress) {
- ILL_CLEAR_MOVE(ill);
- }
-
- /*
* Call ill_delete to bring down the ipifs, ilms and ill on
* this ill. Then wait for the refcnts to drop to zero.
* ill_is_freeable checks whether the ill is really quiescent.
@@ -5510,7 +5406,7 @@ ip_modclose(ill_t *ill)
*/
netstack_hold(ipst->ips_netstack);
- /* qprocsoff is called in ill_delete_tail */
+ /* qprocsoff is done via ill_delete_tail */
ill_delete_tail(ill);
ASSERT(ill->ill_ipst == NULL);
@@ -5755,6 +5651,11 @@ ip_stack_shutdown(netstackid_t stackid, void *arg)
ipst->ips_capab_taskq_quit = B_TRUE;
cv_signal(&ipst->ips_capab_taskq_cv);
mutex_exit(&ipst->ips_capab_taskq_lock);
+
+ mutex_enter(&ipst->ips_mrt_lock);
+ ipst->ips_mrt_flags |= IP_MRT_STOP;
+ cv_signal(&ipst->ips_mrt_cv);
+ mutex_exit(&ipst->ips_mrt_lock);
}
/*
@@ -5766,6 +5667,9 @@ ip_stack_fini(netstackid_t stackid, void *arg)
ip_stack_t *ipst = (ip_stack_t *)arg;
int ret;
+#ifdef NS_DEBUG
+ printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
+#endif
/*
* At this point, all of the notifications that the events and
* protocols are going away have been run, meaning that we can
@@ -5779,9 +5683,14 @@ ip_stack_fini(netstackid_t stackid, void *arg)
cv_destroy(&ipst->ips_capab_taskq_cv);
list_destroy(&ipst->ips_capab_taskq_list);
-#ifdef NS_DEBUG
- printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
-#endif
+ mutex_enter(&ipst->ips_mrt_lock);
+ while (!(ipst->ips_mrt_flags & IP_MRT_DONE))
+ cv_wait(&ipst->ips_mrt_done_cv, &ipst->ips_mrt_lock);
+ mutex_destroy(&ipst->ips_mrt_lock);
+ cv_destroy(&ipst->ips_mrt_cv);
+ cv_destroy(&ipst->ips_mrt_done_cv);
+
+ ipmp_destroy(ipst);
rw_destroy(&ipst->ips_srcid_lock);
ip_kstat_fini(stackid, ipst->ips_ip_mibkp);
@@ -6038,10 +5947,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
"ip_cgtp_filter") == 0);
ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_data =
(caddr_t)&ipst->ips_ip_cgtp_filter;
- ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_name,
- "ipmp_hook_emulation") == 0);
- ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_data =
- (caddr_t)&ipst->ips_ipmp_hook_emulation;
(void) ip_param_register(&ipst->ips_ip_g_nd,
ipst->ips_param_arr, A_CNT(lcl_param_arr),
@@ -6053,8 +5958,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
ipst->ips_ip6_kstat =
ip6_kstat_init(stackid, &ipst->ips_ip6_statistics);
- ipst->ips_ipmp_enable_failback = B_TRUE;
-
ipst->ips_ip_src_id = 1;
rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
@@ -6062,6 +5965,7 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
ip_net_init(ipst, ns);
ipv4_hook_init(ipst);
ipv6_hook_init(ipst);
+ ipmp_init(ipst);
/*
* Create the taskq dispatcher thread and initialize related stuff.
@@ -6073,6 +5977,15 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
list_create(&ipst->ips_capab_taskq_list, sizeof (mblk_t),
offsetof(mblk_t, b_next));
+ /*
+ * Create the mcast_restart_timers_thread() worker thread.
+ */
+ mutex_init(&ipst->ips_mrt_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&ipst->ips_mrt_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&ipst->ips_mrt_done_cv, NULL, CV_DEFAULT, NULL);
+ ipst->ips_mrt_thread = thread_create(NULL, 0,
+ mcast_restart_timers_thread, ipst, 0, &p0, TS_RUN, minclsyspri);
+
major = mod_name_to_major(INET_NAME);
(void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
return (ipst);
@@ -6109,6 +6022,24 @@ ip_dlpi_alloc(size_t len, t_uscalar_t prim)
}
/*
+ * Allocate and initialize a DLPI notification. (May be called as writer.)
+ */
+mblk_t *
+ip_dlnotify_alloc(uint_t notification, uint_t data)
+{
+ dl_notify_ind_t *notifyp;
+ mblk_t *mp;
+
+ if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
+ return (NULL);
+
+ notifyp = (dl_notify_ind_t *)mp->b_rptr;
+ notifyp->dl_notification = notification;
+ notifyp->dl_data = data;
+ return (mp);
+}
+
+/*
* Debug formatting routine. Returns a character string representation of the
* addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address
* in the form of a ipaddr_t and calls ip_dot_saddr with a pointer.
@@ -7753,71 +7684,30 @@ ip_net_mask(ipaddr_t addr)
}
/*
- * Select an ill for the packet by considering load spreading across
- * a different ill in the group if dst_ill is part of some group.
- */
-ill_t *
-ip_newroute_get_dst_ill(ill_t *dst_ill)
-{
- ill_t *ill;
-
- /*
- * We schedule irrespective of whether the source address is
- * INADDR_ANY or not. illgrp_scheduler returns a held ill.
- */
- ill = illgrp_scheduler(dst_ill);
- if (ill == NULL)
- return (NULL);
-
- /*
- * For groups with names ip_sioctl_groupname ensures that all
- * ills are of same type. For groups without names, ifgrp_insert
- * ensures this.
- */
- ASSERT(dst_ill->ill_type == ill->ill_type);
-
- return (ill);
-}
-
-/*
- * Helper function for the IPIF_NOFAILOVER/ATTACH_IF interface attachment case.
+ * Helper ill lookup function used by IPsec.
*/
ill_t *
-ip_grab_attach_ill(ill_t *ill, mblk_t *first_mp, int ifindex, boolean_t isv6,
- ip_stack_t *ipst)
+ip_grab_ill(mblk_t *first_mp, int ifindex, boolean_t isv6, ip_stack_t *ipst)
{
ill_t *ret_ill;
ASSERT(ifindex != 0);
+
ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
ipst);
- if (ret_ill == NULL ||
- (ret_ill->ill_phyint->phyint_flags & PHYI_OFFLINE)) {
+ if (ret_ill == NULL) {
if (isv6) {
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip6_mib,
- ipIfStatsOutDiscards);
- }
- ip1dbg(("ip_grab_attach_ill (IPv6): "
- "bad ifindex %d.\n", ifindex));
+ BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
+ ip1dbg(("ip_grab_ill (IPv6): bad ifindex %d.\n",
+ ifindex));
} else {
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutDiscards);
- }
- ip1dbg(("ip_grab_attach_ill (IPv4): "
- "bad ifindex %d.\n", ifindex));
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ ip1dbg(("ip_grab_ill (IPv4): bad ifindex %d.\n",
+ ifindex));
}
- if (ret_ill != NULL)
- ill_refrele(ret_ill);
freemsg(first_mp);
return (NULL);
}
-
return (ret_ill);
}
@@ -7859,7 +7749,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
ire_t *sire = NULL;
mblk_t *first_mp;
ire_t *save_ire;
- ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER address */
ushort_t ire_marks = 0;
boolean_t mctl_present;
ipsec_out_t *io;
@@ -7873,7 +7762,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
boolean_t multirt_is_resolvable;
boolean_t multirt_resolve_next;
boolean_t unspec_src;
- boolean_t do_attach_ill = B_FALSE;
boolean_t ip_nexthop = B_FALSE;
tsol_ire_gw_secattr_t *attrp = NULL;
tsol_gcgrp_t *gcgrp = NULL;
@@ -7902,22 +7790,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
return;
}
- if (mctl_present && io->ipsec_out_attach_if) {
- /* ip_grab_attach_ill returns a held ill */
- attach_ill = ip_grab_attach_ill(NULL, first_mp,
- io->ipsec_out_ill_index, B_FALSE, ipst);
-
- /* Failure case frees things for us. */
- if (attach_ill == NULL)
- return;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill))
- ire_marks = IRE_MARK_HIDDEN;
- }
if (mctl_present && io->ipsec_out_ip_nexthop) {
ip_nexthop = B_TRUE;
nexthop_addr = io->ipsec_out_nexthop_addr;
@@ -7997,31 +7869,15 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
dst = nexthop_addr;
}
}
- } else if (attach_ill == NULL) {
+ } else {
ire = ire_ftable_lookup(dst, 0, 0, 0,
NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp),
MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT |
MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE,
ipst);
- } else {
- /*
- * attach_ill is set only for communicating with
- * on-link hosts. So, don't look for DEFAULT.
- */
- ipif_t *attach_ipif;
-
- attach_ipif = ipif_get_next_ipif(NULL, attach_ill);
- if (attach_ipif == NULL) {
- ill_refrele(attach_ill);
- goto icmp_err_ret;
- }
- ire = ire_ftable_lookup(dst, 0, 0, 0, attach_ipif,
- &sire, zoneid, 0, MBLK_GETLABEL(mp),
- MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL |
- MATCH_IRE_SECATTR, ipst);
- ipif_refrele(attach_ipif);
}
+
ip3dbg(("ip_newroute: ire_ftable_lookup() "
"returned ire %p, sire %p\n", (void *)ire, (void *)sire));
@@ -8122,8 +7978,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
}
ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0,
RTA_DST, ipst);
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
goto icmp_err_ret;
}
@@ -8134,8 +7988,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
*/
if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
(ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) {
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
goto icmp_err_ret;
}
/*
@@ -8157,119 +8009,51 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
sire->ire_last_used_time = lbolt;
}
/*
- * We have a route to reach the destination.
- *
- * 1) If the interface is part of ill group, try to get a new
- * ill taking load spreading into account.
- *
- * 2) After selecting the ill, get a source address that
- * might create good inbound load spreading.
- * ipif_select_source does this for us.
+ * We have a route to reach the destination. Find the
+ * appropriate ill, then get a source address using
+ * ipif_select_source().
*
- * If the application specified the ill (ifindex), we still
- * load spread. Only if the packets needs to go out
- * specifically on a given ill e.g. binding to
- * IPIF_NOFAILOVER address, then we don't try to use a
- * different ill for load spreading.
+ * If we are here trying to create an IRE_CACHE for an offlink
+ * destination and have an IRE_CACHE entry for VNI, then use
+ * ire_stq instead since VNI's queue is a black hole.
*/
- if (attach_ill == NULL) {
- /*
- * Don't perform outbound load spreading in the
- * case of an RTF_MULTIRT route, as we actually
- * typically want to replicate outgoing packets
- * through particular interfaces.
- */
- if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) {
- dst_ill = ire->ire_ipif->ipif_ill;
- /* for uniformity */
- ill_refhold(dst_ill);
- } else {
- /*
- * If we are here trying to create an IRE_CACHE
- * for an offlink destination and have the
- * IRE_CACHE for the next hop and the latter is
- * using virtual IP source address selection i.e
- * it's ire->ire_ipif is pointing to a virtual
- * network interface (vni) then
- * ip_newroute_get_dst_ll() will return the vni
- * interface as the dst_ill. Since the vni is
- * virtual i.e not associated with any physical
- * interface, it cannot be the dst_ill, hence
- * in such a case call ip_newroute_get_dst_ll()
- * with the stq_ill instead of the ire_ipif ILL.
- * The function returns a refheld ill.
- */
- if ((ire->ire_type == IRE_CACHE) &&
- IS_VNI(ire->ire_ipif->ipif_ill))
- dst_ill = ip_newroute_get_dst_ill(
- ire->ire_stq->q_ptr);
- else
- dst_ill = ip_newroute_get_dst_ill(
- ire->ire_ipif->ipif_ill);
- }
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute: "
- "no dst ill for dst"
- " %s\n", AF_INET, &dst);
- }
- goto icmp_err_ret;
- }
- } else {
- dst_ill = ire->ire_ipif->ipif_ill;
- /* for uniformity */
+ if ((ire->ire_type == IRE_CACHE) &&
+ IS_VNI(ire->ire_ipif->ipif_ill)) {
+ dst_ill = ire->ire_stq->q_ptr;
ill_refhold(dst_ill);
- /*
- * We should have found a route matching ill as we
- * called ire_ftable_lookup with MATCH_IRE_ILL.
- * Rather than asserting, when there is a mismatch,
- * we just drop the packet.
- */
- if (dst_ill != attach_ill) {
- ip0dbg(("ip_newroute: Packet dropped as "
- "IPIF_NOFAILOVER ill is %s, "
- "ire->ire_ipif->ipif_ill is %s\n",
- attach_ill->ill_name,
- dst_ill->ill_name));
- ill_refrele(attach_ill);
- goto icmp_err_ret;
+ } else {
+ ill_t *ill = ire->ire_ipif->ipif_ill;
+
+ if (IS_IPMP(ill)) {
+ dst_ill =
+ ipmp_illgrp_hold_next_ill(ill->ill_grp);
+ } else {
+ dst_ill = ill;
+ ill_refhold(dst_ill);
}
}
- /* attach_ill can't go in loop. IPMP and CGTP are disjoint */
- if (attach_ill != NULL) {
- ill_refrele(attach_ill);
- attach_ill = NULL;
- do_attach_ill = B_TRUE;
+
+ if (dst_ill == NULL) {
+ if (ip_debug > 2) {
+ pr_addr_dbg("ip_newroute: no dst "
+ "ill for dst %s\n", AF_INET, &dst);
+ }
+ goto icmp_err_ret;
}
- ASSERT(dst_ill != NULL);
ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name));
/*
* Pick the best source address from dst_ill.
*
- * 1) If it is part of a multipathing group, we would
- * like to spread the inbound packets across different
- * interfaces. ipif_select_source picks a random source
- * across the different ills in the group.
- *
- * 2) If it is not part of a multipathing group, we try
- * to pick the source address from the destination
+ * 1) Try to pick the source address from the destination
* route. Clustering assumes that when we have multiple
* prefixes hosted on an interface, the prefix of the
* source address matches the prefix of the destination
* route. We do this only if the address is not
* DEPRECATED.
*
- * 3) If the conn is in a different zone than the ire, we
+ * 2) If the conn is in a different zone than the ire, we
* need to pick a source address from the right zone.
- *
- * NOTE : If we hit case (1) above, the prefix of the source
- * address picked may not match the prefix of the
- * destination routes prefix as ipif_select_source
- * does not look at "dst" while picking a source
- * address.
- * If we want the same behavior as (2), we will need
- * to change the behavior of ipif_select_source.
*/
ASSERT(src_ipif == NULL);
if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
@@ -8287,7 +8071,8 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
if (src_ipif == NULL &&
(!unspec_src || ipha->ipha_src != INADDR_ANY)) {
ire_marks |= IRE_MARK_USESRC_CHECK;
- if ((dst_ill->ill_group != NULL) ||
+ if (!IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) &&
+ IS_IPMP(ire->ire_ipif->ipif_ill) ||
(ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
(connp != NULL && ire->ire_zoneid != zoneid &&
ire->ire_zoneid != ALL_ZONES) ||
@@ -8312,6 +8097,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
* as dst_ire source address.
*/
ipaddr_t saddr = ire->ire_ipif->ipif_src_addr;
+
src_ipif = ipif_select_source(dst_ill, saddr,
zoneid);
if (src_ipif == NULL) {
@@ -8319,7 +8105,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
pr_addr_dbg("ip_newroute: "
"no src for dst %s ",
AF_INET, &dst);
- printf("through interface %s\n",
+ printf("on interface %s\n",
dst_ill->ill_name);
}
goto icmp_err_ret;
@@ -8558,6 +8344,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
MULTIRT_DEBUG_TAG(first_mp);
}
}
+
ire_add_then_send(q, ire, xmit_mp);
ire_refrele(save_ire);
@@ -8766,7 +8553,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
"ip_newroute: no "
"src for gw %s ",
AF_INET, &gw);
- printf("through "
+ printf("on "
"interface %s\n",
dst_ill->ill_name);
}
@@ -8867,16 +8654,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
areq = (areq_t *)mp->b_rptr;
addrp = (ipaddr_t *)((char *)areq +
areq->areq_sender_addr_offset);
- if (do_attach_ill) {
- /*
- * This is bind to no failover case.
- * arp packet also must go out on attach_ill.
- */
- ASSERT(ipha->ipha_src != NULL);
- *addrp = ipha->ipha_src;
- } else {
- *addrp = save_ire->ire_src_addr;
- }
+ *addrp = save_ire->ire_src_addr;
ire_refrele(save_ire);
addrp = (ipaddr_t *)((char *)areq +
@@ -9076,14 +8854,10 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
ipaddr_t *addrp;
mblk_t *first_mp;
ire_t *save_ire = NULL;
- ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER */
ipif_t *src_ipif = NULL;
ushort_t ire_marks = 0;
ill_t *dst_ill = NULL;
- boolean_t mctl_present;
- ipsec_out_t *io;
ipha_t *ipha;
- int ihandle = 0;
mblk_t *saved_mp;
ire_t *fire = NULL;
mblk_t *copy_mp = NULL;
@@ -9117,10 +8891,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst),
ipif->ipif_ill->ill_name));
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
- if (mctl_present)
- io = (ipsec_out_t *)first_mp->b_rptr;
-
+ first_mp = mp;
+ if (DB_TYPE(mp) == M_CTL)
+ mp = mp->b_cont;
ipha = (ipha_t *)mp->b_rptr;
/*
@@ -9161,64 +8934,29 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
(void *)ipif, ntohl(dst), (void *)fire));
}
- if (mctl_present && io->ipsec_out_attach_if) {
- attach_ill = ip_grab_attach_ill(NULL, first_mp,
- io->ipsec_out_ill_index, B_FALSE, ipst);
-
- /* Failure case frees things for us. */
- if (attach_ill == NULL) {
- ipif_refrele(ipif);
- if (fire != NULL)
- ire_refrele(fire);
- return;
- }
+ /*
+ * Note: While we pick a dst_ill we are really only
+ * interested in the ill for load spreading. The source
+ * ipif is determined by source address selection below.
+ */
+ if (IS_IPMP(ipif->ipif_ill)) {
+ ipmp_illgrp_t *illg = ipif->ipif_ill->ill_grp;
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill)) {
- ire_marks = IRE_MARK_HIDDEN;
- }
- /*
- * ip_wput passes the right ipif for IPIF_NOFAILOVER
- * case.
- */
- dst_ill = ipif->ipif_ill;
- /* attach_ill has been refheld by ip_grab_attach_ill */
- ASSERT(dst_ill == attach_ill);
+ if (CLASSD(ipha_dst))
+ dst_ill = ipmp_illgrp_hold_cast_ill(illg);
+ else
+ dst_ill = ipmp_illgrp_hold_next_ill(illg);
} else {
- /*
- * If the interface belongs to an interface group,
- * make sure the next possible interface in the group
- * is used. This encourages load spreading among
- * peers in an interface group.
- * Note: load spreading is disabled for RTF_MULTIRT
- * routes.
- */
- if ((flags & RTF_MULTIRT) && (fire != NULL) &&
- (fire->ire_flags & RTF_MULTIRT)) {
- /*
- * Don't perform outbound load spreading
- * in the case of an RTF_MULTIRT issued route,
- * we actually typically want to replicate
- * outgoing packets through particular
- * interfaces.
- */
- dst_ill = ipif->ipif_ill;
- ill_refhold(dst_ill);
- } else {
- dst_ill = ip_newroute_get_dst_ill(
- ipif->ipif_ill);
- }
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute_ipif: "
- "no dst ill for dst %s\n",
- AF_INET, &dst);
- }
- goto err_ret;
+ dst_ill = ipif->ipif_ill;
+ ill_refhold(dst_ill);
+ }
+
+ if (dst_ill == NULL) {
+ if (ip_debug > 2) {
+ pr_addr_dbg("ip_newroute_ipif: no dst ill "
+ "for dst %s\n", AF_INET, &dst);
}
+ goto err_ret;
}
/*
@@ -9242,7 +8980,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
unspec_src = (connp != NULL && connp->conn_unspec_src);
- if (((!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) ||
+ if (!IS_UNDER_IPMP(ipif->ipif_ill) &&
+ (IS_IPMP(ipif->ipif_ill) ||
+ (!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) ||
(ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP ||
(connp != NULL && ipif->ipif_zoneid != zoneid &&
ipif->ipif_zoneid != ALL_ZONES)) &&
@@ -9256,7 +8996,7 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
"no src for dst %s",
AF_INET, &dst);
}
- ip1dbg((" through interface %s\n",
+ ip1dbg((" on interface %s\n",
dst_ill->ill_name));
goto err_ret;
}
@@ -9291,12 +9031,7 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
goto err_ret;
if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
goto err_ret;
- /*
- * ihandle is needed when the ire is added to
- * cache table.
- */
save_ire = ire;
- ihandle = save_ire->ire_ihandle;
ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, "
"flags %04x\n",
@@ -9328,10 +9063,6 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
ipha->ipha_src = fire->ire_src_addr;
}
} else {
- ASSERT((connp == NULL) ||
- (connp->conn_outgoing_ill != NULL) ||
- (connp->conn_dontroute) ||
- infop->ip_opt_ill_index != 0);
/*
* The only ways we can come here are:
* 1) IP_BOUND_IF socket option is set
@@ -9340,6 +9071,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
* In all cases, the new ire will not be added
* into cache table.
*/
+ ASSERT(connp == NULL || connp->conn_dontroute ||
+ connp->conn_outgoing_ill != NULL ||
+ infop->ip_opt_ill_index != 0);
ire_marks |= IRE_MARK_NOADD;
}
@@ -9374,7 +9108,8 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
(save_ire != NULL ? save_ire->ire_mask : 0),
(fire != NULL) ? /* Parent handle */
fire->ire_phandle : 0,
- ihandle, /* Interface handle */
+ (save_ire != NULL) ? /* Interface handle */
+ save_ire->ire_ihandle : 0,
(fire != NULL) ?
(fire->ire_flags &
(RTF_SETSRC | RTF_MULTIRT)) : 0,
@@ -9533,7 +9268,8 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
(save_ire != NULL ? save_ire->ire_mask : 0),
(fire != NULL) ? /* Parent handle */
fire->ire_phandle : 0,
- ihandle, /* Interface handle */
+ (save_ire != NULL) ? /* Interface handle */
+ save_ire->ire_ihandle : 0,
(fire != NULL) ? /* flags if any */
(fire->ire_flags &
(RTF_SETSRC | RTF_MULTIRT)) : 0,
@@ -9593,12 +9329,20 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
/*
* Fill in the source and dest addrs for the resolver.
* NOTE: this depends on memory layouts imposed by
- * ill_init().
+ * ill_init(). There are corner cases above where we
+ * might've created the IRE with an INADDR_ANY source
+ * address (e.g., if the zeroth ipif on an underlying
+ * ill in an IPMP group is 0.0.0.0, but another ipif
+ * on the ill has a usable test address). If so, tell
+ * ARP to use ipha_src as its sender address.
*/
areq = (areq_t *)mp->b_rptr;
addrp = (ipaddr_t *)((char *)areq +
areq->areq_sender_addr_offset);
- *addrp = ire->ire_src_addr;
+ if (ire->ire_src_addr != INADDR_ANY)
+ *addrp = ire->ire_src_addr;
+ else
+ *addrp = ipha->ipha_src;
addrp = (ipaddr_t *)((char *)areq +
areq->areq_target_addr_offset);
*addrp = dst;
@@ -10136,7 +9880,7 @@ ip_ipsec_load_complete(ipsec_stack_t *ipss)
/*
* Can't be used. Need to call svr4* -> optset directly. the leaf routine
* determines the grp on which it has to become exclusive, queues the mp
- * and sq draining restarts the optmgmt
+ * and IPSQ draining restarts the optmgmt
*/
static boolean_t
ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp)
@@ -10482,28 +10226,6 @@ ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option,
}
switch (option) {
- case IP_DONTFAILOVER_IF:
- /*
- * This option is used by in.mpathd to ensure
- * that IPMP probe packets only go out on the
- * test interfaces. in.mpathd sets this option
- * on the non-failover interfaces.
- * For backward compatibility, this option
- * implicitly sets IP_MULTICAST_IF, as used
- * be done in bind(), so that ip_wput gets
- * this ipif to send mcast packets.
- */
- if (ipif != NULL) {
- ASSERT(addr != INADDR_ANY);
- connp->conn_nofailover_ill = ipif->ipif_ill;
- connp->conn_multicast_ipif = ipif;
- } else {
- ASSERT(addr == INADDR_ANY);
- connp->conn_nofailover_ill = NULL;
- connp->conn_multicast_ipif = NULL;
- }
- break;
-
case IP_MULTICAST_IF:
connp->conn_multicast_ipif = ipif;
break;
@@ -10551,7 +10273,7 @@ ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly,
ill_refrele(ill);
return (0);
}
- if (!ipif_lookup_zoneid_group(ill, connp->conn_zoneid,
+ if (!ipif_lookup_zoneid(ill, connp->conn_zoneid,
0, NULL)) {
ill_refrele(ill);
ill = NULL;
@@ -10596,8 +10318,6 @@ setit:
case IP_BOUND_IF:
connp->conn_incoming_ill = ill;
connp->conn_outgoing_ill = ill;
- connp->conn_orig_bound_ifindex = (ill == NULL) ?
- 0 : ifindex;
break;
case IP_MULTICAST_IF:
@@ -10650,40 +10370,6 @@ setit:
case IPV6_BOUND_IF:
connp->conn_incoming_ill = ill;
connp->conn_outgoing_ill = ill;
- connp->conn_orig_bound_ifindex = (ill == NULL) ?
- 0 : ifindex;
- break;
-
- case IPV6_BOUND_PIF:
- /*
- * Limit all transmit to this ill.
- * Unlike IPV6_BOUND_IF, using this option
- * prevents load spreading and failover from
- * happening when the interface is part of the
- * group. That's why we don't need to remember
- * the ifindex in orig_bound_ifindex as in
- * IPV6_BOUND_IF.
- */
- connp->conn_outgoing_pill = ill;
- break;
-
- case IPV6_DONTFAILOVER_IF:
- /*
- * This option is used by in.mpathd to ensure
- * that IPMP probe packets only go out on the
- * test interfaces. in.mpathd sets this option
- * on the non-failover interfaces.
- */
- connp->conn_nofailover_ill = ill;
- /*
- * For backward compatibility, this option
- * implicitly sets ip_multicast_ill as used in
- * IPV6_MULTICAST_IF so that ip_wput gets
- * this ill to send mcast packets.
- */
- connp->conn_multicast_ill = ill;
- connp->conn_orig_multicast_ifindex = (ill == NULL) ?
- 0 : ifindex;
break;
case IPV6_MULTICAST_IF:
@@ -10700,12 +10386,9 @@ setit:
if (!checkonly) {
if (ifindex == 0) {
connp->conn_multicast_ill = NULL;
- connp->conn_orig_multicast_ifindex = 0;
connp->conn_multicast_ipif = NULL;
} else if (ill != NULL) {
connp->conn_multicast_ill = ill;
- connp->conn_orig_multicast_ifindex =
- ifindex;
}
}
break;
@@ -10867,8 +10550,7 @@ ip_opt_set(queue_t *q, uint_t optset_context, int level, int name,
if (secpolicy_ip_config(cr, B_FALSE) != 0)
return (EPERM);
/* FALLTHRU */
- case IP_MULTICAST_IF:
- case IP_DONTFAILOVER_IF: {
+ case IP_MULTICAST_IF: {
ipaddr_t addr = *i1;
error = ip_opt_set_ipif(connp, addr, checkonly, name,
@@ -11189,8 +10871,6 @@ ip_opt_set(queue_t *q, uint_t optset_context, int level, int name,
case IPPROTO_IPV6:
switch (name) {
case IPV6_BOUND_IF:
- case IPV6_BOUND_PIF:
- case IPV6_DONTFAILOVER_IF:
error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly,
level, name, first_mp);
if (error != 0)
@@ -12288,11 +11968,10 @@ ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha,
* frees mp on failure.
*/
static boolean_t
-ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha,
+ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
uint32_t *cksum_val, uint16_t *cksum_flags)
{
uint32_t frag_offset_flags;
- ill_t *ill = (ill_t *)q->q_ptr;
mblk_t *mp = *mpp;
mblk_t *t_mp;
ipaddr_t dst;
@@ -12337,12 +12016,12 @@ ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha,
/*
* We utilize hardware computed checksum info only for UDP since
- * IP fragmentation is a normal occurence for the protocol. In
+ * IP fragmentation is a normal occurrence for the protocol. In
* addition, checksum offload support for IP fragments carrying
* UDP payload is commonly implemented across network adapters.
*/
- ASSERT(ill != NULL);
- if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+ ASSERT(recv_ill != NULL);
+ if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(recv_ill) &&
(DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
mblk_t *mp1 = mp->b_cont;
int32_t len;
@@ -12808,7 +12487,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
goto ipoptions;
/* Check the IP header checksum. */
- if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
+ if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) {
/* Clear the IP header h/w cksum flag */
DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
} else if (!mctl_present) {
@@ -12871,7 +12550,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
* Revert to software checksum calculation if the interface
* isn't capable of checksum offload or if IPsec is present.
*/
- if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+ if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum)
hck_flags = DB_CKSUMFLAGS(mp);
if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
@@ -12958,8 +12637,11 @@ fragmented:
* reassembled packet has a valid hardware computed
* checksum information associated with it.
*/
- if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags))
+ if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, &sum,
+ &reass_hck_flags)) {
goto slow_done;
+ }
+
/*
* Make sure that first_mp points back to mp as
* the mp we came in with could have changed in
@@ -13073,7 +12755,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
goto ipoptions;
} else if (!mctl_present) {
/* Check the IP header checksum. */
- if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
+ if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) {
/* Clear the IP header h/w cksum flag */
DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
} else if (!mctl_present) {
@@ -13159,7 +12841,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
* Revert to software checksum calculation if the interface
* isn't capable of checksum offload or if IPsec is present.
*/
- if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+ if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum)
hck_flags = DB_CKSUMFLAGS(mp);
if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
@@ -13386,7 +13068,7 @@ ipoptions:
u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
if (u1 & (IPH_MF | IPH_OFFSET)) {
fragmented:
- if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
+ if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) {
if (mctl_present)
freeb(first_mp);
goto slow_done;
@@ -13530,7 +13212,7 @@ ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
goto ipoptions;
} else {
/* Check the IP header checksum. */
- if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, ill) &&
+ if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill) &&
!mctl_present) {
#define uph ((uint16_t *)ipha)
sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
@@ -13644,7 +13326,7 @@ ipoptions:
u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
if (u1 & (IPH_MF | IPH_OFFSET)) {
fragmented:
- if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL))
+ if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL))
goto slow_done;
/*
* Make sure that first_mp points back to mp as
@@ -13877,6 +13559,11 @@ ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst)
return (B_TRUE);
}
+/*
+ * Handle the situation where a packet came in on `ill' but matched an IRE
+ * whose ire_rfq doesn't match `ill'. We return the IRE that should be used
+ * for interface statistics.
+ */
ire_t *
ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
{
@@ -13887,16 +13574,22 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
boolean_t strict_check = B_FALSE;
/*
- * This packet came in on an interface other than the one associated
- * with the first ire we found for the destination address. We do
- * another ire lookup here, using the ingress ill, to see if the
- * interface is in an interface group.
+ * IPMP common case: if IRE and ILL are in the same group, there's no
+ * issue (e.g. packet received on an underlying interface matched an
+ * IRE_LOCAL on its associated group interface).
+ */
+ if (ire->ire_rfq != NULL &&
+ IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) {
+ return (ire);
+ }
+
+ /*
+ * Do another ire lookup here, using the ingress ill, to see if the
+ * interface is in a usesrc group.
* As long as the ills belong to the same group, we don't consider
* them to be arriving on the wrong interface. Thus, if the switch
* is doing inbound load spreading, we won't drop packets when the
- * ip*_strict_dst_multihoming switch is on. Note, the same holds true
- * for 'usesrc groups' where the destination address may belong to
- * another interface to allow multipathing to happen.
+ * ip*_strict_dst_multihoming switch is on.
* We also need to check for IPIF_UNNUMBERED point2point interfaces
* where the local address may not be unique. In this case we were
* at the mercy of the initial ire cache lookup and the IRE_LOCAL it
@@ -13910,18 +13603,18 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
strict_check = B_TRUE;
new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL,
ill->ill_ipif, ALL_ZONES, NULL,
- (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst);
+ (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst);
} else {
ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr));
if (ipst->ips_ipv6_strict_dst_multihoming)
strict_check = B_TRUE;
new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL,
IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL,
- (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst);
+ (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst);
}
/*
* If the same ire that was returned in ip_input() is found then this
- * is an indication that interface groups are in use. The packet
+ * is an indication that usesrc groups are in use. The packet
* arrived on a different ill in the group than the one associated with
* the destination address. If a different ire was found then the same
* IP address must be hosted on multiple ills. This is possible with
@@ -14075,11 +13768,10 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
/*
* Forwarding fastpath exception case:
- * If either of the follwoing case is true, we take
- * the slowpath
+ * If any of the following are true, we take the slowpath:
* o forwarding is not enabled
- * o incoming and outgoing interface are the same, or the same
- * IPMP group
+ * o incoming and outgoing interface are the same, or in the same
+ * IPMP group.
* o corresponding ire is in incomplete state
* o packet needs fragmentation
* o ARP cache is not resolved
@@ -14090,8 +13782,7 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
pkt_len = ntohs(ipha->ipha_length);
stq_ill = (ill_t *)ire->ire_stq->q_ptr;
if (!(stq_ill->ill_flags & ILLF_ROUTER) ||
- (ill == stq_ill) ||
- (ill->ill_group != NULL && ill->ill_group == stq_ill->ill_group) ||
+ (ill == stq_ill) || IS_IN_SAME_ILLGRP(ill, stq_ill) ||
(ire->ire_nce == NULL) ||
(pkt_len > ire->ire_max_frag) ||
((fpmp = ire->ire_nce->nce_fp_mp) == NULL) ||
@@ -14185,11 +13876,10 @@ static void
ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward)
{
- ill_group_t *ill_group;
- ill_group_t *ire_group;
queue_t *dev_q;
ire_t *src_ire;
ip_stack_t *ipst = ill->ill_ipst;
+ boolean_t same_illgrp = B_FALSE;
ASSERT(ire->ire_stq != NULL);
@@ -14200,11 +13890,8 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
* If the caller of this function is ip_fast_forward() skip the
* next three checks as it does not apply.
*/
- if (from_ip_fast_forward) {
- ill_group = ill->ill_group;
- ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group;
+ if (from_ip_fast_forward)
goto skip;
- }
if (ll_multicast != 0) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
@@ -14230,13 +13917,10 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
goto drop_pkt;
}
- ill_group = ill->ill_group;
- ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group;
/*
* Check if we want to forward this one at this time.
* We allow source routed packets on a host provided that
- * they go out the same interface or same interface group
- * as they came in on.
+ * they go out the same ill or illgrp as they came in on.
*
* XXX To be quicker, we may wish to not chase pointers to
* get the ILLF_ROUTER flag and instead store the
@@ -14245,11 +13929,12 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
* whenever the ILLF_ROUTER flag changes.
*/
skip:
+ same_illgrp = IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr);
+
if (((ill->ill_flags &
- ((ill_t *)ire->ire_stq->q_ptr)->ill_flags &
- ILLF_ROUTER) == 0) &&
- !(ip_source_routed(ipha, ipst) && (ire->ire_rfq == q ||
- (ill_group != NULL && ill_group == ire_group)))) {
+ ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_ROUTER) == 0) &&
+ !(ip_source_routed(ipha, ipst) &&
+ (ire->ire_rfq == q || same_illgrp))) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
if (ip_source_routed(ipha, ipst)) {
q = WR(q);
@@ -14290,12 +13975,10 @@ skip:
ire_t *nhop_ire = NULL;
/*
- * Check whether ire_rfq and q are from the same ill
- * or if they are not same, they at least belong
- * to the same group. If so, send redirects.
+ * Check whether ire_rfq and q are from the same ill or illgrp.
+ * If so, send redirects.
*/
- if ((ire->ire_rfq == q ||
- (ill_group != NULL && ill_group == ire_group)) &&
+ if ((ire->ire_rfq == q || same_illgrp) &&
!ip_source_routed(ipha, ipst)) {
nhop = (ire->ire_gateway_addr != 0 ?
@@ -14396,26 +14079,15 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha,
}
/*
* For multicast we have set dst to be INADDR_BROADCAST
- * for delivering to all STREAMS. IRE_MARK_NORECV is really
- * only for broadcast packets.
+ * for delivering to all STREAMS.
*/
if (!CLASSD(ipha->ipha_dst)) {
ire_t *new_ire;
ipif_t *ipif;
- /*
- * For ill groups, as the switch duplicates broadcasts
- * across all the ports, we need to filter out and
- * send up only one copy. There is one copy for every
- * broadcast address on each ill. Thus, we look for a
- * specific IRE on this ill and look at IRE_MARK_NORECV
- * later to see whether this ill is eligible to receive
- * them or not. ill_nominate_bcast_rcv() nominates only
- * one set of IREs for receiving.
- */
ipif = ipif_get_next_ipif(NULL, ill);
if (ipif == NULL) {
- ire_refrele(ire);
+discard: ire_refrele(ire);
freemsg(mp);
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
return (NULL);
@@ -14425,13 +14097,17 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha,
ipif_refrele(ipif);
if (new_ire != NULL) {
- if (new_ire->ire_marks & IRE_MARK_NORECV) {
- ire_refrele(ire);
+ /*
+ * If the matching IRE_BROADCAST is part of an IPMP
+ * group, then drop the packet unless our ill has been
+ * nominated to receive for the group.
+ */
+ if (IS_IPMP(new_ire->ire_ipif->ipif_ill) &&
+ new_ire->ire_rfq != q) {
ire_refrele(new_ire);
- freemsg(mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return (NULL);
+ goto discard;
}
+
/*
* In the special case of multirouted broadcast
* packets, we unconditionally need to "gateway"
@@ -14571,6 +14247,13 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
ntohs(ipha->ipha_length));
/*
+ * So that we don't end up with dups, only one ill an IPMP group is
+ * nominated to receive multicast traffic.
+ */
+ if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast)
+ goto drop_pkt;
+
+ /*
* Forward packets only if we have joined the allmulti
* group on this interface.
*/
@@ -14619,18 +14302,15 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
}
}
- ILM_WALKER_HOLD(ill);
if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) {
/*
* This might just be caused by the fact that
* multiple IP Multicast addresses map to the same
* link layer multicast - no need to increment counter!
*/
- ILM_WALKER_RELE(ill);
freemsg(mp);
return (B_TRUE);
}
- ILM_WALKER_RELE(ill);
done:
ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp)));
/*
@@ -15498,8 +15178,8 @@ local:
* broadcast ire.
*/
if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) {
- if ((ire = ip_check_multihome(&ipha->ipha_dst, ire,
- ill)) == NULL) {
+ ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
+ if (ire == NULL) {
/* Drop packet */
BUMP_MIB(ill->ill_ip_mib,
ipIfStatsForwProhibits);
@@ -15935,19 +15615,12 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
ip1dbg(("ip_rput_dlpi_writer .."));
ill = (ill_t *)q->q_ptr;
- ASSERT(ipsq == ill->ill_phyint->phyint_ipsq);
-
+ ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop);
ASSERT(IAM_WRITER_ILL(ill));
ipst = ill->ill_ipst;
- /*
- * ipsq_pending_mp and ipsq_pending_ipif track each other. i.e.
- * both are null or non-null. However we can assert that only
- * after grabbing the ipsq_lock. So we don't make any assertion
- * here and in other places in the code.
- */
- ipif = ipsq->ipsq_pending_ipif;
+ ipif = ipsq->ipsq_xop->ipx_pending_ipif;
/*
* The current ioctl could have been aborted by the user and a new
* ioctl to bring up another ill could have started. We could still
@@ -16045,9 +15718,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
*/
ASSERT(connp != NULL);
q = CONNP_TO_WQ(connp);
- if (ill->ill_move_in_progress) {
- ILL_CLEAR_MOVE(ill);
- }
(void) ipif_down(ipif, NULL, NULL);
/* error is set below the switch */
}
@@ -16196,45 +15866,31 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
* ill_dl_up(), which stopped ipif_up()'s processing.
*/
if (ill->ill_isv6) {
- /*
- * v6 interfaces.
- * Unlike ARP which has to do another bind
- * and attach, once we get here we are
- * done with NDP. Except in the case of
- * ILLF_XRESOLV, in which case we send an
- * AR_INTERFACE_UP to the external resolver.
- * If all goes well, the ioctl will complete
- * in ip_rput(). If there's an error, we
- * complete it here.
- */
- if ((err = ipif_ndp_up(ipif)) == 0) {
- if (ill->ill_flags & ILLF_XRESOLV) {
- mutex_enter(&connp->conn_lock);
- mutex_enter(&ill->ill_lock);
- success = ipsq_pending_mp_add(
- connp, ipif, q, mp1, 0);
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- if (success) {
- err = ipif_resolver_up(ipif,
- Res_act_initial);
- if (err == EINPROGRESS) {
- freemsg(mp);
- return;
- }
- ASSERT(err != 0);
- mp1 = ipsq_pending_mp_get(ipsq,
- &connp);
- ASSERT(mp1 != NULL);
- } else {
- /* conn has started closing */
- err = EINTR;
- }
- } else { /* Non XRESOLV interface */
- (void) ipif_resolver_up(ipif,
+ if (ill->ill_flags & ILLF_XRESOLV) {
+ mutex_enter(&connp->conn_lock);
+ mutex_enter(&ill->ill_lock);
+ success = ipsq_pending_mp_add(connp, ipif, q,
+ mp1, 0);
+ mutex_exit(&ill->ill_lock);
+ mutex_exit(&connp->conn_lock);
+ if (success) {
+ err = ipif_resolver_up(ipif,
Res_act_initial);
- err = ipif_up_done_v6(ipif);
+ if (err == EINPROGRESS) {
+ freemsg(mp);
+ return;
+ }
+ ASSERT(err != 0);
+ mp1 = ipsq_pending_mp_get(ipsq, &connp);
+ ASSERT(mp1 != NULL);
+ } else {
+ /* conn has started closing */
+ err = EINTR;
}
+ } else { /* Non XRESOLV interface */
+ (void) ipif_resolver_up(ipif, Res_act_initial);
+ if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
+ err = ipif_up_done_v6(ipif);
}
} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
/*
@@ -16275,14 +15931,31 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
}
- if (ill->ill_up_ipifs) {
- ill_group_cleanup(ill);
+ /*
+ * If we have a moved ipif to bring up, and everything has
+ * succeeded to this point, bring it up on the IPMP ill.
+ * Otherwise, leave it down -- the admin can try to bring it
+ * up by hand if need be.
+ */
+ if (ill->ill_move_ipif != NULL) {
+ if (err != 0) {
+ ill->ill_move_ipif = NULL;
+ } else {
+ ipif = ill->ill_move_ipif;
+ ill->ill_move_ipif = NULL;
+ err = ipif_up(ipif, q, mp1);
+ if (err == EINPROGRESS) {
+ freemsg(mp);
+ return;
+ }
+ }
}
-
break;
+
case DL_NOTIFY_IND: {
dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr;
ire_t *ire;
+ uint_t orig_mtu;
boolean_t need_ire_walk_v4 = B_FALSE;
boolean_t need_ire_walk_v6 = B_FALSE;
@@ -16322,17 +15995,27 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
* which it is being derived.
*/
mutex_enter(&ill->ill_lock);
+
+ orig_mtu = ill->ill_max_mtu;
ill->ill_max_frag = (uint_t)notify->dl_data;
+ ill->ill_max_mtu = (uint_t)notify->dl_data;
+
+ /*
+ * If ill_user_mtu was set (via SIOCSLIFLNKINFO),
+ * clamp ill_max_mtu at it.
+ */
+ if (ill->ill_user_mtu != 0 &&
+ ill->ill_user_mtu < ill->ill_max_mtu)
+ ill->ill_max_mtu = ill->ill_user_mtu;
/*
- * If an SIOCSLIFLNKINFO has changed the ill_max_mtu
- * leave it alone
+ * If the MTU is unchanged, we're done.
*/
- if (ill->ill_mtu_userspecified) {
+ if (orig_mtu == ill->ill_max_mtu) {
mutex_exit(&ill->ill_lock);
break;
}
- ill->ill_max_mtu = ill->ill_max_frag;
+
if (ill->ill_isv6) {
if (ill->ill_max_mtu < IPV6_MIN_MTU)
ill->ill_max_mtu = IPV6_MIN_MTU;
@@ -16371,7 +16054,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
if (need_ire_walk_v6)
ire_walk_v6(ill_mtu_change, (char *)ill,
ALL_ZONES, ipst);
+
+ /*
+ * Refresh IPMP meta-interface MTU if necessary.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_illgrp_refresh_mtu(ill->ill_grp);
break;
+
case DL_NOTE_LINK_UP:
case DL_NOTE_LINK_DOWN: {
/*
@@ -16385,9 +16075,17 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
went_up = notify->dl_notification == DL_NOTE_LINK_UP;
mutex_enter(&phyint->phyint_lock);
+
new_phyint_flags = went_up ?
phyint->phyint_flags | PHYI_RUNNING :
phyint->phyint_flags & ~PHYI_RUNNING;
+
+ if (IS_IPMP(ill)) {
+ new_phyint_flags = went_up ?
+ new_phyint_flags & ~PHYI_FAILED :
+ new_phyint_flags | PHYI_FAILED;
+ }
+
if (new_phyint_flags != phyint->phyint_flags) {
phyint->phyint_flags = new_phyint_flags;
changed = B_TRUE;
@@ -16474,7 +16172,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
* is invoked from an ill queue, conn_oper_pending_ill is not
* available, but we know the ioctl is pending on ill_wq.)
*/
- uint_t paddrlen, paddroff;
+ uint_t paddrlen, paddroff;
paddrreq = ill->ill_phys_addr_pend;
paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length;
@@ -16592,29 +16290,59 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
freemsg(mp);
- if (mp1 != NULL) {
+ if (mp1 == NULL)
+ return;
+
+ /*
+ * The operation must complete without EINPROGRESS since
+ * ipsq_pending_mp_get() has removed the mblk (mp1). Otherwise,
+ * the operation will be stuck forever inside the IPSQ.
+ */
+ ASSERT(err != EINPROGRESS);
+
+ switch (ipsq->ipsq_xop->ipx_current_ioctl) {
+ case 0:
+ ipsq_current_finish(ipsq);
+ break;
+
+ case SIOCSLIFNAME:
+ case IF_UNITSEL: {
+ ill_t *ill_other = ILL_OTHER(ill);
+
/*
- * The operation must complete without EINPROGRESS
- * since ipsq_pending_mp_get() has removed the mblk
- * from ipsq_pending_mp. Otherwise, the operation
- * will be stuck forever in the ipsq.
+ * If SIOCSLIFNAME or IF_UNITSEL is about to succeed, and the
+ * ill has a peer which is in an IPMP group, then place ill
+ * into the same group. One catch: although ifconfig plumbs
+ * the appropriate IPMP meta-interface prior to plumbing this
+ * ill, it is possible for multiple ifconfig applications to
+ * race (or for another application to adjust plumbing), in
+ * which case the IPMP meta-interface we need will be missing.
+ * If so, kick the phyint out of the group.
*/
- ASSERT(err != EINPROGRESS);
+ if (err == 0 && ill_other != NULL && IS_UNDER_IPMP(ill_other)) {
+ ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
+ ipmp_illgrp_t *illg;
- switch (ipsq->ipsq_current_ioctl) {
- case 0:
- ipsq_current_finish(ipsq);
- break;
+ illg = ill->ill_isv6 ? grp->gr_v6 : grp->gr_v4;
+ if (illg == NULL)
+ ipmp_phyint_leave_grp(ill->ill_phyint);
+ else
+ ipmp_ill_join_illgrp(ill, illg);
+ }
- case SIOCLIFADDIF:
- case SIOCSLIFNAME:
+ if (ipsq->ipsq_xop->ipx_current_ioctl == IF_UNITSEL)
+ ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
+ else
ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
- break;
+ break;
+ }
+ case SIOCLIFADDIF:
+ ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
+ break;
- default:
- ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
- break;
- }
+ default:
+ ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
+ break;
}
}
@@ -16626,20 +16354,16 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
void
ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
{
- ill_t *ill;
+ ill_t *ill = q->q_ptr;
struct iocblk *iocp;
mblk_t *mp1;
conn_t *connp = NULL;
ip1dbg(("ip_rput_other "));
- ill = (ill_t *)q->q_ptr;
- /*
- * This routine is not a writer in the case of SIOCGTUNPARAM
- * in which case ipsq is NULL.
- */
if (ipsq != NULL) {
ASSERT(IAM_WRITER_IPSQ(ipsq));
- ASSERT(ipsq == ill->ill_phyint->phyint_ipsq);
+ ASSERT(ipsq->ipsq_xop ==
+ ill->ill_phyint->phyint_ipsq->ipsq_xop);
}
switch (mp->b_datap->db_type) {
@@ -16752,7 +16476,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
case DL_IOC_HDR_INFO:
/*
- * If this was the first attempt turn of the
+ * If this was the first attempt, turn off the
* fastpath probing.
*/
mutex_enter(&ill->ill_lock);
@@ -16768,7 +16492,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
freemsg(mp);
break;
- case SIOCSTUNPARAM:
+ case SIOCSTUNPARAM:
case OSIOCSTUNPARAM:
ASSERT(ipsq != NULL);
/*
@@ -17017,14 +16741,13 @@ ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif)
/*
* Find an IRE which matches the destination and the outgoing
* queue in the cache table. All we need is an IRE_CACHE which
- * is pointing at ipif->ipif_ill. If it is part of some ill group,
- * then it is enough to have some IRE_CACHE in the group.
+ * is pointing at ipif->ipif_ill.
*/
if (ipif->ipif_flags & IPIF_POINTOPOINT)
dst = ipif->ipif_pp_dst_addr;
ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, MBLK_GETLABEL(mp),
- MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR, ipst);
+ MATCH_IRE_ILL | MATCH_IRE_SECATTR, ipst);
if (ire == NULL) {
/*
* Mark this packet to make it be delivered to
@@ -17321,7 +17044,8 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire)
*/
mp->b_datap->db_type = M_DATA;
icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp,
- ip6h, icmp6, ill, B_TRUE, ii->ipsec_in_zoneid);
+ ip6h, icmp6, ill, recv_ill, B_TRUE,
+ ii->ipsec_in_zoneid);
}
if (ill_need_rele)
ill_refrele(ill);
@@ -17357,37 +17081,36 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire)
}
switch (ipha->ipha_protocol) {
- case IPPROTO_UDP:
- ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire,
- recv_ill);
- if (ire_need_rele)
- ire_refrele(ire);
- break;
- case IPPROTO_TCP:
- if (!ire_need_rele)
- IRE_REFHOLD(ire);
- mp = ip_tcp_input(mp, ipha, ill, B_TRUE,
- ire, ipsec_mp, 0, ill->ill_rq, NULL);
- IRE_REFRELE(ire);
- if (mp != NULL) {
-
- SQUEUE_ENTER(GET_SQUEUE(mp), mp,
- mp, 1, SQ_PROCESS,
- SQTAG_IP_PROTO_AGAIN);
- }
- break;
- case IPPROTO_SCTP:
- if (!ire_need_rele)
- IRE_REFHOLD(ire);
- ip_sctp_input(mp, ipha, ill, B_TRUE, ire,
- ipsec_mp, 0, ill->ill_rq, dst);
- break;
- default:
- ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire,
- recv_ill, 0);
- if (ire_need_rele)
- ire_refrele(ire);
- break;
+ case IPPROTO_UDP:
+ ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire,
+ recv_ill);
+ if (ire_need_rele)
+ ire_refrele(ire);
+ break;
+ case IPPROTO_TCP:
+ if (!ire_need_rele)
+ IRE_REFHOLD(ire);
+ mp = ip_tcp_input(mp, ipha, ill, B_TRUE,
+ ire, ipsec_mp, 0, ill->ill_rq, NULL);
+ IRE_REFRELE(ire);
+ if (mp != NULL) {
+ SQUEUE_ENTER(GET_SQUEUE(mp), mp,
+ mp, 1, SQ_PROCESS,
+ SQTAG_IP_PROTO_AGAIN);
+ }
+ break;
+ case IPPROTO_SCTP:
+ if (!ire_need_rele)
+ IRE_REFHOLD(ire);
+ ip_sctp_input(mp, ipha, ill, B_TRUE, ire,
+ ipsec_mp, 0, ill->ill_rq, dst);
+ break;
+ default:
+ ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire,
+ recv_ill, 0);
+ if (ire_need_rele)
+ ire_refrele(ire);
+ break;
}
} else {
uint32_t rput_flags = 0;
@@ -17621,9 +17344,9 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
*/
ASSERT(!mctl_present);
ASSERT(first_mp == mp);
- if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
+ if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL))
return;
- }
+
/*
* Make sure that first_mp points back to mp as
* the mp we came in with could have changed in
@@ -17647,17 +17370,10 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
ilm_t *ilm;
mblk_t *mp1;
zoneid_t last_zoneid;
+ ilm_walker_t ilw;
if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) {
ASSERT(ire->ire_type == IRE_BROADCAST);
- /*
- * Inactive/Failed interfaces are not supposed to
- * respond to the multicast packets.
- */
- if (ill_is_probeonly(ill)) {
- freemsg(first_mp);
- return;
- }
/*
* In the multicast case, applications may have joined
@@ -17680,11 +17396,9 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
* have been exhausted.
*/
last_zoneid = -1;
- ILM_WALKER_HOLD(recv_ill);
- for (ilm = recv_ill->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if ((ilm->ilm_flags & ILM_DELETED) ||
- ipha->ipha_dst != ilm->ilm_addr ||
+ ilm = ilm_walker_start(&ilw, recv_ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ if (ipha->ipha_dst != ilm->ilm_addr ||
ilm->ilm_zoneid == last_zoneid ||
ilm->ilm_zoneid == ire->ire_zoneid ||
ilm->ilm_zoneid == ALL_ZONES ||
@@ -17693,12 +17407,12 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
mp1 = ip_copymsg(first_mp);
if (mp1 == NULL)
continue;
- icmp_inbound(q, mp1, B_TRUE, ill,
+ icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill,
0, sum, mctl_present, B_TRUE,
recv_ill, ilm->ilm_zoneid);
last_zoneid = ilm->ilm_zoneid;
}
- ILM_WALKER_RELE(recv_ill);
+ ilm_walker_finish(&ilw);
} else if (ire->ire_type == IRE_BROADCAST) {
/*
* In the broadcast case, there may be many zones
@@ -18580,14 +18294,13 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level)
return (1);
}
- if ((mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, ipst)) == NULL) {
+ mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, level, ipst);
+ if (mpctl == NULL)
return (1);
- }
- mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, ipst);
- if (mpctl == NULL) {
+ mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, level, ipst);
+ if (mpctl == NULL)
return (1);
- }
if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) {
return (1);
@@ -19048,6 +18761,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
mblk_t *mp_tail = NULL;
ill_walk_context_t ctx;
zoneid_t zoneid;
+ ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -19064,7 +18778,10 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- ILM_WALKER_HOLD(ill);
+ if (IS_UNDER_IPMP(ill))
+ continue;
+
+ ilm = ilm_walker_start(&ilw, ill);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if (ipif->ipif_zoneid != zoneid &&
@@ -19074,7 +18791,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
OCTET_LENGTH);
ipm.ipGroupMemberIfIndex.o_length =
mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes);
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
ASSERT(ilm->ilm_ipif != NULL);
ASSERT(ilm->ilm_ill == NULL);
if (ilm->ilm_ipif != ipif)
@@ -19090,7 +18807,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
}
}
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
}
rw_exit(&ipst->ips_ill_g_lock);
optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
@@ -19112,6 +18829,7 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
mblk_t *mp_tail = NULL;
ill_walk_context_t ctx;
zoneid_t zoneid;
+ ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -19127,9 +18845,12 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- ILM_WALKER_HOLD(ill);
+ if (IS_UNDER_IPMP(ill))
+ continue;
+
+ ilm = ilm_walker_start(&ilw, ill);
ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex;
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
ASSERT(ilm->ilm_ipif == NULL);
ASSERT(ilm->ilm_ill != NULL);
if (ilm->ilm_zoneid != zoneid)
@@ -19145,7 +18866,7 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
(uint_t)sizeof (ipm6)));
}
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
}
rw_exit(&ipst->ips_ill_g_lock);
@@ -19171,6 +18892,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
zoneid_t zoneid;
int i;
slist_t *sl;
+ ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -19187,7 +18909,10 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- ILM_WALKER_HOLD(ill);
+ if (IS_UNDER_IPMP(ill))
+ continue;
+
+ ilm = ilm_walker_start(&ilw, ill);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if (ipif->ipif_zoneid != zoneid)
@@ -19196,7 +18921,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
OCTET_LENGTH);
ips.ipGroupSourceIfIndex.o_length =
mi_strlen(ips.ipGroupSourceIfIndex.o_bytes);
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
ASSERT(ilm->ilm_ipif != NULL);
ASSERT(ilm->ilm_ill == NULL);
sl = ilm->ilm_filter;
@@ -19220,7 +18945,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
}
}
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
}
rw_exit(&ipst->ips_ill_g_lock);
optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
@@ -19244,6 +18969,7 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
zoneid_t zoneid;
int i;
slist_t *sl;
+ ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -19259,9 +18985,12 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- ILM_WALKER_HOLD(ill);
+ if (IS_UNDER_IPMP(ill))
+ continue;
+
+ ilm = ilm_walker_start(&ilw, ill);
ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex;
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
ASSERT(ilm->ilm_ipif == NULL);
ASSERT(ilm->ilm_ill != NULL);
sl = ilm->ilm_filter;
@@ -19279,7 +19008,7 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
}
}
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
}
rw_exit(&ipst->ips_ill_g_lock);
@@ -19345,7 +19074,8 @@ ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
* in one IRE walk.
*/
static mblk_t *
-ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
+ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
+ ip_stack_t *ipst)
{
struct opthdr *optp;
mblk_t *mp2ctl; /* Returned */
@@ -19377,6 +19107,14 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
ird.ird_route.lp_head = mpctl->b_cont;
ird.ird_netmedia.lp_head = mp3ctl->b_cont;
ird.ird_attrs.lp_head = mp4ctl->b_cont;
+ /*
+ * If the level has been set the special EXPER_IP_AND_TESTHIDDEN
+ * value, then also include IRE_MARK_TESTHIDDEN IREs. This is
+ * intended a temporary solution until a proper MIB API is provided
+ * that provides complete filtering/caller-opt-in.
+ */
+ if (level == EXPER_IP_AND_TESTHIDDEN)
+ ird.ird_flags |= IRD_REPORT_TESTHIDDEN;
zoneid = Q_TO_CONN(q)->conn_zoneid;
ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst);
@@ -19419,7 +19157,8 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
* ipv6NetToMediaEntryTable in an NDP walk.
*/
static mblk_t *
-ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
+ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
+ ip_stack_t *ipst)
{
struct opthdr *optp;
mblk_t *mp2ctl; /* Returned */
@@ -19451,6 +19190,14 @@ ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
ird.ird_route.lp_head = mpctl->b_cont;
ird.ird_netmedia.lp_head = mp3ctl->b_cont;
ird.ird_attrs.lp_head = mp4ctl->b_cont;
+ /*
+ * If the level has been set the special EXPER_IP_AND_TESTHIDDEN
+ * value, then also include IRE_MARK_TESTHIDDEN IREs. This is
+ * intended a temporary solution until a proper MIB API is provided
+ * that provides complete filtering/caller-opt-in.
+ */
+ if (level == EXPER_IP_AND_TESTHIDDEN)
+ ird.ird_flags |= IRD_REPORT_TESTHIDDEN;
zoneid = Q_TO_CONN(q)->conn_zoneid;
ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst);
@@ -19671,6 +19418,11 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
ASSERT(ire->ire_ipversion == IPV4_VERSION);
+ if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) &&
+ ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+ return;
+ }
+
if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
return;
@@ -19812,6 +19564,11 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
ASSERT(ire->ire_ipversion == IPV6_VERSION);
+ if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) &&
+ ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+ return;
+ }
+
if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
return;
@@ -20518,8 +20275,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
boolean_t mctl_present;
ipsec_out_t *io;
int match_flags;
- ill_t *attach_ill = NULL;
- /* Bind to IPIF_NOFAILOVER ill etc. */
ill_t *xmit_ill = NULL; /* IP_PKTINFO etc. */
ipif_t *dst_ipif;
boolean_t multirt_need_resolve = B_FALSE;
@@ -20639,16 +20394,11 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
}
/*
- * IP_DONTFAILOVER_IF and IP_BOUND_IF have precedence over ill index
- * passed in IP_PKTINFO.
+ * IP_BOUND_IF has precedence over the ill index passed in IP_PKTINFO.
*/
- if (infop->ip_opt_ill_index != 0 &&
- connp->conn_outgoing_ill == NULL &&
- connp->conn_nofailover_ill == NULL) {
-
- xmit_ill = ill_lookup_on_ifindex(
- infop->ip_opt_ill_index, B_FALSE, NULL, NULL, NULL, NULL,
- ipst);
+ if (infop->ip_opt_ill_index != 0 && connp->conn_outgoing_ill == NULL) {
+ xmit_ill = ill_lookup_on_ifindex(infop->ip_opt_ill_index,
+ B_FALSE, NULL, NULL, NULL, NULL, ipst);
if (xmit_ill == NULL || IS_VNI(xmit_ill))
goto drop_pkt;
@@ -20659,7 +20409,7 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
* accessible from all zones i.e has a valid ipif in
* all zones.
*/
- if (!ipif_lookup_zoneid_group(xmit_ill, zoneid, 0, NULL)) {
+ if (!ipif_lookup_zoneid(xmit_ill, zoneid, 0, NULL)) {
goto drop_pkt;
}
}
@@ -20696,18 +20446,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
goto version_hdrlen_check;
dst = ipha->ipha_dst;
- if (connp->conn_nofailover_ill != NULL) {
- attach_ill = conn_get_held_ill(connp,
- &connp->conn_nofailover_ill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- if (need_decref)
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
- }
-
/* If IP_BOUND_IF has been set, use that ill. */
if (connp->conn_outgoing_ill != NULL) {
xmit_ill = conn_get_held_ill(connp,
@@ -20761,9 +20499,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
ire = NULL;
}
- if (attach_ill != NULL)
- goto send_from_ill;
-
/*
* We cache IRE_CACHEs to avoid lookups. We don't do
* this for the tcp global queue and listen end point
@@ -21074,45 +20809,21 @@ notdata:
}
ASSERT(first_mp != NULL);
- /*
- * ICMP echo replies attach an ipsec_out and set ipsec_out_attach_if
- * to make sure that this packet goes out on the same interface it
- * came in. We handle that here.
- */
- if (mctl_present) {
- uint_t ifindex;
+ if (mctl_present) {
io = (ipsec_out_t *)first_mp->b_rptr;
- if (io->ipsec_out_attach_if || io->ipsec_out_ip_nexthop) {
+ if (io->ipsec_out_ip_nexthop) {
/*
* We may have lost the conn context if we are
* coming here from ip_newroute(). Copy the
* nexthop information.
*/
- if (io->ipsec_out_ip_nexthop) {
- ip_nexthop = B_TRUE;
- nexthop_addr = io->ipsec_out_nexthop_addr;
+ ip_nexthop = B_TRUE;
+ nexthop_addr = io->ipsec_out_nexthop_addr;
- ipha = (ipha_t *)mp->b_rptr;
- dst = ipha->ipha_dst;
- goto send_from_ill;
- } else {
- ASSERT(io->ipsec_out_ill_index != 0);
- ifindex = io->ipsec_out_ill_index;
- attach_ill = ill_lookup_on_ifindex(ifindex,
- B_FALSE, NULL, NULL, NULL, NULL, ipst);
- if (attach_ill == NULL) {
- ASSERT(xmit_ill == NULL);
- ip1dbg(("ip_output: bad ifindex for "
- "(BIND TO IPIF_NOFAILOVER) %d\n",
- ifindex));
- freemsg(first_mp);
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutDiscards);
- ASSERT(!need_decref);
- return;
- }
- }
+ ipha = (ipha_t *)mp->b_rptr;
+ dst = ipha->ipha_dst;
+ goto send_from_ill;
}
}
@@ -21161,7 +20872,7 @@ hdrtoosmall:
ipha = (ipha_t *)mp->b_rptr;
if (first_mp == NULL) {
- ASSERT(attach_ill == NULL && xmit_ill == NULL);
+ ASSERT(xmit_ill == NULL);
/*
* If we got here because of "goto hdrtoosmall"
* We need to attach a IPSEC_OUT.
@@ -21213,8 +20924,6 @@ version_hdrlen_check:
*/
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion);
ASSERT(xmit_ill == NULL);
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
if (need_decref)
mp->b_flag |= MSGHASREF;
(void) ip_output_v6(arg, first_mp, arg2, caller);
@@ -21255,8 +20964,6 @@ version_hdrlen_check:
zoneid, ipst)) {
ASSERT(xmit_ill == NULL);
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
"ip_wput_end: q %p (%S)", q, "badopts");
if (need_decref)
@@ -21295,22 +21002,6 @@ multicast:
*/
ill_t *ill = (ill_t *)q->q_ptr;
- /*
- * Don't honor attach_if for this case. If ill
- * is part of the group, ipif could belong to
- * any ill and we cannot maintain attach_ill
- * and ipif_ill same anymore and the assert
- * below would fail.
- */
- if (mctl_present && io->ipsec_out_attach_if) {
- io->ipsec_out_ill_index = 0;
- io->ipsec_out_attach_if = B_FALSE;
- ASSERT(attach_ill != NULL);
- ill_refrele(attach_ill);
- attach_ill = NULL;
- }
-
- ASSERT(attach_ill == NULL);
ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID);
if (ipif == NULL) {
if (need_decref)
@@ -21429,25 +21120,11 @@ multicast:
first_mp->b_cont = mp;
mctl_present = B_TRUE;
}
- if (attach_ill != NULL) {
- ASSERT(attach_ill == ipif->ipif_ill);
- match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill)) {
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- }
- io->ipsec_out_ill_index =
- attach_ill->ill_phyint->phyint_ifindex;
- io->ipsec_out_attach_if = B_TRUE;
- } else {
- match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
- io->ipsec_out_ill_index =
- ipif->ipif_ill->ill_phyint->phyint_ifindex;
- }
+ match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
+ io->ipsec_out_ill_index =
+ ipif->ipif_ill->ill_phyint->phyint_ifindex;
+
if (connp != NULL) {
io->ipsec_out_multicast_loop =
connp->conn_multicast_loop;
@@ -21469,9 +21146,7 @@ multicast:
*
* NOTE : We need to do it for non-secure case also as
* this might go out secure if there is a global policy
- * match in ip_wput_ire. For bind to IPIF_NOFAILOVER
- * address, the source should be initialized already and
- * hence we won't be initializing here.
+ * match in ip_wput_ire.
*
* As we do not have the ire yet, it is possible that
* we set the source address here and then later discover
@@ -21507,14 +21182,6 @@ multicast:
zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
}
- /*
- * refrele attach_ill as its not needed anymore.
- */
- if (attach_ill != NULL) {
- ill_refrele(attach_ill);
- attach_ill = NULL;
- }
-
if (ire == NULL) {
/*
* Multicast loopback and multicast forwarding is
@@ -21630,33 +21297,9 @@ noroute:
ipif_refrele(dst_ipif);
}
}
- /*
- * If we are bound to IPIF_NOFAILOVER address, look for
- * an IRE_CACHE matching the ill.
- */
-send_from_ill:
- if (attach_ill != NULL) {
- ipif_t *attach_ipif;
- match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill)) {
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- }
-
- attach_ipif = ipif_get_next_ipif(NULL, attach_ill);
- if (attach_ipif == NULL) {
- ip1dbg(("ip_wput: No ipif for attach_ill\n"));
- goto discard_pkt;
- }
- ire = ire_ctable_lookup(dst, 0, 0, attach_ipif,
- zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
- ipif_refrele(attach_ipif);
- } else if (xmit_ill != NULL) {
+send_from_ill:
+ if (xmit_ill != NULL) {
ipif_t *ipif;
/*
@@ -21681,6 +21324,10 @@ send_from_ill:
goto drop_pkt;
}
+ match_flags = 0;
+ if (IS_UNDER_IPMP(xmit_ill))
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+
/*
* Look for a ire that is part of the group,
* if found use it else call ip_newroute_ipif.
@@ -21689,7 +21336,7 @@ send_from_ill:
* ill is accessible from all zones i.e has a
* valid ipif in all zones.
*/
- match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
+ match_flags |= MATCH_IRE_ILL | MATCH_IRE_SECATTR;
ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid,
MBLK_GETLABEL(mp), match_flags, ipst);
/*
@@ -21729,12 +21376,7 @@ send_from_ill:
ipst);
}
if (!ire) {
- /*
- * Make sure we don't load spread if this
- * is IPIF_NOFAILOVER case.
- */
- if ((attach_ill != NULL) ||
- (ip_nexthop && !ignore_nexthop)) {
+ if (ip_nexthop && !ignore_nexthop) {
if (mctl_present) {
io = (ipsec_out_t *)first_mp->b_rptr;
ASSERT(first_mp->b_datap->db_type ==
@@ -21764,15 +21406,8 @@ send_from_ill:
first_mp->b_cont = mp;
mctl_present = B_TRUE;
}
- if (attach_ill != NULL) {
- io->ipsec_out_ill_index = attach_ill->
- ill_phyint->phyint_ifindex;
- io->ipsec_out_attach_if = B_TRUE;
- } else {
- io->ipsec_out_ip_nexthop = ip_nexthop;
- io->ipsec_out_nexthop_addr =
- nexthop_addr;
- }
+ io->ipsec_out_ip_nexthop = ip_nexthop;
+ io->ipsec_out_nexthop_addr = nexthop_addr;
}
noirefound:
/*
@@ -21787,8 +21422,6 @@ noirefound:
ip_newroute(q, first_mp, dst, connp, zoneid, ipst);
TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
"ip_wput_end: q %p (%S)", q, "newroute");
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
if (xmit_ill != NULL)
ill_refrele(xmit_ill);
if (need_decref)
@@ -21869,8 +21502,6 @@ noirefound:
ip_newroute(q, copy_mp, dst, connp, zoneid, ipst);
}
}
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
if (xmit_ill != NULL)
ill_refrele(xmit_ill);
if (need_decref)
@@ -21896,8 +21527,6 @@ drop_pkt:
if (need_decref)
CONN_DEC_REF(connp);
freemsg(first_mp);
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
if (xmit_ill != NULL)
ill_refrele(xmit_ill);
TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
@@ -21923,8 +21552,8 @@ ip_wput(queue_t *q, mblk_t *mp)
/*
*
* The following rules must be observed when accessing any ipif or ill
- * that has been cached in the conn. Typically conn_nofailover_ill,
- * conn_outgoing_ill, conn_multicast_ipif and conn_multicast_ill.
+ * that has been cached in the conn. Typically conn_outgoing_ill,
+ * conn_multicast_ipif and conn_multicast_ill.
*
* Access: The ipif or ill pointed to from the conn can be accessed under
* the protection of the conn_lock or after it has been refheld under the
@@ -21944,10 +21573,8 @@ ip_wput(queue_t *q, mblk_t *mp)
* The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock
* On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock
* or a reference to the ipif or a reference to an ire that references the
- * ipif. An ipif does not change its ill except for failover/failback. Since
- * failover/failback happens only after bringing down the ipif and making sure
- * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock
- * the above holds.
+ * ipif. An ipif only changes its ill when migrating from an underlying ill
+ * to an IPMP ill in ipif_up().
*/
ipif_t *
conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err)
@@ -22302,96 +21929,6 @@ ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire,
zoneid));
}
-ire_t *
-conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill)
-{
- ipaddr_t addr;
- ire_t *save_ire;
- irb_t *irb;
- ill_group_t *illgrp;
- int err;
-
- save_ire = ire;
- addr = ire->ire_addr;
-
- ASSERT(ire->ire_type == IRE_BROADCAST);
-
- illgrp = connp->conn_outgoing_ill->ill_group;
- if (illgrp == NULL) {
- *conn_outgoing_ill = conn_get_held_ill(connp,
- &connp->conn_outgoing_ill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- ire_refrele(save_ire);
- return (NULL);
- }
- return (save_ire);
- }
- /*
- * If IP_BOUND_IF has been done, conn_outgoing_ill will be set.
- * If it is part of the group, we need to send on the ire
- * that has been cleared of IRE_MARK_NORECV and that belongs
- * to this group. This is okay as IP_BOUND_IF really means
- * any ill in the group. We depend on the fact that the
- * first ire in the group is always cleared of IRE_MARK_NORECV
- * if such an ire exists. This is possible only if you have
- * at least one ill in the group that has not failed.
- *
- * First get to the ire that matches the address and group.
- *
- * We don't look for an ire with a matching zoneid because a given zone
- * won't always have broadcast ires on all ills in the group.
- */
- irb = ire->ire_bucket;
- rw_enter(&irb->irb_lock, RW_READER);
- if (ire->ire_marks & IRE_MARK_NORECV) {
- /*
- * If the current zone only has an ire broadcast for this
- * address marked NORECV, the ire we want is ahead in the
- * bucket, so we look it up deliberately ignoring the zoneid.
- */
- for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_addr != addr)
- continue;
- /* skip over deleted ires */
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
- }
- }
- while (ire != NULL) {
- /*
- * If a new interface is coming up, we could end up
- * seeing the loopback ire and the non-loopback ire
- * may not have been added yet. So check for ire_stq
- */
- if (ire->ire_stq != NULL && (ire->ire_addr != addr ||
- ire->ire_ipif->ipif_ill->ill_group == illgrp)) {
- break;
- }
- ire = ire->ire_next;
- }
- if (ire != NULL && ire->ire_addr == addr &&
- ire->ire_ipif->ipif_ill->ill_group == illgrp) {
- IRE_REFHOLD(ire);
- rw_exit(&irb->irb_lock);
- ire_refrele(save_ire);
- *conn_outgoing_ill = ire_to_ill(ire);
- /*
- * Refhold the ill to make the conn_outgoing_ill
- * independent of the ire. ip_wput_ire goes in a loop
- * and may refrele the ire. Since we have an ire at this
- * point we don't need to use ILL_CAN_LOOKUP on the ill.
- */
- ill_refhold(*conn_outgoing_ill);
- return (ire);
- }
- rw_exit(&irb->irb_lock);
- ip1dbg(("conn_set_outgoing_ill: No matching ire\n"));
- /*
- * If we can't find a suitable ire, return the original ire.
- */
- return (save_ire);
-}
-
/*
* This function does the ire_refrele of the ire passed in as the
* argument. As this function looks up more ires i.e broadcast ires,
@@ -22401,7 +21938,6 @@ conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill)
* IPQoS Notes:
* IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for
* IPsec packets are done in ipsec_out_process.
- *
*/
void
ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
@@ -22471,9 +22007,8 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
if ((first_ire->ire_flags & RTF_MULTIRT) &&
(first_ire->ire_addr == ire->ire_addr) &&
!(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) {
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
break;
- }
}
if ((first_ire != NULL) && (first_ire != ire)) {
@@ -22489,36 +22024,15 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
* conn_outgoing_ill variable is used only in the broadcast loop.
* for performance we don't grab the mutexs in the fastpath
*/
- if ((connp != NULL) &&
- (ire->ire_type == IRE_BROADCAST) &&
- ((connp->conn_nofailover_ill != NULL) ||
- (connp->conn_outgoing_ill != NULL))) {
- /*
- * Bind to IPIF_NOFAILOVER address overrides IP_BOUND_IF
- * option. So, see if this endpoint is bound to a
- * IPIF_NOFAILOVER address. If so, honor it. This implies
- * that if the interface is failed, we will still send
- * the packet on the same ill which is what we want.
- */
+ if (ire->ire_type == IRE_BROADCAST && connp != NULL &&
+ connp->conn_outgoing_ill != NULL) {
conn_outgoing_ill = conn_get_held_ill(connp,
- &connp->conn_nofailover_ill, &err);
+ &connp->conn_outgoing_ill, &err);
if (err == ILL_LOOKUP_FAILED) {
ire_refrele(ire);
freemsg(mp);
return;
}
- if (conn_outgoing_ill == NULL) {
- /*
- * Choose a good ill in the group to send the
- * packets on.
- */
- ire = conn_set_outgoing_ill(connp, ire,
- &conn_outgoing_ill);
- if (ire == NULL) {
- freemsg(mp);
- return;
- }
- }
}
if (mp->b_datap->db_type != M_CTL) {
@@ -22578,7 +22092,7 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
if (src_ire != NULL &&
!(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
(!ipst->ips_ip_restrict_interzone_loopback ||
- ire_local_same_ill_group(ire, src_ire))) {
+ ire_local_same_lan(ire, src_ire))) {
if (ipha->ipha_src == INADDR_ANY && !unspec_src)
ipha->ipha_src = src_ire->ire_src_addr;
ire_refrele(src_ire);
@@ -22741,39 +22255,7 @@ another:;
*/
ASSERT(ire->ire_ipversion == IPV4_VERSION);
- /*
- * With IP multipathing, broadcast packets are sent on the ire
- * that has been cleared of IRE_MARK_NORECV and that belongs to
- * the group. However, this ire might not be in the same zone so
- * we can't always use its source address. We look for a
- * broadcast ire in the same group and in the right zone.
- */
- if (ire->ire_type == IRE_BROADCAST &&
- ire->ire_zoneid != zoneid) {
- ire_t *src_ire = ire_ctable_lookup(dst, 0,
- IRE_BROADCAST, ire->ire_ipif, zoneid, NULL,
- (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst);
- if (src_ire != NULL) {
- src = src_ire->ire_src_addr;
- ire_refrele(src_ire);
- } else {
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- freemsg(first_mp);
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutDiscards);
- }
- return;
- }
- } else {
- src = ire->ire_src_addr;
- }
-
+ src = ire->ire_src_addr;
if (connp == NULL) {
ip1dbg(("ip_wput_ire: no connp and no src "
"address for dst 0x%x, using src 0x%x\n",
@@ -22917,10 +22399,9 @@ another:;
ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t));
io = (ipsec_out_t *)first_mp->b_rptr;
- io->ipsec_out_ill_index = ((ill_t *)stq->q_ptr)->
- ill_phyint->phyint_ifindex;
-
- ipsec_out_process(q, first_mp, ire, ill_index);
+ io->ipsec_out_ill_index =
+ ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex;
+ ipsec_out_process(q, first_mp, ire, 0);
ire_refrele(ire);
if (conn_outgoing_ill != NULL)
ill_refrele(conn_outgoing_ill);
@@ -22960,7 +22441,7 @@ another:;
if (ire1->ire_addr != ire->ire_addr)
continue;
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
continue;
/* Got one */
@@ -23147,71 +22628,16 @@ broadcast:
* back outbound packets in different zones but on the
* same ill, as the application would see duplicates.
*
- * If the interfaces are part of the same group,
- * we would want to send only one copy out for
- * whole group.
- *
* This logic assumes that ire_add_v4() groups the
* IRE_BROADCAST entries so that those with the same
- * ire_addr and ill_group are kept together.
+ * ire_addr are kept together.
*/
ire_ill = ire->ire_ipif->ipif_ill;
- if (ire->ire_stq == NULL && ire1->ire_stq != NULL) {
- if (ire_ill->ill_group != NULL &&
- (ire->ire_marks & IRE_MARK_NORECV)) {
- /*
- * If the current zone only has an ire
- * broadcast for this address marked
- * NORECV, the ire we want is ahead in
- * the bucket, so we look it up
- * deliberately ignoring the zoneid.
- */
- for (ire1 = ire->ire_bucket->irb_ire;
- ire1 != NULL;
- ire1 = ire1->ire_next) {
- ire1_ill =
- ire1->ire_ipif->ipif_ill;
- if (ire1->ire_addr != dst)
- continue;
- /* skip over the current ire */
- if (ire1 == ire)
- continue;
- /* skip over deleted ires */
- if (ire1->ire_marks &
- IRE_MARK_CONDEMNED)
- continue;
- /*
- * non-loopback ire in our
- * group: use it for the next
- * pass in the loop
- */
- if (ire1->ire_stq != NULL &&
- ire1_ill->ill_group ==
- ire_ill->ill_group)
- break;
- }
- }
- } else {
+ if (ire->ire_stq != NULL || ire1->ire_stq == NULL) {
while (ire1 != NULL && ire1->ire_addr == dst) {
ire1_ill = ire1->ire_ipif->ipif_ill;
- /*
- * We can have two broadcast ires on the
- * same ill in different zones; here
- * we'll send a copy of the packet on
- * each ill and the fanout code will
- * call conn_wantpacket() to check that
- * the zone has the broadcast address
- * configured on the ill. If the two
- * ires are in the same group we only
- * send one copy up.
- */
- if (ire1_ill != ire_ill &&
- (ire1_ill->ill_group == NULL ||
- ire_ill->ill_group == NULL ||
- ire1_ill->ill_group !=
- ire_ill->ill_group)) {
+ if (ire1_ill != ire_ill)
break;
- }
ire1 = ire1->ire_next;
}
}
@@ -23403,13 +22829,8 @@ multi_loopback:
* logic.
*/
if (ill != NULL) {
- ilm_t *ilm;
-
- ILM_WALKER_HOLD(ill);
- ilm = ilm_lookup_ill(ill, ipha->ipha_dst,
- ALL_ZONES);
- ILM_WALKER_RELE(ill);
- if (ilm != NULL) {
+ if (ilm_lookup_ill(ill, ipha->ipha_dst,
+ ALL_ZONES) != NULL) {
/*
* Pass along the virtual output q.
* ip_wput_local() will distribute the
@@ -23565,18 +22986,17 @@ checksumoptions:
ire1 != NULL;
ire1 = ire1->ire_next) {
if (!(ire1->ire_flags &
- RTF_MULTIRT)) {
+ RTF_MULTIRT))
continue;
- }
+
if (ire1->ire_addr !=
- ire->ire_addr) {
+ ire->ire_addr)
continue;
- }
+
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED|
- IRE_MARK_HIDDEN)) {
+ (IRE_MARK_CONDEMNED |
+ IRE_MARK_TESTHIDDEN))
continue;
- }
/* Got one */
IRE_REFHOLD(ire1);
@@ -24743,9 +24163,8 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
if ((first_ire->ire_flags & RTF_MULTIRT) &&
(first_ire->ire_addr == ire->ire_addr) &&
!(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) {
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
break;
- }
}
if (first_ire != NULL) {
@@ -24808,7 +24227,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
if (ire1->ire_addr != ire->ire_addr)
continue;
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
continue;
/*
* Ensure we do not exceed the MTU
@@ -25130,10 +24549,9 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
if (ire1->ire_addr != ire->ire_addr)
continue;
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED|
- IRE_MARK_HIDDEN)) {
+ (IRE_MARK_CONDEMNED |
+ IRE_MARK_TESTHIDDEN))
continue;
- }
/*
* Ensure we do not exceed the MTU
* of the next route.
@@ -25500,6 +24918,7 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire,
ilm_t *ilm;
mblk_t *mp1;
zoneid_t last_zoneid;
+ ilm_walker_t ilw;
if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) {
ASSERT(ire_type == IRE_BROADCAST);
@@ -25524,11 +24943,9 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire,
* have been exhausted.
*/
last_zoneid = -1;
- ILM_WALKER_HOLD(ill);
- for (ilm = ill->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if ((ilm->ilm_flags & ILM_DELETED) ||
- ipha->ipha_dst != ilm->ilm_addr ||
+ ilm = ilm_walker_start(&ilw, ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ if (ipha->ipha_dst != ilm->ilm_addr ||
ilm->ilm_zoneid == last_zoneid ||
ilm->ilm_zoneid == zoneid ||
!(ilm->ilm_ipif->ipif_flags & IPIF_UP))
@@ -25536,12 +24953,12 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire,
mp1 = ip_copymsg(first_mp);
if (mp1 == NULL)
continue;
- icmp_inbound(q, mp1, B_TRUE, ill, 0, 0,
- mctl_present, B_FALSE, ill,
+ icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill,
+ 0, 0, mctl_present, B_FALSE, ill,
ilm->ilm_zoneid);
last_zoneid = ilm->ilm_zoneid;
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
/*
* Loopback case: the sending endpoint has
* IP_MULTICAST_LOOP disabled, therefore we don't
@@ -25859,14 +25276,9 @@ ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid)
* caller and hence matching on ILL (MATCH_IRE_ILL) would
* be sufficient rather than MATCH_IRE_IPIF.
*
- * This function is used for sending IGMP packets. We need
- * to make sure that we send the packet out of the interface
- * (ipif->ipif_ill) where we joined the group. This is to
- * prevent from switches doing IGMP snooping to send us multicast
- * packets for a given group on the interface we have joined.
- * If we can't find an ire, igmp_sendpkt has already initialized
- * ipsec_out_attach_if so that this will not be load spread in
- * ip_newroute_ipif.
+ * This function is used for sending IGMP packets. For IPMP,
+ * we sidestep IGMP snooping issues by sending all multicast
+ * traffic on a single interface in the IPMP group.
*/
ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL,
MATCH_IRE_ILL, ipst);
@@ -26035,7 +25447,7 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill,
ip6_t *ip6h1;
uint_t ill_index;
ipsec_out_t *io;
- boolean_t attach_if, hwaccel;
+ boolean_t hwaccel;
uint32_t flags = IP6_NO_IPPOLICY;
int match_flags;
zoneid_t zoneid;
@@ -26052,42 +25464,22 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill,
if (io->ipsec_out_reachable) {
flags |= IPV6_REACHABILITY_CONFIRMATION;
}
- attach_if = io->ipsec_out_attach_if;
hwaccel = io->ipsec_out_accelerated;
zoneid = io->ipsec_out_zoneid;
ASSERT(zoneid != ALL_ZONES);
- match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
+ match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
/* Multicast addresses should have non-zero ill_index. */
v6dstp = &ip6h->ip6_dst;
ASSERT(ip6h->ip6_nxt != IPPROTO_RAW);
ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0);
- ASSERT(!attach_if || ill_index != 0);
- if (ill_index != 0) {
- if (ill == NULL) {
- ill = ip_grab_attach_ill(NULL, ipsec_mp, ill_index,
- B_TRUE, ipst);
- /* Failure case frees things for us. */
- if (ill == NULL)
- return;
-
- ill_need_rele = B_TRUE;
- }
- /*
- * If this packet needs to go out on a particular interface
- * honor it.
- */
- if (attach_if) {
- match_flags = MATCH_IRE_ILL;
+ if (ill == NULL && ill_index != 0) {
+ ill = ip_grab_ill(ipsec_mp, ill_index, B_TRUE, ipst);
+ /* Failure case frees things for us. */
+ if (ill == NULL)
+ return;
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill)) {
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- }
- }
+ ill_need_rele = B_TRUE;
}
ASSERT(mp != NULL);
@@ -26138,32 +25530,15 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill,
return;
}
- ip_newroute_ipif_v6(q, ipsec_mp, ipif, *v6dstp,
+ ip_newroute_ipif_v6(q, ipsec_mp, ipif, v6dstp, &ip6h->ip6_src,
unspec_src, zoneid);
ipif_refrele(ipif);
} else {
- if (attach_if) {
- ipif_t *ipif;
-
- ipif = ipif_get_next_ipif(NULL, ill);
- if (ipif == NULL) {
- if (ill_need_rele)
- ill_refrele(ill);
- freemsg(ipsec_mp);
- return;
- }
- ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif,
- zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
- ire_need_rele = B_TRUE;
- ipif_refrele(ipif);
+ if (ire_arg != NULL) {
+ ire = ire_arg;
} else {
- if (ire_arg != NULL) {
- ire = ire_arg;
- } else {
- ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL,
- ipst);
- ire_need_rele = B_TRUE;
- }
+ ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, ipst);
+ ire_need_rele = B_TRUE;
}
if (ire != NULL)
goto send;
@@ -26350,7 +25725,6 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
ipha_t *ipha1;
uint_t ill_index;
ipsec_out_t *io;
- boolean_t attach_if;
int match_flags;
irb_t *irb = NULL;
boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE;
@@ -26372,39 +25746,19 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
io = (ipsec_out_t *)ipsec_mp->b_rptr;
ill_index = io->ipsec_out_ill_index;
- attach_if = io->ipsec_out_attach_if;
zoneid = io->ipsec_out_zoneid;
ASSERT(zoneid != ALL_ZONES);
ipst = io->ipsec_out_ns->netstack_ip;
ASSERT(io->ipsec_out_ns != NULL);
- match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
- if (ill_index != 0) {
- if (ill == NULL) {
- ill = ip_grab_attach_ill(NULL, ipsec_mp,
- ill_index, B_FALSE, ipst);
-
- /* Failure case frees things for us. */
- if (ill == NULL)
- return;
-
- ill_need_rele = B_TRUE;
- }
- /*
- * If this packet needs to go out on a particular interface
- * honor it.
- */
- if (attach_if) {
- match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
+ match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
+ if (ill == NULL && ill_index != 0) {
+ ill = ip_grab_ill(ipsec_mp, ill_index, B_FALSE, ipst);
+ /* Failure case frees things for us. */
+ if (ill == NULL)
+ return;
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill)) {
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- }
- }
+ ill_need_rele = B_TRUE;
}
if (CLASSD(dst)) {
@@ -26474,17 +25828,12 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT,
zoneid, &zero_info);
} else {
- if (attach_if) {
- ire = ire_ctable_lookup(dst, 0, 0, ill->ill_ipif,
- zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
+ if (ire_arg != NULL) {
+ ire = ire_arg;
+ ire_need_rele = B_FALSE;
} else {
- if (ire_arg != NULL) {
- ire = ire_arg;
- ire_need_rele = B_FALSE;
- } else {
- ire = ire_cache_lookup(dst, zoneid,
- MBLK_GETLABEL(mp), ipst);
- }
+ ire = ire_cache_lookup(dst, zoneid,
+ MBLK_GETLABEL(mp), ipst);
}
if (ire != NULL) {
goto send;
@@ -26613,11 +25962,9 @@ send:
(void *)ire->ire_ipif, (void *)ipif));
/*
- * Multiroute the secured packet, unless IPsec really
- * requires the packet to go out only through a particular
- * interface.
+ * Multiroute the secured packet.
*/
- if ((ire->ire_flags & RTF_MULTIRT) && !attach_if) {
+ if (ire->ire_flags & RTF_MULTIRT) {
ire_t *first_ire;
irb = ire->ire_bucket;
ASSERT(irb != NULL);
@@ -26634,9 +25981,8 @@ send:
if ((first_ire->ire_flags & RTF_MULTIRT) &&
(first_ire->ire_addr == ire->ire_addr) &&
!(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) {
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
break;
- }
}
if ((first_ire != NULL) && (first_ire != ire)) {
@@ -26657,11 +26003,6 @@ send:
multirt_send = B_TRUE;
max_frag = ire->ire_max_frag;
- } else {
- if ((ire->ire_flags & RTF_MULTIRT) && attach_if) {
- ip1dbg(("ip_wput_ipsec_out: ignoring multirouting "
- "flag, attach_if %d\n", attach_if));
- }
}
/*
@@ -26689,7 +26030,7 @@ send:
if (ire1->ire_addr != ire->ire_addr)
continue;
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
continue;
/* No loopback here */
if (ire1->ire_stq == NULL)
@@ -27155,10 +26496,8 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
* before sending it the accelerated packet.
*/
if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) {
- int ifindex;
ill = ire_to_ill(ire);
- ifindex = ill->ill_phyint->phyint_ifindex;
- io->ipsec_out_capab_ill_index = ifindex;
+ io->ipsec_out_capab_ill_index = ill->ill_phyint->phyint_ifindex;
}
/*
@@ -27284,17 +26623,18 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
}
}
/*
- * We are done with IPsec processing. Send it over
- * the wire.
+ * We are done with IPsec processing. Send it over the wire.
*/
done:
mp = ipsec_mp->b_cont;
ipha = (ipha_t *)mp->b_rptr;
if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
- ip_wput_ipsec_out(q, ipsec_mp, ipha, ill, ire);
+ ip_wput_ipsec_out(q, ipsec_mp, ipha, ire->ire_ipif->ipif_ill,
+ ire);
} else {
ip6h = (ip6_t *)ipha;
- ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ill, ire);
+ ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ire->ire_ipif->ipif_ill,
+ ire);
}
if (ill != NULL && ill_need_rele)
ill_refrele(ill);
@@ -27356,18 +26696,16 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
ipip = ip_sioctl_lookup(iocp->ioc_cmd);
if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) {
/*
- * Special case where ipsq_current_ipif is not set:
+ * Special case where ipx_current_ipif is not set:
* ill_phyint_reinit merged the v4 and v6 into a single ipsq.
- * ill could also have become part of a ipmp group in the
- * process, we are here as were not able to complete the
- * operation in ipif_set_values because we could not become
- * exclusive on the new ipsq, In such a case ipsq_current_ipif
- * will not be set so we need to set it.
+ * We are here as were not able to complete the operation in
+ * ipif_set_values because we could not become exclusive on
+ * the new ipsq.
*/
ill_t *ill = q->q_ptr;
ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd);
}
- ASSERT(ipsq->ipsq_current_ipif != NULL);
+ ASSERT(ipsq->ipsq_xop->ipx_current_ipif != NULL);
if (ipip->ipi_cmd_type == IF_CMD) {
/* This a old style SIOC[GS]IF* command */
@@ -27381,8 +26719,8 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
sin = NULL;
}
- err = (*ipip->ipi_func_restart)(ipsq->ipsq_current_ipif, sin, q, mp,
- ipip, mp1->b_rptr);
+ err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin,
+ q, mp, ipip, mp1->b_rptr);
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
}
@@ -27424,6 +26762,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
ip_extract_func_t *extract_funcp;
cmd_info_t ci;
int err;
+ boolean_t entered_ipsq = B_FALSE;
ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd));
@@ -27505,18 +26844,21 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
return;
}
+ ASSERT(ci.ci_ipif != NULL);
+
/*
- * If ipsq is non-null, we are already being called exclusively on an
- * ill but in the case of a failover in progress it is the "from" ill,
- * rather than the "to" ill (which is the ill ptr passed in).
- * In order to ensure we are exclusive on both ILLs we rerun
- * ipsq_try_enter() here, ipsq's support recursive entry.
+ * If ipsq is non-NULL, we are already being called exclusively.
*/
ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
- ASSERT(ci.ci_ipif != NULL);
-
- ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl,
- NEW_OP, B_TRUE);
+ if (ipsq == NULL) {
+ ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl,
+ NEW_OP, B_TRUE);
+ if (ipsq == NULL) {
+ ipif_refrele(ci.ci_ipif);
+ return;
+ }
+ entered_ipsq = B_TRUE;
+ }
/*
* Release the ipif so that ipif_down and friends that wait for
@@ -27525,8 +26867,6 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
* the ipif.
*/
ipif_refrele(ci.ci_ipif);
- if (ipsq == NULL)
- return;
ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
@@ -27535,19 +26875,12 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
* where we set the IPIF_CHANGING flag. This ensures that there won't
* be any new references to the ipif. This helps functions that go
* through this path and end up trying to wait for the refcnts
- * associated with the ipif to go down to zero. Some exceptions are
- * Failover, Failback, and Groupname commands that operate on more than
- * just the ci.ci_ipif. These commands internally determine the
- * set of ipif's they operate on and set and clear the IPIF_CHANGING
- * flags on that set. Another exception is the Removeif command that
- * sets the IPIF_CONDEMNED flag internally after identifying the right
- * ipif to operate on.
+ * associated with the ipif to go down to zero. The exception is
+ * SIOCSLIFREMOVEIF, which sets IPIF_CONDEMNED internally after
+ * identifying the right ipif to operate on.
*/
mutex_enter(&(ci.ci_ipif)->ipif_ill->ill_lock);
- if (ipip->ipi_cmd != SIOCLIFREMOVEIF &&
- ipip->ipi_cmd != SIOCLIFFAILOVER &&
- ipip->ipi_cmd != SIOCLIFFAILBACK &&
- ipip->ipi_cmd != SIOCSLIFGROUPNAME)
+ if (ipip->ipi_cmd != SIOCLIFREMOVEIF)
(ci.ci_ipif)->ipif_state_flags |= IPIF_CHANGING;
mutex_exit(&(ci.ci_ipif)->ipif_ill->ill_lock);
@@ -27560,7 +26893,8 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
- ipsq_exit(ipsq);
+ if (entered_ipsq)
+ ipsq_exit(ipsq);
}
/*
@@ -27708,7 +27042,7 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
* Refhold the conn, till the ioctl completes. This is
* needed in case the ioctl ends up in the pending mp
* list. Every mp in the ill_pending_mp list and
- * the ipsq_pending_mp must have a refhold on the conn
+ * the ipx_pending_mp must have a refhold on the conn
* to resume processing. The refhold is released when
* the ioctl completes. (normally or abnormally)
* In all cases ip_ioctl_finish is called to finish
@@ -27753,8 +27087,25 @@ nak:
if (CONN_Q(q))
goto nak;
- /* Finish socket ioctls passed through to ARP. */
- ip_sioctl_iocack(q, mp);
+ /*
+ * Finish socket ioctls passed through to ARP. We use the
+ * ioc_cmd values we set in ip_sioctl_arp() to decide whether
+ * we need to become writer before calling ip_sioctl_iocack().
+ * Note that qwriter_ip() will release the refhold, and that a
+ * refhold is OK without ILL_CAN_LOOKUP() since we're on the
+ * ill stream.
+ */
+ iocp = (struct iocblk *)mp->b_rptr;
+ if (iocp->ioc_cmd == AR_ENTRY_SQUERY) {
+ ip_sioctl_iocack(NULL, q, mp, NULL);
+ return;
+ }
+
+ ASSERT(iocp->ioc_cmd == AR_ENTRY_DELETE ||
+ iocp->ioc_cmd == AR_ENTRY_ADD);
+ ill = q->q_ptr;
+ ill_refhold(ill);
+ qwriter_ip(ill, q, mp, ip_sioctl_iocack, CUR_OP, B_FALSE);
return;
case M_FLUSH:
if (*mp->b_rptr & FLUSHW)
@@ -28021,11 +27372,11 @@ nak:
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
- nce = ndp_lookup_v6(ill,
+ nce = ndp_lookup_v6(ill, B_FALSE,
&ire->ire_addr_v6, B_FALSE);
} else {
- nce = ndp_lookup_v6(ill, &gw_addr_v6,
- B_FALSE);
+ nce = ndp_lookup_v6(ill, B_FALSE,
+ &gw_addr_v6, B_FALSE);
}
if (nce != NULL) {
nce_resolv_failed(nce);
@@ -28061,10 +27412,11 @@ nak:
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
- nce = ndp_lookup_v6(ill, &ire->ire_addr_v6,
- B_FALSE);
+ nce = ndp_lookup_v6(ill, B_FALSE,
+ &ire->ire_addr_v6, B_FALSE);
} else {
- nce = ndp_lookup_v6(ill, &gw_addr_v6, B_FALSE);
+ nce = ndp_lookup_v6(ill, B_FALSE,
+ &gw_addr_v6, B_FALSE);
}
if (nce != NULL) {
/*
@@ -28238,13 +27590,14 @@ nak:
fake_ire = (ire_t *)mp->b_rptr;
/*
- * By the time we come back here from ARP the incomplete ire
- * created in ire_forward() could have been removed. We use
- * the parameters stored in the fake_ire to specify the real
- * ire as explicitly as possible. This avoids problems when
- * IPMP groups are configured as an ipif can 'float'
- * across several ill queues. We can be confident that the
- * the inability to find an ire is because it no longer exists.
+ * By the time we come back here from ARP the logical outgoing
+ * interface of the incomplete ire we added in ire_forward()
+ * could have disappeared, causing the incomplete ire to also
+ * disappear. So we need to retreive the proper ipif for the
+ * ire before looking in ctable. In the case of IPMP, the
+ * ipif may be on the IPMP ill, so look it up based on the
+ * ire_ipif_ifindex we stashed back in ire_init_common().
+ * Then, we can verify that ire_ipif_seqid still exists.
*/
ill = ill_lookup_on_ifindex(fake_ire->ire_ipif_ifindex, B_FALSE,
NULL, NULL, NULL, NULL, ipst);
@@ -28299,6 +27652,7 @@ nak:
freemsg(mp); /* fake ire */
return;
}
+
nce = ire->ire_nce;
DTRACE_PROBE2(ire__arpresolve__type,
ire_t *, ire, nce_t *, nce);
@@ -29030,7 +28384,7 @@ boolean_t
conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
zoneid_t zoneid)
{
- ill_t *in_ill;
+ ill_t *bound_ill;
boolean_t found;
ipif_t *ipif;
ire_t *ire;
@@ -29045,32 +28399,15 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
* unicast, broadcast and multicast reception to
* conn_incoming_ill. conn_wantpacket itself is called
* only for BROADCAST and multicast.
- *
- * 1) ip_rput supresses duplicate broadcasts if the ill
- * is part of a group. Hence, we should be receiving
- * just one copy of broadcast for the whole group.
- * Thus, if it is part of the group the packet could
- * come on any ill of the group and hence we need a
- * match on the group. Otherwise, match on ill should
- * be sufficient.
- *
- * 2) ip_rput does not suppress duplicate multicast packets.
- * If there are two interfaces in a ill group and we have
- * 2 applications (conns) joined a multicast group G on
- * both the interfaces, ilm_lookup_ill filter in ip_rput
- * will give us two packets because we join G on both the
- * interfaces rather than nominating just one interface
- * for receiving multicast like broadcast above. So,
- * we have to call ilg_lookup_ill to filter out duplicate
- * copies, if ill is part of a group.
- */
- in_ill = connp->conn_incoming_ill;
- if (in_ill != NULL) {
- if (in_ill->ill_group == NULL) {
- if (in_ill != ill)
+ */
+ bound_ill = connp->conn_incoming_ill;
+ if (bound_ill != NULL) {
+ if (IS_IPMP(bound_ill)) {
+ if (bound_ill->ill_grp != ill->ill_grp)
+ return (B_FALSE);
+ } else {
+ if (bound_ill != ill)
return (B_FALSE);
- } else if (in_ill->ill_group != ill->ill_group) {
- return (B_FALSE);
}
}
@@ -29079,15 +28416,14 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
return (B_TRUE);
/*
* The conn is in a different zone; we need to check that this
- * broadcast address is configured in the application's zone and
- * on one ill in the group.
+ * broadcast address is configured in the application's zone.
*/
ipif = ipif_get_next_ipif(NULL, ill);
if (ipif == NULL)
return (B_FALSE);
ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif,
connp->conn_zoneid, NULL,
- (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst);
+ (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
ipif_refrele(ipif);
if (ire != NULL) {
ire_refrele(ire);
@@ -29171,7 +28507,7 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
ipsq = ill->ill_phyint->phyint_ipsq;
- ipif = ipsq->ipsq_pending_ipif;
+ ipif = ipsq->ipsq_xop->ipx_pending_ipif;
mp1 = ipsq_pending_mp_get(ipsq, &connp);
ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
if (mp1 == NULL) {
@@ -29181,12 +28517,12 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
/*
- * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
+ * If an IOCTL is waiting on this (ipx_current_ioctl != 0), then we
* must have an associated conn_t. Otherwise, we're bringing this
* interface back up as part of handling an asynchronous event (e.g.,
* physical address change).
*/
- if (ipsq->ipsq_current_ioctl != 0) {
+ if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
ASSERT(connp != NULL);
q = CONNP_TO_WQ(connp);
} else {
@@ -29219,16 +28555,28 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg)
return;
}
- if (ill->ill_up_ipifs)
- ill_group_cleanup(ill);
+ /*
+ * If we have a moved ipif to bring up, and everything has succeeded
+ * to this point, bring it up on the IPMP ill. Otherwise, leave it
+ * down -- the admin can try to bring it up by hand if need be.
+ */
+ if (ill->ill_move_ipif != NULL) {
+ ipif = ill->ill_move_ipif;
+ ill->ill_move_ipif = NULL;
+ if (err == 0) {
+ err = ipif_up(ipif, q, mp1);
+ if (err == EINPROGRESS)
+ return;
+ }
+ }
/*
* The operation must complete without EINPROGRESS since
- * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
- * Otherwise, the operation will be stuck forever in the ipsq.
+ * ipsq_pending_mp_get() has removed the mblk. Otherwise, the
+ * operation will be stuck forever in the ipsq.
*/
ASSERT(err != EINPROGRESS);
- if (ipsq->ipsq_current_ioctl != 0)
+ if (ipsq->ipsq_xop->ipx_current_ioctl != 0)
ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
else
ipsq_current_finish(ipsq);
@@ -29649,124 +28997,6 @@ ip_int_set(queue_t *q, mblk_t *mp, char *value,
return (0);
}
-/*
- * Handle changes to ipmp_hook_emulation ndd variable.
- * Need to update phyint_hook_ifindex.
- * Also generate a nic plumb event should a new ifidex be assigned to a group.
- */
-static void
-ipmp_hook_emulation_changed(ip_stack_t *ipst)
-{
- phyint_t *phyi;
- phyint_t *phyi_tmp;
- char *groupname;
- int namelen;
- ill_t *ill;
- boolean_t new_group;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- /*
- * Group indicies are stored in the phyint - a common structure
- * to both IPv4 and IPv6.
- */
- phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
- for (; phyi != NULL;
- phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
- phyi, AVL_AFTER)) {
- /* Ignore the ones that do not have a group */
- if (phyi->phyint_groupname_len == 0)
- continue;
-
- /*
- * Look for other phyint in group.
- * Clear name/namelen so the lookup doesn't find ourselves.
- */
- namelen = phyi->phyint_groupname_len;
- groupname = phyi->phyint_groupname;
- phyi->phyint_groupname_len = 0;
- phyi->phyint_groupname = NULL;
-
- phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst);
- /* Restore */
- phyi->phyint_groupname_len = namelen;
- phyi->phyint_groupname = groupname;
-
- new_group = B_FALSE;
- if (ipst->ips_ipmp_hook_emulation) {
- /*
- * If the group already exists and has already
- * been assigned a group ifindex, we use the existing
- * group_ifindex, otherwise we pick a new group_ifindex
- * here.
- */
- if (phyi_tmp != NULL &&
- phyi_tmp->phyint_group_ifindex != 0) {
- phyi->phyint_group_ifindex =
- phyi_tmp->phyint_group_ifindex;
- } else {
- /* XXX We need a recovery strategy here. */
- if (!ip_assign_ifindex(
- &phyi->phyint_group_ifindex, ipst))
- cmn_err(CE_PANIC,
- "ip_assign_ifindex() failed");
- new_group = B_TRUE;
- }
- } else {
- phyi->phyint_group_ifindex = 0;
- }
- if (ipst->ips_ipmp_hook_emulation)
- phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex;
- else
- phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-
- /*
- * For IP Filter to find out the relationship between
- * names and interface indicies, we need to generate
- * a NE_PLUMB event when a new group can appear.
- * We always generate events when a new interface appears
- * (even when ipmp_hook_emulation is set) so there
- * is no need to generate NE_PLUMB events when
- * ipmp_hook_emulation is turned off.
- * And since it isn't critical for IP Filter to get
- * the NE_UNPLUMB events we skip those here.
- */
- if (new_group) {
- /*
- * First phyint in group - generate group PLUMB event.
- * Since we are not running inside the ipsq we do
- * the dispatch immediately.
- */
- if (phyi->phyint_illv4 != NULL)
- ill = phyi->phyint_illv4;
- else
- ill = phyi->phyint_illv6;
-
- if (ill != NULL)
- ill_nic_event_plumb(ill, B_TRUE);
- }
- }
- rw_exit(&ipst->ips_ill_g_lock);
-}
-
-/* ARGSUSED */
-static int
-ipmp_hook_emulation_set(queue_t *q, mblk_t *mp, char *value,
- caddr_t addr, cred_t *cr)
-{
- int *v = (int *)addr;
- long new_value;
- ip_stack_t *ipst = CONNQ_TO_IPST(q);
-
- if (ddi_strtol(value, NULL, 10, &new_value) != 0)
- return (EINVAL);
-
- if (*v != new_value) {
- *v = new_value;
- ipmp_hook_emulation_changed(ipst);
- }
- return (0);
-}
-
static void *
ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp)
{
@@ -30448,12 +29678,12 @@ next_mp:
arpce->nce_state = ND_INCOMPLETE;
mutex_exit(&arpce->nce_lock);
+
/*
* Note that ire_add() (called from ire_forward())
* holds a ref on the ire until ARP is completed.
*/
-
- ire_arpresolve(ire, ire_to_ill(ire));
+ ire_arpresolve(ire);
return (LOOKUP_IN_PROGRESS);
default:
ASSERT(0);
@@ -30596,7 +29826,7 @@ ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill,
return (ALL_ZONES);
if (IN6_IS_ADDR_LINKLOCAL(addr)) {
- ire_flags |= MATCH_IRE_ILL_GROUP;
+ ire_flags |= MATCH_IRE_ILL;
ipif_arg = ill->ill_ipif;
}
if (lookup_zoneid != ALL_ZONES)
@@ -30648,20 +29878,24 @@ void
ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
const ill_t *ill, int ipver, uint32_t hlen, ip_stack_t *ipst)
{
+ mblk_t *mp2;
ipobs_cb_t *ipobs_cb;
+ ipobs_hook_data_t *ihd;
+ uint64_t grifindex = 0;
ASSERT(DB_TYPE(mp) == M_DATA);
+ if (IS_UNDER_IPMP(ill))
+ grifindex = ipmp_ill_get_ipmp_ifindex(ill);
+
mutex_enter(&ipst->ips_ipobs_cb_lock);
ipst->ips_ipobs_cb_nwalkers++;
mutex_exit(&ipst->ips_ipobs_cb_lock);
for (ipobs_cb = list_head(&ipst->ips_ipobs_cb_list); ipobs_cb != NULL;
ipobs_cb = list_next(&ipst->ips_ipobs_cb_list, ipobs_cb)) {
- mblk_t *mp2 = allocb(sizeof (ipobs_hook_data_t),
- BPRI_HI);
+ mp2 = allocb(sizeof (ipobs_hook_data_t), BPRI_HI);
if (mp2 != NULL) {
- ipobs_hook_data_t *ihd =
- (ipobs_hook_data_t *)mp2->b_rptr;
+ ihd = (ipobs_hook_data_t *)mp2->b_rptr;
if (((ihd->ihd_mp = dupmsg(mp)) == NULL) &&
((ihd->ihd_mp = copymsg(mp)) == NULL)) {
freemsg(mp2);
@@ -30673,6 +29907,7 @@ ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
ihd->ihd_zsrc = zsrc;
ihd->ihd_zdst = zdst;
ihd->ihd_ifindex = ill->ill_phyint->phyint_ifindex;
+ ihd->ihd_grifindex = grifindex;
ihd->ihd_stack = ipst->ips_netstack;
mp2->b_wptr += sizeof (*ihd);
ipobs_cb->ipobs_cbfunc(mp2);
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index fe326778c2..6e63af32b3 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -95,7 +95,6 @@
#include <sys/pattr.h>
#include <inet/ipclassifier.h>
#include <inet/ipsecah.h>
-#include <inet/udp_impl.h>
#include <inet/rawip_impl.h>
#include <inet/rts_impl.h>
#include <sys/squeue_impl.h>
@@ -186,7 +185,7 @@ const in6_addr_t ipv6_solicited_node_mcast =
#define IP6_MBLK_HDR_ERR 1
#define IP6_MBLK_LEN_ERR 2
-static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *ill,
+static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *, ill_t *,
boolean_t, zoneid_t);
static void icmp_pkt_v6(queue_t *, mblk_t *, void *, size_t,
const in6_addr_t *, boolean_t, zoneid_t, ip_stack_t *);
@@ -208,11 +207,13 @@ static void ip_fanout_udp_v6(queue_t *, mblk_t *, ip6_t *, uint32_t,
ill_t *, ill_t *, uint_t, boolean_t, zoneid_t);
static int ip_process_options_v6(queue_t *, mblk_t *, ip6_t *,
uint8_t *, uint_t, uint8_t, ip_stack_t *);
-static mblk_t *ip_rput_frag_v6(queue_t *, mblk_t *, ip6_t *,
+static mblk_t *ip_rput_frag_v6(ill_t *, ill_t *, mblk_t *, ip6_t *,
ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *);
static boolean_t ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *);
static void ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int,
- conn_t *, int, int, int, zoneid_t);
+ conn_t *, int, int, zoneid_t);
+static boolean_t ipif_lookup_testaddr_v6(ill_t *, const in6_addr_t *,
+ ipif_t **);
/*
* A template for an IPv6 AR_ENTRY_QUERY
@@ -248,15 +249,14 @@ static areq_t ipv6_areq_template = {
* call icmp_inbound_v6() for each relevant zone.
*/
static void
-icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
- boolean_t mctl_present, uint_t flags, zoneid_t zoneid, mblk_t *dl_mp)
+icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
+ uint_t hdr_length, boolean_t mctl_present, uint_t flags, zoneid_t zoneid,
+ mblk_t *dl_mp)
{
icmp6_t *icmp6;
ip6_t *ip6h;
boolean_t interested;
- ip6i_t *ip6i;
in6_addr_t origsrc;
- ire_t *ire;
mblk_t *first_mp;
ipsec_in_t *ii;
ip_stack_t *ipst = ill->ill_ipst;
@@ -344,7 +344,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
break;
case ICMP6_PACKET_TOO_BIG:
- icmp_inbound_too_big_v6(q, first_mp, ill, mctl_present,
+ icmp_inbound_too_big_v6(q, first_mp, ill, inill, mctl_present,
zoneid);
return;
case ICMP6_ECHO_REQUEST:
@@ -422,66 +422,6 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
* checksum field. The checksum is calculated in ip_wput_v6.
*/
icmp6->icmp6_cksum = ip6h->ip6_plen;
- /*
- * ICMP echo replies should go out on the same interface
- * the request came on as probes used by in.mpathd for
- * detecting NIC failures are ECHO packets. We turn-off load
- * spreading by allocating a ip6i and setting ip6i_attach_if
- * to B_TRUE which is handled both by ip_wput_v6 and
- * ip_newroute_v6. If we don't turnoff load spreading,
- * the packets might get dropped if there are no
- * non-FAILED/INACTIVE interfaces for it to go out on and
- * in.mpathd would wrongly detect a failure or mis-detect
- * a NIC failure as a link failure. As load spreading can
- * happen only if ill_group is not NULL, we do only for
- * that case and this does not affect the normal case.
- *
- * We force this only on echo packets that came from on-link
- * hosts. We restrict this to link-local addresses which
- * is used by in.mpathd for probing. In the IPv6 case,
- * default routes typically have an ire_ipif pointer and
- * hence a MATCH_IRE_ILL later in ip_newroute_v6/ip_wput_v6
- * might work. As a default route out of this interface
- * may not be present, enforcing this packet to go out in
- * this case may not work.
- */
- if (ill->ill_group != NULL &&
- IN6_IS_ADDR_LINKLOCAL(&origsrc)) {
- /*
- * If we are sending replies to ourselves, don't
- * set ATTACH_IF as we may not be able to find
- * the IRE_LOCAL on this ill i.e setting ATTACH_IF
- * causes ip_wput_v6 to look for an IRE_LOCAL on
- * "ill" which it may not find and will try to
- * create an IRE_CACHE for our local address. Once
- * we do this, we will try to forward all packets
- * meant to our LOCAL address.
- */
- ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ALL_ZONES,
- NULL, ipst);
- if (ire == NULL || ire->ire_type != IRE_LOCAL) {
- mp = ip_add_info_v6(mp, NULL, &ip6h->ip6_dst);
- if (mp == NULL) {
- BUMP_MIB(ill->ill_icmp6_mib,
- ipv6IfIcmpInErrors);
- if (ire != NULL)
- ire_refrele(ire);
- if (mctl_present)
- freeb(first_mp);
- return;
- } else if (mctl_present) {
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
- ip6i = (ip6i_t *)mp->b_rptr;
- ip6i->ip6i_flags = IP6I_ATTACH_IF;
- ip6i->ip6i_ifindex =
- ill->ill_phyint->phyint_ifindex;
- }
- if (ire != NULL)
- ire_refrele(ire);
- }
if (!mctl_present) {
/*
@@ -529,7 +469,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
if (mctl_present)
freeb(first_mp);
/* XXX may wish to pass first_mp up to ndp_input someday. */
- ndp_input(ill, mp, dl_mp);
+ ndp_input(inill, mp, dl_mp);
return;
case ND_NEIGHBOR_ADVERT:
@@ -538,7 +478,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
if (mctl_present)
freeb(first_mp);
/* XXX may wish to pass first_mp up to ndp_input someday. */
- ndp_input(ill, mp, dl_mp);
+ ndp_input(inill, mp, dl_mp);
return;
case ND_REDIRECT: {
@@ -579,7 +519,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
}
if (interested) {
icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill,
- mctl_present, zoneid);
+ inill, mctl_present, zoneid);
} else {
freemsg(first_mp);
}
@@ -592,7 +532,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
*/
/* ARGSUSED */
static void
-icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
+icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
boolean_t mctl_present, zoneid_t zoneid)
{
ip6_t *ip6h;
@@ -658,11 +598,10 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
* sufficient. Same link local addresses for different ILL's is
* possible.
*/
-
if (IN6_IS_ADDR_LINKLOCAL(&inner_ip6h->ip6_dst)) {
first_ire = ire_ctable_lookup_v6(&inner_ip6h->ip6_dst, NULL,
IRE_CACHE, ill->ill_ipif, ALL_ZONES, NULL,
- MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst);
+ MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
if (first_ire == NULL) {
if (ip_debug > 2) {
@@ -773,7 +712,7 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
}
rw_exit(&irb->irb_lock);
}
- icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill,
+ icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, inill,
mctl_present, zoneid);
}
@@ -783,7 +722,8 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
*/
void
icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
- icmp6_t *icmp6, ill_t *ill, boolean_t mctl_present, zoneid_t zoneid)
+ icmp6_t *icmp6, ill_t *ill, ill_t *inill, boolean_t mctl_present,
+ zoneid_t zoneid)
{
uint16_t *up; /* Pointer to ports in ULP header */
uint32_t ports; /* reversed ports for fanout */
@@ -861,7 +801,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
((uint16_t *)&ports)[0] = up[1];
((uint16_t *)&ports)[1] = up[0];
- ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, ill,
+ ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, inill,
IP6_NO_IPPOLICY, mctl_present, zoneid);
return;
}
@@ -908,7 +848,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
((uint16_t *)&ports)[0] = up[1];
((uint16_t *)&ports)[1] = up[0];
- ip_fanout_sctp(first_mp, ill, (ipha_t *)ip6h, ports, 0,
+ ip_fanout_sctp(first_mp, inill, (ipha_t *)ip6h, ports, 0,
mctl_present, IP6_NO_IPPOLICY, zoneid);
return;
case IPPROTO_ESP:
@@ -940,7 +880,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
ASSERT(ill != NULL);
ii->ipsec_in_ill_index =
ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index = ii->ipsec_in_ill_index;
+ ii->ipsec_in_rill_index =
+ inill->ill_phyint->phyint_ifindex;
first_mp->b_cont->b_datap->db_type = M_CTL;
} else {
/*
@@ -970,7 +911,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
mp->b_datap->db_type = M_CTL;
ii->ipsec_in_ill_index =
ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index = ii->ipsec_in_ill_index;
+ ii->ipsec_in_rill_index =
+ inill->ill_phyint->phyint_ifindex;
}
if (!ipsec_loaded(ipss)) {
@@ -985,7 +927,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
if (ipsec_rc == IPSEC_STATUS_FAILED)
return;
- ip_fanout_proto_again(first_mp, ill, ill, NULL);
+ ip_fanout_proto_again(first_mp, ill, inill, NULL);
return;
}
case IPPROTO_ENCAP:
@@ -1083,8 +1025,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
* doing here.
*/
icmp_inbound_error_fanout_v6(q, first_mp,
- (ip6_t *)mp->b_rptr, icmp6, ill, mctl_present,
- zoneid);
+ (ip6_t *)mp->b_rptr, icmp6, ill, inill,
+ mctl_present, zoneid);
return;
}
/* FALLTHRU */
@@ -1096,7 +1038,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
rip6h.ip6_src = ip6h->ip6_dst;
rip6h.ip6_dst = ip6h->ip6_src;
rip6h.ip6_nxt = nexthdr;
- ip_fanout_proto_v6(q, first_mp, &rip6h, ill, ill, nexthdr, 0,
+ ip_fanout_proto_v6(q, first_mp, &rip6h, ill, inill, nexthdr, 0,
IP6_NO_IPPOLICY, mctl_present, zoneid);
return;
}
@@ -1194,9 +1136,8 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
* redirect packet.)
*/
- prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL,
- ALL_ZONES, NULL, MATCH_IRE_GW | MATCH_IRE_ILL_GROUP |
- MATCH_IRE_DEFAULT, ipst);
+ prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL, ALL_ZONES,
+ NULL, MATCH_IRE_GW | MATCH_IRE_ILL | MATCH_IRE_DEFAULT, ipst);
/*
* Check that
@@ -1260,6 +1201,7 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR);
if (opt != NULL) {
err = ndp_lookup_then_add_v6(ill,
+ B_FALSE, /* don't match across illgrp */
(uchar_t *)&opt[1], /* Link layer address */
gateway,
&ipv6_all_ones, /* prefix mask */
@@ -1367,8 +1309,7 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
*/
redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST,
ire->ire_ipif, NULL, ALL_ZONES, 0, NULL,
- (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP),
- ipst);
+ (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
ire_refrele(ire); /* Held in ire_add_v6 */
@@ -1457,15 +1398,11 @@ icmp_pick_source_v6(queue_t *wq, in6_addr_t *origsrc, in6_addr_t *origdst,
BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes);
return (NULL);
}
- /*
- * Does not matter whether we use ire_stq or ire_ipif here.
- * Just pick an ill for ICMP replies.
- */
ASSERT(ire->ire_ipif != NULL);
ill = ire->ire_ipif->ipif_ill;
ire_refrele(ire);
}
- ipif = ipif_select_source_v6(ill, origsrc, RESTRICT_TO_NONE,
+ ipif = ipif_select_source_v6(ill, origsrc, B_FALSE,
IPV6_PREFER_SRC_DEFAULT, zoneid);
if (ipif != NULL) {
*src = ipif->ipif_v6src_addr;
@@ -1858,7 +1795,7 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
mp = icmp_pkt_err_ok_v6(q, mp, llbcast, B_FALSE, ipst);
if (mp == NULL)
return;
- nce = ndp_lookup_v6(ill, targetp, B_FALSE);
+ nce = ndp_lookup_v6(ill, B_TRUE, targetp, B_FALSE);
if (nce != NULL && nce->nce_state != ND_INCOMPLETE) {
ll_opt_len = (sizeof (nd_opt_hdr_t) +
ill->ill_phys_addr_length + 7)/8 * 8;
@@ -1908,31 +1845,8 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
rdh->nd_opt_rh_reserved1 = 0;
rdh->nd_opt_rh_reserved2 = 0;
/* ipif_v6src_addr contains the link-local source address */
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- if (ill->ill_group != NULL) {
- /*
- * The receiver of the redirect will verify whether it
- * had a route through us (srcp that we will use in
- * the redirect) or not. As we load spread even link-locals,
- * we don't know which source address the receiver of
- * redirect has in its route for communicating with us.
- * Thus we randomly choose a source here and finally we
- * should get to the right one and it will eventually
- * accept the redirect from us. We can't call
- * ip_lookup_scope_v6 because we don't have the right
- * link-local address here. Thus we randomly choose one.
- */
- int cnt = ill->ill_group->illgrp_ill_count;
+ srcp = &ill->ill_ipif->ipif_v6src_addr;
- ill = ill->ill_group->illgrp_ill;
- cnt = ++ipst->ips_icmp_redirect_v6_src_index % cnt;
- while (cnt--)
- ill = ill->ill_group_next;
- srcp = &ill->ill_ipif->ipif_v6src_addr;
- } else {
- srcp = &ill->ill_ipif->ipif_v6src_addr;
- }
- rw_exit(&ipst->ips_ill_g_lock);
/* Redirects sent by router, and router is global zone */
icmp_pkt_v6(q, mp, buf, len, srcp, B_FALSE, GLOBAL_ZONEID, ipst);
kmem_free(buf, len);
@@ -2231,6 +2145,7 @@ ip_bind_post_handling_v6(conn_t *connp, mblk_t *mp,
if (version_changed) {
ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst);
}
+
/*
* Pass the IPSEC headers size in ire_ipsec_overhead.
* We can't do this in ip_bind_insert_ire because the policy
@@ -2771,8 +2686,8 @@ ip_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
}
if (ip6_asp_can_lookup(ipst)) {
src_ipif = ipif_select_source_v6(dst_ill,
- v6dst, RESTRICT_TO_NONE,
- connp->conn_src_preferences, zoneid);
+ v6dst, B_FALSE, connp->conn_src_preferences,
+ zoneid);
ip6_asp_table_refrele(ipst);
if (src_ipif == NULL) {
pr_addr_dbg("ip_bind_connected_v6: "
@@ -3111,7 +3026,15 @@ ip_add_info_v6(mblk_t *mp, ill_t *ill, const in6_addr_t *dst)
ip6i->ip6i_nxt = IPPROTO_RAW;
if (ill != NULL) {
ip6i->ip6i_flags = IP6I_IFINDEX;
- ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
+ /*
+ * If `ill' is in an IPMP group, make sure we use the IPMP
+ * interface index so that e.g. IPV6_RECVPKTINFO will get the
+ * IPMP interface index and not an underlying interface index.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ip6i->ip6i_ifindex = ipmp_ill_get_ipmp_ifindex(ill);
+ else
+ ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
} else {
ip6i->ip6i_flags = 0;
}
@@ -4257,33 +4180,6 @@ ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h)
}
/*
- * Select an ill for the packet by considering load spreading across
- * a different ill in the group if dst_ill is part of some group.
- */
-static ill_t *
-ip_newroute_get_dst_ill_v6(ill_t *dst_ill)
-{
- ill_t *ill;
-
- /*
- * We schedule irrespective of whether the source address is
- * INADDR_UNSPECIED or not.
- */
- ill = illgrp_scheduler(dst_ill);
- if (ill == NULL)
- return (NULL);
-
- /*
- * For groups with names ip_sioctl_groupname ensures that all
- * ills are of same type. For groups without names, ifgrp_insert
- * ensures this.
- */
- ASSERT(dst_ill->ill_type == ill->ill_type);
-
- return (ill);
-}
-
-/*
* IPv6 -
* ip_newroute_v6 is called by ip_rput_data_v6 or ip_wput_v6 whenever we need
* to send out a packet to a destination address for which we do not have
@@ -4303,14 +4199,6 @@ ip_newroute_get_dst_ill_v6(ill_t *dst_ill)
* node sits at a site boundary).
* We create the cache entries in the regular ctable since
* it can not "confuse" things for other destinations.
- * table.
- *
- * When ill is part of a ill group, we subject the packets
- * to load spreading even if the ill is specified by the
- * means described above. We disable only for IPV6_BOUND_PIF
- * and for the cases where IP6I_ATTACH_IF is set i.e NS/NA/
- * Echo replies to link-local destinations have IP6I_ATTACH_IF
- * set.
*
* NOTE : These are the scopes of some of the variables that point at IRE,
* which needs to be followed while making any future modifications
@@ -4327,8 +4215,6 @@ ip_newroute_get_dst_ill_v6(ill_t *dst_ill)
*
* Thus on failures, we have to REFRELE only ire and sire, if they
* are not NULL.
- *
- * v6srcp may be used in the future. Currently unused.
*/
/* ARGSUSED */
void
@@ -4346,10 +4232,8 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
int err = 0;
mblk_t *first_mp;
ipsec_out_t *io;
- ill_t *attach_ill = NULL;
ushort_t ire_marks = 0;
int match_flags;
- boolean_t ip6i_present;
ire_t *first_sire = NULL;
mblk_t *copy_mp = NULL;
mblk_t *xmit_mp = NULL;
@@ -4359,7 +4243,6 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
boolean_t multirt_is_resolvable;
boolean_t multirt_resolve_next;
boolean_t need_rele = B_FALSE;
- boolean_t do_attach_ill = B_FALSE;
boolean_t ip6_asp_table_held = B_FALSE;
tsol_ire_gw_secattr_t *attrp = NULL;
tsol_gcgrp_t *gcgrp = NULL;
@@ -4376,39 +4259,12 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
io = NULL;
}
- /*
- * If this end point is bound to IPIF_NOFAILOVER, set bnf_ill and
- * bind_to_nofailover B_TRUE. We can't use conn to determine as it
- * could be NULL.
- *
- * This information can appear either in an ip6i_t or an IPSEC_OUT
- * message.
- */
ip6h = (ip6_t *)mp->b_rptr;
- ip6i_present = (ip6h->ip6_nxt == IPPROTO_RAW);
- if (ip6i_present || (io != NULL && io->ipsec_out_attach_if)) {
- if (!ip6i_present ||
- ((ip6i_t *)ip6h)->ip6i_flags & IP6I_ATTACH_IF) {
- attach_ill = ip_grab_attach_ill(ill, first_mp,
- (ip6i_present ? ((ip6i_t *)ip6h)->ip6i_ifindex :
- io->ipsec_out_ill_index), B_TRUE, ipst);
- /* Failure case frees things for us. */
- if (attach_ill == NULL)
- return;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill))
- ire_marks = IRE_MARK_HIDDEN;
- }
- }
if (IN6_IS_ADDR_LOOPBACK(v6dstp)) {
ip1dbg(("ip_newroute_v6: dst with loopback addr\n"));
goto icmp_err_ret;
- } else if ((v6srcp != NULL) && IN6_IS_ADDR_LOOPBACK(v6srcp)) {
+ } else if (IN6_IS_ADDR_LOOPBACK(v6srcp)) {
ip1dbg(("ip_newroute_v6: src with loopback addr\n"));
goto icmp_err_ret;
}
@@ -4436,30 +4292,24 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0,
NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp),
match_flags, ipst);
+ } else {
+ match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
+ MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL;
+ match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR;
+
/*
- * ire_add_then_send -> ip_newroute_v6 in the CGTP case passes
- * in a NULL ill, but the packet could be a neighbor
- * solicitation/advertisment and could have a valid attach_ill.
+ * Because nce_xmit() calls ip_output_v6() and NCEs are always
+ * tied to an underlying interface, IS_UNDER_IPMP() may be
+ * true even when building IREs that will be used for data
+ * traffic. As such, use the packet's source address to
+ * determine whether the traffic is test traffic, and set
+ * MATCH_IRE_MARK_TESTHIDDEN if so.
*/
- if (attach_ill != NULL)
- ill_refrele(attach_ill);
- } else {
- if (attach_ill != NULL) {
- /*
- * attach_ill is set only for communicating with
- * on-link hosts. So, don't look for DEFAULT.
- * ip_wput_v6 passes the right ill in this case and
- * hence we can assert.
- */
- ASSERT(ill == attach_ill);
- ill_refrele(attach_ill);
- do_attach_ill = B_TRUE;
- match_flags = MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL;
- } else {
- match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL_GROUP;
+ if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) {
+ if (ipif_lookup_testaddr_v6(ill, v6srcp, NULL))
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
}
- match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR;
+
ire = ire_ftable_lookup_v6(v6dstp, NULL, NULL, 0, ill->ill_ipif,
&sire, zoneid, 0, MBLK_GETLABEL(mp), match_flags, ipst);
}
@@ -4601,106 +4451,56 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
}
/*
- * We have a route to reach the destination.
- *
- * 1) If the interface is part of ill group, try to get a new
- * ill taking load spreading into account.
+ * We have a route to reach the destination. Find the
+ * appropriate ill, then get a source address that matches the
+ * right scope via ipif_select_source_v6().
*
- * 2) After selecting the ill, get a source address that might
- * create good inbound load spreading and that matches the
- * right scope. ipif_select_source_v6 does this for us.
+ * If we are here trying to create an IRE_CACHE for an offlink
+ * destination and have an IRE_CACHE entry for VNI, then use
+ * ire_stq instead since VNI's queue is a black hole.
*
- * If the application specified the ill (ifindex), we still
- * load spread. Only if the packets needs to go out specifically
- * on a given ill e.g. bind to IPIF_NOFAILOVER address,
- * IPV6_BOUND_PIF we don't try to use a different ill for load
- * spreading.
+ * Note: While we pick a dst_ill we are really only interested
+ * in the ill for load spreading. The source ipif is
+ * determined by source address selection below.
*/
- if (!do_attach_ill) {
- /*
- * If the interface belongs to an interface group,
- * make sure the next possible interface in the group
- * is used. This encourages load spreading among
- * peers in an interface group. However, in the case
- * of multirouting, load spreading is not used, as we
- * actually want to replicate outgoing packets through
- * particular interfaces.
- *
- * Note: While we pick a dst_ill we are really only
- * interested in the ill for load spreading.
- * The source ipif is determined by source address
- * selection below.
- */
- if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) {
- dst_ill = ire->ire_ipif->ipif_ill;
- /* For uniformity do a refhold */
- ill_refhold(dst_ill);
+ if ((ire->ire_type == IRE_CACHE) &&
+ IS_VNI(ire->ire_ipif->ipif_ill)) {
+ dst_ill = ire->ire_stq->q_ptr;
+ ill_refhold(dst_ill);
+ } else {
+ ill_t *ill = ire->ire_ipif->ipif_ill;
+
+ if (IS_IPMP(ill)) {
+ dst_ill =
+ ipmp_illgrp_hold_next_ill(ill->ill_grp);
} else {
- /*
- * If we are here trying to create an IRE_CACHE
- * for an offlink destination and have the
- * IRE_CACHE for the next hop and the latter is
- * using virtual IP source address selection i.e
- * it's ire->ire_ipif is pointing to a virtual
- * network interface (vni) then
- * ip_newroute_get_dst_ll() will return the vni
- * interface as the dst_ill. Since the vni is
- * virtual i.e not associated with any physical
- * interface, it cannot be the dst_ill, hence
- * in such a case call ip_newroute_get_dst_ll()
- * with the stq_ill instead of the ire_ipif ILL.
- * The function returns a refheld ill.
- */
- if ((ire->ire_type == IRE_CACHE) &&
- IS_VNI(ire->ire_ipif->ipif_ill))
- dst_ill = ip_newroute_get_dst_ill_v6(
- ire->ire_stq->q_ptr);
- else
- dst_ill = ip_newroute_get_dst_ill_v6(
- ire->ire_ipif->ipif_ill);
+ dst_ill = ill;
+ ill_refhold(dst_ill);
}
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute_v6 : no dst "
- "ill for dst %s\n",
- AF_INET6, v6dstp);
- }
- goto icmp_err_ret;
- } else if (dst_ill->ill_group == NULL && ill != NULL &&
- dst_ill != ill) {
- /*
- * If "ill" is not part of any group, we should
- * have found a route matching "ill" as we
- * called ire_ftable_lookup_v6 with
- * MATCH_IRE_ILL_GROUP.
- * Rather than asserting when there is a
- * mismatch, we just drop the packet.
- */
- ip0dbg(("ip_newroute_v6: BOUND_IF failed : "
- "dst_ill %s ill %s\n",
- dst_ill->ill_name,
- ill->ill_name));
- goto icmp_err_ret;
+ }
+
+ if (dst_ill == NULL) {
+ if (ip_debug > 2) {
+ pr_addr_dbg("ip_newroute_v6 : no dst "
+ "ill for dst %s\n", AF_INET6, v6dstp);
}
- } else {
- dst_ill = ire->ire_ipif->ipif_ill;
- /* For uniformity do refhold */
- ill_refhold(dst_ill);
+ goto icmp_err_ret;
+ }
+
+ if (ill != NULL && dst_ill != ill &&
+ !IS_IN_SAME_ILLGRP(dst_ill, ill)) {
/*
- * We should have found a route matching ill as we
- * called ire_ftable_lookup_v6 with MATCH_IRE_ILL.
- * Rather than asserting, while there is a mismatch,
- * we just drop the packet.
+ * We should have found a route matching "ill"
+ * as we called ire_ftable_lookup_v6 with
+ * MATCH_IRE_ILL. Rather than asserting when
+ * there is a mismatch, we just drop the packet.
*/
- if (dst_ill != ill) {
- ip0dbg(("ip_newroute_v6: Packet dropped as "
- "IP6I_ATTACH_IF ill is %s, "
- "ire->ire_ipif->ipif_ill is %s\n",
- ill->ill_name,
- dst_ill->ill_name));
- goto icmp_err_ret;
- }
+ ip0dbg(("ip_newroute_v6: BOUND_IF failed: "
+ "dst_ill %s ill %s\n", dst_ill->ill_name,
+ ill->ill_name));
+ goto icmp_err_ret;
}
+
/*
* Pick a source address which matches the scope of the
* destination address.
@@ -4708,7 +4508,20 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
* parent ire (sire).
*/
ASSERT(src_ipif == NULL);
- if (ire->ire_type == IRE_IF_RESOLVER &&
+
+ /*
+ * Because nce_xmit() calls ip_output_v6() and NCEs are always
+ * tied to the underlying interface, IS_UNDER_IPMP() may be
+ * true even when building IREs that will be used for data
+ * traffic. As such, see if the packet's source address is a
+ * test address, and if so use that test address's ipif for
+ * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in
+ * ire_add_v6() can work properly.
+ */
+ if (ill != NULL && IS_UNDER_IPMP(ill))
+ (void) ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif);
+
+ if (src_ipif == NULL && ire->ire_type == IRE_IF_RESOLVER &&
!IN6_IS_ADDR_UNSPECIFIED(&v6gw) &&
ip6_asp_can_lookup(ipst)) {
/*
@@ -4718,10 +4531,10 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
*/
ip6_asp_table_held = B_TRUE;
src_ipif = ipif_select_source_v6(dst_ill, &v6gw,
- RESTRICT_TO_GROUP, IPV6_PREFER_SRC_DEFAULT, zoneid);
+ B_TRUE, IPV6_PREFER_SRC_DEFAULT, zoneid);
if (src_ipif != NULL)
ire_marks |= IRE_MARK_USESRC_CHECK;
- } else {
+ } else if (src_ipif == NULL) {
if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
/*
* Check that the ipif matching the requested
@@ -4732,14 +4545,9 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
NULL, NULL, NULL, NULL, ipst);
}
if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
- uint_t restrict_ill = RESTRICT_TO_NONE;
-
- if (ip6i_present && ((ip6i_t *)ip6h)->ip6i_flags
- & IP6I_ATTACH_IF)
- restrict_ill = RESTRICT_TO_ILL;
ip6_asp_table_held = B_TRUE;
src_ipif = ipif_select_source_v6(dst_ill,
- v6dstp, restrict_ill,
+ v6dstp, B_FALSE,
IPV6_PREFER_SRC_DEFAULT, zoneid);
if (src_ipif != NULL)
ire_marks |= IRE_MARK_USESRC_CHECK;
@@ -4750,7 +4558,7 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
if (ip_debug > 2) {
/* ip1dbg */
pr_addr_dbg("ip_newroute_v6: no src for "
- "dst %s\n, ", AF_INET6, v6dstp);
+ "dst %s\n", AF_INET6, v6dstp);
printf("ip_newroute_v6: interface name %s\n",
dst_ill->ill_name);
}
@@ -4837,14 +4645,7 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
"ire_ihandle_lookup_offlink_v6 failed\n"));
goto icmp_err_ret;
}
- /*
- * Assume DL_UNITDATA_REQ is same for all physical
- * interfaces in the ifgrp. If it isn't, this code will
- * have to be seriously rewhacked to allow the
- * fastpath probing (such that I cache the link
- * header in the IRE_CACHE) to work over ifgrps.
- * We have what we need to build an IRE_CACHE.
- */
+
/*
* Note: the new ire inherits RTF_SETSRC
* and RTF_MULTIRT to propagate these flags from prefix
@@ -5659,24 +5460,22 @@ icmp_err_ret:
*/
void
ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
- in6_addr_t v6dst, int unspec_src, zoneid_t zoneid)
+ const in6_addr_t *v6dstp, const in6_addr_t *v6srcp, int unspec_src,
+ zoneid_t zoneid)
{
ire_t *ire = NULL;
ipif_t *src_ipif = NULL;
int err = 0;
ill_t *dst_ill = NULL;
ire_t *save_ire;
- ushort_t ire_marks = 0;
ipsec_out_t *io;
- ill_t *attach_ill = NULL;
ill_t *ill;
- ip6_t *ip6h;
mblk_t *first_mp;
- boolean_t ip6i_present;
ire_t *fire = NULL;
mblk_t *copy_mp = NULL;
+ const in6_addr_t *ire_v6srcp;
+ boolean_t probe = B_FALSE;
boolean_t multirt_resolve_next;
- in6_addr_t *v6dstp = &v6dst;
boolean_t ipif_held = B_FALSE;
boolean_t ill_held = B_FALSE;
boolean_t ip6_asp_table_held = B_FALSE;
@@ -5728,35 +5527,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
if (!(ill->ill_flags & ILLF_MULTICAST)) {
goto err_ret;
}
- /*
- * If this end point is bound to IPIF_NOFAILOVER, set bnf_ill
- * and bind_to_nofailover B_TRUE. We can't use conn to determine
- * as it could be NULL.
- *
- * This information can appear either in an ip6i_t or an
- * IPSEC_OUT message.
- */
- ip6h = (ip6_t *)mp->b_rptr;
- ip6i_present = (ip6h->ip6_nxt == IPPROTO_RAW);
- if (ip6i_present || (io != NULL && io->ipsec_out_attach_if)) {
- if (!ip6i_present ||
- ((ip6i_t *)ip6h)->ip6i_flags & IP6I_ATTACH_IF) {
- attach_ill = ip_grab_attach_ill(ill, first_mp,
- (ip6i_present ?
- ((ip6i_t *)ip6h)->ip6i_ifindex :
- io->ipsec_out_ill_index), B_TRUE, ipst);
- /* Failure case frees things for us. */
- if (attach_ill == NULL)
- return;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(attach_ill))
- ire_marks = IRE_MARK_HIDDEN;
- }
- }
/*
* We check if an IRE_OFFSUBNET for the addr that goes through
@@ -5770,76 +5540,93 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
(void *)ipif, ntohl(V4_PART_OF_V6((*v6dstp))),
(void *)fire));
+ ASSERT(src_ipif == NULL);
+
/*
- * If the application specified the ill (ifindex), we still
- * load spread. Only if the packets needs to go out specifically
- * on a given ill e.g. binding to IPIF_NOFAILOVER address or
- * IPV6_BOUND_PIF, or there is a parent ire entry that specified
- * multirouting, then we don't try to use a different ill for
- * load spreading.
+ * Because nce_xmit() calls ip_output_v6() and NCEs are always
+ * tied to the underlying interface, IS_UNDER_IPMP() may be
+ * true even when building IREs that will be used for data
+ * traffic. As such, see if the packet's source address is a
+ * test address, and if so use that test address's ipif for
+ * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in
+ * ire_add_v6() can work properly.
+ */
+ if (IS_UNDER_IPMP(ill))
+ probe = ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif);
+
+ /*
+ * Determine the outbound (destination) ill for this route.
+ * If IPMP is not in use, that's the same as our ill. If IPMP
+ * is in-use and we're on the IPMP interface, or we're on an
+ * underlying ill but sending data traffic, use a suitable
+ * destination ill from the group. The latter case covers a
+ * subtle edge condition with multicast: when we bring up an
+ * IPv6 data address, we will create an NCE on an underlying
+ * interface, and send solitications to ff02::1, which would
+ * take us through here, and cause us to create an IRE for
+ * ff02::1. To meet our defined semantics for multicast (and
+ * ensure there aren't unexpected echoes), that IRE needs to
+ * use the IPMP group's nominated multicast interface.
+ *
+ * Note: the source ipif is determined by source address
+ * selection later.
*/
- if (attach_ill == NULL) {
- /*
- * If the interface belongs to an interface group,
- * make sure the next possible interface in the group
- * is used. This encourages load spreading among peers
- * in an interface group.
- *
- * Note: While we pick a dst_ill we are really only
- * interested in the ill for load spreading. The source
- * ipif is determined by source address selection below.
- */
- if ((fire != NULL) && (fire->ire_flags & RTF_MULTIRT)) {
- dst_ill = ipif->ipif_ill;
- /* For uniformity do a refhold */
- ill_refhold(dst_ill);
+ if (IS_IPMP(ill) || (IS_UNDER_IPMP(ill) && !probe)) {
+ ill_t *ipmp_ill;
+ ipmp_illgrp_t *illg;
+
+ if (IS_UNDER_IPMP(ill)) {
+ ipmp_ill = ipmp_ill_hold_ipmp_ill(ill);
} else {
- /* refheld by ip_newroute_get_dst_ill_v6 */
- dst_ill =
- ip_newroute_get_dst_ill_v6(ipif->ipif_ill);
+ ipmp_ill = ill;
+ ill_refhold(ipmp_ill); /* for symmetry */
}
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute_ipif_v6: "
- "no dst ill for dst %s\n",
- AF_INET6, v6dstp);
- }
+
+ if (ipmp_ill == NULL)
goto err_ret;
- }
+
+ illg = ipmp_ill->ill_grp;
+ if (IN6_IS_ADDR_MULTICAST(v6dstp))
+ dst_ill = ipmp_illgrp_hold_cast_ill(illg);
+ else
+ dst_ill = ipmp_illgrp_hold_next_ill(illg);
+
+ ill_refrele(ipmp_ill);
} else {
- dst_ill = ipif->ipif_ill;
- /*
- * ip_wput_v6 passes the right ipif for IPIF_NOFAILOVER
- * and IPV6_BOUND_PIF case.
- */
- ASSERT(dst_ill == attach_ill);
- /* attach_ill is already refheld */
+ dst_ill = ill;
+ ill_refhold(dst_ill); /* for symmetry */
+ }
+
+ if (dst_ill == NULL) {
+ if (ip_debug > 2) {
+ pr_addr_dbg("ip_newroute_ipif_v6: "
+ "no dst ill for dst %s\n",
+ AF_INET6, v6dstp);
+ }
+ goto err_ret;
}
+
/*
* Pick a source address which matches the scope of the
* destination address.
* For RTF_SETSRC routes, the source address is imposed by the
* parent ire (fire).
*/
- ASSERT(src_ipif == NULL);
- if ((fire != NULL) && (fire->ire_flags & RTF_SETSRC)) {
+
+ if (src_ipif == NULL && fire != NULL &&
+ (fire->ire_flags & RTF_SETSRC)) {
/*
* Check that the ipif matching the requested source
* address still exists.
*/
- src_ipif =
- ipif_lookup_addr_v6(&fire->ire_src_addr_v6,
+ src_ipif = ipif_lookup_addr_v6(&fire->ire_src_addr_v6,
NULL, zoneid, NULL, NULL, NULL, NULL, ipst);
}
- if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
- uint_t restrict_ill = RESTRICT_TO_NONE;
- if (ip6i_present && ((ip6i_t *)ip6h)->ip6i_flags
- & IP6I_ATTACH_IF)
- restrict_ill = RESTRICT_TO_ILL;
+ if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
ip6_asp_table_held = B_TRUE;
src_ipif = ipif_select_source_v6(dst_ill, v6dstp,
- restrict_ill, IPV6_PREFER_SRC_DEFAULT, zoneid);
+ B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid);
}
if (src_ipif == NULL) {
@@ -5847,16 +5634,20 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
if (ip_debug > 2) {
/* ip1dbg */
pr_addr_dbg("ip_newroute_ipif_v6: "
- "no src for dst %s\n,",
+ "no src for dst %s\n",
AF_INET6, v6dstp);
printf(" through interface %s\n",
dst_ill->ill_name);
}
goto err_ret;
}
+ ire_v6srcp = &ipv6_all_zeros;
src_ipif = ipif;
ipif_refhold(src_ipif);
+ } else {
+ ire_v6srcp = &src_ipif->ipif_v6src_addr;
}
+
ire = ipif_to_ire_v6(ipif);
if (ire == NULL) {
if (ip_debug > 2) {
@@ -5903,7 +5694,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
}
}
- ASSERT((attach_ill == NULL) || (dst_ill == attach_ill));
switch (ire->ire_type) {
case IRE_IF_NORESOLVER: {
/*
@@ -5921,7 +5711,7 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
ire = ire_create_v6(
v6dstp, /* dest address */
&ipv6_all_ones, /* mask */
- &src_ipif->ipif_v6src_addr, /* source address */
+ ire_v6srcp, /* source address */
NULL, /* gateway address */
&save_ire->ire_max_frag,
NULL, /* no src nce */
@@ -5946,8 +5736,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
break;
}
- ire->ire_marks |= ire_marks;
-
err = ndp_noresolver(dst_ill, v6dstp);
if (err != 0) {
ire_refrele(save_ire);
@@ -6051,7 +5839,7 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
ire = ire_create_v6(
v6dstp, /* dest address */
&ipv6_all_ones, /* mask */
- &src_ipif->ipif_v6src_addr, /* source address */
+ ire_v6srcp, /* source address */
NULL, /* gateway address */
&save_ire->ire_max_frag,
NULL, /* src nce */
@@ -6076,8 +5864,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
break;
}
- ire->ire_marks |= ire_marks;
-
/* Resolve and add ire to the ctable */
err = ndp_resolver(dst_ill, v6dstp, first_mp, zoneid);
switch (err) {
@@ -6273,8 +6059,8 @@ err_ret:
ipif_refrele(ipif);
if (src_ipif != NULL)
ipif_refrele(src_ipif);
+
/* Multicast - no point in trying to generate ICMP error */
- ASSERT((attach_ill == NULL) || (dst_ill == attach_ill));
if (dst_ill != NULL) {
ill = dst_ill;
ill_held = B_TRUE;
@@ -6499,7 +6285,7 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
&ip6h->ip6_dst)) {
ipif = ipif_select_source_v6(
ill, &ip6h->ip6_src,
- RESTRICT_TO_GROUP,
+ B_TRUE,
IPV6_PREFER_SRC_DEFAULT,
ALL_ZONES);
if (ipif != NULL) {
@@ -7050,7 +6836,7 @@ ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr)
*/
static boolean_t
ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present,
- ill_t *ill, mblk_t *hada_mp, zoneid_t zoneid)
+ ill_t *ill, ill_t *inill, mblk_t *hada_mp, zoneid_t zoneid)
{
mblk_t *mp;
uint8_t nexthdr;
@@ -7093,7 +6879,7 @@ ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present,
*/
ii = (ipsec_in_t *)first_mp->b_rptr;
ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index = ii->ipsec_in_ill_index;
+ ii->ipsec_in_rill_index = inill->ill_phyint->phyint_ifindex;
first_mp->b_cont = mp;
}
/*
@@ -7122,7 +6908,7 @@ ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present,
switch (ipsec_rc) {
case IPSEC_STATUS_SUCCESS:
/* we're done with IPsec processing, send it up */
- ip_fanout_proto_again(first_mp, ill, ill, NULL);
+ ip_fanout_proto_again(first_mp, ill, inill, NULL);
break;
case IPSEC_STATUS_FAILED:
BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
@@ -7225,7 +7011,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
ip6_hbh_t *hbhhdr;
boolean_t ll_multicast = (flags & IP6_IN_LLMCAST);
conn_t *connp;
- ilm_t *ilm;
uint32_t ports;
zoneid_t zoneid = GLOBAL_ZONEID;
uint16_t hck_flags, reass_hck_flags;
@@ -7347,10 +7132,8 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
/*
* XXX TODO Give to mrouted to for multicast forwarding.
*/
- ILM_WALKER_HOLD(ill);
- ilm = ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, ALL_ZONES);
- ILM_WALKER_RELE(ill);
- if (ilm == NULL) {
+ if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE,
+ ALL_ZONES) == NULL) {
if (ip_debug > 3) {
/* ip2dbg */
pr_addr_dbg("ip_rput_data_v6: got mcast packet"
@@ -7405,7 +7188,7 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) {
ire = ire_ctable_lookup_v6(&ip6h->ip6_dst, NULL,
IRE_CACHE|IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL,
- MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst);
+ MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
} else {
ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ALL_ZONES,
MBLK_GETLABEL(mp), ipst);
@@ -7466,9 +7249,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
}
/* we have a matching IRE */
if (ire->ire_stq != NULL) {
- ill_group_t *ill_group;
- ill_group_t *ire_group;
-
/*
* To be quicker, we may wish not to chase pointers
* (ire->ire_ipif->ipif_ill...) and instead store the
@@ -7483,7 +7263,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
no_forward = ((ill->ill_flags &
ire->ire_ipif->ipif_ill->ill_flags & ILLF_ROUTER) == 0);
-
ASSERT(first_mp == mp);
/*
* This ire has a send-to queue - forward the packet.
@@ -7568,10 +7347,8 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
* we're forwarding onto the same link), conditionally send
* a redirect message.
*/
- ill_group = ill->ill_group;
- ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group;
- if (ire->ire_rfq != q && (ill_group == NULL ||
- ill_group != ire_group)) {
+ if (ire->ire_rfq != q &&
+ !IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) {
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) ||
IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) {
BUMP_MIB(ill->ill_ip_mib,
@@ -8006,7 +7783,10 @@ tcp_fanout:
* where there is no conn.
*/
if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
- ASSERT(!IS_LOOPBACK((ill)));
+ ilm_t *ilm;
+ ilm_walker_t ilw;
+
+ ASSERT(!IS_LOOPBACK(ill));
/*
* In the multicast case, applications may have
* joined the group from different zones, so we
@@ -8015,32 +7795,32 @@ tcp_fanout:
* structures (ilm) on the receive ill and send
* a copy of the packet up each matching one.
*/
- ILM_WALKER_HOLD(ill);
- for (ilm = ill->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if (ilm->ilm_flags & ILM_DELETED)
- continue;
+ ilm = ilm_walker_start(&ilw, inill);
+ for (; ilm != NULL;
+ ilm = ilm_walker_step(&ilw, ilm)) {
if (!IN6_ARE_ADDR_EQUAL(
&ilm->ilm_v6addr, &ip6h->ip6_dst))
continue;
- if (!ipif_lookup_zoneid(ill,
- ilm->ilm_zoneid, IPIF_UP, NULL))
+ if (!ipif_lookup_zoneid(
+ ilw.ilw_walk_ill, ilm->ilm_zoneid,
+ IPIF_UP, NULL))
continue;
first_mp1 = ip_copymsg(first_mp);
if (first_mp1 == NULL)
continue;
- icmp_inbound_v6(q, first_mp1, ill,
+ icmp_inbound_v6(q, first_mp1,
+ ilw.ilw_walk_ill, inill,
hdr_len, mctl_present, 0,
ilm->ilm_zoneid, dl_mp);
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
} else {
first_mp1 = ip_copymsg(first_mp);
if (first_mp1 != NULL)
icmp_inbound_v6(q, first_mp1, ill,
- hdr_len, mctl_present, 0, zoneid,
- dl_mp);
+ inill, hdr_len, mctl_present, 0,
+ zoneid, dl_mp);
}
}
/* FALLTHRU */
@@ -8082,7 +7862,7 @@ tcp_fanout:
/* Check if AH is present. */
if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill,
- hada_mp, zoneid)) {
+ inill, hada_mp, zoneid)) {
ip0dbg(("dst early hada drop\n"));
return;
}
@@ -8206,7 +7986,7 @@ tcp_fanout:
/* Restore the flags */
DB_CKSUMFLAGS(mp) = hck_flags;
- mp = ip_rput_frag_v6(q, mp, ip6h, fraghdr,
+ mp = ip_rput_frag_v6(ill, inill, mp, ip6h, fraghdr,
remlen - used, &prev_nexthdr_offset,
&reass_sum, &reass_hck_flags);
if (mp == NULL) {
@@ -8249,7 +8029,7 @@ tcp_fanout:
/* Check if AH is present. */
if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill,
- hada_mp, zoneid)) {
+ inill, hada_mp, zoneid)) {
ip0dbg(("routing hada drop\n"));
return;
}
@@ -8322,7 +8102,7 @@ tcp_fanout:
ii->ipsec_in_ill_index =
ill->ill_phyint->phyint_ifindex;
ii->ipsec_in_rill_index =
- ii->ipsec_in_ill_index;
+ inill->ill_phyint->phyint_ifindex;
first_mp->b_cont = mp;
/*
* Cache hardware acceleration info.
@@ -8480,11 +8260,10 @@ hada_drop:
* nexthdr field when reassembly completes.
*/
static mblk_t *
-ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
+ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset,
uint32_t *cksum_val, uint16_t *cksum_flags)
{
- ill_t *ill = (ill_t *)q->q_ptr;
uint32_t ident = ntohl(fraghdr->ip6f_ident);
uint16_t offset;
boolean_t more_frags;
@@ -8518,8 +8297,8 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
* addition, checksum offload support for IP fragments carrying
* UDP payload is commonly implemented across network adapters.
*/
- ASSERT(ill != NULL);
- if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+ ASSERT(inill != NULL);
+ if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(inill) &&
(DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
mblk_t *mp1 = mp->b_cont;
int32_t len;
@@ -8581,7 +8360,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
freemsg(mp);
return (NULL);
}
- icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER,
+ icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER,
(uint32_t)((char *)&ip6h->ip6_plen -
(char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst);
return (NULL);
@@ -8607,7 +8386,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
freemsg(mp);
return (NULL);
}
- icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER,
+ icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER,
(uint32_t)((char *)&fraghdr->ip6f_offlg -
(char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst);
return (NULL);
@@ -9204,16 +8983,14 @@ ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst)
* The routine can handle an ICMPv6 header that is not in the first mblk.
*
* The order to determine the outgoing interface is as follows:
- * 1. IPV6_BOUND_PIF is set, use that ill (conn_outgoing_pill)
- * 2. If conn_nofailover_ill is set then use that ill.
- * 3. If an ip6i_t with IP6I_IFINDEX set then use that ill.
- * 4. If q is an ill queue and (link local or multicast destination) then
+ * 1. If an ip6i_t with IP6I_IFINDEX set then use that ill.
+ * 2. If q is an ill queue and (link local or multicast destination) then
* use that ill.
- * 5. If IPV6_BOUND_IF has been set use that ill.
- * 6. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise
+ * 3. If IPV6_BOUND_IF has been set use that ill.
+ * 4. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise
* look for the best IRE match for the unspecified group to determine
* the ill.
- * 7. For unicast: Just do an IRE lookup for the best match.
+ * 5. For unicast: Just do an IRE lookup for the best match.
*
* arg2 is always a queue_t *.
* When that queue is an ill_t (i.e. q_next != NULL), then arg must be
@@ -9238,12 +9015,10 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
int unspec_src;
boolean_t do_outrequests; /* Increment OutRequests? */
mib2_ipIfStatsEntry_t *mibptr;
- int match_flags = MATCH_IRE_ILL_GROUP;
- boolean_t attach_if = B_FALSE;
+ int match_flags = MATCH_IRE_ILL;
mblk_t *first_mp;
boolean_t mctl_present;
ipsec_out_t *io;
- boolean_t drop_if_delayed = B_FALSE;
boolean_t multirt_need_resolve = B_FALSE;
mblk_t *copy_mp = NULL;
int err = 0;
@@ -9574,16 +9349,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
*/
mp->b_rptr = (uchar_t *)ip6h;
- /*
- * IP6I_ATTACH_IF is set in this function when we had a
- * conn and it was either bound to the IPFF_NOFAILOVER address
- * or IPV6_BOUND_PIF was set. These options override other
- * options that set the ifindex. We come here with
- * IP6I_ATTACH_IF set when we can't find the ire and
- * ip_newroute_v6 is feeding the packet for second time.
- */
- if ((ip6i->ip6i_flags & IP6I_IFINDEX) ||
- (ip6i->ip6i_flags & IP6I_ATTACH_IF)) {
+ if (ip6i->ip6i_flags & IP6I_IFINDEX) {
ASSERT(ip6i->ip6i_ifindex != 0);
if (ill != NULL)
ill_refrele(ill);
@@ -9603,33 +9369,13 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
return;
}
mibptr = ill->ill_ip_mib;
- if (ip6i->ip6i_flags & IP6I_IFINDEX) {
- /*
- * Preserve the index so that when we return
- * from IPSEC processing, we know where to
- * send the packet.
- */
- if (mctl_present) {
- ASSERT(io != NULL);
- io->ipsec_out_ill_index =
- ip6i->ip6i_ifindex;
- }
- }
- if (ip6i->ip6i_flags & IP6I_ATTACH_IF) {
- /*
- * This is a multipathing probe packet that has
- * been delayed in ND resolution. Drop the
- * packet for the reasons mentioned in
- * nce_queue_mp()
- */
- if ((ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) &&
- (ip6i->ip6i_flags & IP6I_ND_DELAYED)) {
- freemsg(first_mp);
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
+ /*
+ * Preserve the index so that when we return from
+ * IPSEC processing, we know where to send the packet.
+ */
+ if (mctl_present) {
+ ASSERT(io != NULL);
+ io->ipsec_out_ill_index = ip6i->ip6i_ifindex;
}
}
if (ip6i->ip6i_flags & IP6I_VERIFY_SRC) {
@@ -9698,114 +9444,20 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
if (IN6_IS_ADDR_MULTICAST(v6dstp))
goto ipv6multicast;
- /* 1. IPV6_BOUND_PIF takes precedence over all the ifindex settings. */
- if (connp != NULL && connp->conn_outgoing_pill != NULL) {
- ill_t *conn_outgoing_pill;
-
- conn_outgoing_pill = conn_get_held_ill(connp,
- &connp->conn_outgoing_pill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- if (ill != NULL)
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
- if (conn_outgoing_pill != NULL) {
- if (ill != NULL)
- ill_refrele(ill);
- ill = conn_outgoing_pill;
- attach_if = B_TRUE;
- match_flags = MATCH_IRE_ILL;
- mibptr = ill->ill_ip_mib;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- goto send_from_ill;
- }
- }
-
- /* 2. If ipc_nofailover_ill is set then use that ill. */
- if (connp != NULL && connp->conn_nofailover_ill != NULL) {
- ill_t *conn_nofailover_ill;
-
- conn_nofailover_ill = conn_get_held_ill(connp,
- &connp->conn_nofailover_ill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- if (ill != NULL)
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
- if (conn_nofailover_ill != NULL) {
- if (ill != NULL)
- ill_refrele(ill);
- ill = conn_nofailover_ill;
- attach_if = B_TRUE;
- /*
- * Assumes that ipc_nofailover_ill is used only for
- * multipathing probe packets. These packets are better
- * dropped, if they are delayed in ND resolution, for
- * the reasons described in nce_queue_mp().
- * IP6I_DROP_IFDELAYED will be set later on in this
- * function for this packet.
- */
- drop_if_delayed = B_TRUE;
- match_flags = MATCH_IRE_ILL;
- mibptr = ill->ill_ip_mib;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- goto send_from_ill;
- }
- }
-
- /*
- * Redo 1. If we did not find an IRE_CACHE the first time, we should
- * have an ip6i_t with IP6I_ATTACH_IF if IPV6_BOUND_PIF or
- * bind to the IPIF_NOFAILOVER address was used on this endpoint.
- */
- if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_ATTACH_IF)) {
- ASSERT(ip6i->ip6i_ifindex != 0);
- attach_if = B_TRUE;
- ASSERT(ill != NULL);
- match_flags = MATCH_IRE_ILL;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- goto send_from_ill;
- }
-
- /* 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
+ /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
ASSERT(ill != NULL);
goto send_from_ill;
}
/*
- * 4. If q is an ill queue and (link local or multicast destination)
+ * 2. If q is an ill queue and there's a link-local destination
* then use that ill.
*/
- if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp)) {
+ if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp))
goto send_from_ill;
- }
- /* 5. If IPV6_BOUND_IF has been set use that ill. */
+ /* 3. If IPV6_BOUND_IF has been set use that ill. */
if (connp != NULL && connp->conn_outgoing_ill != NULL) {
ill_t *conn_outgoing_ill;
@@ -9827,7 +9479,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
}
/*
- * 6. For unicast: Just do an IRE lookup for the best match.
+ * 4. For unicast: Just do an IRE lookup for the best match.
* If we get here for a link-local address it is rather random
* what interface we pick on a multihomed host.
* *If* there is an IRE_CACHE (and the link-local address
@@ -9913,7 +9565,6 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
}
BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
}
- ASSERT(!attach_if);
/*
* Check if the ire has the RTF_MULTIRT flag, inherited
@@ -9966,7 +9617,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
}
}
ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request,
- connp, caller, 0, ip6i_flags, zoneid);
+ connp, caller, ip6i_flags, zoneid);
if (need_decref) {
CONN_DEC_REF(connp);
connp = NULL;
@@ -10086,9 +9737,6 @@ ipv6multicast:
ip2dbg(("ip_wput_v6: multicast\n"));
/*
- * 1. IPV6_BOUND_PIF takes precedence over all the ifindex settings
- * 2. If conn_nofailover_ill is set then use that ill.
- *
* Hold the conn_lock till we refhold the ill of interest that is
* pointed to from the conn. Since we cannot do an ill/ipif_refrele
* while holding any locks, postpone the refrele until after the
@@ -10100,79 +9748,12 @@ ipv6multicast:
} else {
conn_lock_held = B_FALSE;
}
- if (connp != NULL && connp->conn_outgoing_pill != NULL) {
- err = ill_check_and_refhold(connp->conn_outgoing_pill);
- if (err == ILL_LOOKUP_FAILED) {
- ip1dbg(("ip_output_v6: multicast"
- " conn_outgoing_pill no ipif\n"));
-multicast_discard:
- ASSERT(saved_ill == NULL);
- if (conn_lock_held)
- mutex_exit(&connp->conn_lock);
- if (ill != NULL)
- ill_refrele(ill);
- freemsg(first_mp);
- if (do_outrequests)
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- saved_ill = ill;
- ill = connp->conn_outgoing_pill;
- attach_if = B_TRUE;
- match_flags = MATCH_IRE_ILL;
- mibptr = ill->ill_ip_mib;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- } else if (connp != NULL && connp->conn_nofailover_ill != NULL) {
- err = ill_check_and_refhold(connp->conn_nofailover_ill);
- if (err == ILL_LOOKUP_FAILED) {
- ip1dbg(("ip_output_v6: multicast"
- " conn_nofailover_ill no ipif\n"));
- goto multicast_discard;
- }
- saved_ill = ill;
- ill = connp->conn_nofailover_ill;
- attach_if = B_TRUE;
- match_flags = MATCH_IRE_ILL;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- } else if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_ATTACH_IF)) {
- /*
- * Redo 1. If we did not find an IRE_CACHE the first time,
- * we should have an ip6i_t with IP6I_ATTACH_IF if
- * IPV6_BOUND_PIF or bind to the IPIF_NOFAILOVER address was
- * used on this endpoint.
- */
- ASSERT(ip6i->ip6i_ifindex != 0);
- attach_if = B_TRUE;
- ASSERT(ill != NULL);
- match_flags = MATCH_IRE_ILL;
-
- /*
- * Check if we need an ire that will not be
- * looked up by anybody else i.e. HIDDEN.
- */
- if (ill_is_probeonly(ill))
- match_flags |= MATCH_IRE_MARK_HIDDEN;
- } else if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
- /* 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
-
+ if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
+ /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
ASSERT(ill != NULL);
} else if (ill != NULL) {
/*
- * 4. If q is an ill queue and (link local or multicast
+ * 2. If q is an ill queue and (link local or multicast
* destination) then use that ill.
* We don't need the ipif initialization here.
* This useless assert below is just to prevent lint from
@@ -10181,9 +9762,9 @@ multicast_discard:
ASSERT(ill != NULL);
} else if (connp != NULL) {
/*
- * 5. If IPV6_BOUND_IF has been set use that ill.
+ * 3. If IPV6_BOUND_IF has been set use that ill.
*
- * 6. For multicast: if IPV6_MULTICAST_IF has been set use it.
+ * 4. For multicast: if IPV6_MULTICAST_IF has been set use it.
* Otherwise look for the best IRE match for the unspecified
* group to determine the ill.
*
@@ -10198,7 +9779,18 @@ multicast_discard:
if (err == ILL_LOOKUP_FAILED) {
ip1dbg(("ip_output_v6: multicast"
" conn_outgoing_ill no ipif\n"));
- goto multicast_discard;
+multicast_discard:
+ ASSERT(saved_ill == NULL);
+ if (conn_lock_held)
+ mutex_exit(&connp->conn_lock);
+ if (ill != NULL)
+ ill_refrele(ill);
+ freemsg(first_mp);
+ if (do_outrequests)
+ BUMP_MIB(mibptr, ipIfStatsOutDiscards);
+ if (need_decref)
+ CONN_DEC_REF(connp);
+ return;
}
ill = connp->conn_outgoing_ill;
} else if (connp->conn_multicast_ill != NULL) {
@@ -10239,8 +9831,6 @@ multicast_discard:
*/
mutex_enter(&connp->conn_lock);
connp->conn_multicast_ill = ill;
- connp->conn_orig_multicast_ifindex =
- ill->ill_phyint->phyint_ifindex;
mutex_exit(&connp->conn_lock);
}
}
@@ -10307,11 +9897,55 @@ multicast_discard:
send_from_ill:
ASSERT(ill != NULL);
ASSERT(mibptr == ill->ill_ip_mib);
+
if (do_outrequests) {
BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
do_outrequests = B_FALSE;
}
+ /*
+ * Because nce_xmit() calls ip_output_v6() and NCEs are always tied to
+ * an underlying interface, IS_UNDER_IPMP() may be true even when
+ * building IREs that will be used for data traffic. As such, use the
+ * packet's source address to determine whether the traffic is test
+ * traffic, and set MATCH_IRE_MARK_TESTHIDDEN if so.
+ *
+ * Separately, we also need to mark probe packets so that ND can
+ * process them specially; see the comments in nce_queue_mp_common().
+ */
+ if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
+ ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) {
+ if (ip6i == NULL) {
+ if ((mp = ip_add_info_v6(mp, NULL, v6dstp)) == NULL) {
+ if (mctl_present)
+ freeb(first_mp);
+ goto discard;
+ }
+
+ if (mctl_present)
+ first_mp->b_cont = mp;
+ else
+ first_mp = mp;
+
+ /* ndp_resolver() expects a pulled-up message */
+ if (MBLKL(mp) == sizeof (ip6i_t) &&
+ pullupmsg(mp, -1) == 0) {
+ ip1dbg(("ip_output_v6: pullupmsg failed\n"));
+discard: BUMP_MIB(mibptr, ipIfStatsOutDiscards);
+ ill_refrele(ill);
+ if (need_decref)
+ CONN_DEC_REF(connp);
+ return;
+ }
+ ip6i = (ip6i_t *)mp->b_rptr;
+ ip6h = (ip6_t *)&ip6i[1];
+ v6dstp = &ip6h->ip6_dst;
+ mp->b_rptr = (uchar_t *)ip6h; /* rewound below */
+ }
+ ip6i->ip6i_flags |= IP6I_IPMP_PROBE;
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+ }
+
if (io != NULL)
io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
@@ -10390,9 +10024,7 @@ send_from_ill:
ill->ill_name, (void *)ire,
ill->ill_phyint->phyint_ifindex));
ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request,
- connp, caller,
- (attach_if ? ill->ill_phyint->phyint_ifindex : 0),
- ip6i_flags, zoneid);
+ connp, caller, ip6i_flags, zoneid);
ire_refrele(ire);
if (need_decref) {
CONN_DEC_REF(connp);
@@ -10422,7 +10054,8 @@ send_from_ill:
return;
}
ip_newroute_ipif_v6(q, copy_mp, ipif,
- ip6h->ip6_dst, unspec_src, zoneid);
+ &ip6h->ip6_dst, &ip6h->ip6_src, unspec_src,
+ zoneid);
ipif_refrele(ipif);
} else {
ip_newroute_v6(q, copy_mp, &ip6h->ip6_dst,
@@ -10440,12 +10073,11 @@ send_from_ill:
/* Update rptr if there was an ip6i_t header. */
if (ip6i != NULL)
mp->b_rptr -= sizeof (ip6i_t);
- if (unspec_src || attach_if) {
+ if (unspec_src) {
if (ip6i == NULL) {
/*
* Add ip6i_t header to carry unspec_src
- * or attach_if until the packet comes back in
- * ip_wput_v6.
+ * until the packet comes back in ip_wput_v6.
*/
if (mctl_present) {
first_mp->b_cont =
@@ -10481,28 +10113,15 @@ send_from_ill:
ip6h = (ip6_t *)&ip6i[1];
v6dstp = &ip6h->ip6_dst;
}
- if (unspec_src)
- ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
- if (attach_if) {
- /*
- * Bind to nofailover/BOUND_PIF overrides ifindex.
- */
- ip6i->ip6i_flags |= IP6I_ATTACH_IF;
- ip6i->ip6i_flags &= ~IP6I_IFINDEX;
- ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
- if (drop_if_delayed) {
- /* This is a multipathing probe packet */
- ip6i->ip6i_flags |= IP6I_DROP_IFDELAYED;
- }
- }
+ ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
if (mctl_present) {
ASSERT(io != NULL);
io->ipsec_out_unspec_src = unspec_src;
}
}
if (IN6_IS_ADDR_MULTICAST(v6dstp)) {
- ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, *v6dstp,
- unspec_src, zoneid);
+ ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, v6dstp,
+ &ip6h->ip6_src, unspec_src, zoneid);
} else {
ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, ill,
zoneid, ipst);
@@ -10544,14 +10163,6 @@ ip_wput_v6(queue_t *q, mblk_t *mp)
ip_output_v6(GLOBAL_ZONEID, mp, q, IP_WPUT);
}
-static void
-ipsec_out_attach_if(ipsec_out_t *io, int attach_index)
-{
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- io->ipsec_out_attach_if = B_TRUE;
- io->ipsec_out_ill_index = attach_index;
-}
-
/*
* NULL send-to queue - packet is to be delivered locally.
*/
@@ -10731,6 +10342,8 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
*/
if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
!IS_LOOPBACK(ill)) {
+ ilm_walker_t ilw;
+
/*
* In the multicast case, applications may have
* joined the group from different zones, so we
@@ -10742,11 +10355,9 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
* on the loopback interface (PHYI_LOOPBACK flag
* set) as they must stay in the sender's zone.
*/
- ILM_WALKER_HOLD(ill);
- for (ilm = ill->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if (ilm->ilm_flags & ILM_DELETED)
- continue;
+ ilm = ilm_walker_start(&ilw, ill);
+ for (; ilm != NULL;
+ ilm = ilm_walker_step(&ilw, ilm)) {
if (!IN6_ARE_ADDR_EQUAL(
&ilm->ilm_v6addr, &ip6h->ip6_dst))
continue;
@@ -10754,23 +10365,24 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
IP_FF_NO_MCAST_LOOP) &&
ilm->ilm_zoneid == ire->ire_zoneid)
continue;
- if (!ipif_lookup_zoneid(ill,
- ilm->ilm_zoneid, IPIF_UP, NULL))
+ if (!ipif_lookup_zoneid(
+ ilw.ilw_walk_ill, ilm->ilm_zoneid,
+ IPIF_UP, NULL))
continue;
first_mp1 = ip_copymsg(first_mp);
if (first_mp1 == NULL)
continue;
- icmp_inbound_v6(q, first_mp1, ill,
- hdr_length, mctl_present,
- IP6_NO_IPPOLICY, ilm->ilm_zoneid,
- NULL);
+ icmp_inbound_v6(q, first_mp1,
+ ilw.ilw_walk_ill, ill, hdr_length,
+ mctl_present, IP6_NO_IPPOLICY,
+ ilm->ilm_zoneid, NULL);
}
- ILM_WALKER_RELE(ill);
+ ilm_walker_finish(&ilw);
} else {
first_mp1 = ip_copymsg(first_mp);
if (first_mp1 != NULL)
- icmp_inbound_v6(q, first_mp1, ill,
+ icmp_inbound_v6(q, first_mp1, ill, ill,
hdr_length, mctl_present,
IP6_NO_IPPOLICY, ire->ire_zoneid,
NULL);
@@ -10823,8 +10435,7 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
*/
static void
ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
- int cksum_request, conn_t *connp, int caller, int attach_index, int flags,
- zoneid_t zoneid)
+ int cksum_request, conn_t *connp, int caller, int flags, zoneid_t zoneid)
{
ip6_t *ip6h;
uint8_t nexthdr;
@@ -10917,7 +10528,7 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
if (src_ire != NULL &&
!(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
(!ipst->ips_ip_restrict_interzone_loopback ||
- ire_local_same_ill_group(ire, src_ire))) {
+ ire_local_same_lan(ire, src_ire))) {
if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
!unspec_src) {
ip6h->ip6_src = src_ire->ire_src_addr_v6;
@@ -10974,20 +10585,14 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
/*
* Select the source address using ipif_select_source_v6.
*/
- if (attach_index != 0) {
- ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst,
- RESTRICT_TO_ILL, IPV6_PREFER_SRC_DEFAULT, zoneid);
- } else {
- ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst,
- RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid);
- }
+ ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, B_FALSE,
+ IPV6_PREFER_SRC_DEFAULT, zoneid);
if (ipif == NULL) {
if (ip_debug > 2) {
/* ip1dbg */
pr_addr_dbg("ip_wput_ire_v6: no src for "
- "dst %s\n, ", AF_INET6, &ip6h->ip6_dst);
- printf("ip_wput_ire_v6: interface name %s\n",
- ill->ill_name);
+ "dst %s\n", AF_INET6, &ip6h->ip6_dst);
+ printf("through interface %s\n", ill->ill_name);
}
freemsg(first_mp);
return;
@@ -10998,12 +10603,8 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
if ((connp != NULL && connp->conn_multicast_loop) ||
!IS_LOOPBACK(ill)) {
- ilm_t *ilm;
-
- ILM_WALKER_HOLD(ill);
- ilm = ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, ALL_ZONES);
- ILM_WALKER_RELE(ill);
- if (ilm != NULL) {
+ if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE,
+ ALL_ZONES) != NULL) {
mblk_t *nmp;
int fanout_flags = 0;
@@ -11417,8 +11018,6 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
}
/* Do IPSEC processing first */
if (mctl_present) {
- if (attach_index != 0)
- ipsec_out_attach_if(io, attach_index);
ipsec_out_process(q, first_mp, ire, ill_index);
return;
}
@@ -11456,8 +11055,6 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
max_frag, B_FALSE, B_TRUE, zoneid, ipst);
return;
}
- if (attach_index != 0)
- ipsec_out_attach_if(io, attach_index);
ipsec_out_process(q, first_mp, ire, ill_index);
return;
}
@@ -11948,8 +11545,8 @@ boolean_t
conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags,
zoneid_t zoneid)
{
- ill_t *in_ill;
- boolean_t wantpacket = B_TRUE;
+ ill_t *bound_ill;
+ boolean_t wantpacket;
in6_addr_t *v6dst_ptr = &ip6h->ip6_dst;
in6_addr_t *v6src_ptr = &ip6h->ip6_src;
@@ -11958,42 +11555,16 @@ conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags,
* unicast and multicast reception to conn_incoming_ill.
* conn_wantpacket_v6 is called both for unicast and
* multicast.
- *
- * 1) The unicast copy of the packet can come anywhere in
- * the ill group if it is part of the group. Thus, we
- * need to check to see whether the ill group matches
- * if in_ill is part of a group.
- *
- * 2) ip_rput does not suppress duplicate multicast packets.
- * If there are two interfaces in a ill group and we have
- * 2 applications (conns) joined a multicast group G on
- * both the interfaces, ilm_lookup_ill filter in ip_rput
- * will give us two packets because we join G on both the
- * interfaces rather than nominating just one interface
- * for receiving multicast like broadcast above. So,
- * we have to call ilg_lookup_ill to filter out duplicate
- * copies, if ill is part of a group, to supress duplicates.
*/
- in_ill = connp->conn_incoming_ill;
- if (in_ill != NULL) {
- mutex_enter(&connp->conn_lock);
- in_ill = connp->conn_incoming_ill;
- mutex_enter(&ill->ill_lock);
- /*
- * No IPMP, and the packet did not arrive on conn_incoming_ill
- * OR, IPMP in use and the packet arrived on an IPMP group
- * different from the conn_incoming_ill's IPMP group.
- * Reject the packet.
- */
- if ((in_ill->ill_group == NULL && in_ill != ill) ||
- (in_ill->ill_group != NULL &&
- in_ill->ill_group != ill->ill_group)) {
- wantpacket = B_FALSE;
+ bound_ill = connp->conn_incoming_ill;
+ if (bound_ill != NULL) {
+ if (IS_IPMP(bound_ill)) {
+ if (bound_ill->ill_grp != ill->ill_grp)
+ return (B_FALSE);
+ } else {
+ if (bound_ill != ill)
+ return (B_FALSE);
}
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- if (!wantpacket)
- return (B_FALSE);
}
if (connp->conn_multi_router)
@@ -12140,7 +11711,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
(IN6_ARE_ADDR_EQUAL(&first_ire->ire_addr_v6,
&ire->ire_addr_v6)) &&
!(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)))
+ (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
break;
}
@@ -12204,8 +11775,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
&ire->ire_addr_v6))
continue;
if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED|
- IRE_MARK_HIDDEN))
+ IRE_MARK_CONDEMNED)
continue;
/* Got one */
@@ -13279,3 +12849,31 @@ ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah)
size += ehdrlen;
}
}
+
+/*
+ * Utility routine that checks if `v6srcp' is a valid address on underlying
+ * interface `ill'. If `ipifp' is non-NULL, it's set to a held ipif
+ * associated with `v6srcp' on success. NOTE: if this is not called from
+ * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
+ * group during or after this lookup.
+ */
+static boolean_t
+ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp)
+{
+ ipif_t *ipif;
+
+ ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst);
+ if (ipif != NULL) {
+ if (ipifp != NULL)
+ *ipifp = ipif;
+ else
+ ipif_refrele(ipif);
+ return (B_TRUE);
+ }
+
+ if (ip_debug > 2) {
+ pr_addr_dbg("ipif_lookup_testaddr_v6: cannot find ipif for "
+ "src %s\n", AF_INET6, v6srcp);
+ }
+ return (B_FALSE);
+}
diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c
index 81447c2e30..c729118fec 100644
--- a/usr/src/uts/common/inet/ip/ip6_if.c
+++ b/usr/src/uts/common/inet/ip/ip6_if.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -53,7 +53,6 @@
#include <netinet/igmp_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
-#include <netinet/in.h>
#include <inet/common.h>
#include <inet/nd.h>
@@ -178,10 +177,12 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
} else if (IPIF_CAN_WAIT(ipif, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
rw_exit(&ipst->ips_ill_g_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP,
ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (error != NULL)
@@ -202,16 +203,12 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
}
/*
- * Look for an ipif with the specified address. For point-point links
- * we look for matches on either the destination address and the local
- * address, but we ignore the check on the local address if IPIF_UNNUMBERED
- * is set.
- * Matches on a specific ill if match_ill is set.
+ * Common function for ipif_lookup_addr_v6() and ipif_lookup_addr_exact_v6().
*/
-/* ARGSUSED */
-ipif_t *
-ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
- queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+static ipif_t *
+ipif_lookup_addr_common_v6(const in6_addr_t *addr, ill_t *match_ill,
+ boolean_t match_illgrp, zoneid_t zoneid, queue_t *q, mblk_t *mp,
+ ipsq_func_t func, int *error, ip_stack_t *ipst)
{
ipif_t *ipif;
ill_t *ill;
@@ -230,7 +227,8 @@ ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
repeat:
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (match_ill != NULL && ill != match_ill) {
+ if (match_ill != NULL && ill != match_ill &&
+ (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
continue;
}
GRAB_CONN_LOCK(q);
@@ -257,10 +255,12 @@ repeat:
} else if (IPIF_CAN_WAIT(ipif, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
rw_exit(&ipst->ips_ill_g_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP,
ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (error != NULL)
@@ -323,11 +323,41 @@ ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid,
}
/*
+ * Lookup an ipif with the specified address. For point-to-point links we
+ * look for matches on either the destination address or the local address,
+ * but we skip the local address check if IPIF_UNNUMBERED is set. If the
+ * `match_ill' argument is non-NULL, the lookup is restricted to that ill
+ * (or illgrp if `match_ill' is in an IPMP group).
+ */
+ipif_t *
+ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
+ queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+{
+ return (ipif_lookup_addr_common_v6(addr, match_ill, B_TRUE, zoneid, q,
+ mp, func, error, ipst));
+}
+
+/*
+ * Special abbreviated version of ipif_lookup_addr_v6() that doesn't match
+ * `match_ill' across the IPMP group. This function is only needed in some
+ * corner-cases; almost everything should use ipif_lookup_addr_v6().
+ */
+ipif_t *
+ipif_lookup_addr_exact_v6(const in6_addr_t *addr, ill_t *match_ill,
+ ip_stack_t *ipst)
+{
+ ASSERT(match_ill != NULL);
+ return (ipif_lookup_addr_common_v6(addr, match_ill, B_FALSE, ALL_ZONES,
+ NULL, NULL, NULL, NULL, ipst));
+}
+
+/*
* Look for an ipif with the specified address. For point-point links
* we look for matches on either the destination address and the local
* address, but we ignore the check on the local address if IPIF_UNNUMBERED
* is set.
- * Matches on a specific ill if match_ill is set.
+ * If the `match_ill' argument is non-NULL, the lookup is restricted to that
+ * ill (or illgrp if `match_ill' is in an IPMP group).
* Return the zoneid for the ipif. ALL_ZONES if none found.
*/
zoneid_t
@@ -348,7 +378,8 @@ ipif_lookup_addr_zoneid_v6(const in6_addr_t *addr, ill_t *match_ill,
repeat:
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (match_ill != NULL && ill != match_ill) {
+ if (match_ill != NULL && ill != match_ill &&
+ !IS_IN_SAME_ILLGRP(ill, match_ill)) {
continue;
}
mutex_enter(&ill->ill_lock);
@@ -1120,11 +1151,10 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
boolean_t
ill_setdefaulttoken(ill_t *ill)
{
- int i;
+ int i;
in6_addr_t v6addr, v6mask;
- if (!MEDIA_V6INTFID(ill->ill_media, ill->ill_phys_addr_length,
- ill->ill_phys_addr, &v6addr))
+ if (!MEDIA_V6INTFID(ill->ill_media, ill, &v6addr))
return (B_FALSE);
(void) ip_plen_to_mask_v6(IPV6_TOKEN_LEN, &v6mask);
@@ -1161,7 +1191,7 @@ ipif_set_tun_auto_addr(ipif_t *ipif, struct iftun_req *ta)
{
sin6_t sin6;
sin_t *sin;
- ill_t *ill = ipif->ipif_ill;
+ ill_t *ill = ipif->ipif_ill;
tun_t *tp = (tun_t *)ill->ill_wq->q_next->q_ptr;
if (ta->ifta_saddr.ss_family != AF_INET ||
@@ -1227,7 +1257,7 @@ ipif_set_tun_llink(ill_t *ill, struct iftun_req *ta)
if ((ta->ifta_flags & IFTUN_DST) &&
IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)) {
- sin6_t sin6;
+ sin6_t sin6;
ASSERT(!(ipif->ipif_flags & IPIF_UP));
bzero(&sin6, sizeof (sin6_t));
@@ -1344,13 +1374,22 @@ ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce)
if (ret_nce != NULL)
*ret_nce = NULL;
+
+ /*
+ * IPMP meta-interfaces don't have any inherent multicast mappings,
+ * and instead use the ones on the underlying interfaces.
+ */
+ if (IS_IPMP(ill))
+ return (0);
+
/*
* Delete the mapping nce. Normally these should not exist
* as a previous ipif_down -> ipif_ndp_down should have deleted
* all the nces. But they can exist if ip_rput_dlpi_writer
- * calls this when PHYI_MULTI_BCAST is set.
+ * calls this when PHYI_MULTI_BCAST is set. Mappings are always
+ * tied to the underlying ill, so don't match across the illgrp.
*/
- mnce = ndp_lookup_v6(ill, &v6_mcast_addr, B_FALSE);
+ mnce = ndp_lookup_v6(ill, B_FALSE, &v6_mcast_addr, B_FALSE);
if (mnce != NULL) {
ndp_delete(mnce);
NCE_REFRELE(mnce);
@@ -1424,13 +1463,15 @@ ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce)
* Get the resolver set up for a new ipif. (Always called as writer.)
*/
int
-ipif_ndp_up(ipif_t *ipif)
+ipif_ndp_up(ipif_t *ipif, boolean_t initial)
{
ill_t *ill = ipif->ipif_ill;
int err = 0;
nce_t *nce = NULL;
nce_t *mnce = NULL;
+ boolean_t added_ipif = B_FALSE;
+ ASSERT(IAM_WRITER_ILL(ill));
ip1dbg(("ipif_ndp_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
/*
@@ -1464,7 +1505,10 @@ ipif_ndp_up(ipif_t *ipif)
if ((ipif->ipif_flags & (IPIF_UNNUMBERED|IPIF_NOLOCAL)) == 0) {
uint16_t flags;
- uchar_t *hw_addr = NULL;
+ uint16_t state;
+ uchar_t *hw_addr = NULL;
+ ill_t *bound_ill;
+ ipmp_illgrp_t *illg = ill->ill_grp;
/* Permanent entries don't need NUD */
flags = NCE_F_PERMANENT | NCE_F_NONUD;
@@ -1474,26 +1518,65 @@ ipif_ndp_up(ipif_t *ipif)
if (ipif->ipif_flags & IPIF_ANYCAST)
flags |= NCE_F_ANYCAST;
- if (ill->ill_net_type == IRE_IF_RESOLVER) {
- hw_addr = ill->ill_nd_lla;
-
- if (ill->ill_move_in_progress) {
- /*
- * Addresses are failing over to this ill.
- * Don't wait for NUD to see this change.
- * Publish our new link-layer address.
- */
- flags |= NCE_F_UNSOL_ADV;
+ if (IS_IPMP(ill)) {
+ ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
+ /*
+ * If we're here via ipif_up(), then the ipif won't be
+ * bound yet -- add it to the group, which will bind
+ * it if possible. (We would add it in ipif_up(), but
+ * deleting on failure there is gruesome.) If we're
+ * here via ipmp_ill_bind_ipif(), then the ipif has
+ * already been added to the group and we just need to
+ * use the binding.
+ */
+ if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
+ bound_ill = ipmp_illgrp_add_ipif(illg, ipif);
+ if (bound_ill == NULL) {
+ /*
+ * We couldn't bind the ipif to an ill
+ * yet, so we have nothing to publish.
+ * Set ipif_addr_ready so that this
+ * address can be used locally for now.
+ * The routing socket message will be
+ * sent from ipif_up_done_v6().
+ */
+ ipif->ipif_addr_ready = 1;
+ return (0);
+ }
+ added_ipif = B_TRUE;
}
+ hw_addr = bound_ill->ill_nd_lla;
+ } else {
+ bound_ill = ill;
+ if (ill->ill_net_type == IRE_IF_RESOLVER)
+ hw_addr = ill->ill_nd_lla;
+ }
+
+ /*
+ * If this is an initial bring-up (or the ipif was never
+ * completely brought up), do DAD. Otherwise, we're here
+ * because IPMP has rebound an address to this ill: send
+ * unsolicited advertisements to inform others.
+ */
+ if (initial || !ipif->ipif_addr_ready) {
+ state = ND_PROBE;
+ } else {
+ state = ND_REACHABLE;
+ flags |= NCE_F_UNSOL_ADV;
}
- err = ndp_lookup_then_add_v6(ill,
+ /*
+ * NOTE: for IPMP, local addresses are always associated with
+ * the ill they're bound to, so don't match across the illgrp.
+ */
+ err = ndp_lookup_then_add_v6(bound_ill,
+ B_FALSE,
hw_addr,
&ipif->ipif_v6lcl_addr,
&ipv6_all_ones,
&ipv6_all_zeros,
0,
flags,
- ND_PROBE, /* Causes Duplicate Address Detection to run */
+ state,
&nce);
switch (err) {
case 0:
@@ -1509,19 +1592,11 @@ ipif_ndp_up(ipif_t *ipif)
NCE_REFRELE(nce);
ip1dbg(("ipif_ndp_up: NCE already exists for %s\n",
ill->ill_name));
- if (mnce != NULL) {
- ndp_delete(mnce);
- NCE_REFRELE(mnce);
- }
- return (err);
+ goto fail;
default:
- ip1dbg(("ipif_ndp_up: NCE creation failed %s\n",
+ ip1dbg(("ipif_ndp_up: NCE creation failed for %s\n",
ill->ill_name));
- if (mnce != NULL) {
- ndp_delete(mnce);
- NCE_REFRELE(mnce);
- }
- return (err);
+ goto fail;
}
} else {
/* No local NCE for this entry */
@@ -1532,6 +1607,15 @@ ipif_ndp_up(ipif_t *ipif)
if (mnce != NULL)
NCE_REFRELE(mnce);
return (0);
+fail:
+ if (mnce != NULL) {
+ ndp_delete(mnce);
+ NCE_REFRELE(mnce);
+ }
+ if (added_ipif)
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+
+ return (err);
}
/* Remove all cache entries for this logical interface */
@@ -1539,23 +1623,42 @@ void
ipif_ndp_down(ipif_t *ipif)
{
nce_t *nce;
+ ill_t *ill = ipif->ipif_ill;
+
+ ASSERT(IAM_WRITER_ILL(ill));
if (ipif->ipif_isv6) {
- nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr,
- B_FALSE);
- if (nce != NULL) {
- ndp_delete(nce);
- NCE_REFRELE(nce);
+ ill_t *bound_ill;
+
+ if (IS_IPMP(ill))
+ bound_ill = ipmp_ipif_bound_ill(ipif);
+ else
+ bound_ill = ill;
+
+ if (bound_ill != NULL) {
+ nce = ndp_lookup_v6(bound_ill,
+ B_FALSE, /* see comment in ipif_ndp_up() */
+ &ipif->ipif_v6lcl_addr,
+ B_FALSE);
+ if (nce != NULL) {
+ ndp_delete(nce);
+ NCE_REFRELE(nce);
+ }
}
+
+ /*
+ * Make IPMP aware of the deleted data address.
+ */
+ if (IS_IPMP(ill))
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
}
+
/*
* Remove mapping and all other nces dependent on this ill
* when the last ipif is going away.
*/
- if (ipif->ipif_ill->ill_ipif_up_count == 0) {
- ndp_walk(ipif->ipif_ill, (pfi_t)ndp_delete_per_ill,
- (uchar_t *)ipif->ipif_ill, ipif->ipif_ill->ill_ipst);
- }
+ if (ill->ill_ipif_up_count == 0)
+ ndp_walk(ill, (pfi_t)ndp_delete_per_ill, ill, ill->ill_ipst);
}
/*
@@ -1936,9 +2039,7 @@ rule_preferred(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
}
/*
- * Prefer source addresses that are assigned to the outgoing interface, or
- * to an interface that is in the same IPMP group as the outgoing
- * interface.
+ * Prefer source addresses that are assigned to the outgoing interface.
*/
/* ARGSUSED3 */
static rule_res_t
@@ -1955,15 +2056,11 @@ rule_interface(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
return (CAND_TIE);
if (!bc->cand_matchedinterface_set) {
- bc->cand_matchedinterface = (bc->cand_ill == dstill ||
- (dstill->ill_group != NULL &&
- dstill->ill_group == bc->cand_ill->ill_group));
+ bc->cand_matchedinterface = bc->cand_ill == dstill;
bc->cand_matchedinterface_set = B_TRUE;
}
- cc->cand_matchedinterface = (cc->cand_ill == dstill ||
- (dstill->ill_group != NULL &&
- dstill->ill_group == cc->cand_ill->ill_group));
+ cc->cand_matchedinterface = cc->cand_ill == dstill;
cc->cand_matchedinterface_set = B_TRUE;
if (bc->cand_matchedinterface == cc->cand_matchedinterface)
@@ -2134,6 +2231,13 @@ rule_addr_type(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
static rule_res_t
rule_prefix(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst)
{
+ /*
+ * For IPMP, we always want to choose a random source address from
+ * among any equally usable addresses, so always report a tie.
+ */
+ if (IS_IPMP(dstinfo->dst_ill))
+ return (CAND_TIE);
+
if (!bc->cand_common_pref_set) {
bc->cand_common_pref = ip_common_prefix_v6(&bc->cand_srcaddr,
dstinfo->dst_addr);
@@ -2177,10 +2281,9 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
* specification's algorithm could traverse the list of addresses once for
* every rule).
*
- * The restrict_ill argument restricts the algorithm to chose a source
- * address that is assigned to the destination ill or an ill in the same
- * IPMP group as the destination ill. This is used when the destination
- * address is a link-local or multicast address, and when
+ * The restrict_ill argument restricts the algorithm to choose a source
+ * address that is assigned to the destination ill. This is used when
+ * the destination address is a link-local or multicast address, and when
* ipv6_strict_dst_multihoming is turned on.
*
* src_prefs is the caller's set of source address preferences. If source
@@ -2192,13 +2295,13 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
*/
ipif_t *
ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
- uint_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid)
+ boolean_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid)
{
dstinfo_t dstinfo;
char dstr[INET6_ADDRSTRLEN];
char sstr[INET6_ADDRSTRLEN];
- ipif_t *ipif;
- ill_t *ill, *usesrc_ill = NULL;
+ ipif_t *ipif, *start_ipif, *next_ipif;
+ ill_t *ill, *usesrc_ill = NULL, *ipmp_ill = NULL;
ill_walk_context_t ctx;
cand_t best_c; /* The best candidate */
cand_t curr_c; /* The current candidate */
@@ -2247,6 +2350,16 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
} else {
return (NULL);
}
+ } else if (IS_UNDER_IPMP(dstill)) {
+ /*
+ * Test addresses should never be used for source address
+ * selection, so if we were passed an underlying ill, switch
+ * to the IPMP meta-interface.
+ */
+ if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(dstill)) != NULL)
+ dstinfo.dst_ill = ipmp_ill;
+ else
+ return (NULL);
} else {
dstinfo.dst_ill = dstill;
}
@@ -2286,10 +2399,7 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
*/
if (IN6_IS_ADDR_LINKLOCAL(dst) || IN6_IS_ADDR_MULTICAST(dst) ||
ipst->ips_ipv6_strict_dst_multihoming || usesrc_ill != NULL) {
- if (restrict_ill == RESTRICT_TO_NONE)
- dstinfo.dst_restrict_ill = RESTRICT_TO_GROUP;
- else
- dstinfo.dst_restrict_ill = restrict_ill;
+ dstinfo.dst_restrict_ill = B_TRUE;
} else {
dstinfo.dst_restrict_ill = restrict_ill;
}
@@ -2297,39 +2407,41 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
bzero(&best_c, sizeof (cand_t));
/*
- * Take a pass through the list of IPv6 interfaces to chose the
- * best possible source address. If restrict_ill is true, we only
- * iterate through the ill's that are in the same IPMP group as the
- * destination's outgoing ill. If restrict_ill is false, we walk
- * the entire list of IPv6 ill's.
+ * Take a pass through the list of IPv6 interfaces to choose the best
+ * possible source address. If restrict_ill is set, just use dst_ill.
*/
- if (dstinfo.dst_restrict_ill != RESTRICT_TO_NONE) {
- if (dstinfo.dst_ill->ill_group != NULL &&
- dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP) {
- ill = dstinfo.dst_ill->ill_group->illgrp_ill;
- } else {
- ill = dstinfo.dst_ill;
- }
- } else {
+ if (dstinfo.dst_restrict_ill)
+ ill = dstinfo.dst_ill;
+ else
ill = ILL_START_WALK_V6(&ctx, ipst);
- }
- while (ill != NULL) {
+ for (; ill != NULL; ill = ill_next(&ctx, ill)) {
ASSERT(ill->ill_isv6);
/*
- * Avoid FAILED/OFFLINE ills.
- * Global and site local addresses will failover and
- * will be available on the new ill.
- * But link local addresses don't move.
+ * Test addresses should never be used for source address
+ * selection, so ignore underlying ills.
*/
- if (dstinfo.dst_restrict_ill != RESTRICT_TO_ILL &&
- ill->ill_phyint->phyint_flags &
- (PHYI_OFFLINE | PHYI_FAILED))
- goto next_ill;
+ if (IS_UNDER_IPMP(ill))
+ continue;
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
+ /*
+ * For source address selection, we treat the ipif list as
+ * circular and continue until we get back to where we
+ * started. This allows IPMP to vary source address selection
+ * (which improves inbound load spreading) by caching its last
+ * ending point and starting from there. NOTE: we don't have
+ * to worry about ill_src_ipif changing ills since that can't
+ * happen on the IPMP ill.
+ */
+ start_ipif = ill->ill_ipif;
+ if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
+ start_ipif = ill->ill_src_ipif;
+
+ ipif = start_ipif;
+ do {
+ if ((next_ipif = ipif->ipif_next) == NULL)
+ next_ipif = ill->ill_ipif;
if (!IPIF_VALID_IPV6_SOURCE(ipif))
continue;
@@ -2387,9 +2499,8 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
*/
for (index = 0; rules[index] != NULL; index++) {
/* Apply a comparison rule. */
- rule_result =
- (rules[index])(&best_c, &curr_c, &dstinfo,
- ipst);
+ rule_result = (rules[index])(&best_c, &curr_c,
+ &dstinfo, ipst);
if (rule_result == CAND_AVOID) {
/*
* The best candidate is still the
@@ -2417,21 +2528,29 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
* have been prefered as the best candidate so far.
*/
ASSERT(rule_result != CAND_TIE);
+ } while ((ipif = next_ipif) != start_ipif);
+
+ /*
+ * For IPMP, update the source ipif rotor to the next ipif,
+ * provided we can look it up. (We must not use it if it's
+ * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
+ * ipif_free() checked ill_src_ipif.)
+ */
+ if (IS_IPMP(ill) && ipif != NULL) {
+ mutex_enter(&ipif->ipif_ill->ill_lock);
+ next_ipif = ipif->ipif_next;
+ if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif))
+ ill->ill_src_ipif = next_ipif;
+ else
+ ill->ill_src_ipif = NULL;
+ mutex_exit(&ipif->ipif_ill->ill_lock);
}
/*
- * We may be walking the linked-list of ill's in an
- * IPMP group or traversing the IPv6 ill avl tree. If it is a
- * usesrc ILL then it can't be part of IPMP group and we
- * will exit the while loop.
+ * Only one ill to consider if dst_restrict_ill is set.
*/
-next_ill:
- if (dstinfo.dst_restrict_ill == RESTRICT_TO_ILL)
- ill = NULL;
- else if (dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP)
- ill = ill->ill_group_next;
- else
- ill = ill_next(&ctx, ill);
+ if (dstinfo.dst_restrict_ill)
+ break;
}
ipif = best_c.cand_ipif;
@@ -2444,6 +2563,9 @@ next_ill:
if (usesrc_ill != NULL)
ill_refrele(usesrc_ill);
+ if (ipmp_ill != NULL)
+ ill_refrele(ipmp_ill);
+
if (dst_rhtp != NULL)
TPC_RELE(dst_rhtp);
@@ -2474,8 +2596,7 @@ next_ill:
* ipif_update_other_ipifs calls us.
*
* If old_ipif is NULL, just redo the source address selection
- * if needed. This happens when illgrp_insert or ipif_up_done_v6
- * calls us.
+ * if needed. This happens when ipif_up_done_v6 calls us.
*/
void
ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
@@ -2561,8 +2682,7 @@ ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
if (ip6_asp_can_lookup(ipst)) {
ip6_asp_table_held = B_TRUE;
nipif = ipif_select_source_v6(ill, &ipif->ipif_v6subnet,
- RESTRICT_TO_GROUP, IPV6_PREFER_SRC_DEFAULT,
- ipif->ipif_zoneid);
+ B_TRUE, IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid);
}
if (nipif == NULL) {
/* Last resort - all ipif's have IPIF_NOLOCAL */
@@ -2630,13 +2750,9 @@ ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
* Find the IRE_INTERFACE for such ipif's and recreate them
* to use an different source address following the rules in
* ipif_up_done_v6.
- *
- * This function takes an illgrp as an argument so that illgrp_delete
- * can call this to update source address even after deleting the
- * old_ipif->ipif_ill from the ill group.
*/
void
-ipif_update_other_ipifs_v6(ipif_t *old_ipif, ill_group_t *illgrp)
+ipif_update_other_ipifs_v6(ipif_t *old_ipif)
{
ipif_t *ipif;
ill_t *ill;
@@ -2651,23 +2767,9 @@ ipif_update_other_ipifs_v6(ipif_t *old_ipif, ill_group_t *illgrp)
inet_ntop(AF_INET6, &old_ipif->ipif_v6lcl_addr,
buf, sizeof (buf))));
- /*
- * If this part of a group, look at all ills as ipif_select_source
- * borrows a source address across all the ills in the group.
- */
- if (illgrp != NULL)
- ill = illgrp->illgrp_ill;
-
- /* Don't need a lock since this is a writer */
- for (; ill != NULL; ill = ill->ill_group_next) {
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
-
- if (ipif == old_ipif)
- continue;
-
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ if (ipif != old_ipif)
ipif_recreate_interface_routes_v6(old_ipif, ipif);
- }
}
}
@@ -2828,12 +2930,10 @@ ipif_up_done_v6(ipif_t *ipif)
boolean_t flush_ire_cache = B_TRUE;
int err;
char buf[INET6_ADDRSTRLEN];
- phyint_t *phyi;
ire_t **ipif_saved_irep = NULL;
int ipif_saved_ire_cnt;
int cnt;
boolean_t src_ipif_held = B_FALSE;
- boolean_t ire_added = B_FALSE;
boolean_t loopback = B_FALSE;
boolean_t ip6_asp_table_held = B_FALSE;
ip_stack_t *ipst = ill->ill_ipst;
@@ -2868,8 +2968,8 @@ ipif_up_done_v6(ipif_t *ipif)
break;
}
if (flush_ire_cache)
- ire_walk_ill_v6(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE,
- IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
+ ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
+ IRE_CACHE, ill_ipif_cache_delete, ill, ill);
/*
* Figure out which way the send-to queue should go. Only
@@ -2900,7 +3000,9 @@ ipif_up_done_v6(ipif_t *ipif)
ipif->ipif_ire_type = IRE_LOCAL;
}
- if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) {
+ if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) ||
+ ((ipif->ipif_flags & IPIF_DEPRECATED) &&
+ !(ipif->ipif_flags & IPIF_NOFAILOVER))) {
/*
* Can't use our source address. Select a different
* source address for the IRE_INTERFACE and IRE_LOCAL
@@ -2908,7 +3010,7 @@ ipif_up_done_v6(ipif_t *ipif)
if (ip6_asp_can_lookup(ipst)) {
ip6_asp_table_held = B_TRUE;
src_ipif = ipif_select_source_v6(ipif->ipif_ill,
- &ipif->ipif_v6subnet, RESTRICT_TO_NONE,
+ &ipif->ipif_v6subnet, B_FALSE,
IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid);
}
if (src_ipif == NULL)
@@ -3090,9 +3192,9 @@ ipif_up_done_v6(ipif_t *ipif)
ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
/*
- * Need to atomically check for ip_addr_availablity_check
- * now under ill_g_lock, and if it fails got bad, and remove
- * from group also
+ * Need to atomically check for IP address availability under
+ * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new
+ * ills or new ipifs can be added while we are checking availability.
*/
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
mutex_enter(&ipst->ips_ip_addr_avail_lock);
@@ -3125,9 +3227,7 @@ ipif_up_done_v6(ipif_t *ipif)
}
/*
- * Add in all newly created IREs. We want to add before
- * we call ifgrp_insert which wants to know whether
- * IRE_IF_RESOLVER exists or not.
+ * Add in all newly created IREs.
*
* NOTE : We refrele the ire though we may branch to "bad"
* later on where we do ire_delete. This is okay
@@ -3148,36 +3248,6 @@ ipif_up_done_v6(ipif_t *ipif)
ip6_asp_table_refrele(ipst);
ip6_asp_table_held = B_FALSE;
}
- ire_added = B_TRUE;
-
- /*
- * Form groups if possible.
- *
- * If we are supposed to be in a ill_group with a name, insert it
- * now as we know that at least one ipif is UP. Otherwise form
- * nameless groups.
- *
- * If ip_enable_group_ifs is set and ipif address is not ::0, insert
- * this ipif into the appropriate interface group, or create a
- * new one. If this is already in a nameless group, we try to form
- * a bigger group looking at other ills potentially sharing this
- * ipif's prefix.
- */
- phyi = ill->ill_phyint;
- if (phyi->phyint_groupname_len != 0) {
- ASSERT(phyi->phyint_groupname != NULL);
- if (ill->ill_ipif_up_count == 1) {
- ASSERT(ill->ill_group == NULL);
- err = illgrp_insert(&ipst->ips_illgrp_head_v6, ill,
- phyi->phyint_groupname, NULL, B_TRUE);
- if (err != 0) {
- ip1dbg(("ipif_up_done_v6: illgrp allocation "
- "failed, error %d\n", err));
- goto bad;
- }
- }
- ASSERT(ill->ill_group != NULL);
- }
/* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
@@ -3190,19 +3260,23 @@ ipif_up_done_v6(ipif_t *ipif)
*/
ill_recover_multicast(ill);
}
- /* Join the allhosts multicast address and the solicited node MC */
- ipif_multicast_up(ipif);
- if (!loopback) {
+ if (ill->ill_ipif_up_count == 1) {
/*
- * See whether anybody else would benefit from the
- * new ipif that we added. We call this always rather
- * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST
- * ipif for the benefit of illgrp_insert (done above)
- * which does not do source address selection as it does
- * not want to re-create interface routes that we are
- * having reference to it here.
+ * Since the interface is now up, it may now be active.
*/
+ if (IS_UNDER_IPMP(ill))
+ ipmp_ill_refresh_active(ill);
+ }
+
+ /* Join the allhosts multicast address and the solicited node MC */
+ ipif_multicast_up(ipif);
+
+ /*
+ * See if anybody else would benefit from our new ipif.
+ */
+ if (!loopback &&
+ !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
ill_update_source_selection(ill);
}
@@ -3238,29 +3312,11 @@ ipif_up_done_v6(ipif_t *ipif)
bad:
if (ip6_asp_table_held)
ip6_asp_table_refrele(ipst);
- /*
- * We don't have to bother removing from ill groups because
- *
- * 1) For groups with names, we insert only when the first ipif
- * comes up. In that case if it fails, it will not be in any
- * group. So, we need not try to remove for that case.
- *
- * 2) For groups without names, either we tried to insert ipif_ill
- * in a group as singleton or found some other group to become
- * a bigger group. For the former, if it fails we don't have
- * anything to do as ipif_ill is not in the group and for the
- * latter, there are no failures in illgrp_insert/illgrp_delete
- * (ENOMEM can't occur for this. Check ifgrp_insert).
- */
while (irep > ire_array) {
irep--;
- if (*irep != NULL) {
+ if (*irep != NULL)
ire_delete(*irep);
- if (ire_added)
- ire_refrele(*irep);
- }
-
}
(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
@@ -3272,8 +3328,7 @@ bad:
ipif_refrele(src_ipif);
ipif_ndp_down(ipif);
- if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV)
- ipif_arp_down(ipif);
+ ipif_resolver_down(ipif);
return (err);
}
@@ -3286,15 +3341,14 @@ int
ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
{
- in6_addr_t addr;
sin6_t *sin6;
nce_t *nce;
struct lifreq *lifr;
lif_nd_req_t *lnr;
- mblk_t *mp1;
+ ill_t *ill = ipif->ipif_ill;
+ ire_t *ire;
- mp1 = mp->b_cont->b_cont;
- lifr = (struct lifreq *)mp1->b_rptr;
+ lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr;
lnr = &lifr->lifr_nd;
/* Only allow for logical unit zero i.e. not on "le0:17" */
if (ipif->ipif_id != 0)
@@ -3307,8 +3361,28 @@ ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
return (EAFNOSUPPORT);
sin6 = (sin6_t *)&lnr->lnr_addr;
- addr = sin6->sin6_addr;
- nce = ndp_lookup_v6(ipif->ipif_ill, &addr, B_FALSE);
+
+ /*
+ * Since ND mappings must be consistent across an IPMP group, prohibit
+ * deleting ND mappings on underlying interfaces. Also, since ND
+ * mappings for IPMP data addresses are owned by IP itself, prohibit
+ * deleting them.
+ */
+ if (IS_UNDER_IPMP(ill))
+ return (EPERM);
+
+ if (IS_IPMP(ill)) {
+ ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL,
+ ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL,
+ ill->ill_ipst);
+ if (ire != NULL) {
+ ire_refrele(ire);
+ return (EPERM);
+ }
+ }
+
+ /* See comment in ndp_query() regarding IS_IPMP(ill) usage */
+ nce = ndp_lookup_v6(ill, IS_IPMP(ill), &sin6->sin6_addr, B_FALSE);
if (nce == NULL)
return (ESRCH);
ndp_delete(nce);
@@ -3354,11 +3428,11 @@ int
ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
{
+ sin6_t *sin6;
ill_t *ill = ipif->ipif_ill;
struct lifreq *lifr;
lif_nd_req_t *lnr;
-
- ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
+ ire_t *ire;
lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr;
lnr = &lifr->lifr_nd;
@@ -3372,5 +3446,26 @@ ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
if (lnr->lnr_addr.ss_family != AF_INET6)
return (EAFNOSUPPORT);
+ sin6 = (sin6_t *)&lnr->lnr_addr;
+
+ /*
+ * Since ND mappings must be consistent across an IPMP group, prohibit
+ * updating ND mappings on underlying interfaces. Also, since ND
+ * mappings for IPMP data addresses are owned by IP itself, prohibit
+ * updating them.
+ */
+ if (IS_UNDER_IPMP(ill))
+ return (EPERM);
+
+ if (IS_IPMP(ill)) {
+ ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL,
+ ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL,
+ ill->ill_ipst);
+ if (ire != NULL) {
+ ire_refrele(ire);
+ return (EPERM);
+ }
+ }
+
return (ndp_sioc_update(ill, lnr));
}
diff --git a/usr/src/uts/common/inet/ip/ip6_ire.c b/usr/src/uts/common/inet/ip/ip6_ire.c
index 41461ca96f..0d0f3621f5 100644
--- a/usr/src/uts/common/inet/ip/ip6_ire.c
+++ b/usr/src/uts/common/inet/ip/ip6_ire.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -73,7 +73,6 @@ static ire_t *ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *,
const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
static ire_t *ip6_ctable_lookup_impl(ire_ctable_args_t *);
-
/*
* Initialize the ire that is specific to IPv6 part and call
* ire_init_common to finish it.
@@ -261,13 +260,11 @@ ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
* Make sure we follow ire_ipif.
*
* We need to determine the interface route through
- * which the gateway will be reached. We don't really
- * care which interface is picked if the interface is
- * part of a group.
+ * which the gateway will be reached.
*/
if (ire->ire_ipif != NULL) {
ipif = ire->ire_ipif;
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
}
switch (ire->ire_type) {
@@ -409,35 +406,54 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
ire_t *ire = *ire_p;
int error;
ip_stack_t *ipst = ire->ire_ipst;
+ uint_t marks = 0;
ASSERT(ire->ire_ipversion == IPV6_VERSION);
ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
ASSERT(ire->ire_nce == NULL);
+ /*
+ * IREs with source addresses hosted on interfaces that are under IPMP
+ * should be hidden so that applications don't accidentally end up
+ * sending packets with test addresses as their source addresses, or
+ * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here.
+ * (We let IREs with unspecified source addresses slip through since
+ * ire_send_v6() will delete them automatically.)
+ */
+ if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) &&
+ !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
+ DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
+ marks |= IRE_MARK_TESTHIDDEN;
+ }
+
/* Find the appropriate list head. */
switch (ire->ire_type) {
case IRE_HOST:
ire->ire_mask_v6 = ipv6_all_ones;
ire->ire_masklen = IPV6_ABITS;
+ ire->ire_marks |= marks;
if ((ire->ire_flags & RTF_SETSRC) == 0)
ire->ire_src_addr_v6 = ipv6_all_zeros;
break;
case IRE_CACHE:
+ ire->ire_mask_v6 = ipv6_all_ones;
+ ire->ire_masklen = IPV6_ABITS;
+ ire->ire_marks |= marks;
+ break;
case IRE_LOCAL:
case IRE_LOOPBACK:
ire->ire_mask_v6 = ipv6_all_ones;
ire->ire_masklen = IPV6_ABITS;
break;
case IRE_PREFIX:
- if ((ire->ire_flags & RTF_SETSRC) == 0)
- ire->ire_src_addr_v6 = ipv6_all_zeros;
- break;
case IRE_DEFAULT:
+ ire->ire_marks |= marks;
if ((ire->ire_flags & RTF_SETSRC) == 0)
ire->ire_src_addr_v6 = ipv6_all_zeros;
break;
case IRE_IF_RESOLVER:
case IRE_IF_NORESOLVER:
+ ire->ire_marks |= marks;
break;
default:
printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n",
@@ -543,9 +559,8 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
* 2) We could have multiple packets trying to create
* an IRE_CACHE for the same ill.
*
- * Moreover, IPIF_NOFAILOVER and IPV6_BOUND_PIF endpoints wants
- * to go out on a particular ill. Rather than looking at the
- * packet, we depend on the above for MATCH_IRE_ILL here.
+ * Rather than looking at the packet, we depend on the above for
+ * MATCH_IRE_ILL here.
*
* Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have
* multiple IRE_CACHES for an ill for the same destination
@@ -555,20 +570,15 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
*/
if (ire->ire_ipif != NULL)
flags |= MATCH_IRE_IPIF;
+
/*
- * If we are creating hidden ires, make sure we search on
- * this ill (MATCH_IRE_ILL) and a hidden ire, while we are
- * searching for duplicates below. Otherwise we could
- * potentially find an IRE on some other interface
- * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
- * shouldn't do this as this will lead to an infinite loop as
- * eventually we need an hidden ire for this packet to go
- * out. MATCH_IRE_ILL is already marked above.
+ * If we are creating a hidden IRE, make sure we search for
+ * hidden IREs when searching for duplicates below.
+ * Otherwise, we might find an IRE on some other interface
+ * that's not marked hidden.
*/
- if (ire->ire_marks & IRE_MARK_HIDDEN) {
- ASSERT(ire->ire_type == IRE_CACHE);
- flags |= MATCH_IRE_MARK_HIDDEN;
- }
+ if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
+ flags |= MATCH_IRE_MARK_TESTHIDDEN;
/*
* Start the atomic add of the ire. Grab the ill locks,
@@ -692,7 +702,7 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
}
}
if (ire->ire_type == IRE_CACHE) {
- in6_addr_t gw_addr_v6;
+ const in6_addr_t *addr_v6;
ill_t *ill = ire_to_ill(ire);
char buf[INET6_ADDRSTRLEN];
nce_t *nce;
@@ -712,12 +722,12 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
* time on the list and rts_setgwr_v6 could not
* be changing this.
*/
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
- nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, B_TRUE);
- } else {
- nce = ndp_lookup_v6(ill, &gw_addr_v6, B_TRUE);
- }
+ addr_v6 = &ire->ire_gateway_addr_v6;
+ if (IN6_IS_ADDR_UNSPECIFIED(addr_v6))
+ addr_v6 = &ire->ire_addr_v6;
+
+ /* nce fastpath is per-ill; don't match across illgrp */
+ nce = ndp_lookup_v6(ill, B_FALSE, addr_v6, B_TRUE);
if (nce == NULL)
goto failed;
@@ -1217,28 +1227,29 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
in6_addr_t gw_addr_v6;
ill_t *ire_ill = NULL, *dst_ill;
ill_t *ipif_ill = NULL;
- ill_group_t *ire_ill_group = NULL;
- ill_group_t *ipif_ill_group = NULL;
ipif_t *src_ipif;
ASSERT(ire->ire_ipversion == IPV6_VERSION);
ASSERT(addr != NULL);
ASSERT(mask != NULL);
ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
- ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) ||
+ ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
(ipif != NULL && ipif->ipif_isv6));
/*
- * HIDDEN cache entries have to be looked up specifically with
- * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set
- * when the interface is FAILED or INACTIVE. In that case,
- * any IRE_CACHES that exists should be marked with
- * IRE_MARK_HIDDEN. So, we don't really need to match below
- * for IRE_MARK_HIDDEN. But we do so for consistency.
+ * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
+ * is in fact hidden, to ensure the caller gets the right one. One
+ * exception: if the caller passed MATCH_IRE_IHANDLE, then they
+ * already know the identity of the given IRE_INTERFACE entry and
+ * there's no point trying to hide it from them.
*/
- if (!(match_flags & MATCH_IRE_MARK_HIDDEN) &&
- (ire->ire_marks & IRE_MARK_HIDDEN))
- return (B_FALSE);
+ if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+ if (match_flags & MATCH_IRE_IHANDLE)
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+
+ if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
+ return (B_FALSE);
+ }
if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
ire->ire_zoneid != ALL_ZONES) {
@@ -1288,7 +1299,7 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
*/
if ((dst_ill->ill_usesrc_ifindex != 0) &&
(src_ipif = ipif_select_source_v6(dst_ill, addr,
- RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid))
+ B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid))
!= NULL) {
ip3dbg(("ire_match_args: src_ipif %p"
" dst_ill %p", (void *)src_ipif,
@@ -1326,20 +1337,20 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
}
+
/*
- * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that
- * somebody wants to send out on a particular interface which
- * is given by ire_stq and hence use ire_stq to derive the ill
- * value. ire_ipif for IRE_CACHES is just the
- * means of getting a source address i.e ire_src_addr_v6 =
- * ire->ire_ipif->ipif_src_addr_v6.
+ * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
+ * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
+ * of getting a source address -- i.e., ire_src_addr_v6 ==
+ * ire->ire_ipif->ipif_v6src_addr). ire_to_ill() handles this.
+ *
+ * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
+ * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
+ * IPMP test traffic), then the ill must match exactly.
*/
- if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
+ if (match_flags & MATCH_IRE_ILL) {
ire_ill = ire_to_ill(ire);
- if (ire_ill != NULL)
- ire_ill_group = ire_ill->ill_group;
ipif_ill = ipif->ipif_ill;
- ipif_ill_group = ipif_ill->ill_group;
}
/* No ire_addr_v6 bits set past the mask */
@@ -1357,17 +1368,14 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
&ipif->ipif_v6src_addr)) &&
((!(match_flags & MATCH_IRE_IPIF)) ||
(ire->ire_ipif == ipif)) &&
- ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
- (ire->ire_type != IRE_CACHE ||
- ire->ire_marks & IRE_MARK_HIDDEN)) &&
+ ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
+ (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
((!(match_flags & MATCH_IRE_ILL)) ||
- (ire_ill == ipif_ill)) &&
+ (ire_ill == ipif_ill ||
+ (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
+ ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
((!(match_flags & MATCH_IRE_IHANDLE)) ||
(ire->ire_ihandle == ihandle)) &&
- ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
- (ire_ill == ipif_ill) ||
- (ire_ill_group != NULL &&
- ire_ill_group == ipif_ill_group)) &&
((!(match_flags & MATCH_IRE_SECATTR)) ||
(!is_system_labeled()) ||
(tsol_ire_match_gwattr(ire, tsl) == 0))) {
@@ -1391,8 +1399,7 @@ ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
* ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
* MATCH_IRE_ILL is set.
*/
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
- (ipif == NULL))
+ if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
return (NULL);
/*
@@ -1477,8 +1484,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
* ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
* MATCH_IRE_ILL is set.
*/
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
- (ipif == NULL))
+ if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
return (NULL);
/*
@@ -1661,8 +1667,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
mutex_enter(&ire->ire_lock);
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
- match_flags = MATCH_IRE_ILL_GROUP |
- MATCH_IRE_SECATTR;
+ match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL,
0, ire->ire_ipif, zoneid, tsl, match_flags,
ipst);
@@ -1703,7 +1708,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
if (ire->ire_ipif != NULL) {
ire_match_flags |=
- MATCH_IRE_ILL_GROUP;
+ MATCH_IRE_ILL;
}
rire = ire_route_lookup_v6(&gw_addr_v6,
NULL, NULL, IRE_INTERFACE,
@@ -1791,21 +1796,8 @@ found_ire_held:
*/
saved_ire = ire;
- /*
- * Currently MATCH_IRE_ILL is never used with
- * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
- * sending out packets as MATCH_IRE_ILL is used only
- * for communicating with on-link hosts. We can't assert
- * that here as RTM_GET calls this function with
- * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
- * We have already used the MATCH_IRE_ILL in determining
- * the right prefix route at this point. To match the
- * behavior of how we locate routes while sending out
- * packets, we don't want to use MATCH_IRE_ILL below
- * while locating the interface route.
- */
if (ire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
mutex_enter(&ire->ire_lock);
gw_addr_v6 = ire->ire_gateway_addr_v6;
@@ -1958,9 +1950,7 @@ ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
}
/*
- * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers
- * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get
- * to the hidden ones.
+ * Lookup cache.
*
* In general the zoneid has to match (where ALL_ZONES match all of them).
* But for IRE_LOCAL we also need to handle the case where L2 should
@@ -1968,8 +1958,7 @@ ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
* Ethernet drivers nor Ethernet hardware loops back packets sent to their
* own MAC address. This loopback is needed when the normal
* routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill (or ill group) as the ill with which this IRE_LOCAL is
- * associated.
+ * the same ill as the ill with which this IRE_LOCAL is associated.
*
* Earlier versions of this code always matched an IRE_LOCAL independently of
* the zoneid. We preserve that earlier behavior when
@@ -1986,7 +1975,7 @@ ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid,
ipst->ips_ip6_cache_table_size)];
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
- if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
+ if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
continue;
if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) {
/*
@@ -2125,13 +2114,8 @@ ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
ASSERT(cire != NULL && pire != NULL);
match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
- /*
- * ip_newroute_v6 calls ire_ftable_lookup with MATCH_IRE_ILL only
- * for on-link hosts. We should never be here for onlink.
- * Thus, use MATCH_IRE_ILL_GROUP.
- */
if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
/*
* We know that the mask of the interface ire equals cire->ire_cmask.
* (When ip_newroute_v6() created 'cire' for an on-link destn. it set
@@ -2168,7 +2152,7 @@ ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
*/
match_flags = MATCH_IRE_TYPE;
if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
mutex_enter(&pire->ire_lock);
gw_addr = pire->ire_gateway_addr_v6;
@@ -2210,24 +2194,30 @@ ire_t *
ipif_to_ire_v6(const ipif_t *ipif)
{
ire_t *ire;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF;
+
+ /*
+ * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
+ * so that they aren't accidentally returned. However, if the
+ * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
+ */
+ if (IS_UNDER_IPMP(ipif->ipif_ill))
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
ASSERT(ipif->ipif_isv6);
if (ipif->ipif_ire_type == IRE_LOOPBACK) {
ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL,
- IRE_LOOPBACK, ipif, ALL_ZONES, NULL,
- (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ipst);
+ IRE_LOOPBACK, ipif, ALL_ZONES, NULL, match_flags, ipst);
} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
/* In this case we need to lookup destination address. */
ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr,
&ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES,
- 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
- MATCH_IRE_MASK), ipst);
+ 0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
} else {
ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet,
&ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL,
- ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
- MATCH_IRE_MASK), ipst);
+ ALL_ZONES, 0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
}
return (ire);
}
@@ -2296,7 +2286,7 @@ ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl,
continue;
if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp))
continue;
- if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
+ if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
continue;
unres_cnt--;
}
@@ -2434,7 +2424,7 @@ ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
continue;
if (cire->ire_marks &
(IRE_MARK_CONDEMNED|
- IRE_MARK_HIDDEN))
+ IRE_MARK_TESTHIDDEN))
continue;
if (cire->ire_gw_secattr != NULL &&
@@ -2635,8 +2625,7 @@ ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
&cire->ire_addr_v6, &v6dst))
continue;
if (cire->ire_marks &
- (IRE_MARK_CONDEMNED|
- IRE_MARK_HIDDEN))
+ IRE_MARK_CONDEMNED)
continue;
if (cire->ire_gw_secattr != NULL &&
@@ -2845,8 +2834,7 @@ ip6_ctable_lookup_impl(ire_ctable_args_t *margs)
ire_t *ire;
ip_stack_t *ipst = margs->ict_ipst;
- if ((margs->ict_flags &
- (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
+ if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
(margs->ict_ipif == NULL)) {
return (NULL);
}
diff --git a/usr/src/uts/common/inet/ip/ip6_rts.c b/usr/src/uts/common/inet/ip/ip6_rts.c
index 7d2ddd5c04..dcf429c8ba 100644
--- a/usr/src/uts/common/inet/ip/ip6_rts.c
+++ b/usr/src/uts/common/inet/ip/ip6_rts.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -38,8 +38,6 @@
* @(#)rtsock.c 8.6 (Berkeley) 2/11/95
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file contains routines that processes routing socket requests.
*/
@@ -216,5 +214,5 @@ ip_rts_change_v6(int type, const in6_addr_t *dst_addr,
rtm->rtm_errno = error;
rtm->rtm_flags |= RTF_DONE;
rtm->rtm_addrs = rtm_addrs;
- rts_queue_input(mp, NULL, AF_INET6, ipst);
+ rts_queue_input(mp, NULL, AF_INET6, RTSQ_ALL, ipst);
}
diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c
index 4fa3c7a74d..31f83c842d 100644
--- a/usr/src/uts/common/inet/ip/ip_ftable.c
+++ b/usr/src/uts/common/inet/ip/ip_ftable.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -67,7 +67,6 @@
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
#include <inet/sadb.h>
-#include <sys/kmem.h>
#include <inet/tcp.h>
#include <inet/ipclassifier.h>
#include <sys/zone.h>
@@ -159,8 +158,7 @@ ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
* ire_match_args() will dereference ipif MATCH_IRE_SRC or
* MATCH_IRE_ILL is set.
*/
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
- (ipif == NULL))
+ if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
return (NULL);
(void) memset(&rdst, 0, sizeof (rdst));
@@ -290,28 +288,16 @@ found_ire_held:
*/
save_ire = ire;
+ if (ire->ire_ipif != NULL)
+ match_flags |= MATCH_IRE_ILL;
+
/*
- * Currently MATCH_IRE_ILL is never used with
- * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
- * sending out packets as MATCH_IRE_ILL is used only
- * for communicating with on-link hosts. We can't assert
- * that here as RTM_GET calls this function with
- * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
- * We have already used the MATCH_IRE_ILL in determining
- * the right prefix route at this point. To match the
- * behavior of how we locate routes while sending out
- * packets, we don't want to use MATCH_IRE_ILL below
- * while locating the interface route.
- *
* ire_ftable_lookup may end up with an incomplete IRE_CACHE
* entry for the gateway (i.e., one for which the
* ire_nce->nce_state is not yet ND_REACHABLE). If the caller
* has specified MATCH_IRE_COMPLETE, such entries will not
* be returned; instead, we return the IF_RESOLVER ire.
*/
- if (ire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
-
ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0,
ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
@@ -532,7 +518,7 @@ ire_ftable_lookup_simple(ipaddr_t addr,
}
}
if (ire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
ire = ire_route_lookup(ire->ire_gateway_addr, 0,
0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst);
@@ -678,13 +664,11 @@ ire_lookup_multi(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
* Make sure we follow ire_ipif.
*
* We need to determine the interface route through
- * which the gateway will be reached. We don't really
- * care which interface is picked if the interface is
- * part of a group.
+ * which the gateway will be reached.
*/
if (ire->ire_ipif != NULL) {
ipif = ire->ire_ipif;
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
}
switch (ire->ire_type) {
@@ -854,40 +838,26 @@ ire_get_next_default_ire(ire_t *ire, ire_t *ire_origin)
}
static ipif_t *
-ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill,
+ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire,
int zoneid, ushort_t *marks)
{
ipif_t *src_ipif;
- ip_stack_t *ipst = dst_ill->ill_ipst;
+ ill_t *ill = ire->ire_ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
/*
- * Pick the best source address from dst_ill.
+ * Pick the best source address from ill.
*
- * 1) If it is part of a multipathing group, we would
- * like to spread the inbound packets across different
- * interfaces. ipif_select_source picks a random source
- * across the different ills in the group.
- *
- * 2) If it is not part of a multipathing group, we try
- * to pick the source address from the destination
+ * 1) Try to pick the source address from the destination
* route. Clustering assumes that when we have multiple
* prefixes hosted on an interface, the prefix of the
* source address matches the prefix of the destination
* route. We do this only if the address is not
* DEPRECATED.
*
- * 3) If the conn is in a different zone than the ire, we
+ * 2) If the conn is in a different zone than the ire, we
* need to pick a source address from the right zone.
- *
- * NOTE : If we hit case (1) above, the prefix of the source
- * address picked may not match the prefix of the
- * destination routes prefix as ipif_select_source
- * does not look at "dst" while picking a source
- * address.
- * If we want the same behavior as (2), we will need
- * to change the behavior of ipif_select_source.
*/
-
if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
/*
* The RTF_SETSRC flag is set in the parent ire (sire).
@@ -899,13 +869,10 @@ ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill,
return (src_ipif);
}
*marks |= IRE_MARK_USESRC_CHECK;
- if ((dst_ill->ill_group != NULL) ||
+ if (IS_IPMP(ill) ||
(ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
- (dst_ill->ill_usesrc_ifindex != 0)) {
- src_ipif = ipif_select_source(dst_ill, dst, zoneid);
- if (src_ipif == NULL)
- return (NULL);
-
+ (ill->ill_usesrc_ifindex != 0)) {
+ src_ipif = ipif_select_source(ill, dst, zoneid);
} else {
src_ipif = ire->ire_ipif;
ASSERT(src_ipif != NULL);
@@ -1071,18 +1038,20 @@ create_irecache:
sire->ire_last_used_time = lbolt;
}
- /* Obtain dst_ill */
- dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill);
+ dst_ill = ire->ire_ipif->ipif_ill;
+ if (IS_IPMP(dst_ill))
+ dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp);
+ else
+ ill_refhold(dst_ill);
+
if (dst_ill == NULL) {
- ip2dbg(("ire_forward no dst ill; ire 0x%p\n",
- (void *)ire));
+ ip2dbg(("ire_forward no dst ill; ire 0x%p\n", (void *)ire));
goto icmp_err_ret;
}
ASSERT(src_ipif == NULL);
/* Now obtain the src_ipif */
- src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill,
- zoneid, &ire_marks);
+ src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks);
if (src_ipif == NULL)
goto icmp_err_ret;
@@ -1254,18 +1223,13 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
ire_t *sire = NULL, *save_ire;
ill_t *dst_ill = NULL;
int error;
- zoneid_t zoneid;
+ zoneid_t zoneid = GLOBAL_ZONEID;
ipif_t *src_ipif = NULL;
mblk_t *res_mp;
ushort_t ire_marks = 0;
- zoneid = GLOBAL_ZONEID;
-
-
ire = ire_ftable_lookup_simple(dst, &sire, zoneid,
- MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE, ipst);
-
+ MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE, ipst);
if (ire == NULL) {
ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
goto icmp_err_ret;
@@ -1288,9 +1252,7 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
* nexthop router, just hand over the cache entry
* and we are done.
*/
-
if (ire->ire_type & IRE_CACHE) {
-
/*
* If we are using this ire cache entry as a
* gateway to forward packets, chances are we
@@ -1334,18 +1296,21 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
UPDATE_OB_PKT_COUNT(sire);
}
- /* Obtain dst_ill */
- dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill);
+ dst_ill = ire->ire_ipif->ipif_ill;
+ if (IS_IPMP(dst_ill))
+ dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp);
+ else
+ ill_refhold(dst_ill); /* for symmetry */
+
if (dst_ill == NULL) {
- ip2dbg(("ire_forward no dst ill; ire 0x%p\n",
+ ip2dbg(("ire_forward_simple: no dst ill; ire 0x%p\n",
(void *)ire));
goto icmp_err_ret;
}
ASSERT(src_ipif == NULL);
/* Now obtain the src_ipif */
- src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill,
- zoneid, &ire_marks);
+ src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks);
if (src_ipif == NULL)
goto icmp_err_ret;
@@ -1720,33 +1685,24 @@ ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
MATCH_IRE_RECURSIVE| MATCH_IRE_RJ_BHOLE|
- MATCH_IRE_SECATTR);
+ MATCH_IRE_SECATTR | MATCH_IRE_ILL);
/*
* If supplied ifindex is non-null, the only valid
- * nexthop is one off of the interface or group corresponding
+ * nexthop is one off of the interface corresponding
* to the specified ifindex.
*/
ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
NULL, NULL, NULL, NULL, ipst);
if (ill != NULL) {
- match_flags |= MATCH_IRE_ILL;
+ supplied_ipif = ipif_get_next_ipif(NULL, ill);
} else {
- /* Fallback to group names if hook_emulation set */
- if (ipst->ips_ipmp_hook_emulation) {
- ill = ill_group_lookup_on_ifindex(ifindex,
- B_FALSE, ipst);
- }
- if (ill == NULL) {
- ip1dbg(("ipfil_sendpkt: Could not find"
- " route to dst\n"));
- value = ECOMM;
- freemsg(mp);
- goto discard;
- }
- match_flags |= MATCH_IRE_ILL_GROUP;
+ ip1dbg(("ipfil_sendpkt: Could not find"
+ " route to dst\n"));
+ value = ECOMM;
+ freemsg(mp);
+ goto discard;
}
- supplied_ipif = ipif_get_next_ipif(NULL, ill);
ire = ire_route_lookup(dst, 0, 0, 0, supplied_ipif,
&sire, zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
@@ -2325,9 +2281,9 @@ ire_round_robin(irb_t *irb_ptr, zoneid_t zoneid, ire_ftable_args_t *margs,
* interested in routers that are
* reachable through ipifs within our zone.
*/
- if (ire->ire_ipif != NULL) {
- match_flags |= MATCH_IRE_ILL_GROUP;
- }
+ if (ire->ire_ipif != NULL)
+ match_flags |= MATCH_IRE_ILL;
+
rire = ire_route_lookup(ire->ire_gateway_addr, 0, 0,
IRE_INTERFACE, ire->ire_ipif, NULL, zoneid, margs->ift_tsl,
match_flags, ipst);
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 0597245499..9771c87721 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -46,6 +46,7 @@
#include <sys/bitmap.h>
#include <sys/cpuvar.h>
#include <sys/time.h>
+#include <sys/ctype.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/param.h>
@@ -61,10 +62,10 @@
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet/igmp_var.h>
-#include <sys/strsun.h>
#include <sys/policy.h>
#include <sys/ethernet.h>
#include <sys/callb.h>
+#include <sys/md5.h>
#include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
#include <inet/mi.h>
@@ -85,7 +86,6 @@
#include <inet/tun.h>
#include <inet/sctp_ip.h>
#include <inet/ip_netinfo.h>
-#include <inet/mib2.h>
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
@@ -93,7 +93,6 @@
#include <inet/ipsec_impl.h>
#include <sys/iphada.h>
-
#include <netinet/igmp.h>
#include <inet/ip_listutils.h>
#include <inet/ipclassifier.h>
@@ -158,7 +157,7 @@ static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
static void ipsq_delete(ipsq_t *);
static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
- boolean_t initialize);
+ boolean_t initialize, boolean_t insert);
static void ipif_check_bcast_ires(ipif_t *test_ipif);
static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
@@ -169,7 +168,6 @@ static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
static void ipif_free(ipif_t *ipif);
static void ipif_free_tail(ipif_t *ipif);
static void ipif_mtu_change(ire_t *ire, char *ipif_arg);
-static void ipif_multicast_down(ipif_t *ipif);
static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif);
static void ipif_set_default(ipif_t *ipif);
static int ipif_set_values(queue_t *q, mblk_t *mp,
@@ -179,8 +177,7 @@ static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
static ipif_t *ipif_lookup_on_name(char *name, size_t namelen,
boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *);
-static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp);
-static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp);
+static void ipif_update_other_ipifs(ipif_t *old_ipif);
static int ill_alloc_ppa(ill_if_t *, ill_t *);
static int ill_arp_off(ill_t *ill);
@@ -192,33 +189,18 @@ static void ill_down(ill_t *ill);
static void ill_downi(ire_t *ire, char *ill_arg);
static void ill_free_mib(ill_t *ill);
static void ill_glist_delete(ill_t *);
-static boolean_t ill_has_usable_ipif(ill_t *);
-static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int);
-static void ill_nominate_bcast_rcv(ill_group_t *illgrp);
-static void ill_phyint_free(ill_t *ill);
static void ill_phyint_reinit(ill_t *ill);
static void ill_set_nce_router_flags(ill_t *, boolean_t);
static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
-static void ill_signal_ipsq_ills(ipsq_t *, boolean_t);
-static boolean_t ill_split_ipsq(ipsq_t *cur_sq);
-static void ill_stq_cache_delete(ire_t *, char *);
-
-static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
- in6_addr_t *);
-static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
- ipaddr_t *);
-static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
- in6_addr_t *);
-static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
- ipaddr_t *);
-
+static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
+static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
+static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo;
+static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo;
static void ipif_save_ire(ipif_t *, ire_t *);
static void ipif_remove_ire(ipif_t *, ire_t *);
static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *);
static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
+static void phyint_free(phyint_t *);
/*
* Per-ill IPsec capabilities management.
@@ -250,18 +232,14 @@ static void ill_capability_ack_thr(void *);
static void ill_capability_lso_enable(ill_t *);
static void ill_capability_send(ill_t *, mblk_t *);
-static void illgrp_cache_delete(ire_t *, char *);
-static void illgrp_delete(ill_t *ill);
-static void illgrp_reset_schednext(ill_t *ill);
-
static ill_t *ill_prev_usesrc(ill_t *);
static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
static void ill_disband_usesrc_group(ill_t *);
static void conn_cleanup_stale_ire(conn_t *, caddr_t);
#ifdef DEBUG
-static void ill_trace_cleanup(const ill_t *);
-static void ipif_trace_cleanup(const ipif_t *);
+static void ill_trace_cleanup(const ill_t *);
+static void ipif_trace_cleanup(const ipif_t *);
#endif
/*
@@ -491,6 +469,7 @@ static nv_t ipif_nv_tbl[] = {
{ PHYI_STANDBY, "STANDBY" },
{ PHYI_INACTIVE, "INACTIVE" },
{ PHYI_OFFLINE, "OFFLINE" },
+ { PHYI_IPMP, "IPMP" }
};
static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
@@ -508,7 +487,8 @@ static ip_m_t ip_m_tbl[] = {
ip_ether_v6intfid },
{ DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo,
ip_ib_v6intfid },
- { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL},
+ { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL },
+ { SUNW_DL_IPMP, IFT_OTHER, NULL, NULL, ip_ipmp_v6intfid },
{ DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_nodef_v6intfid }
};
@@ -529,14 +509,6 @@ static ipif_t ipif_zero;
*/
uint_t ill_no_arena = 12; /* Setable in /etc/system */
-static uint_t
-ipif_rand(ip_stack_t *ipst)
-{
- ipst->ips_ipif_src_random = ipst->ips_ipif_src_random * 1103515245 +
- 12345;
- return ((ipst->ips_ipif_src_random >> 16) & 0x7fff);
-}
-
/*
* Allocate per-interface mibs.
* Returns true if ok. False otherwise.
@@ -623,7 +595,7 @@ ill_allocate_mibs(ill_t *ill)
* (Always called as writer.)
*/
mblk_t *
-ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
+ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr)
{
arc_t *arc = (arc_t *)template;
char *cp;
@@ -669,17 +641,69 @@ ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
}
mblk_t *
-ipif_area_alloc(ipif_t *ipif)
+ipif_area_alloc(ipif_t *ipif, uint_t optflags)
{
- return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template,
- (char *)&ipif->ipif_lcl_addr));
+ caddr_t addr;
+ mblk_t *mp;
+ area_t *area;
+ uchar_t *areap;
+ ill_t *ill = ipif->ipif_ill;
+
+ if (ill->ill_isv6) {
+ ASSERT(ill->ill_flags & ILLF_XRESOLV);
+ addr = (caddr_t)&ipif->ipif_v6lcl_addr;
+ areap = (uchar_t *)&ip6_area_template;
+ } else {
+ addr = (caddr_t)&ipif->ipif_lcl_addr;
+ areap = (uchar_t *)&ip_area_template;
+ }
+
+ if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL)
+ return (NULL);
+
+ /*
+ * IPMP requires that the hardware address be included in all
+ * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on.
+ * If there are no active underlying ills in the group (and thus no
+ * hardware address, DAD will be deferred until an underlying ill
+ * becomes active.
+ */
+ if (IS_IPMP(ill)) {
+ if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
+ freemsg(mp);
+ return (NULL);
+ }
+ } else {
+ ill_refhold(ill);
+ }
+
+ area = (area_t *)mp->b_rptr;
+ area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR;
+ area->area_flags |= optflags;
+ area->area_hw_addr_length = ill->ill_phys_addr_length;
+ bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset,
+ area->area_hw_addr_length);
+
+ ill_refrele(ill);
+ return (mp);
}
mblk_t *
ipif_ared_alloc(ipif_t *ipif)
{
- return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template,
- (char *)&ipif->ipif_lcl_addr));
+ caddr_t addr;
+ uchar_t *aredp;
+
+ if (ipif->ipif_ill->ill_isv6) {
+ ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV);
+ addr = (caddr_t)&ipif->ipif_v6lcl_addr;
+ aredp = (uchar_t *)&ip6_ared_template;
+ } else {
+ addr = (caddr_t)&ipif->ipif_lcl_addr;
+ aredp = (uchar_t *)&ip_ared_template;
+ }
+
+ return (ill_arp_alloc(ipif->ipif_ill, aredp, addr));
}
mblk_t *
@@ -689,6 +713,19 @@ ill_ared_alloc(ill_t *ill, ipaddr_t addr)
(char *)&addr));
}
+mblk_t *
+ill_arie_alloc(ill_t *ill, const char *grifname, const void *template)
+{
+ mblk_t *mp = ill_arp_alloc(ill, template, 0);
+ arie_t *arie;
+
+ if (mp != NULL) {
+ arie = (arie_t *)mp->b_rptr;
+ (void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ);
+ }
+ return (mp);
+}
+
/*
* Completely vaporize a lower level tap and all associated interfaces.
* ill_delete is called only out of ip_close when the device control
@@ -751,6 +788,12 @@ ill_delete(ill_t *ill)
ip_purge_allmulti(ill);
/*
+ * If the ill being deleted is under IPMP, boot it out of the illgrp.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_ill_leave_illgrp(ill);
+
+ /*
* ill_down will arrange to blow off any IRE's dependent on this
* ILL, and shut down fragmentation reassembly.
*/
@@ -890,8 +933,19 @@ ill_delete_tail(ill_t *ill)
* ill references.
*/
ASSERT(ilm_walk_ill(ill) == 0);
+
/*
- * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free
+ * If this ill is an IPMP meta-interface, blow away the illgrp. This
+ * is safe to do because the illgrp has already been unlinked from the
+ * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
+ */
+ if (IS_IPMP(ill)) {
+ ipmp_illgrp_destroy(ill->ill_grp);
+ ill->ill_grp = NULL;
+ }
+
+ /*
+ * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
* could free the phyint. No more reference to the phyint after this
* point.
*/
@@ -1139,7 +1193,7 @@ ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id)
* Add the pending mp to the list. There can be only 1 pending mp
* in the list. Any exclusive ioctl that needs to wait for a response
* from another module or driver needs to use this function to set
- * the ipsq_pending_mp to the ioctl mblk and wait for the response from
+ * the ipx_pending_mp to the ioctl mblk and wait for the response from
* the other module/driver. This is also used while waiting for the
* ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
*/
@@ -1147,19 +1201,19 @@ boolean_t
ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
int waitfor)
{
- ipsq_t *ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
+ ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
ASSERT(IAM_WRITER_IPIF(ipif));
ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
- ASSERT(ipsq->ipsq_pending_mp == NULL);
+ ASSERT(ipx->ipx_pending_mp == NULL);
/*
* The caller may be using a different ipif than the one passed into
* ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
* ill needs to wait for the V6 ill to quiesce). So we can't ASSERT
- * that `ipsq_current_ipif == ipif'.
+ * that `ipx_current_ipif == ipif'.
*/
- ASSERT(ipsq->ipsq_current_ipif != NULL);
+ ASSERT(ipx->ipx_current_ipif != NULL);
/*
* M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls,
@@ -1180,8 +1234,8 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
if (connp->conn_state_flags & CONN_CLOSING)
return (B_FALSE);
}
- mutex_enter(&ipsq->ipsq_lock);
- ipsq->ipsq_pending_ipif = ipif;
+ mutex_enter(&ipx->ipx_lock);
+ ipx->ipx_pending_ipif = ipif;
/*
* Note down the queue in b_queue. This will be returned by
* ipsq_pending_mp_get. Caller will then use these values to restart
@@ -1189,38 +1243,40 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
*/
add_mp->b_next = NULL;
add_mp->b_queue = q;
- ipsq->ipsq_pending_mp = add_mp;
- ipsq->ipsq_waitfor = waitfor;
+ ipx->ipx_pending_mp = add_mp;
+ ipx->ipx_waitfor = waitfor;
+ mutex_exit(&ipx->ipx_lock);
if (connp != NULL)
connp->conn_oper_pending_ill = ipif->ipif_ill;
- mutex_exit(&ipsq->ipsq_lock);
+
return (B_TRUE);
}
/*
- * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp
+ * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
* queued in the list.
*/
mblk_t *
ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
{
mblk_t *curr = NULL;
+ ipxop_t *ipx = ipsq->ipsq_xop;
- mutex_enter(&ipsq->ipsq_lock);
*connpp = NULL;
- if (ipsq->ipsq_pending_mp == NULL) {
- mutex_exit(&ipsq->ipsq_lock);
+ mutex_enter(&ipx->ipx_lock);
+ if (ipx->ipx_pending_mp == NULL) {
+ mutex_exit(&ipx->ipx_lock);
return (NULL);
}
/* There can be only 1 such excl message */
- curr = ipsq->ipsq_pending_mp;
- ASSERT(curr != NULL && curr->b_next == NULL);
- ipsq->ipsq_pending_ipif = NULL;
- ipsq->ipsq_pending_mp = NULL;
- ipsq->ipsq_waitfor = 0;
- mutex_exit(&ipsq->ipsq_lock);
+ curr = ipx->ipx_pending_mp;
+ ASSERT(curr->b_next == NULL);
+ ipx->ipx_pending_ipif = NULL;
+ ipx->ipx_pending_mp = NULL;
+ ipx->ipx_waitfor = 0;
+ mutex_exit(&ipx->ipx_lock);
if (CONN_Q(curr->b_queue)) {
/*
@@ -1237,7 +1293,7 @@ ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
}
/*
- * Cleanup the ioctl mp queued in ipsq_pending_mp
+ * Cleanup the ioctl mp queued in ipx_pending_mp
* - Called in the ill_delete path
* - Called in the M_ERROR or M_HANGUP path on the ill.
* - Called in the conn close path.
@@ -1246,48 +1302,41 @@ boolean_t
ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
{
mblk_t *mp;
- ipsq_t *ipsq;
+ ipxop_t *ipx;
queue_t *q;
ipif_t *ipif;
ASSERT(IAM_WRITER_ILL(ill));
- ipsq = ill->ill_phyint->phyint_ipsq;
- mutex_enter(&ipsq->ipsq_lock);
+ ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
+
/*
- * If connp is null, unconditionally clean up the ipsq_pending_mp.
+ * If connp is null, unconditionally clean up the ipx_pending_mp.
* This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl
* even if it is meant for another ill, since we have to enqueue
- * a new mp now in ipsq_pending_mp to complete the ipif_down.
+ * a new mp now in ipx_pending_mp to complete the ipif_down.
* If connp is non-null we are called from the conn close path.
*/
- mp = ipsq->ipsq_pending_mp;
+ mutex_enter(&ipx->ipx_lock);
+ mp = ipx->ipx_pending_mp;
if (mp == NULL || (connp != NULL &&
mp->b_queue != CONNP_TO_WQ(connp))) {
- mutex_exit(&ipsq->ipsq_lock);
+ mutex_exit(&ipx->ipx_lock);
return (B_FALSE);
}
- /* Now remove from the ipsq_pending_mp */
- ipsq->ipsq_pending_mp = NULL;
+ /* Now remove from the ipx_pending_mp */
+ ipx->ipx_pending_mp = NULL;
q = mp->b_queue;
mp->b_next = NULL;
mp->b_prev = NULL;
mp->b_queue = NULL;
- /* If MOVE was in progress, clear the move_in_progress fields also. */
- ill = ipsq->ipsq_pending_ipif->ipif_ill;
- if (ill->ill_move_in_progress) {
- ILL_CLEAR_MOVE(ill);
- } else if (ill->ill_up_ipifs) {
- ill_group_cleanup(ill);
- }
-
- ipif = ipsq->ipsq_pending_ipif;
- ipsq->ipsq_pending_ipif = NULL;
- ipsq->ipsq_waitfor = 0;
- ipsq->ipsq_current_ipif = NULL;
- ipsq->ipsq_current_ioctl = 0;
- ipsq->ipsq_current_done = B_TRUE;
- mutex_exit(&ipsq->ipsq_lock);
+ ipif = ipx->ipx_pending_ipif;
+ ipx->ipx_pending_ipif = NULL;
+ ipx->ipx_waitfor = 0;
+ ipx->ipx_current_ipif = NULL;
+ ipx->ipx_current_ioctl = 0;
+ ipx->ipx_current_done = B_TRUE;
+ mutex_exit(&ipx->ipx_lock);
if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
if (connp == NULL) {
@@ -1437,7 +1486,7 @@ conn_ioctl_cleanup(conn_t *connp)
* Is any exclusive ioctl pending ? If so clean it up. If the
* ioctl has not yet started, the mp is pending in the list headed by
* ipsq_xopq_head. If the ioctl has started the mp could be present in
- * ipsq_pending_mp. If the ioctl timed out in the streamhead but
+ * ipx_pending_mp. If the ioctl timed out in the streamhead but
* is currently executing now the mp is not queued anywhere but
* conn_oper_pending_ill is null. The conn close will wait
* till the conn_ref drops to zero.
@@ -1468,9 +1517,9 @@ conn_ioctl_cleanup(conn_t *connp)
ill_waiter_dcr(ill);
/*
* Check whether this ioctl has started and is
- * pending now in ipsq_pending_mp. If it is not
- * found there then check whether this ioctl has
- * not even started and is in the ipsq_xopq list.
+ * pending. If it is not found there then check
+ * whether this ioctl has not even started and is in
+ * the ipsq_xopq list.
*/
if (!ipsq_pending_mp_cleanup(ill, connp))
ipsq_xopq_mp_cleanup(ill, connp);
@@ -1506,16 +1555,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg)
if (connp->conn_multicast_ill == ill) {
/* Revert to late binding */
connp->conn_multicast_ill = NULL;
- connp->conn_orig_multicast_ifindex = 0;
}
if (connp->conn_incoming_ill == ill)
connp->conn_incoming_ill = NULL;
if (connp->conn_outgoing_ill == ill)
connp->conn_outgoing_ill = NULL;
- if (connp->conn_outgoing_pill == ill)
- connp->conn_outgoing_pill = NULL;
- if (connp->conn_nofailover_ill == ill)
- connp->conn_nofailover_ill = NULL;
if (connp->conn_dhcpinit_ill == ill) {
connp->conn_dhcpinit_ill = NULL;
ASSERT(ill->ill_dhcpinit != 0);
@@ -1524,11 +1568,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg)
if (connp->conn_ire_cache != NULL) {
ire = connp->conn_ire_cache;
/*
- * ip_newroute creates IRE_CACHE with ire_stq coming from
- * interface X and ipif coming from interface Y, if interface
- * X and Y are part of the same IPMPgroup. Thus whenever
- * interface X goes down, remove all references to it by
- * checking both on ire_ipif and ire_stq.
+ * Source address selection makes it possible for IRE_CACHE
+ * entries to be created with ire_stq coming from interface X
+ * and ipif coming from interface Y. Thus whenever interface
+ * X goes down, remove all references to it by checking both
+ * on ire_ipif and ire_stq.
*/
if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
(ire->ire_type == IRE_CACHE &&
@@ -1601,14 +1645,10 @@ ill_down(ill_t *ill)
ip_stack_t *ipst = ill->ill_ipst;
/* Blow off any IREs dependent on this ILL. */
- ire_walk(ill_downi, (char *)ill, ipst);
+ ire_walk(ill_downi, ill, ipst);
/* Remove any conn_*_ill depending on this ill */
ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
-
- if (ill->ill_group != NULL) {
- illgrp_delete(ill);
- }
}
/*
@@ -1621,9 +1661,9 @@ ill_downi(ire_t *ire, char *ill_arg)
ill_t *ill = (ill_t *)ill_arg;
/*
- * ip_newroute creates IRE_CACHE with ire_stq coming from
- * interface X and ipif coming from interface Y, if interface
- * X and Y are part of the same IPMP group. Thus whenever interface
+ * Source address selection makes it possible for IRE_CACHE
+ * entries to be created with ire_stq coming from interface X
+ * and ipif coming from interface Y. Thus whenever interface
* X goes down, remove all references to it by checking both
* on ire_ipif and ire_stq.
*/
@@ -3696,16 +3736,39 @@ nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp,
}
/*
- * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an
- * IPMP group, make sure all ill's in the group adopt the new policy. Send
- * up RTS_IFINFO routing socket messages for each interface whose flags we
- * change.
+ * Helper function for ill_forward_set().
+ */
+static void
+ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
+{
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
+
+ ip1dbg(("ill_forward_set: %s %s forwarding on %s",
+ (enable ? "Enabling" : "Disabling"),
+ (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
+ mutex_enter(&ill->ill_lock);
+ if (enable)
+ ill->ill_flags |= ILLF_ROUTER;
+ else
+ ill->ill_flags &= ~ILLF_ROUTER;
+ mutex_exit(&ill->ill_lock);
+ if (ill->ill_isv6)
+ ill_set_nce_router_flags(ill, enable);
+ /* Notify routing socket listeners of this change. */
+ ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
+}
+
+/*
+ * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing
+ * socket messages for each interface whose flags we change.
*/
int
ill_forward_set(ill_t *ill, boolean_t enable)
{
- ill_group_t *illgrp;
- ip_stack_t *ipst = ill->ill_ipst;
+ ipmp_illgrp_t *illg;
+ ip_stack_t *ipst = ill->ill_ipst;
ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
@@ -3716,47 +3779,23 @@ ill_forward_set(ill_t *ill, boolean_t enable)
if (IS_LOOPBACK(ill))
return (EINVAL);
- /*
- * If the ill is in an IPMP group, set the forwarding policy on all
- * members of the group to the same value.
- */
- illgrp = ill->ill_group;
- if (illgrp != NULL) {
- ill_t *tmp_ill;
+ if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
+ /*
+ * Update all of the interfaces in the group.
+ */
+ illg = ill->ill_grp;
+ ill = list_head(&illg->ig_if);
+ for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
+ ill_forward_set_on_ill(ill, enable);
- for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL;
- tmp_ill = tmp_ill->ill_group_next) {
- ip1dbg(("ill_forward_set: %s %s forwarding on %s",
- (enable ? "Enabling" : "Disabling"),
- (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"),
- tmp_ill->ill_name));
- mutex_enter(&tmp_ill->ill_lock);
- if (enable)
- tmp_ill->ill_flags |= ILLF_ROUTER;
- else
- tmp_ill->ill_flags &= ~ILLF_ROUTER;
- mutex_exit(&tmp_ill->ill_lock);
- if (tmp_ill->ill_isv6)
- ill_set_nce_router_flags(tmp_ill, enable);
- /* Notify routing socket listeners of this change. */
- ip_rts_ifmsg(tmp_ill->ill_ipif);
- }
- } else {
- ip1dbg(("ill_forward_set: %s %s forwarding on %s",
- (enable ? "Enabling" : "Disabling"),
- (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
- mutex_enter(&ill->ill_lock);
- if (enable)
- ill->ill_flags |= ILLF_ROUTER;
- else
- ill->ill_flags &= ~ILLF_ROUTER;
- mutex_exit(&ill->ill_lock);
- if (ill->ill_isv6)
- ill_set_nce_router_flags(ill, enable);
- /* Notify routing socket listeners of this change. */
- ip_rts_ifmsg(ill->ill_ipif);
+ /*
+ * Update the IPMP meta-interface.
+ */
+ ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
+ return (0);
}
+ ill_forward_set_on_ill(ill, enable);
return (0);
}
@@ -3772,7 +3811,12 @@ ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
nce_t *nce;
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE);
+ /*
+ * NOTE: we're called separately for each ill in an illgrp,
+ * so don't match across the illgrp.
+ */
+ nce = ndp_lookup_v6(ill, B_FALSE, &ipif->ipif_v6lcl_addr,
+ B_FALSE);
if (nce != NULL) {
mutex_enter(&nce->nce_lock);
if (enable)
@@ -3928,36 +3972,45 @@ ill_next(ill_walk_context_t *ctx, ill_t *lastill)
}
/*
- * Check interface name for correct format which is name+ppa.
- * name can contain characters and digits, the right most digits
- * make up the ppa number. use of octal is not allowed, name must contain
- * a ppa, return pointer to the start of ppa.
- * In case of error return NULL.
+ * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
+ * The final number (PPA) must not have any leading zeros. Upon success, a
+ * pointer to the start of the PPA is returned; otherwise NULL is returned.
*/
static char *
ill_get_ppa_ptr(char *name)
{
- int namelen = mi_strlen(name);
+ int namelen = strlen(name);
+ int end_ndx = namelen - 1;
+ int ppa_ndx, i;
- int len = namelen;
+ /*
+ * Check that the first character is [a-zA-Z], and that the last
+ * character is [0-9].
+ */
+ if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
+ return (NULL);
- name += len;
- while (len > 0) {
- name--;
- if (*name < '0' || *name > '9')
+ /*
+ * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
+ */
+ for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
+ if (!isdigit(name[ppa_ndx - 1]))
break;
- len--;
- }
- /* empty string, all digits, or no trailing digits */
- if (len == 0 || len == (int)namelen)
+ if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
return (NULL);
- name++;
- /* check for attempted use of octal */
- if (*name == '0' && len != (int)namelen - 1)
- return (NULL);
- return (name);
+ /*
+ * Check that the intermediate characters are [a-z0-9.]
+ */
+ for (i = 1; i < ppa_ndx; i++) {
+ if (!isalpha(name[i]) && !isdigit(name[i]) &&
+ name[i] != '.' && name[i] != '_') {
+ return (NULL);
+ }
+ }
+
+ return (name + ppa_ndx);
}
/*
@@ -4037,8 +4090,10 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
} else if (ILL_CAN_WAIT(ill, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (error != NULL)
@@ -4102,6 +4157,7 @@ static void
ill_glist_delete(ill_t *ill)
{
ip_stack_t *ipst;
+ phyint_t *phyi;
if (ill == NULL)
return;
@@ -4139,8 +4195,41 @@ ill_glist_delete(ill_t *ill)
ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
ill->ill_name_length);
- ill_phyint_free(ill);
+ ASSERT(ill->ill_phyint != NULL);
+ phyi = ill->ill_phyint;
+ ill->ill_phyint = NULL;
+
+ /*
+ * ill_init allocates a phyint always to store the copy
+ * of flags relevant to phyint. At that point in time, we could
+ * not assign the name and hence phyint_illv4/v6 could not be
+ * initialized. Later in ipif_set_values, we assign the name to
+ * the ill, at which point in time we assign phyint_illv4/v6.
+ * Thus we don't rely on phyint_illv6 to be initialized always.
+ */
+ if (ill->ill_flags & ILLF_IPV6)
+ phyi->phyint_illv6 = NULL;
+ else
+ phyi->phyint_illv4 = NULL;
+
+ if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
+ return;
+ }
+
+ /*
+ * There are no ills left on this phyint; pull it out of the phyint
+ * avl trees, and free it.
+ */
+ if (phyi->phyint_ifindex > 0) {
+ avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+ phyi);
+ avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
+ phyi);
+ }
rw_exit(&ipst->ips_ill_g_lock);
+
+ phyint_free(phyi);
}
/*
@@ -4367,30 +4456,32 @@ ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
return (0);
}
-/* Initialize the per phyint (per IPMP group) ipsq used for serialization */
+/* Initialize the per phyint ipsq used for serialization */
static boolean_t
-ipsq_init(ill_t *ill)
+ipsq_init(ill_t *ill, boolean_t enter)
{
ipsq_t *ipsq;
+ ipxop_t *ipx;
- /* Init the ipsq and impicitly enter as writer */
- ill->ill_phyint->phyint_ipsq =
- kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
- if (ill->ill_phyint->phyint_ipsq == NULL)
+ if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
return (B_FALSE);
- ipsq = ill->ill_phyint->phyint_ipsq;
- ipsq->ipsq_phyint_list = ill->ill_phyint;
- ill->ill_phyint->phyint_ipsq_next = NULL;
+
+ ill->ill_phyint->phyint_ipsq = ipsq;
+ ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
+ ipx->ipx_ipsq = ipsq;
+ ipsq->ipsq_next = ipsq;
+ ipsq->ipsq_phyint = ill->ill_phyint;
mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
- ipsq->ipsq_refs = 1;
- ipsq->ipsq_writer = curthread;
- ipsq->ipsq_reentry_cnt = 1;
+ mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */
+ if (enter) {
+ ipx->ipx_writer = curthread;
+ ipx->ipx_forced = B_FALSE;
+ ipx->ipx_reentry_cnt = 1;
#ifdef DEBUG
- ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack,
- IPSQ_STACK_DEPTH);
+ ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
#endif
- (void) strcpy(ipsq->ipsq_name, ill->ill_name);
+ }
return (B_TRUE);
}
@@ -4468,7 +4559,7 @@ ill_init(queue_t *q, ill_t *ill)
ill->ill_ppa = UINT_MAX;
ill->ill_fastpath_list = &ill->ill_fastpath_list;
- if (!ipsq_init(ill)) {
+ if (!ipsq_init(ill, B_TRUE)) {
freemsg(info_mp);
mi_free(frag_ptr);
mi_free(ill->ill_phyint);
@@ -4589,29 +4680,16 @@ loopback_kstat_update(kstat_t *ksp, int rw)
}
/*
- * Has ifindex been plumbed already.
- * Compares both phyint_ifindex and phyint_group_ifindex.
+ * Has ifindex been plumbed already?
*/
static boolean_t
phyint_exists(uint_t index, ip_stack_t *ipst)
{
- phyint_t *phyi;
-
ASSERT(index != 0);
ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
- /*
- * Indexes are stored in the phyint - a common structure
- * to both IPv4 and IPv6.
- */
- phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
- for (; phyi != NULL;
- phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
- phyi, AVL_AFTER)) {
- if (phyi->phyint_ifindex == index ||
- phyi->phyint_group_ifindex == index)
- return (B_TRUE);
- }
- return (B_FALSE);
+
+ return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+ &index, NULL) != NULL);
}
/* Pick a unique ifindex */
@@ -4675,9 +4753,9 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
{
ill_t *ill;
ipif_t *ipif;
+ ipsq_t *ipsq;
kstat_named_t *kn;
boolean_t isloopback;
- ipsq_t *old_ipsq;
in6_addr_t ov6addr;
isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
@@ -4761,16 +4839,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
ill->ill_net_type = IRE_LOOPBACK;
/* Initialize the ipsq */
- if (!ipsq_init(ill))
+ if (!ipsq_init(ill, B_FALSE))
goto done;
- ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL;
- ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--;
- ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
- ill->ill_phyint->phyint_ipsq->ipsq_depth = 0;
-#endif
- ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE);
+ ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE);
if (ipif == NULL)
goto done;
@@ -4807,7 +4879,7 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
ill->ill_frag_free_num_pkts = 0;
ill->ill_last_frag_clean_time = 0;
- old_ipsq = ill->ill_phyint->phyint_ipsq;
+ ipsq = ill->ill_phyint->phyint_ipsq;
if (ill_glist_insert(ill, "lo", isv6) != 0)
cmn_err(CE_PANIC, "cannot insert loopback interface");
@@ -4824,13 +4896,11 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
sctp_update_ipif_addr(ipif, ov6addr);
/*
- * If the ipsq was changed in ill_phyint_reinit free the old ipsq.
+ * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
+ * If so, free our original one.
*/
- if (old_ipsq != ill->ill_phyint->phyint_ipsq) {
- /* Loopback ills aren't in any IPMP group */
- ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP));
- ipsq_delete(old_ipsq);
- }
+ if (ipsq != ill->ill_phyint->phyint_ipsq)
+ ipsq_delete(ipsq);
/*
* Delay this till the ipif is allocated as ipif_allocate
@@ -4871,12 +4941,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
done:
if (ill != NULL) {
if (ill->ill_phyint != NULL) {
- ipsq_t *ipsq;
-
ipsq = ill->ill_phyint->phyint_ipsq;
if (ipsq != NULL) {
- ipsq->ipsq_ipst = NULL;
- kmem_free(ipsq, sizeof (ipsq_t));
+ ipsq->ipsq_phyint = NULL;
+ ipsq_delete(ipsq);
}
mi_free(ill->ill_phyint);
}
@@ -4954,9 +5022,11 @@ ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp,
} else if (ILL_CAN_WAIT(ill, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
rw_exit(&ipst->ips_ill_g_lock);
mutex_exit(&ill->ill_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (err != NULL)
@@ -5294,6 +5364,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
dl_info_ack_t *dlia;
ip_m_t *ipm;
dl_qos_cl_sel1_t *sel1;
+ int min_mtu;
ASSERT(IAM_WRITER_ILL(ill));
@@ -5336,7 +5407,14 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
ill->ill_bcast_addr_length = brdcst_addr_length;
ill->ill_phys_addr_length = phys_addr_length;
ill->ill_sap_length = sap_length;
- ill->ill_max_frag = dlia->dl_max_sdu;
+
+ /*
+ * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
+ * but we must ensure a minimum IP MTU is used since other bits of
+ * IP will fly apart otherwise.
+ */
+ min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
+ ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
ill->ill_max_mtu = ill->ill_max_frag;
ill->ill_type = ipm->ip_m_type;
@@ -5358,7 +5436,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
* the wakeup.
*/
(void) ipif_allocate(ill, 0, IRE_LOCAL,
- dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE);
+ dlia->dl_provider_style != DL_STYLE2, B_TRUE);
mutex_enter(&ill->ill_lock);
ASSERT(ill->ill_dlpi_style_set == 0);
ill->ill_dlpi_style_set = 1;
@@ -5397,8 +5475,13 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
/*
* Free ill_resolver_mp and ill_bcast_mp as things could have
* changed now.
+ *
+ * NOTE: The IPMP meta-interface is special-cased because it starts
+ * with no underlying interfaces (and thus an unknown broadcast
+ * address length), but we enforce that an interface is broadcast-
+ * capable as part of allowing it to join a group.
*/
- if (ill->ill_bcast_addr_length == 0) {
+ if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
if (ill->ill_resolver_mp != NULL)
freemsg(ill->ill_resolver_mp);
if (ill->ill_bcast_mp != NULL)
@@ -5451,6 +5534,11 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
if (!ill->ill_isv6)
ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
}
+
+ /* For IPMP, PHYI_IPMP should already be set by ipif_allocate() */
+ if (ill->ill_mactype == SUNW_DL_IPMP)
+ ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
+
/* By default an interface does not support any CoS marking */
ill->ill_flags &= ~ILLF_COS_ENABLED;
@@ -5552,16 +5640,18 @@ ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
}
/*
- * Find any non-virtual, not condemned, and up multicast capable interface
- * given an IP instance and zoneid. Order of preference is:
+ * Find a mulitcast-capable ipif given an IP instance and zoneid.
+ * The ipif must be up, and its ill must multicast-capable, not
+ * condemned, not an underlying interface in an IPMP group, and
+ * not a VNI interface. Order of preference:
*
- * 1. normal
- * 1.1 normal, but deprecated
- * 2. point to point
- * 2.1 point to point, but deprecated
- * 3. link local
- * 3.1 link local, but deprecated
- * 4. loopback.
+ * 1a. normal
+ * 1b. normal, but deprecated
+ * 2a. point to point
+ * 2b. point to point, but deprecated
+ * 3a. link local
+ * 3b. link local, but deprecated
+ * 4. loopback.
*/
ipif_t *
ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
@@ -5580,7 +5670,7 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
mutex_enter(&ill->ill_lock);
- if (IS_VNI(ill) || !ILL_CAN_LOOKUP(ill) ||
+ if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) ||
!(ill->ill_flags & ILLF_MULTICAST)) {
mutex_exit(&ill->ill_lock);
continue;
@@ -5736,10 +5826,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
} else if (IPIF_CAN_WAIT(ipif, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
rw_exit(&ipst->ips_ill_g_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP,
ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (error != NULL)
@@ -5761,15 +5853,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
}
/*
- * Look for an ipif with the specified address. For point-point links
- * we look for matches on either the destination address and the local
- * address, but we ignore the check on the local address if IPIF_UNNUMBERED
- * is set.
- * Matches on a specific ill if match_ill is set.
+ * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
*/
-ipif_t *
-ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
- mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+static ipif_t *
+ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp,
+ zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
+ ip_stack_t *ipst)
{
ipif_t *ipif;
ill_t *ill;
@@ -5788,7 +5877,8 @@ ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
repeat:
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (match_ill != NULL && ill != match_ill) {
+ if (match_ill != NULL && ill != match_ill &&
+ (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
continue;
}
GRAB_CONN_LOCK(q);
@@ -5817,10 +5907,12 @@ repeat:
} else if (IPIF_CAN_WAIT(ipif, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
rw_exit(&ipst->ips_ill_g_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP,
ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
if (error != NULL)
@@ -5894,11 +5986,40 @@ ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
}
/*
+ * Lookup an ipif with the specified address. For point-to-point links we
+ * look for matches on either the destination address or the local address,
+ * but we skip the local address check if IPIF_UNNUMBERED is set. If the
+ * `match_ill' argument is non-NULL, the lookup is restricted to that ill
+ * (or illgrp if `match_ill' is in an IPMP group).
+ */
+ipif_t *
+ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
+ mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+{
+ return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp,
+ func, error, ipst));
+}
+
+/*
+ * Special abbreviated version of ipif_lookup_addr() that doesn't match
+ * `match_ill' across the IPMP group. This function is only needed in some
+ * corner-cases; almost everything should use ipif_lookup_addr().
+ */
+static ipif_t *
+ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
+{
+ ASSERT(match_ill != NULL);
+ return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES,
+ NULL, NULL, NULL, NULL, ipst));
+}
+
+/*
* Look for an ipif with the specified address. For point-point links
* we look for matches on either the destination address and the local
* address, but we ignore the check on the local address if IPIF_UNNUMBERED
* is set.
- * Matches on a specific ill if match_ill is set.
+ * If the `match_ill' argument is non-NULL, the lookup is restricted to that
+ * ill (or illgrp if `match_ill' is in an IPMP group).
* Return the zoneid for the ipif which matches. ALL_ZONES if no match.
*/
zoneid_t
@@ -5918,7 +6039,8 @@ ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
repeat:
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (match_ill != NULL && ill != match_ill) {
+ if (match_ill != NULL && ill != match_ill &&
+ !IS_IN_SAME_ILLGRP(ill, match_ill)) {
continue;
}
mutex_enter(&ill->ill_lock);
@@ -6008,7 +6130,7 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
/*
* The callers of this function wants to know the
* interface on which they have to send the replies
- * back. For IRE_CACHES that have ire_stq and ire_ipif
+ * back. For IREs that have ire_stq and ire_ipif
* derived from different ills, we really don't care
* what we return here.
*/
@@ -6109,30 +6231,6 @@ ipif_is_freeable(ipif_t *ipif)
}
/*
- * This func does not prevent refcnt from increasing. But if
- * the caller has taken steps to that effect, then this func
- * can be used to determine whether the ipifs marked with IPIF_MOVING
- * have become quiescent and can be moved in a failover/failback.
- */
-static ipif_t *
-ill_quiescent_to_move(ill_t *ill)
-{
- ipif_t *ipif;
-
- ASSERT(MUTEX_HELD(&ill->ill_lock));
-
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (ipif->ipif_state_flags & IPIF_MOVING) {
- if (ipif->ipif_refcnt != 0 ||
- !IPIF_DOWN_OK(ipif)) {
- return (ipif);
- }
- }
- }
- return (NULL);
-}
-
-/*
* The ipif/ill/ire has been refreled. Do the tail processing.
* Determine if the ipif or ill in question has become quiescent and if so
* wakeup close and/or restart any queued pending ioctl that is waiting
@@ -6144,87 +6242,61 @@ ipif_ill_refrele_tail(ill_t *ill)
mblk_t *mp;
conn_t *connp;
ipsq_t *ipsq;
+ ipxop_t *ipx;
ipif_t *ipif;
dl_notify_ind_t *dlindp;
ASSERT(MUTEX_HELD(&ill->ill_lock));
- if ((ill->ill_state_flags & ILL_CONDEMNED) &&
- ill_is_freeable(ill)) {
- /* ill_close may be waiting */
+ if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
+ /* ip_modclose() may be waiting */
cv_broadcast(&ill->ill_cv);
}
- /* ipsq can't change because ill_lock is held */
ipsq = ill->ill_phyint->phyint_ipsq;
- if (ipsq->ipsq_waitfor == 0) {
- /* Not waiting for anything, just return. */
- mutex_exit(&ill->ill_lock);
- return;
- }
- ASSERT(ipsq->ipsq_pending_mp != NULL &&
- ipsq->ipsq_pending_ipif != NULL);
- /*
- * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF.
- * Last ipif going down needs to down the ill, so ill_ire_cnt must
- * be zero for restarting an ioctl that ends up downing the ill.
- */
- ipif = ipsq->ipsq_pending_ipif;
- if (ipif->ipif_ill != ill) {
- /* The ioctl is pending on some other ill. */
- mutex_exit(&ill->ill_lock);
- return;
- }
+ mutex_enter(&ipsq->ipsq_lock);
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
+ if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */
+ goto unlock;
+
+ ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
+
+ ipif = ipx->ipx_pending_ipif;
+ if (ipif->ipif_ill != ill) /* wait is for another ill; bail */
+ goto unlock;
- switch (ipsq->ipsq_waitfor) {
+ switch (ipx->ipx_waitfor) {
case IPIF_DOWN:
- if (!ipif_is_quiescent(ipif)) {
- mutex_exit(&ill->ill_lock);
- return;
- }
+ if (!ipif_is_quiescent(ipif))
+ goto unlock;
break;
case IPIF_FREE:
- if (!ipif_is_freeable(ipif)) {
- mutex_exit(&ill->ill_lock);
- return;
- }
+ if (!ipif_is_freeable(ipif))
+ goto unlock;
break;
-
case ILL_DOWN:
- if (!ill_is_quiescent(ill)) {
- mutex_exit(&ill->ill_lock);
- return;
- }
+ if (!ill_is_quiescent(ill))
+ goto unlock;
break;
case ILL_FREE:
/*
- * case ILL_FREE arises only for loopback. otherwise ill_delete
- * waits synchronously in ip_close, and no message is queued in
- * ipsq_pending_mp at all in this case
+ * ILL_FREE is only for loopback; normal ill teardown waits
+ * synchronously in ip_modclose() without using ipx_waitfor,
+ * handled by the cv_broadcast() at the top of this function.
*/
- if (!ill_is_freeable(ill)) {
- mutex_exit(&ill->ill_lock);
- return;
- }
- break;
-
- case ILL_MOVE_OK:
- if (ill_quiescent_to_move(ill) != NULL) {
- mutex_exit(&ill->ill_lock);
- return;
- }
+ if (!ill_is_freeable(ill))
+ goto unlock;
break;
default:
- cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n",
- (void *)ipsq, ipsq->ipsq_waitfor);
+ cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
+ (void *)ipsq, ipx->ipx_waitfor);
}
- /*
- * Incr refcnt for the qwriter_ip call below which
- * does a refrele
- */
- ill_refhold_locked(ill);
+ ill_refhold_locked(ill); /* for qwriter_ip() call below */
+ mutex_exit(&ipx->ipx_lock);
mp = ipsq_pending_mp_get(ipsq, &connp);
+ mutex_exit(&ipsq->ipsq_lock);
mutex_exit(&ill->ill_lock);
ASSERT(mp != NULL);
@@ -6249,6 +6321,7 @@ ipif_ill_refrele_tail(ill_t *ill)
return;
default:
ASSERT(0);
+ ill_refrele(ill);
}
break;
@@ -6268,6 +6341,11 @@ ipif_ill_refrele_tail(ill_t *ill)
cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
"db_type %d\n", (void *)mp, mp->b_datap->db_type);
}
+ return;
+unlock:
+ mutex_exit(&ipsq->ipsq_lock);
+ mutex_exit(&ipx->ipx_lock);
+ mutex_exit(&ill->ill_lock);
}
#ifdef DEBUG
@@ -6902,10 +6980,23 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
ipif = ipif_arg;
if (ipif_arg != NULL)
match_flags |= MATCH_IRE_ILL;
+again:
gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL,
ALL_ZONES, 0, NULL, match_flags, ipst);
- if (gw_ire == NULL)
+ if (gw_ire == NULL) {
+ /*
+ * With IPMP, we allow host routes to influence in.mpathd's
+ * target selection. However, if the test addresses are on
+ * their own network, the above lookup will fail since the
+ * underlying IRE_INTERFACEs are marked hidden. So allow
+ * hidden test IREs to be found and try again.
+ */
+ if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) {
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+ goto again;
+ }
return (ENETUNREACH);
+ }
/*
* We create one of three types of IREs as a result of this request
@@ -7355,9 +7446,11 @@ void
ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
ill_t *pending_ill)
{
- conn_t *connp = NULL;
+ conn_t *connp;
+ ipxop_t *ipx = ipsq->ipsq_xop;
ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
+ ASSERT(MUTEX_HELD(&ipx->ipx_lock));
ASSERT(func != NULL);
mp->b_queue = q;
@@ -7366,14 +7459,14 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
switch (type) {
case CUR_OP:
- if (ipsq->ipsq_mptail != NULL) {
- ASSERT(ipsq->ipsq_mphead != NULL);
- ipsq->ipsq_mptail->b_next = mp;
+ if (ipx->ipx_mptail != NULL) {
+ ASSERT(ipx->ipx_mphead != NULL);
+ ipx->ipx_mptail->b_next = mp;
} else {
- ASSERT(ipsq->ipsq_mphead == NULL);
- ipsq->ipsq_mphead = mp;
+ ASSERT(ipx->ipx_mphead == NULL);
+ ipx->ipx_mphead = mp;
}
- ipsq->ipsq_mptail = mp;
+ ipx->ipx_mptail = mp;
break;
case NEW_OP:
@@ -7385,6 +7478,15 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
ipsq->ipsq_xopq_mphead = mp;
}
ipsq->ipsq_xopq_mptail = mp;
+ ipx->ipx_ipsq_queued = B_TRUE;
+ break;
+
+ case SWITCH_OP:
+ ASSERT(ipsq->ipsq_swxop != NULL);
+ /* only one switch operation is currently allowed */
+ ASSERT(ipsq->ipsq_switch_mp == NULL);
+ ipsq->ipsq_switch_mp = mp;
+ ipx->ipx_ipsq_queued = B_TRUE;
break;
default:
cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
@@ -7392,55 +7494,273 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
if (CONN_Q(q) && pending_ill != NULL) {
connp = Q_TO_CONN(q);
-
ASSERT(MUTEX_HELD(&connp->conn_lock));
connp->conn_oper_pending_ill = pending_ill;
}
}
/*
- * Return the mp at the head of the ipsq. After emptying the ipsq
- * look at the next ioctl, if this ioctl is complete. Otherwise
- * return, we will resume when we complete the current ioctl.
- * The current ioctl will wait till it gets a response from the
- * driver below.
+ * Dequeue the next message that requested exclusive access to this IPSQ's
+ * xop. Specifically:
+ *
+ * 1. If we're still processing the current operation on `ipsq', then
+ * dequeue the next message for the operation (from ipx_mphead), or
+ * return NULL if there are no queued messages for the operation.
+ * These messages are queued via CUR_OP to qwriter_ip() and friends.
+ *
+ * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is
+ * not set) see if the ipsq has requested an xop switch. If so, switch
+ * `ipsq' to a different xop. Xop switches only happen when joining or
+ * leaving IPMP groups and require a careful dance -- see the comments
+ * in-line below for details. If we're leaving a group xop or if we're
+ * joining a group xop and become writer on it, then we proceed to (3).
+ * Otherwise, we return NULL and exit the xop.
+ *
+ * 3. For each IPSQ in the xop, return any switch operation stored on
+ * ipsq_switch_mp (set via SWITCH_OP); these must be processed before
+ * any other messages queued on the IPSQ. Otherwise, dequeue the next
+ * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
+ * Note that if the phyint tied to `ipsq' is not using IPMP there will
+ * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for
+ * each phyint in the group, including the IPMP meta-interface phyint.
*/
static mblk_t *
ipsq_dq(ipsq_t *ipsq)
{
+ ill_t *illv4, *illv6;
mblk_t *mp;
+ ipsq_t *xopipsq;
+ ipsq_t *leftipsq = NULL;
+ ipxop_t *ipx;
+ phyint_t *phyi = ipsq->ipsq_phyint;
+ ip_stack_t *ipst = ipsq->ipsq_ipst;
+ boolean_t emptied = B_FALSE;
- ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
+ /*
+ * Grab all the locks we need in the defined order (ill_g_lock ->
+ * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
+ */
+ rw_enter(&ipst->ips_ill_g_lock,
+ ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
+ mutex_enter(&ipsq->ipsq_lock);
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
- mp = ipsq->ipsq_mphead;
- if (mp != NULL) {
- ipsq->ipsq_mphead = mp->b_next;
- if (ipsq->ipsq_mphead == NULL)
- ipsq->ipsq_mptail = NULL;
- mp->b_next = NULL;
- return (mp);
+ /*
+ * Dequeue the next message associated with the current exclusive
+ * operation, if any.
+ */
+ if ((mp = ipx->ipx_mphead) != NULL) {
+ ipx->ipx_mphead = mp->b_next;
+ if (ipx->ipx_mphead == NULL)
+ ipx->ipx_mptail = NULL;
+ mp->b_next = (void *)ipsq;
+ goto out;
}
- if (ipsq->ipsq_current_ipif != NULL)
- return (NULL);
- mp = ipsq->ipsq_xopq_mphead;
- if (mp != NULL) {
- ipsq->ipsq_xopq_mphead = mp->b_next;
- if (ipsq->ipsq_xopq_mphead == NULL)
- ipsq->ipsq_xopq_mptail = NULL;
- mp->b_next = NULL;
- return (mp);
+
+ if (ipx->ipx_current_ipif != NULL)
+ goto empty;
+
+ if (ipsq->ipsq_swxop != NULL) {
+ /*
+ * The exclusive operation that is now being completed has
+ * requested a switch to a different xop. This happens
+ * when an interface joins or leaves an IPMP group. Joins
+ * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
+ * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
+ * (phyint_free()), or interface plumb for an ill type
+ * not in the IPMP group (ip_rput_dlpi_writer()).
+ *
+ * Xop switches are not allowed on the IPMP meta-interface.
+ */
+ ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
+ DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
+
+ if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
+ /*
+ * We're switching back to our own xop, so we have two
+ * xop's to drain/exit: our own, and the group xop
+ * that we are leaving.
+ *
+ * First, pull ourselves out of the group ipsq list.
+ * This is safe since we're writer on ill_g_lock.
+ */
+ ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
+
+ xopipsq = ipx->ipx_ipsq;
+ while (xopipsq->ipsq_next != ipsq)
+ xopipsq = xopipsq->ipsq_next;
+
+ xopipsq->ipsq_next = ipsq->ipsq_next;
+ ipsq->ipsq_next = ipsq;
+ ipsq->ipsq_xop = ipsq->ipsq_swxop;
+ ipsq->ipsq_swxop = NULL;
+
+ /*
+ * Second, prepare to exit the group xop. The actual
+ * ipsq_exit() is done at the end of this function
+ * since we cannot hold any locks across ipsq_exit().
+ * Note that although we drop the group's ipx_lock, no
+ * threads can proceed since we're still ipx_writer.
+ */
+ leftipsq = xopipsq;
+ mutex_exit(&ipx->ipx_lock);
+
+ /*
+ * Third, set ipx to point to our own xop (which was
+ * inactive and therefore can be entered).
+ */
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
+ ASSERT(ipx->ipx_writer == NULL);
+ ASSERT(ipx->ipx_current_ipif == NULL);
+ } else {
+ /*
+ * We're switching from our own xop to a group xop.
+ * The requestor of the switch must ensure that the
+ * group xop cannot go away (e.g. by ensuring the
+ * phyint associated with the xop cannot go away).
+ *
+ * If we can become writer on our new xop, then we'll
+ * do the drain. Otherwise, the current writer of our
+ * new xop will do the drain when it exits.
+ *
+ * First, splice ourselves into the group IPSQ list.
+ * This is safe since we're writer on ill_g_lock.
+ */
+ ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
+
+ xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
+ while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
+ xopipsq = xopipsq->ipsq_next;
+
+ xopipsq->ipsq_next = ipsq;
+ ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
+ ipsq->ipsq_xop = ipsq->ipsq_swxop;
+ ipsq->ipsq_swxop = NULL;
+
+ /*
+ * Second, exit our own xop, since it's now unused.
+ * This is safe since we've got the only reference.
+ */
+ ASSERT(ipx->ipx_writer == curthread);
+ ipx->ipx_writer = NULL;
+ VERIFY(--ipx->ipx_reentry_cnt == 0);
+ ipx->ipx_ipsq_queued = B_FALSE;
+ mutex_exit(&ipx->ipx_lock);
+
+ /*
+ * Third, set ipx to point to our new xop, and check
+ * if we can become writer on it. If we cannot, then
+ * the current writer will drain the IPSQ group when
+ * it exits. Our ipsq_xop is guaranteed to be stable
+ * because we're still holding ipsq_lock.
+ */
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
+ if (ipx->ipx_writer != NULL ||
+ ipx->ipx_current_ipif != NULL) {
+ goto out;
+ }
+ }
+
+ /*
+ * Fourth, become writer on our new ipx before we continue
+ * with the drain. Note that we never dropped ipsq_lock
+ * above, so no other thread could've raced with us to
+ * become writer first. Also, we're holding ipx_lock, so
+ * no other thread can examine the ipx right now.
+ */
+ ASSERT(ipx->ipx_current_ipif == NULL);
+ ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
+ VERIFY(ipx->ipx_reentry_cnt++ == 0);
+ ipx->ipx_writer = curthread;
+ ipx->ipx_forced = B_FALSE;
+#ifdef DEBUG
+ ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
+#endif
}
- return (NULL);
+
+ xopipsq = ipsq;
+ do {
+ /*
+ * So that other operations operate on a consistent and
+ * complete phyint, a switch message on an IPSQ must be
+ * handled prior to any other operations on that IPSQ.
+ */
+ if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
+ xopipsq->ipsq_switch_mp = NULL;
+ ASSERT(mp->b_next == NULL);
+ mp->b_next = (void *)xopipsq;
+ goto out;
+ }
+
+ if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
+ xopipsq->ipsq_xopq_mphead = mp->b_next;
+ if (xopipsq->ipsq_xopq_mphead == NULL)
+ xopipsq->ipsq_xopq_mptail = NULL;
+ mp->b_next = (void *)xopipsq;
+ goto out;
+ }
+ } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
+empty:
+ /*
+ * There are no messages. Further, we are holding ipx_lock, hence no
+ * new messages can end up on any IPSQ in the xop.
+ */
+ ipx->ipx_writer = NULL;
+ ipx->ipx_forced = B_FALSE;
+ VERIFY(--ipx->ipx_reentry_cnt == 0);
+ ipx->ipx_ipsq_queued = B_FALSE;
+ emptied = B_TRUE;
+#ifdef DEBUG
+ ipx->ipx_depth = 0;
+#endif
+out:
+ mutex_exit(&ipx->ipx_lock);
+ mutex_exit(&ipsq->ipsq_lock);
+
+ /*
+ * If we completely emptied the xop, then wake up any threads waiting
+ * to enter any of the IPSQ's associated with it.
+ */
+ if (emptied) {
+ xopipsq = ipsq;
+ do {
+ if ((phyi = xopipsq->ipsq_phyint) == NULL)
+ continue;
+
+ illv4 = phyi->phyint_illv4;
+ illv6 = phyi->phyint_illv6;
+
+ GRAB_ILL_LOCKS(illv4, illv6);
+ if (illv4 != NULL)
+ cv_broadcast(&illv4->ill_cv);
+ if (illv6 != NULL)
+ cv_broadcast(&illv6->ill_cv);
+ RELEASE_ILL_LOCKS(illv4, illv6);
+ } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ /*
+ * Now that all locks are dropped, exit the IPSQ we left.
+ */
+ if (leftipsq != NULL)
+ ipsq_exit(leftipsq);
+
+ return (mp);
}
/*
* Enter the ipsq corresponding to ill, by waiting synchronously till
* we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
* will have to drain completely before ipsq_enter returns success.
- * ipsq_current_ipif will be set if some exclusive ioctl is in progress,
- * and the ipsq_exit logic will start the next enqueued ioctl after
- * completion of the current ioctl. If 'force' is used, we don't wait
- * for the enqueued ioctls. This is needed when a conn_close wants to
+ * ipx_current_ipif will be set if some exclusive op is in progress,
+ * and the ipsq_exit logic will start the next enqueued op after
+ * completion of the current op. If 'force' is used, we don't wait
+ * for the enqueued ops. This is needed when a conn_close wants to
* enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
* of an ill can also use this option. But we dont' use it currently.
*/
@@ -7449,13 +7769,16 @@ boolean_t
ipsq_enter(ill_t *ill, boolean_t force, int type)
{
ipsq_t *ipsq;
+ ipxop_t *ipx;
boolean_t waited_enough = B_FALSE;
/*
- * Holding the ill_lock prevents <ill-ipsq> assocs from changing.
- * Since the <ill-ipsq> assocs could change while we wait for the
- * writer, it is easier to wait on a fixed global rather than try to
- * cv_wait on a changing ipsq.
+ * Note that the relationship between ill and ipsq is fixed as long as
+ * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the
+ * relationship between the IPSQ and xop cannot change. However,
+ * since we cannot hold ipsq_lock across the cv_wait(), it may change
+ * while we're waiting. We wait on ill_cv and rely on ipsq_exit()
+ * waking up all ills in the xop when it becomes available.
*/
mutex_enter(&ill->ill_lock);
for (;;) {
@@ -7466,34 +7789,35 @@ ipsq_enter(ill_t *ill, boolean_t force, int type)
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
- if (ipsq->ipsq_writer == NULL &&
- (type == CUR_OP || ipsq->ipsq_current_ipif == NULL ||
- waited_enough)) {
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
+
+ if (ipx->ipx_writer == NULL && (type == CUR_OP ||
+ ipx->ipx_current_ipif == NULL || waited_enough))
break;
- } else if (ipsq->ipsq_writer != NULL) {
+
+ if (!force || ipx->ipx_writer != NULL) {
+ mutex_exit(&ipx->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
cv_wait(&ill->ill_cv, &ill->ill_lock);
} else {
+ mutex_exit(&ipx->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
- if (force) {
- (void) cv_timedwait(&ill->ill_cv,
- &ill->ill_lock,
- lbolt + ENTER_SQ_WAIT_TICKS);
- waited_enough = B_TRUE;
- continue;
- } else {
- cv_wait(&ill->ill_cv, &ill->ill_lock);
- }
+ (void) cv_timedwait(&ill->ill_cv,
+ &ill->ill_lock, lbolt + ENTER_SQ_WAIT_TICKS);
+ waited_enough = B_TRUE;
}
}
- ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL);
- ASSERT(ipsq->ipsq_reentry_cnt == 0);
- ipsq->ipsq_writer = curthread;
- ipsq->ipsq_reentry_cnt++;
+ ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
+ ASSERT(ipx->ipx_reentry_cnt == 0);
+ ipx->ipx_writer = curthread;
+ ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
+ ipx->ipx_reentry_cnt++;
#ifdef DEBUG
- ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IPSQ_STACK_DEPTH);
+ ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
#endif
+ mutex_exit(&ipx->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
mutex_exit(&ill->ill_lock);
return (B_TRUE);
@@ -7513,14 +7837,13 @@ ill_perim_exit(ill_t *ill)
/*
* The ipsq_t (ipsq) is the synchronization data structure used to serialize
- * certain critical operations like plumbing (i.e. most set ioctls),
- * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP
- * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per
- * IPMP group. The ipsq serializes exclusive ioctls issued by applications
- * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple
- * threads executing in the ipsq. Responses from the driver pertain to the
- * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated
- * as part of bringing up the interface) and are enqueued in ipsq_mphead.
+ * certain critical operations like plumbing (i.e. most set ioctls), multicast
+ * joins, igmp/mld timers, etc. There is one ipsq per phyint. The ipsq
+ * serializes exclusive ioctls issued by applications on a per ipsq basis in
+ * ipsq_xopq_mphead. It also protects against multiple threads executing in
+ * the ipsq. Responses from the driver pertain to the current ioctl (say a
+ * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
+ * up the interface) and are enqueued in ipx_mphead.
*
* If a thread does not want to reenter the ipsq when it is already writer,
* it must make sure that the specified reentry point to be called later
@@ -7528,29 +7851,33 @@ ill_perim_exit(ill_t *ill)
* point must never ever try to enter the ipsq again. Otherwise it can lead
* to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
* When the thread that is currently exclusive finishes, it (ipsq_exit)
- * dequeues the requests waiting to become exclusive in ipsq_mphead and calls
- * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit
+ * dequeues the requests waiting to become exclusive in ipx_mphead and calls
+ * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
* proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
* ioctl if the current ioctl has completed. If the current ioctl is still
* in progress it simply returns. The current ioctl could be waiting for
- * a response from another module (arp_ or the driver or could be waiting for
- * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp
- * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the
+ * a response from another module (arp or the driver or could be waiting for
+ * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
+ * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
* execution of the ioctl and ipsq_exit does not start the next ioctl unless
- * ipsq_current_ipif is clear which happens only on ioctl completion.
+ * ipx_current_ipif is NULL which happens only once the ioctl is complete and
+ * all associated DLPI operations have completed.
*/
/*
- * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of
- * ipif or ill can be specified). The caller ensures ipif or ill is valid by
- * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued
- * completion.
+ * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
+ * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ
+ * on success, or NULL on failure. The caller ensures ipif/ill is valid by
+ * refholding it as necessary. If the IPSQ cannot be entered and `func' is
+ * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
+ * can be entered. If `func' is NULL, then `q' and `mp' are ignored.
*/
ipsq_t *
ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
ipsq_func_t func, int type, boolean_t reentry_ok)
{
ipsq_t *ipsq;
+ ipxop_t *ipx;
/* Only 1 of ipif or ill can be specified */
ASSERT((ipif != NULL) ^ (ill != NULL));
@@ -7558,13 +7885,15 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
ill = ipif->ipif_ill;
/*
- * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock
- * ipsq of an ill can't change when ill_lock is held.
+ * lock ordering: conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
+ * ipx of an ipsq can't change when ipsq_lock is held.
*/
GRAB_CONN_LOCK(q);
mutex_enter(&ill->ill_lock);
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ ipx = ipsq->ipsq_xop;
+ mutex_enter(&ipx->ipx_lock);
/*
* 1. Enter the ipsq if we are already writer and reentry is ok.
@@ -7572,30 +7901,32 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
* 'func' nor any of its callees must ever attempt to enter the ipsq
* again. Otherwise it can lead to an infinite loop
* 2. Enter the ipsq if there is no current writer and this attempted
- * entry is part of the current ioctl or operation
+ * entry is part of the current operation
* 3. Enter the ipsq if there is no current writer and this is a new
- * ioctl (or operation) and the ioctl (or operation) queue is
- * empty and there is no ioctl (or operation) currently in progress
+ * operation and the operation queue is empty and there is no
+ * operation currently in progress
*/
- if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) ||
- (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL &&
- ipsq->ipsq_current_ipif == NULL))) ||
- (ipsq->ipsq_writer == curthread && reentry_ok)) {
+ if ((ipx->ipx_writer == curthread && reentry_ok) ||
+ (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
+ !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL)))) {
/* Success. */
- ipsq->ipsq_reentry_cnt++;
- ipsq->ipsq_writer = curthread;
+ ipx->ipx_reentry_cnt++;
+ ipx->ipx_writer = curthread;
+ ipx->ipx_forced = B_FALSE;
+ mutex_exit(&ipx->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
mutex_exit(&ill->ill_lock);
RELEASE_CONN_LOCK(q);
#ifdef DEBUG
- ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack,
- IPSQ_STACK_DEPTH);
+ ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
#endif
return (ipsq);
}
- ipsq_enq(ipsq, q, mp, func, type, ill);
+ if (func != NULL)
+ ipsq_enq(ipsq, q, mp, func, type, ill);
+ mutex_exit(&ipx->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
mutex_exit(&ill->ill_lock);
RELEASE_CONN_LOCK(q);
@@ -7630,188 +7961,58 @@ qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
}
/*
- * If there are more than ILL_GRP_CNT ills in a group,
- * we use kmem alloc'd buffers, else use the stack
- */
-#define ILL_GRP_CNT 14
-/*
- * Drain the ipsq, if there are messages on it, and then leave the ipsq.
- * Called by a thread that is currently exclusive on this ipsq.
+ * Exit the specified IPSQ. If this is the final exit on it then drain it
+ * prior to exiting. Caller must be writer on the specified IPSQ.
*/
void
ipsq_exit(ipsq_t *ipsq)
{
+ mblk_t *mp;
+ ipsq_t *mp_ipsq;
queue_t *q;
- mblk_t *mp;
- ipsq_func_t func;
- int next;
- ill_t **ill_list = NULL;
- size_t ill_list_size = 0;
- int cnt = 0;
- boolean_t need_ipsq_free = B_FALSE;
- ip_stack_t *ipst = ipsq->ipsq_ipst;
+ phyint_t *phyi;
+ ipsq_func_t func;
ASSERT(IAM_WRITER_IPSQ(ipsq));
- mutex_enter(&ipsq->ipsq_lock);
- ASSERT(ipsq->ipsq_reentry_cnt >= 1);
- if (ipsq->ipsq_reentry_cnt != 1) {
- ipsq->ipsq_reentry_cnt--;
- mutex_exit(&ipsq->ipsq_lock);
+
+ ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
+ if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
+ ipsq->ipsq_xop->ipx_reentry_cnt--;
return;
}
- mp = ipsq_dq(ipsq);
- while (mp != NULL) {
-again:
- mutex_exit(&ipsq->ipsq_lock);
- func = (ipsq_func_t)mp->b_prev;
- q = (queue_t *)mp->b_queue;
- mp->b_prev = NULL;
- mp->b_queue = NULL;
-
- /*
- * If 'q' is an conn queue, it is valid, since we did a
- * a refhold on the connp, at the start of the ioctl.
- * If 'q' is an ill queue, it is valid, since close of an
- * ill will clean up the 'ipsq'.
- */
- (*func)(ipsq, q, mp, NULL);
-
- mutex_enter(&ipsq->ipsq_lock);
+ for (;;) {
+ phyi = ipsq->ipsq_phyint;
mp = ipsq_dq(ipsq);
- }
-
- mutex_exit(&ipsq->ipsq_lock);
-
- /*
- * Need to grab the locks in the right order. Need to
- * atomically check (under ipsq_lock) that there are no
- * messages before relinquishing the ipsq. Also need to
- * atomically wakeup waiters on ill_cv while holding ill_lock.
- * Holding ill_g_lock ensures that ipsq list of ills is stable.
- * If we need to call ill_split_ipsq and change <ill-ipsq> we need
- * to grab ill_g_lock as writer.
- */
- rw_enter(&ipst->ips_ill_g_lock,
- ipsq->ipsq_split ? RW_WRITER : RW_READER);
+ mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
- /* ipsq_refs can't change while ill_g_lock is held as reader */
- if (ipsq->ipsq_refs != 0) {
- /* At most 2 ills v4/v6 per phyint */
- cnt = ipsq->ipsq_refs << 1;
- ill_list_size = cnt * sizeof (ill_t *);
/*
- * If memory allocation fails, we will do the split
- * the next time ipsq_exit is called for whatever reason.
- * As long as the ipsq_split flag is set the need to
- * split is remembered.
+ * If we've changed to a new IPSQ, and the phyint associated
+ * with the old one has gone away, free the old IPSQ. Note
+ * that this cannot happen while the IPSQ is in a group.
*/
- ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
- if (ill_list != NULL)
- cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt);
- }
- mutex_enter(&ipsq->ipsq_lock);
- mp = ipsq_dq(ipsq);
- if (mp != NULL) {
- /* oops, some message has landed up, we can't get out */
- if (ill_list != NULL)
- ill_unlock_ills(ill_list, cnt);
- rw_exit(&ipst->ips_ill_g_lock);
- if (ill_list != NULL)
- kmem_free(ill_list, ill_list_size);
- ill_list = NULL;
- ill_list_size = 0;
- cnt = 0;
- goto again;
- }
+ if (mp_ipsq != ipsq && phyi == NULL) {
+ ASSERT(ipsq->ipsq_next == ipsq);
+ ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
+ ipsq_delete(ipsq);
+ }
- /*
- * Split only if no ioctl is pending and if memory alloc succeeded
- * above.
- */
- if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL &&
- ill_list != NULL) {
- /*
- * No new ill can join this ipsq since we are holding the
- * ill_g_lock. Hence ill_split_ipsq can safely traverse the
- * ipsq. ill_split_ipsq may fail due to memory shortage.
- * If so we will retry on the next ipsq_exit.
- */
- ipsq->ipsq_split = ill_split_ipsq(ipsq);
- }
+ if (mp == NULL)
+ break;
- /*
- * We are holding the ipsq lock, hence no new messages can
- * land up on the ipsq, and there are no messages currently.
- * Now safe to get out. Wake up waiters and relinquish ipsq
- * atomically while holding ill locks.
- */
- ipsq->ipsq_writer = NULL;
- ipsq->ipsq_reentry_cnt--;
- ASSERT(ipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
- ipsq->ipsq_depth = 0;
-#endif
- mutex_exit(&ipsq->ipsq_lock);
- /*
- * For IPMP this should wake up all ills in this ipsq.
- * We need to hold the ill_lock while waking up waiters to
- * avoid missed wakeups. But there is no need to acquire all
- * the ill locks and then wakeup. If we have not acquired all
- * the locks (due to memory failure above) ill_signal_ipsq_ills
- * wakes up ills one at a time after getting the right ill_lock
- */
- ill_signal_ipsq_ills(ipsq, ill_list != NULL);
- if (ill_list != NULL)
- ill_unlock_ills(ill_list, cnt);
- if (ipsq->ipsq_refs == 0)
- need_ipsq_free = B_TRUE;
- rw_exit(&ipst->ips_ill_g_lock);
- if (ill_list != 0)
- kmem_free(ill_list, ill_list_size);
+ q = mp->b_queue;
+ func = (ipsq_func_t)mp->b_prev;
+ ipsq = mp_ipsq;
+ mp->b_next = mp->b_prev = NULL;
+ mp->b_queue = NULL;
- if (need_ipsq_free) {
/*
- * Free the ipsq. ipsq_refs can't increase because ipsq can't be
- * looked up. ipsq can be looked up only thru ill or phyint
- * and there are no ills/phyint on this ipsq.
+ * If 'q' is an conn queue, it is valid, since we did a
+ * a refhold on the conn at the start of the ioctl.
+ * If 'q' is an ill queue, it is valid, since close of an
+ * ill will clean up its IPSQ.
*/
- ipsq_delete(ipsq);
- }
-
- /*
- * Now that we're outside the IPSQ, start any IGMP/MLD timers. We
- * can't start these inside the IPSQ since e.g. igmp_start_timers() ->
- * untimeout() (inside the IPSQ, waiting for an executing timeout to
- * finish) could deadlock with igmp_timeout_handler() -> ipsq_enter()
- * (executing the timeout, waiting to get inside the IPSQ).
- *
- * However, there is one exception to the above: if this thread *is*
- * the IGMP/MLD timeout handler thread, then we must not start its
- * timer until the current handler is done.
- */
- mutex_enter(&ipst->ips_igmp_timer_lock);
- if (curthread != ipst->ips_igmp_timer_thread) {
- next = ipst->ips_igmp_deferred_next;
- ipst->ips_igmp_deferred_next = INFINITY;
- mutex_exit(&ipst->ips_igmp_timer_lock);
-
- if (next != INFINITY)
- igmp_start_timers(next, ipst);
- } else {
- mutex_exit(&ipst->ips_igmp_timer_lock);
- }
-
- mutex_enter(&ipst->ips_mld_timer_lock);
- if (curthread != ipst->ips_mld_timer_thread) {
- next = ipst->ips_mld_deferred_next;
- ipst->ips_mld_deferred_next = INFINITY;
- mutex_exit(&ipst->ips_mld_timer_lock);
-
- if (next != INFINITY)
- mld_start_timers(next, ipst);
- } else {
- mutex_exit(&ipst->ips_mld_timer_lock);
+ (*func)(ipsq, q, mp, NULL);
}
}
@@ -7822,15 +8023,17 @@ again:
void
ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
{
+ ipxop_t *ipx = ipsq->ipsq_xop;
+
ASSERT(IAM_WRITER_IPSQ(ipsq));
+ ASSERT(ipx->ipx_current_ipif == NULL);
+ ASSERT(ipx->ipx_current_ioctl == 0);
- mutex_enter(&ipsq->ipsq_lock);
- ASSERT(ipsq->ipsq_current_ipif == NULL);
- ASSERT(ipsq->ipsq_current_ioctl == 0);
- ipsq->ipsq_current_done = B_FALSE;
- ipsq->ipsq_current_ipif = ipif;
- ipsq->ipsq_current_ioctl = ioccmd;
- mutex_exit(&ipsq->ipsq_lock);
+ ipx->ipx_current_done = B_FALSE;
+ ipx->ipx_current_ioctl = ioccmd;
+ mutex_enter(&ipx->ipx_lock);
+ ipx->ipx_current_ipif = ipif;
+ mutex_exit(&ipx->ipx_lock);
}
/*
@@ -7844,17 +8047,18 @@ ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
void
ipsq_current_finish(ipsq_t *ipsq)
{
- ipif_t *ipif = ipsq->ipsq_current_ipif;
+ ipxop_t *ipx = ipsq->ipsq_xop;
t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
+ ipif_t *ipif = ipx->ipx_current_ipif;
ASSERT(IAM_WRITER_IPSQ(ipsq));
/*
- * For SIOCSLIFREMOVEIF, the ipif has been already been blown away
+ * For SIOCLIFREMOVEIF, the ipif has been already been blown away
* (but in that case, IPIF_CHANGING will already be clear and no
* pending DLPI messages can remain).
*/
- if (ipsq->ipsq_current_ioctl != SIOCLIFREMOVEIF) {
+ if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
ill_t *ill = ipif->ipif_ill;
mutex_enter(&ill->ill_lock);
@@ -7863,12 +8067,14 @@ ipsq_current_finish(ipsq_t *ipsq)
mutex_exit(&ill->ill_lock);
}
- mutex_enter(&ipsq->ipsq_lock);
- ipsq->ipsq_current_ioctl = 0;
- ipsq->ipsq_current_done = B_TRUE;
- if (dlpi_pending == DL_PRIM_INVAL)
- ipsq->ipsq_current_ipif = NULL;
- mutex_exit(&ipsq->ipsq_lock);
+ ASSERT(!ipx->ipx_current_done);
+ ipx->ipx_current_done = B_TRUE;
+ ipx->ipx_current_ioctl = 0;
+ if (dlpi_pending == DL_PRIM_INVAL) {
+ mutex_enter(&ipx->ipx_lock);
+ ipx->ipx_current_ipif = NULL;
+ mutex_exit(&ipx->ipx_lock);
+ }
}
/*
@@ -7884,123 +8090,38 @@ ipsq_flush(ill_t *ill)
mblk_t *prev;
mblk_t *mp;
mblk_t *mp_next;
- ipsq_t *ipsq;
+ ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
ASSERT(IAM_WRITER_ILL(ill));
- ipsq = ill->ill_phyint->phyint_ipsq;
+
/*
* Flush any messages sent up by the driver.
*/
- mutex_enter(&ipsq->ipsq_lock);
- for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) {
+ mutex_enter(&ipx->ipx_lock);
+ for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
mp_next = mp->b_next;
q = mp->b_queue;
if (q == ill->ill_rq || q == ill->ill_wq) {
- /* Remove the mp from the ipsq */
+ /* dequeue mp */
if (prev == NULL)
- ipsq->ipsq_mphead = mp->b_next;
+ ipx->ipx_mphead = mp->b_next;
else
prev->b_next = mp->b_next;
- if (ipsq->ipsq_mptail == mp) {
+ if (ipx->ipx_mptail == mp) {
ASSERT(mp_next == NULL);
- ipsq->ipsq_mptail = prev;
+ ipx->ipx_mptail = prev;
}
inet_freemsg(mp);
} else {
prev = mp;
}
}
- mutex_exit(&ipsq->ipsq_lock);
+ mutex_exit(&ipx->ipx_lock);
(void) ipsq_pending_mp_cleanup(ill, NULL);
ipsq_xopq_mp_cleanup(ill, NULL);
ill_pending_mp_cleanup(ill);
}
-/* ARGSUSED */
-int
-ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
- ip_ioctl_cmd_t *ipip, void *ifreq)
-{
- ill_t *ill;
- struct lifreq *lifr = (struct lifreq *)ifreq;
- boolean_t isv6;
- conn_t *connp;
- ip_stack_t *ipst;
-
- connp = Q_TO_CONN(q);
- ipst = connp->conn_netstack->netstack_ip;
- isv6 = connp->conn_af_isv6;
- /*
- * Set original index.
- * Failover and failback move logical interfaces
- * from one physical interface to another. The
- * original index indicates the parent of a logical
- * interface, in other words, the physical interface
- * the logical interface will be moved back to on
- * failback.
- */
-
- /*
- * Don't allow the original index to be changed
- * for non-failover addresses, autoconfigured
- * addresses, or IPv6 link local addresses.
- */
- if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) ||
- (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) {
- return (EINVAL);
- }
- /*
- * The new original index must be in use by some
- * physical interface.
- */
- ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL,
- NULL, NULL, ipst);
- if (ill == NULL)
- return (ENXIO);
- ill_refrele(ill);
-
- ipif->ipif_orig_ifindex = lifr->lifr_index;
- /*
- * When this ipif gets failed back, don't
- * preserve the original id, as it is no
- * longer applicable.
- */
- ipif->ipif_orig_ipifid = 0;
- /*
- * For IPv4, change the original index of any
- * multicast addresses associated with the
- * ipif to the new value.
- */
- if (!isv6) {
- ilm_t *ilm;
-
- mutex_enter(&ipif->ipif_ill->ill_lock);
- for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if (ilm->ilm_ipif == ipif) {
- ilm->ilm_orig_ifindex = lifr->lifr_index;
- }
- }
- mutex_exit(&ipif->ipif_ill->ill_lock);
- }
- return (0);
-}
-
-/* ARGSUSED */
-int
-ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
- ip_ioctl_cmd_t *ipip, void *ifreq)
-{
- struct lifreq *lifr = (struct lifreq *)ifreq;
-
- /*
- * Get the original interface index i.e the one
- * before FAILOVER if it ever happened.
- */
- lifr->lifr_index = ipif->ipif_orig_ifindex;
- return (0);
-}
-
/*
* Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls,
* refhold and return the associated ipif
@@ -8087,8 +8208,6 @@ int
ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
cmd_info_t *ci, ipsq_func_t func)
{
- sin_t *sin;
- sin6_t *sin6;
char *name;
struct ifreq *ifr;
struct lifreq *lifr;
@@ -8132,9 +8251,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
* be trusted.
*/
ifr->ifr_name[IFNAMSIZ - 1] = '\0';
- sin = (sin_t *)&ifr->ifr_addr;
name = ifr->ifr_name;
- ci->ci_sin = sin;
+ ci->ci_sin = (sin_t *)&ifr->ifr_addr;
ci->ci_sin6 = NULL;
ci->ci_lifr = (struct lifreq *)ifr;
} else {
@@ -8148,14 +8266,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
*/
lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
name = lifr->lifr_name;
- sin = (sin_t *)&lifr->lifr_addr;
- sin6 = (sin6_t *)&lifr->lifr_addr;
- if (ipip->ipi_cmd == SIOCSLIFGROUPNAME) {
- (void) strncpy(ci->ci_groupname, lifr->lifr_groupname,
- LIFNAMSIZ);
- }
- ci->ci_sin = sin;
- ci->ci_sin6 = sin6;
+ ci->ci_sin = (sin_t *)&lifr->lifr_addr;
+ ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
ci->ci_lifr = lifr;
}
@@ -8181,21 +8293,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
if (ipif == NULL) {
if (err == EINPROGRESS)
return (err);
- if (ipip->ipi_cmd == SIOCLIFFAILOVER ||
- ipip->ipi_cmd == SIOCLIFFAILBACK) {
- /*
- * Need to try both v4 and v6 since this
- * ioctl can come down either v4 or v6
- * socket. The lifreq.lifr_family passed
- * down by this ioctl is AF_UNSPEC.
- */
- ipif = ipif_lookup_on_name(name,
- mi_strlen(name), B_FALSE, &exists, !isv6,
- zoneid, (connp == NULL) ? q :
- CONNP_TO_WQ(connp), mp, func, &err, ipst);
- if (err == EINPROGRESS)
- return (err);
- }
err = 0; /* Ensure we don't use it below */
}
}
@@ -8221,15 +8318,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
if (ipif == NULL)
return (ENXIO);
- /*
- * Allow only GET operations if this ipif has been created
- * temporarily due to a MOVE operation.
- */
- if (ipif->ipif_replace_zero && !(ipip->ipi_flags & IPI_REPL)) {
- ipif_refrele(ipif);
- return (EINVAL);
- }
-
ci->ci_ipif = ipif;
return (0);
}
@@ -8247,15 +8335,15 @@ ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
-
- while (ill != NULL) {
+ for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ if (IS_UNDER_IPMP(ill))
+ continue;
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if (ipif->ipif_zoneid == zoneid ||
ipif->ipif_zoneid == ALL_ZONES)
numifs++;
}
- ill = ill_next(&ctx, ill);
}
rw_exit(&ipst->ips_ill_g_lock);
return (numifs);
@@ -8283,6 +8371,9 @@ ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
ill = ILL_START_WALK_ALL(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
+ continue;
+
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if ((ipif->ipif_flags & IPIF_NOXMIT) &&
@@ -8491,6 +8582,8 @@ ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ if (IS_UNDER_IPMP(ill))
+ continue;
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if (zoneid != ipif->ipif_zoneid &&
@@ -8760,6 +8853,9 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ill_first(list, list, &ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
+ continue;
+
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if ((ipif->ipif_flags & IPIF_NOXMIT) &&
@@ -8795,6 +8891,7 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
ipif_get_name(ipif, lifr->lifr_name,
sizeof (lifr->lifr_name));
+ lifr->lifr_type = ill->ill_type;
if (ipif->ipif_isv6) {
sin6 = (sin6_t *)&lifr->lifr_addr;
*sin6 = sin6_null;
@@ -8828,23 +8925,6 @@ lif_copydone:
return (0);
}
-/* ARGSUSED */
-int
-ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin,
- queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
-{
- ip_stack_t *ipst;
-
- if (q->q_next == NULL)
- ipst = CONNQ_TO_IPST(q);
- else
- ipst = ILLQ_TO_IPST(q);
-
- /* Existence of b_cont->b_cont checked in ip_wput_nondata */
- ipst->ips_ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr;
- return (0);
-}
-
static void
ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
{
@@ -9038,8 +9118,7 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid);
} else {
src_ipif = ipif_select_source_v6(dst_ill,
- daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT,
- zoneid);
+ daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid);
}
if (src_ipif == NULL)
goto next_dst;
@@ -9325,10 +9404,14 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
struct arpreq *ar;
struct xarpreq *xar;
int flags, alength;
- char *lladdr;
- ip_stack_t *ipst;
+ uchar_t *lladdr;
+ ire_t *ire;
+ ip_stack_t *ipst;
ill_t *ill = ipif->ipif_ill;
+ ill_t *proxy_ill = NULL;
+ ipmp_arpent_t *entp = NULL;
boolean_t if_arp_ioctl = B_FALSE;
+ boolean_t proxyarp = B_FALSE;
ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
connp = Q_TO_CONN(q);
@@ -9340,7 +9423,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ar = NULL;
flags = xar->xarp_flags;
- lladdr = LLADDR(&xar->xarp_ha);
+ lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
/*
* Validate against user's link layer address length
@@ -9359,7 +9442,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
xar = NULL;
flags = ar->arp_flags;
- lladdr = ar->arp_ha.sa_data;
+ lladdr = (uchar_t *)ar->arp_ha.sa_data;
/*
* Theoretically, the sa_family could tell us what link
* layer type this operation is trying to deal with. By
@@ -9379,6 +9462,51 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
}
+ ipaddr = sin->sin_addr.s_addr;
+
+ /*
+ * IPMP ARP special handling:
+ *
+ * 1. Since ARP mappings must appear consistent across the group,
+ * prohibit changing ARP mappings on the underlying interfaces.
+ *
+ * 2. Since ARP mappings for IPMP data addresses are maintained by
+ * IP itself, prohibit changing them.
+ *
+ * 3. For proxy ARP, use a functioning hardware address in the group,
+ * provided one exists. If one doesn't, just add the entry as-is;
+ * ipmp_illgrp_refresh_arpent() will refresh it if things change.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP)
+ return (EPERM);
+ }
+ if (IS_IPMP(ill)) {
+ ipmp_illgrp_t *illg = ill->ill_grp;
+
+ switch (ipip->ipi_cmd) {
+ case SIOCSARP:
+ case SIOCSXARP:
+ proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength);
+ if (proxy_ill != NULL) {
+ proxyarp = B_TRUE;
+ if (!ipmp_ill_is_active(proxy_ill))
+ proxy_ill = ipmp_illgrp_next_ill(illg);
+ if (proxy_ill != NULL)
+ lladdr = proxy_ill->ill_phys_addr;
+ }
+ /* FALLTHRU */
+ case SIOCDARP:
+ case SIOCDXARP:
+ ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL,
+ ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
+ if (ire != NULL) {
+ ire_refrele(ire);
+ return (EPERM);
+ }
+ }
+ }
+
/*
* We are going to pass up to ARP a packet chain that looks
* like:
@@ -9400,8 +9528,6 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
return (ENOMEM);
}
- ipaddr = sin->sin_addr.s_addr;
-
mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
(caddr_t)&ipaddr);
if (mp2 == NULL) {
@@ -9481,6 +9607,30 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
area->area_flags |= ACE_F_AUTHORITY;
/*
+ * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it
+ * so that IP can update ARP as the active ills in the group change.
+ */
+ if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD &&
+ (area->area_flags & ACE_F_PERMANENT)) {
+ entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp);
+
+ /*
+ * The second part of the conditional below handles a corner
+ * case: if this is proxy ARP and the IPMP group has no active
+ * interfaces, we can't send the request to ARP now since it
+ * won't be able to build an ACE. So we return success and
+ * notify ARP about the proxy ARP entry once an interface
+ * becomes active.
+ */
+ if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
+ mp2->b_cont = NULL;
+ inet_freemsg(mp1);
+ inet_freemsg(pending_mp);
+ return (entp == NULL ? ENOMEM : 0);
+ }
+ }
+
+ /*
* Before sending 'mp' to ARP, we have to clear the b_next
* and b_prev. Otherwise if STREAMS encounters such a message
* in freemsg(), (because ARP can close any time) it can cause
@@ -9497,7 +9647,12 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
mutex_enter(&connp->conn_lock);
mutex_enter(&ill->ill_lock);
/* conn has not yet started closing, hence this can't fail */
- VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0);
+ if (ipip->ipi_flags & IPI_WR) {
+ VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp),
+ pending_mp, 0) != 0);
+ } else {
+ VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0);
+ }
mutex_exit(&ill->ill_lock);
mutex_exit(&connp->conn_lock);
@@ -9506,6 +9661,13 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* M_IOCACK, and will be handed to ip_sioctl_iocack() for completion.
*/
putnext(ill->ill_rq, mp1);
+
+ /*
+ * If we created an IPMP ARP entry, mark that we've notified ARP.
+ */
+ if (entp != NULL)
+ ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
+
return (EINPROGRESS);
}
@@ -9564,55 +9726,114 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
mp, func, &err, ipst);
if (ipif == NULL)
return (err);
- if (ipif->ipif_id != 0 ||
- ipif->ipif_net_type != IRE_IF_RESOLVER) {
+ if (ipif->ipif_id != 0) {
ipif_refrele(ipif);
return (ENXIO);
}
} else {
/*
- * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with sdl_nlen ==
- * 0: use the IP address to figure out the ill. In the IPMP
- * case, a simple forwarding table lookup will return the
- * IRE_IF_RESOLVER for the first interface in the group, which
- * might not be the interface on which the requested IP
- * address was resolved due to the ill selection algorithm
- * (see ip_newroute_get_dst_ill()). So we do a cache table
- * lookup first: if the IRE cache entry for the IP address is
- * still there, it will contain the ill pointer for the right
- * interface, so we use that. If the cache entry has been
- * flushed, we fall back to the forwarding table lookup. This
- * should be rare enough since IRE cache entries have a longer
- * life expectancy than ARP cache entries.
+ * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen
+ * of 0: use the IP address to find the ipif. If the IP
+ * address is an IPMP test address, ire_ftable_lookup() will
+ * find the wrong ill, so we first do an ipif_lookup_addr().
*/
- ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL,
- ipst);
- if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) ||
- ((ill = ire_to_ill(ire)) == NULL) ||
- (ill->ill_net_type != IRE_IF_RESOLVER)) {
- if (ire != NULL)
- ire_refrele(ire);
- ire = ire_ftable_lookup(sin->sin_addr.s_addr,
- 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0,
- NULL, MATCH_IRE_TYPE, ipst);
+ ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
+ CONNP_TO_WQ(connp), mp, func, &err, ipst);
+ if (ipif == NULL) {
+ ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0,
+ IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL,
+ MATCH_IRE_TYPE, ipst);
if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) {
-
if (ire != NULL)
ire_refrele(ire);
return (ENXIO);
}
+ ipif = ill->ill_ipif;
+ ipif_refhold(ipif);
+ ire_refrele(ire);
}
- ASSERT(ire != NULL && ill != NULL);
- ipif = ill->ill_ipif;
- ipif_refhold(ipif);
- ire_refrele(ire);
}
+
+ if (ipif->ipif_net_type != IRE_IF_RESOLVER) {
+ ipif_refrele(ipif);
+ return (ENXIO);
+ }
+
ci->ci_sin = sin;
ci->ci_ipif = ipif;
return (0);
}
/*
+ * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the
+ * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is
+ * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it
+ * up and thus an ill can join that illgrp.
+ *
+ * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than
+ * open()/close() primarily because close() is not allowed to fail or block
+ * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason
+ * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure
+ * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the
+ * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts
+ * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent
+ * state if I_UNLINK didn't occur.
+ *
+ * Note that for each plumb/unplumb operation, we may end up here more than
+ * once because of the way ifconfig works. However, it's OK to link the same
+ * illgrp more than once, or unlink an illgrp that's already unlinked.
+ */
+static int
+ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
+{
+ int err;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ASSERT(IS_IPMP(ill));
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ switch (ioccmd) {
+ case I_LINK:
+ return (ENOTSUP);
+
+ case I_PLINK:
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp);
+ rw_exit(&ipst->ips_ipmp_lock);
+ break;
+
+ case I_PUNLINK:
+ /*
+ * Require all UP ipifs be brought down prior to unlinking the
+ * illgrp so any associated IREs (and other state) is torched.
+ */
+ if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
+ return (EBUSY);
+
+ /*
+ * NOTE: We hold ipmp_lock across the unlink to prevent a race
+ * with an SIOCSLIFGROUPNAME request from an ill trying to
+ * join this group. Specifically: ills trying to join grab
+ * ipmp_lock and bump a "pending join" counter checked by
+ * ipmp_illgrp_unlink_grp(). During the unlink no new pending
+ * joins can occur (since we have ipmp_lock). Once we drop
+ * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not
+ * find the illgrp (since we unlinked it) and will return
+ * EAFNOSUPPORT. This will then take them back through the
+ * IPMP meta-interface plumbing logic in ifconfig, and thus
+ * back through I_PLINK above.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ err = ipmp_illgrp_unlink_grp(ill->ill_grp);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (err);
+ default:
+ break;
+ }
+ return (0);
+}
+
+/*
* Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
* atomically set/clear the muxids. Also complete the ioctl by acking or
* naking it. Note that the code is structured such that the link type,
@@ -9697,7 +9918,7 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
if (ipsq == NULL) {
ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
- NEW_OP, B_TRUE);
+ NEW_OP, B_FALSE);
if (ipsq == NULL) {
ill_refrele(ill);
return;
@@ -9728,6 +9949,11 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
err = EINVAL;
goto done;
}
+
+ if (IS_IPMP(ill) &&
+ (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
+ goto done;
+
ill->ill_arp_muxid = islink ? li->l_index : 0;
} else {
/*
@@ -9763,6 +9989,7 @@ static int
ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
struct linkblk *li, boolean_t doconsist)
{
+ int err = 0;
ill_t *ill;
queue_t *ipwq, *dwq;
const char *name;
@@ -9796,7 +10023,7 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
if (ipsq == NULL) {
ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
- NEW_OP, B_TRUE);
+ NEW_OP, B_FALSE);
if (ipsq == NULL)
return (EINPROGRESS);
entered_ipsq = B_TRUE;
@@ -9811,12 +10038,14 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
*/
if ((islink && ill->ill_ip_muxid != 0) ||
(!islink && ill->ill_arp_muxid != 0)) {
- if (entered_ipsq)
- ipsq_exit(ipsq);
- return (EINVAL);
+ err = EINVAL;
+ goto done;
}
}
+ if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
+ goto done;
+
/*
* As part of I_{P}LINKing, stash the number of downstream modules and
* the read queue of the module immediately below IP in the ill.
@@ -9853,11 +10082,11 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
ill_capability_reset(ill, B_FALSE);
}
ipsq_current_finish(ipsq);
-
+done:
if (entered_ipsq)
ipsq_exit(ipsq);
- return (0);
+ return (err);
}
/*
@@ -10124,8 +10353,9 @@ nak:
}
/* ip_wput hands off ARP IOCTL responses to us */
+/* ARGSUSED3 */
void
-ip_sioctl_iocack(queue_t *q, mblk_t *mp)
+ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
{
struct arpreq *ar;
struct xarpreq *xar;
@@ -10136,7 +10366,6 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
struct iocblk *orig_iocp;
ill_t *ill;
conn_t *connp = NULL;
- uint_t ioc_id;
mblk_t *pending_mp;
int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE;
int *flagsp;
@@ -10146,6 +10375,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
int err;
ip_stack_t *ipst;
+ ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
ill = q->q_ptr;
ASSERT(ill != NULL);
ipst = ill->ill_ipst;
@@ -10185,10 +10415,14 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
iocp = (struct iocblk *)mp->b_rptr;
/*
- * Pick out the originating queue based on the ioc_id.
+ * Find the pending message; if we're exclusive, it'll be on our IPSQ.
+ * Otherwise, we can find it from our ioc_id.
*/
- ioc_id = iocp->ioc_id;
- pending_mp = ill_pending_mp_get(ill, &connp, ioc_id);
+ if (ipsq != NULL)
+ pending_mp = ipsq_pending_mp_get(ipsq, &connp);
+ else
+ pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id);
+
if (pending_mp == NULL) {
ASSERT(connp == NULL);
inet_freemsg(mp);
@@ -10271,7 +10505,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
ire_refrele(ire);
freemsg(mp);
ip_ioctl_finish(q, orig_ioc_mp,
- EINVAL, NO_COPYOUT, NULL);
+ EINVAL, NO_COPYOUT, ipsq);
return;
}
*flagsp |= ATF_COM;
@@ -10297,12 +10531,27 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
/* Ditch the internal IOCTL. */
freemsg(mp);
ire_refrele(ire);
- ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL);
+ ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
return;
}
}
/*
+ * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE
+ * on the IPMP meta-interface, ensure any ARP entries added in
+ * ip_sioctl_arp() are deleted.
+ */
+ if (IS_IPMP(ill) &&
+ ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) ||
+ ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) {
+ ipmp_illgrp_t *illg = ill->ill_grp;
+ ipmp_arpent_t *entp;
+
+ if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL)
+ ipmp_illgrp_destroy_arpent(illg, entp);
+ }
+
+ /*
* Delete the coresponding IRE_CACHE if any.
* Reset the error if there was one (in case there was no entry
* in arp.)
@@ -10341,7 +10590,7 @@ errack:
if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) {
err = iocp->ioc_error;
freemsg(mp);
- ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL);
+ ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq);
return;
}
@@ -10355,7 +10604,7 @@ errack:
sizeof (xar->xarp_ha.sdl_data)) {
freemsg(mp);
ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT,
- NULL);
+ ipsq);
return;
}
}
@@ -10382,7 +10631,7 @@ errack:
/* Ditch the internal IOCTL. */
freemsg(mp);
/* Complete the original. */
- ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL);
+ ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
}
/*
@@ -10397,7 +10646,7 @@ errack:
* If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
* is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
*
- * Executed as a writer on the ill or ill group.
+ * Executed as a writer on the ill.
* So no lock is needed to traverse the ipif chain, or examine the
* phyint flags.
*/
@@ -10423,7 +10672,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
boolean_t found_sep = B_FALSE;
conn_t *connp;
zoneid_t zoneid;
- int orig_ifindex = 0;
ip_stack_t *ipst = CONNQ_TO_IPST(q);
ASSERT(q->q_next == NULL);
@@ -10513,61 +10761,10 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
if (ipsq == NULL)
return (EINPROGRESS);
- /*
- * If the interface is failed, inactive or offlined, look for a working
- * interface in the ill group and create the ipif there. If we can't
- * find a good interface, create the ipif anyway so that in.mpathd can
- * move it to the first repaired interface.
- */
- if ((ill->ill_phyint->phyint_flags &
- (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
- ill->ill_phyint->phyint_groupname_len != 0) {
- phyint_t *phyi;
- char *groupname = ill->ill_phyint->phyint_groupname;
-
- /*
- * We're looking for a working interface, but it doesn't matter
- * if it's up or down; so instead of following the group lists,
- * we look at each physical interface and compare the groupname.
- * We're only interested in interfaces with IPv4 (resp. IPv6)
- * plumbed when we're adding an IPv4 (resp. IPv6) ipif.
- * Otherwise we create the ipif on the failed interface.
- */
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- phyi = avl_first(&ipst->ips_phyint_g_list->
- phyint_list_avl_by_index);
- for (; phyi != NULL;
- phyi = avl_walk(&ipst->ips_phyint_g_list->
- phyint_list_avl_by_index,
- phyi, AVL_AFTER)) {
- if (phyi->phyint_groupname_len == 0)
- continue;
- ASSERT(phyi->phyint_groupname != NULL);
- if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 &&
- !(phyi->phyint_flags &
- (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
- (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) :
- (phyi->phyint_illv4 != NULL))) {
- break;
- }
- }
- rw_exit(&ipst->ips_ill_g_lock);
-
- if (phyi != NULL) {
- orig_ifindex = ill->ill_phyint->phyint_ifindex;
- ill = (ill->ill_isv6 ? phyi->phyint_illv6 :
- phyi->phyint_illv4);
- }
- }
-
- /*
- * We are now exclusive on the ipsq, so an ill move will be serialized
- * before or after us.
- */
+ /* We are now exclusive on the IPSQ */
ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(ill->ill_move_in_progress == B_FALSE);
- if (found_sep && orig_ifindex == 0) {
+ if (found_sep) {
/* Now see if there is an IPIF with this unit number. */
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
@@ -10580,14 +10777,11 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
/*
* We use IRE_LOCAL for lo0:1 etc. for "receive only" use
- * of lo0. We never come here when we plumb lo0:0. It
- * happens in ipif_lookup_on_name.
- * The specified unit number is ignored when we create the ipif on a
- * different interface. However, we save it in ipif_orig_ipifid below so
- * that the ipif fails back to the right position.
- */
- if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ?
- id : -1, IRE_LOCAL, B_TRUE)) == NULL) {
+ * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name()
+ * instead.
+ */
+ if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL,
+ B_TRUE, B_TRUE)) == NULL) {
err = ENOBUFS;
goto done;
}
@@ -10604,14 +10798,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
&ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
}
- /* Set ifindex and unit number for failback */
- if (err == 0 && orig_ifindex != 0) {
- ipif->ipif_orig_ifindex = orig_ifindex;
- if (found_sep) {
- ipif->ipif_orig_ipifid = id;
- }
- }
-
done:
ipsq_exit(ipsq);
return (err);
@@ -10672,7 +10858,6 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ill_delete(ill);
mutex_enter(&connp->conn_lock);
mutex_enter(&ill->ill_lock);
- ASSERT(ill->ill_group == NULL);
/* Are any references to this ill active */
if (ill_is_freeable(ill)) {
@@ -10693,14 +10878,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
}
- /*
- * We are exclusive on the ipsq, so an ill move will be serialized
- * before or after us.
- */
- ASSERT(ill->ill_move_in_progress == B_FALSE);
-
if (ipif->ipif_id == 0) {
-
ipsq_t *ipsq;
/* Find based on address */
@@ -10712,35 +10890,15 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
sin6 = (sin6_t *)sin;
/* We are a writer, so we should be able to lookup */
- ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
- ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
- if (ipif == NULL) {
- /*
- * Maybe the address in on another interface in
- * the same IPMP group? We check this below.
- */
- ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
- NULL, ALL_ZONES, NULL, NULL, NULL, NULL,
- ipst);
- }
+ ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill,
+ ipst);
} else {
- ipaddr_t addr;
-
if (sin->sin_family != AF_INET)
return (EAFNOSUPPORT);
- addr = sin->sin_addr.s_addr;
/* We are a writer, so we should be able to lookup */
- ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL,
- NULL, NULL, NULL, ipst);
- if (ipif == NULL) {
- /*
- * Maybe the address in on another interface in
- * the same IPMP group? We check this below.
- */
- ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES,
- NULL, NULL, NULL, NULL, ipst);
- }
+ ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill,
+ ipst);
}
if (ipif == NULL) {
return (EADDRNOTAVAIL);
@@ -10750,32 +10908,11 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* It is possible for a user to send an SIOCLIFREMOVEIF with
* lifr_name of the physical interface but with an ip address
* lifr_addr of a logical interface plumbed over it.
- * So update ipsq_current_ipif once ipif points to the
- * correct interface after doing ipif_lookup_addr().
+ * So update ipx_current_ipif now that ipif points to the
+ * correct one.
*/
ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
- ASSERT(ipsq != NULL);
-
- mutex_enter(&ipsq->ipsq_lock);
- ipsq->ipsq_current_ipif = ipif;
- mutex_exit(&ipsq->ipsq_lock);
-
- /*
- * When the address to be removed is hosted on a different
- * interface, we check if the interface is in the same IPMP
- * group as the specified one; if so we proceed with the
- * removal.
- * ill->ill_group is NULL when the ill is down, so we have to
- * compare the group names instead.
- */
- if (ipif->ipif_ill != ill &&
- (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 ||
- ill->ill_phyint->phyint_groupname_len == 0 ||
- mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname,
- ill->ill_phyint->phyint_groupname) != 0)) {
- ipif_refrele(ipif);
- return (EADDRNOTAVAIL);
- }
+ ipsq->ipsq_xop->ipx_current_ipif = ipif;
/* This is a writer */
ipif_refrele(ipif);
@@ -11072,7 +11209,7 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
if (need_dl_down)
ill_dl_down(ill);
if (need_arp_down)
- ipif_arp_down(ipif);
+ ipif_resolver_down(ipif);
return (err);
}
@@ -11272,9 +11409,9 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
if (need_dl_down)
ill_dl_down(ill);
-
if (need_arp_down)
- ipif_arp_down(ipif);
+ ipif_resolver_down(ipif);
+
return (err);
}
@@ -11323,144 +11460,8 @@ ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
/*
- * part of ipmp, make this func return the active/inactive state and
- * caller can set once atomically instead of multiple mutex_enter/mutex_exit
- */
-/*
- * This function either sets or clears the IFF_INACTIVE flag.
- *
- * As long as there are some addresses or multicast memberships on the
- * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we
- * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface
- * will be used for outbound packets.
- *
- * Caller needs to verify the validity of setting IFF_INACTIVE.
- */
-static void
-phyint_inactive(phyint_t *phyi)
-{
- ill_t *ill_v4;
- ill_t *ill_v6;
- ipif_t *ipif;
- ilm_t *ilm;
-
- ill_v4 = phyi->phyint_illv4;
- ill_v6 = phyi->phyint_illv6;
-
- /*
- * No need for a lock while traversing the list since iam
- * a writer
- */
- if (ill_v4 != NULL) {
- ASSERT(IAM_WRITER_ILL(ill_v4));
- for (ipif = ill_v4->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
- mutex_enter(&phyi->phyint_lock);
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- mutex_exit(&phyi->phyint_lock);
- return;
- }
- }
- for (ilm = ill_v4->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
- mutex_enter(&phyi->phyint_lock);
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- mutex_exit(&phyi->phyint_lock);
- return;
- }
- }
- }
- if (ill_v6 != NULL) {
- ill_v6 = phyi->phyint_illv6;
- for (ipif = ill_v6->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
- mutex_enter(&phyi->phyint_lock);
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- mutex_exit(&phyi->phyint_lock);
- return;
- }
- }
- for (ilm = ill_v6->ill_ilm; ilm != NULL;
- ilm = ilm->ilm_next) {
- if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
- mutex_enter(&phyi->phyint_lock);
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- mutex_exit(&phyi->phyint_lock);
- return;
- }
- }
- }
- mutex_enter(&phyi->phyint_lock);
- phyi->phyint_flags |= PHYI_INACTIVE;
- mutex_exit(&phyi->phyint_lock);
-}
-
-/*
- * This function is called only when the phyint flags change. Currently
- * called from ip_sioctl_flags. We re-do the broadcast nomination so
- * that we can select a good ill.
- */
-static void
-ip_redo_nomination(phyint_t *phyi)
-{
- ill_t *ill_v4;
-
- ill_v4 = phyi->phyint_illv4;
-
- if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
- ASSERT(IAM_WRITER_ILL(ill_v4));
- if (ill_v4->ill_group->illgrp_ill_count > 1)
- ill_nominate_bcast_rcv(ill_v4->ill_group);
- }
-}
-
-/*
- * Heuristic to check if ill is INACTIVE.
- * Checks if ill has an ipif with an usable ip address.
- *
- * Return values:
- * B_TRUE - ill is INACTIVE; has no usable ipif
- * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif
- */
-static boolean_t
-ill_is_inactive(ill_t *ill)
-{
- ipif_t *ipif;
-
- /* Check whether it is in an IPMP group */
- if (ill->ill_phyint->phyint_groupname == NULL)
- return (B_FALSE);
-
- if (ill->ill_ipif_up_count == 0)
- return (B_TRUE);
-
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- uint64_t flags = ipif->ipif_flags;
-
- /*
- * This ipif is usable if it is IPIF_UP and not a
- * dedicated test address. A dedicated test address
- * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED
- * (note in particular that V6 test addresses are
- * link-local data addresses and thus are marked
- * IPIF_NOFAILOVER but not IPIF_DEPRECATED).
- */
- if ((flags & IPIF_UP) &&
- ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) !=
- (IPIF_DEPRECATED|IPIF_NOFAILOVER)))
- return (B_FALSE);
- }
- return (B_TRUE);
-}
-
-/*
- * Set interface flags.
- * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT,
- * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST,
- * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE.
+ * Set interface flags. Many flags require special handling (e.g.,
+ * bringing the interface down); see below for details.
*
* NOTE : We really don't enforce that ipif_id zero should be used
* for setting any flags other than IFF_LOGINT_FLAGS. This
@@ -11478,17 +11479,16 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
{
uint64_t turn_on;
uint64_t turn_off;
- int err;
+ int err = 0;
phyint_t *phyi;
ill_t *ill;
- uint64_t intf_flags;
+ uint64_t intf_flags, cantchange_flags;
boolean_t phyint_flags_modified = B_FALSE;
uint64_t flags;
struct ifreq *ifr;
struct lifreq *lifr;
boolean_t set_linklocal = B_FALSE;
boolean_t zero_source = B_FALSE;
- ip_stack_t *ipst;
ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -11497,11 +11497,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ill = ipif->ipif_ill;
phyi = ill->ill_phyint;
- ipst = ill->ill_ipst;
if (ipip->ipi_cmd_type == IF_CMD) {
ifr = (struct ifreq *)if_req;
- flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
+ flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
} else {
lifr = (struct lifreq *)if_req;
flags = lifr->lifr_flags;
@@ -11524,25 +11523,60 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
flags |= intf_flags & ~0xFFFF;
/*
- * First check which bits will change and then which will
- * go on and off
+ * Explicitly fail attempts to change flags that are always invalid on
+ * an IPMP meta-interface.
*/
- turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE;
- if (!turn_on)
+ if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID))
+ return (EINVAL);
+
+ /*
+ * Check which flags will change; silently ignore flags which userland
+ * is not allowed to control. (Because these flags may change between
+ * SIOCGLIFFLAGS and SIOCSLIFFLAGS, and that's outside of userland's
+ * control, we need to silently ignore them rather than fail.)
+ */
+ cantchange_flags = IFF_CANTCHANGE;
+ if (IS_IPMP(ill))
+ cantchange_flags |= IFF_IPMP_CANTCHANGE;
+
+ turn_on = (flags ^ intf_flags) & ~cantchange_flags;
+ if (turn_on == 0)
return (0); /* No change */
turn_off = intf_flags & turn_on;
turn_on ^= turn_off;
- err = 0;
/*
- * Don't allow any bits belonging to the logical interface
- * to be set or cleared on the replacement ipif that was
- * created temporarily during a MOVE.
+ * All test addresses must be IFF_DEPRECATED (to ensure source address
+ * selection avoids them) -- so force IFF_DEPRECATED on, and do not
+ * allow it to be turned off.
*/
- if (ipif->ipif_replace_zero &&
- ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) {
+ if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED &&
+ (turn_on|intf_flags) & IFF_NOFAILOVER)
return (EINVAL);
+
+ if (turn_on & IFF_NOFAILOVER) {
+ turn_on |= IFF_DEPRECATED;
+ flags |= IFF_DEPRECATED;
+ }
+
+ /*
+ * On underlying interfaces, only allow applications to manage test
+ * addresses -- otherwise, they may get confused when the address
+ * moves as part of being brought up. Likewise, prevent an
+ * application-managed test address from being converted to a data
+ * address. To prevent migration of administratively up addresses in
+ * the kernel, we don't allow them to be converted either.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF;
+
+ if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER))
+ return (EINVAL);
+
+ if ((turn_off & IFF_NOFAILOVER) &&
+ (flags & (appflags | IFF_UP | IFF_DUPLICATE)))
+ return (EINVAL);
}
/*
@@ -11583,16 +11617,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
/*
- * ILL cannot be part of a usesrc group and and IPMP group at the
- * same time. No need to grab ill_g_usesrc_lock here, see
- * synchronization notes in ip.c
- */
- if (turn_on & PHYI_STANDBY &&
- ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
- return (EINVAL);
- }
-
- /*
* If we modify physical interface flags, we'll potentially need to
* send up two routing socket messages for the changes (one for the
* IPv4 ill, and another for the IPv6 ill). Note that here.
@@ -11601,98 +11625,44 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
phyint_flags_modified = B_TRUE;
/*
- * If we are setting or clearing FAILED or STANDBY or OFFLINE,
- * we need to flush the IRE_CACHES belonging to this ill.
- * We handle this case here without doing the DOWN/UP dance
- * like it is done for other flags. If some other flags are
- * being turned on/off with FAILED/STANDBY/OFFLINE, the code
- * below will handle it by bringing it down and then
- * bringing it UP.
+ * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE
+ * (otherwise, we'd immediately use them, defeating standby). Also,
+ * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not
+ * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already
+ * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We
+ * also don't allow PHYI_STANDBY if VNI is enabled since its semantics
+ * will not be honored.
*/
- if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) {
- ill_t *ill_v4, *ill_v6;
-
- ill_v4 = phyi->phyint_illv4;
- ill_v6 = phyi->phyint_illv6;
-
+ if (turn_on & PHYI_STANDBY) {
/*
- * First set the INACTIVE flag if needed. Then delete the ires.
- * ire_add will atomically prevent creating new IRE_CACHEs
- * unless hidden flag is set.
- * PHYI_FAILED and PHYI_INACTIVE are exclusive
+ * No need to grab ill_g_usesrc_lock here; see the
+ * synchronization notes in ip.c.
*/
- if ((turn_on & PHYI_FAILED) &&
- ((intf_flags & PHYI_STANDBY) ||
- !ipst->ips_ipmp_enable_failback)) {
- /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- }
- if ((turn_off & PHYI_FAILED) &&
- ((intf_flags & PHYI_STANDBY) ||
- (!ipst->ips_ipmp_enable_failback &&
- ill_is_inactive(ill)))) {
- phyint_inactive(phyi);
- }
-
- if (turn_on & PHYI_STANDBY) {
- /*
- * We implicitly set INACTIVE only when STANDBY is set.
- * INACTIVE is also set on non-STANDBY phyint when user
- * disables FAILBACK using configuration file.
- * Do not allow STANDBY to be set on such INACTIVE
- * phyint
- */
- if (phyi->phyint_flags & PHYI_INACTIVE)
- return (EINVAL);
- if (!(phyi->phyint_flags & PHYI_FAILED))
- phyint_inactive(phyi);
- }
- if (turn_off & PHYI_STANDBY) {
- if (ipst->ips_ipmp_enable_failback) {
- /*
- * Reset PHYI_INACTIVE.
- */
- phyi->phyint_flags &= ~PHYI_INACTIVE;
- } else if (ill_is_inactive(ill) &&
- !(phyi->phyint_flags & PHYI_FAILED)) {
- /*
- * Need to set INACTIVE, when user sets
- * STANDBY on a non-STANDBY phyint and
- * later resets STANDBY
- */
- phyint_inactive(phyi);
- }
+ if (ill->ill_usesrc_grp_next != NULL ||
+ intf_flags & PHYI_INACTIVE)
+ return (EINVAL);
+ if (!(flags & PHYI_FAILED)) {
+ flags |= PHYI_INACTIVE;
+ turn_on |= PHYI_INACTIVE;
}
- /*
- * We should always send up a message so that the
- * daemons come to know of it. Note that the zeroth
- * interface can be down and the check below for IPIF_UP
- * will not make sense as we are actually setting
- * a phyint flag here. We assume that the ipif used
- * is always the zeroth ipif. (ip_rts_ifmsg does not
- * send up any message for non-zero ipifs).
- */
- phyint_flags_modified = B_TRUE;
+ }
- if (ill_v4 != NULL) {
- ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, ill_stq_cache_delete,
- (char *)ill_v4, ill_v4);
- illgrp_reset_schednext(ill_v4);
- }
- if (ill_v6 != NULL) {
- ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, ill_stq_cache_delete,
- (char *)ill_v6, ill_v6);
- illgrp_reset_schednext(ill_v6);
- }
+ if (turn_off & PHYI_STANDBY) {
+ flags &= ~PHYI_INACTIVE;
+ turn_off |= PHYI_INACTIVE;
}
/*
+ * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both
+ * would end up on.
+ */
+ if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
+ (PHYI_FAILED | PHYI_INACTIVE))
+ return (EINVAL);
+
+ /*
* If ILLF_ROUTER changes, we need to change the ip forwarding
- * status of the interface and, if the interface is part of an IPMP
- * group, all other interfaces that are part of the same IPMP
- * group.
+ * status of the interface.
*/
if ((turn_on | turn_off) & ILLF_ROUTER)
(void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
@@ -11718,33 +11688,31 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
mutex_exit(&ill->ill_phyint->phyint_lock);
/*
- * We do the broadcast and nomination here rather
- * than waiting for a FAILOVER/FAILBACK to happen. In
- * the case of FAILBACK from INACTIVE standby to the
- * interface that has been repaired, PHYI_FAILED has not
- * been cleared yet. If there are only two interfaces in
- * that group, all we have is a FAILED and INACTIVE
- * interface. If we do the nomination soon after a failback,
- * the broadcast nomination code would select the
- * INACTIVE interface for receiving broadcasts as FAILED is
- * not yet cleared. As we don't want STANDBY/INACTIVE to
- * receive broadcast packets, we need to redo nomination
- * when the FAILED is cleared here. Thus, in general we
- * always do the nomination here for FAILED, STANDBY
- * and OFFLINE.
+ * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the
+ * same to the kernel: if any of them has been set by
+ * userland, the interface cannot be used for data traffic.
*/
- if (((turn_on | turn_off) &
- (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) {
- ip_redo_nomination(phyi);
+ if ((turn_on|turn_off) &
+ (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
+ ASSERT(!IS_IPMP(ill));
+ /*
+ * It's possible the ill is part of an "anonymous"
+ * IPMP group rather than a real group. In that case,
+ * there are no other interfaces in the group and thus
+ * no need to call ipmp_phyint_refresh_active().
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_phyint_refresh_active(phyi);
}
+
if (phyint_flags_modified) {
if (phyi->phyint_illv4 != NULL) {
ip_rts_ifmsg(phyi->phyint_illv4->
- ill_ipif);
+ ill_ipif, RTSQ_DEFAULT);
}
if (phyi->phyint_illv6 != NULL) {
ip_rts_ifmsg(phyi->phyint_illv6->
- ill_ipif);
+ ill_ipif, RTSQ_DEFAULT);
}
}
return (0);
@@ -11785,15 +11753,17 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
/*
- * The only flag changes that we currently take specific action on
- * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL,
- * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and
- * IPIF_PREFERRED. This is done by bring the ipif down, changing
- * the flags and bringing it back up again.
+ * The only flag changes that we currently take specific action on are
+ * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP,
+ * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and
+ * IPIF_NOFAILOVER. This is done by bring the ipif down, changing the
+ * flags and bringing it back up again. For IPIF_NOFAILOVER, the act
+ * of bringing it back up will trigger the address to be moved.
*/
if ((turn_on|turn_off) &
(IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
- ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) {
+ ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
+ IPIF_NOFAILOVER)) {
/*
* Taking this ipif down, make sure we have
* valid net and subnet bcast ire's for other
@@ -11822,9 +11792,8 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
{
ill_t *ill;
phyint_t *phyi;
- uint64_t turn_on;
- uint64_t turn_off;
- uint64_t intf_flags;
+ uint64_t turn_on, turn_off;
+ uint64_t intf_flags, cantchange_flags;
boolean_t phyint_flags_modified = B_FALSE;
int err = 0;
boolean_t set_linklocal = B_FALSE;
@@ -11839,12 +11808,15 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
phyi = ill->ill_phyint;
intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
- turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP);
+ cantchange_flags = IFF_CANTCHANGE | IFF_UP;
+ if (IS_IPMP(ill))
+ cantchange_flags |= IFF_IPMP_CANTCHANGE;
+ turn_on = (flags ^ intf_flags) & ~cantchange_flags;
turn_off = intf_flags & turn_on;
turn_on ^= turn_off;
- if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))
+ if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
phyint_flags_modified = B_TRUE;
/*
@@ -11870,9 +11842,6 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
mutex_exit(&ill->ill_lock);
mutex_exit(&phyi->phyint_lock);
- if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)))
- ip_redo_nomination(phyi);
-
if (set_linklocal)
(void) ipif_setlinklocal(ipif);
@@ -11881,12 +11850,29 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
else
ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
+ /*
+ * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
+ * the kernel: if any of them has been set by userland, the interface
+ * cannot be used for data traffic.
+ */
+ if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
+ ASSERT(!IS_IPMP(ill));
+ /*
+ * It's possible the ill is part of an "anonymous" IPMP group
+ * rather than a real group. In that case, there are no other
+ * interfaces in the group and thus no need for us to call
+ * ipmp_phyint_refresh_active().
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_phyint_refresh_active(phyi);
+ }
+
if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) {
/*
* XXX ipif_up really does not know whether a phyint flags
* was modified or not. So, it sends up information on
* only one routing sockets message. As we don't bring up
- * the interface and also set STANDBY/FAILED simultaneously
+ * the interface and also set PHYI_ flags simultaneously
* it should be okay.
*/
err = ipif_up(ipif, q, mp);
@@ -11898,14 +11884,14 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
if (phyint_flags_modified) {
if (phyi->phyint_illv4 != NULL) {
ip_rts_ifmsg(phyi->phyint_illv4->
- ill_ipif);
+ ill_ipif, RTSQ_DEFAULT);
}
if (phyi->phyint_illv6 != NULL) {
ip_rts_ifmsg(phyi->phyint_illv6->
- ill_ipif);
+ ill_ipif, RTSQ_DEFAULT);
}
} else {
- ip_rts_ifmsg(ipif);
+ ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
}
/*
* Update the flags in SCTP's IPIF list, ipif_up() will do
@@ -12101,10 +12087,7 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* broadcast address makes sense. If it does,
* there should be an IRE for it already.
* Don't match on ipif, only on the ill
- * since we are sharing these now. Don't use
- * MATCH_IRE_ILL_GROUP as we are looking for
- * the broadcast ire on this ill and each ill
- * in the group has its own broadcast ire.
+ * since we are sharing these now.
*/
ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST,
ipif, ALL_ZONES, NULL,
@@ -12302,9 +12285,16 @@ int
ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *if_req)
{
-
ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+
+ /*
+ * Since no applications should ever be setting metrics on underlying
+ * interfaces, we explicitly fail to smoke 'em out.
+ */
+ if (IS_UNDER_IPMP(ipif->ipif_ill))
+ return (EINVAL);
+
/*
* Set interface metric. We don't use this for
* anything but we keep track of it in case it is
@@ -12332,6 +12322,7 @@ ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
/* Get interface metric. */
ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+
if (ipip->ipi_cmd_type == IF_CMD) {
struct ifreq *ifr;
@@ -12766,13 +12757,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
nipif->ipif_state_flags |= IPIF_CHANGING;
}
- mutex_exit(&ill->ill_lock);
-
if (lir->lir_maxmtu != 0) {
ill->ill_max_mtu = lir->lir_maxmtu;
- ill->ill_mtu_userspecified = 1;
+ ill->ill_user_mtu = lir->lir_maxmtu;
mtu_walk = B_TRUE;
}
+ mutex_exit(&ill->ill_lock);
if (lir->lir_reachtime != 0)
ill->ill_reachable_time = lir->lir_reachtime;
@@ -12821,6 +12811,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ILL_UNMARK_CHANGING(ill);
mutex_exit(&ill->ill_lock);
+ /*
+ * Refresh IPMP meta-interface MTU if necessary.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_illgrp_refresh_mtu(ill->ill_grp);
+
return (0);
}
@@ -13032,13 +13028,117 @@ ipif_assign_seqid(ipif_t *ipif)
}
/*
+ * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are
+ * administratively down (i.e., no DAD), of the same type, and locked. Note
+ * that the clone is complete -- including the seqid -- and the expectation is
+ * that the caller will either free or overwrite `sipif' before it's unlocked.
+ */
+static void
+ipif_clone(const ipif_t *sipif, ipif_t *dipif)
+{
+ ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock));
+ ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock));
+ ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
+ ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
+ ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
+ ASSERT(sipif->ipif_arp_del_mp == NULL);
+ ASSERT(dipif->ipif_arp_del_mp == NULL);
+ ASSERT(sipif->ipif_igmp_rpt == NULL);
+ ASSERT(dipif->ipif_igmp_rpt == NULL);
+ ASSERT(sipif->ipif_multicast_up == 0);
+ ASSERT(dipif->ipif_multicast_up == 0);
+ ASSERT(sipif->ipif_joined_allhosts == 0);
+ ASSERT(dipif->ipif_joined_allhosts == 0);
+
+ dipif->ipif_mtu = sipif->ipif_mtu;
+ dipif->ipif_flags = sipif->ipif_flags;
+ dipif->ipif_metric = sipif->ipif_metric;
+ dipif->ipif_zoneid = sipif->ipif_zoneid;
+ dipif->ipif_v6subnet = sipif->ipif_v6subnet;
+ dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
+ dipif->ipif_v6src_addr = sipif->ipif_v6src_addr;
+ dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
+ dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
+ dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
+
+ /*
+ * While dipif is down right now, it might've been up before. Since
+ * it's changing identity, its packet counters need to be reset.
+ */
+ dipif->ipif_ib_pkt_count = 0;
+ dipif->ipif_ob_pkt_count = 0;
+ dipif->ipif_fo_pkt_count = 0;
+
+ /*
+ * As per the comment atop the function, we assume that these sipif
+ * fields will be changed before sipif is unlocked.
+ */
+ dipif->ipif_seqid = sipif->ipif_seqid;
+ dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp;
+ dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt;
+ dipif->ipif_state_flags = sipif->ipif_state_flags;
+}
+
+/*
+ * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif'
+ * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin
+ * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then
+ * transfer the xop to `dipif'. Requires that all ipifs are administratively
+ * down (i.e., no DAD), of the same type, and unlocked.
+ */
+static void
+ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
+{
+ ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq;
+ int ipx_current_ioctl;
+
+ ASSERT(sipif != dipif);
+ ASSERT(sipif != virgipif);
+
+ /*
+ * Grab all of the locks that protect the ipif in a defined order.
+ */
+ GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
+ if (sipif > dipif) {
+ mutex_enter(&sipif->ipif_saved_ire_lock);
+ mutex_enter(&dipif->ipif_saved_ire_lock);
+ } else {
+ mutex_enter(&dipif->ipif_saved_ire_lock);
+ mutex_enter(&sipif->ipif_saved_ire_lock);
+ }
+
+ ipif_clone(sipif, dipif);
+ if (virgipif != NULL) {
+ ipif_clone(virgipif, sipif);
+ mi_free(virgipif);
+ }
+
+ mutex_exit(&sipif->ipif_saved_ire_lock);
+ mutex_exit(&dipif->ipif_saved_ire_lock);
+ RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
+
+ /*
+ * Transfer ownership of the current xop, if necessary.
+ */
+ if (ipsq->ipsq_xop->ipx_current_ipif == sipif) {
+ ASSERT(ipsq->ipsq_xop->ipx_pending_ipif == NULL);
+ ipx_current_ioctl = ipsq->ipsq_xop->ipx_current_ioctl;
+ ipsq_current_finish(ipsq);
+ ipsq_current_start(ipsq, dipif, ipx_current_ioctl);
+ }
+
+ if (virgipif == NULL)
+ mi_free(sipif);
+}
+
+/*
* Insert the ipif, so that the list of ipifs on the ill will be sorted
* with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
* be inserted into the first space available in the list. The value of
* ipif_id will then be set to the appropriate value for its position.
*/
static int
-ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
+ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock)
{
ill_t *ill;
ipif_t *tipif;
@@ -13056,12 +13156,11 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
/*
* In the case of lo0:0 we already hold the ill_g_lock.
* ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
- * ipif_insert. Another such caller is ipif_move.
+ * ipif_insert.
*/
if (acquire_g_lock)
rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- if (acquire_ill_lock)
- mutex_enter(&ill->ill_lock);
+ mutex_enter(&ill->ill_lock);
id = ipif->ipif_id;
tipifp = &(ill->ill_ipif);
if (id == -1) { /* need to find a real id */
@@ -13075,8 +13174,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
}
/* limit number of logical interfaces */
if (id >= ipst->ips_ip_addrs_per_if) {
- if (acquire_ill_lock)
- mutex_exit(&ill->ill_lock);
+ mutex_exit(&ill->ill_lock);
if (acquire_g_lock)
rw_exit(&ipst->ips_ill_g_lock);
return (-1);
@@ -13091,8 +13189,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
tipifp = &(tipif->ipif_next);
}
} else {
- if (acquire_ill_lock)
- mutex_exit(&ill->ill_lock);
+ mutex_exit(&ill->ill_lock);
if (acquire_g_lock)
rw_exit(&ipst->ips_ill_g_lock);
return (-1);
@@ -13102,25 +13199,22 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
ipif->ipif_next = tipif;
*tipifp = ipif;
- if (acquire_ill_lock)
- mutex_exit(&ill->ill_lock);
+ mutex_exit(&ill->ill_lock);
if (acquire_g_lock)
rw_exit(&ipst->ips_ill_g_lock);
+
return (0);
}
static void
-ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
+ipif_remove(ipif_t *ipif)
{
ipif_t **ipifp;
ill_t *ill = ipif->ipif_ill;
ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
- if (acquire_ill_lock)
- mutex_enter(&ill->ill_lock);
- else
- ASSERT(MUTEX_HELD(&ill->ill_lock));
+ mutex_enter(&ill->ill_lock);
ipifp = &ill->ill_ipif;
for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
if (*ipifp == ipif) {
@@ -13128,9 +13222,7 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
break;
}
}
-
- if (acquire_ill_lock)
- mutex_exit(&ill->ill_lock);
+ mutex_exit(&ill->ill_lock);
}
/*
@@ -13149,10 +13241,12 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
* second DL_INFO_ACK comes in from the driver.
*/
static ipif_t *
-ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
+ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
+ boolean_t insert)
{
ipif_t *ipif;
- phyint_t *phyi;
+ phyint_t *phyi = ill->ill_phyint;
+ ip_stack_t *ipst = ill->ill_ipst;
ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
ill->ill_name, id, (void *)ill));
@@ -13175,23 +13269,61 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
ipif->ipif_refcnt = 0;
ipif->ipif_saved_ire_cnt = 0;
- if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) {
- mi_free(ipif);
- return (NULL);
+ if (insert) {
+ if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) {
+ mi_free(ipif);
+ return (NULL);
+ }
+ /* -1 id should have been replaced by real id */
+ id = ipif->ipif_id;
+ ASSERT(id >= 0);
}
- /* -1 id should have been replaced by real id */
- id = ipif->ipif_id;
- ASSERT(id >= 0);
if (ill->ill_name[0] != '\0')
ipif_assign_seqid(ipif);
/*
- * Keep a copy of original id in ipif_orig_ipifid. Failback
- * will attempt to restore the original id. The SIOCSLIFOINDEX
- * ioctl sets ipif_orig_ipifid to zero.
+ * If this is ipif zero, configure ill/phyint-wide information.
+ * Defer most configuration until we're guaranteed we're attached.
*/
- ipif->ipif_orig_ipifid = id;
+ if (id == 0) {
+ if (ill->ill_mactype == SUNW_DL_IPMP) {
+ /*
+ * Set PHYI_IPMP and also set PHYI_FAILED since there
+ * are no active interfaces. Similarly, PHYI_RUNNING
+ * isn't set until the group has an active interface.
+ */
+ mutex_enter(&phyi->phyint_lock);
+ phyi->phyint_flags |= (PHYI_IPMP | PHYI_FAILED);
+ mutex_exit(&phyi->phyint_lock);
+
+ /*
+ * Create the illgrp (which must not exist yet because
+ * the zeroth ipif is created once per ill). However,
+ * do not not link it to the ipmp_grp_t until I_PLINK
+ * is called; see ip_sioctl_plink_ipmp() for details.
+ */
+ if (ipmp_illgrp_create(ill) == NULL) {
+ if (insert) {
+ rw_enter(&ipst->ips_ill_g_lock,
+ RW_WRITER);
+ ipif_remove(ipif);
+ rw_exit(&ipst->ips_ill_g_lock);
+ }
+ mi_free(ipif);
+ return (NULL);
+ }
+ } else {
+ /*
+ * By default, PHYI_RUNNING is set when the zeroth
+ * ipif is created. For other ipifs, we don't touch
+ * it since DLPI notifications may have changed it.
+ */
+ mutex_enter(&phyi->phyint_lock);
+ phyi->phyint_flags |= PHYI_RUNNING;
+ mutex_exit(&phyi->phyint_lock);
+ }
+ }
/*
* We grab the ill_lock and phyint_lock to protect the flag changes.
@@ -13199,18 +13331,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
* ioctl completes and the IPIF_CHANGING flag is cleared.
*/
mutex_enter(&ill->ill_lock);
- mutex_enter(&ill->ill_phyint->phyint_lock);
- /*
- * Set the running flag when logical interface zero is created.
- * For subsequent logical interfaces, a DLPI link down
- * notification message may have cleared the running flag to
- * indicate the link is down, so we shouldn't just blindly set it.
- */
- if (id == 0)
- ill->ill_phyint->phyint_flags |= PHYI_RUNNING;
+ mutex_enter(&phyi->phyint_lock);
+
ipif->ipif_ire_type = ire_type;
- phyi = ill->ill_phyint;
- ipif->ipif_orig_ifindex = phyi->phyint_ifindex;
if (ipif->ipif_isv6) {
ill->ill_flags |= ILLF_IPV6;
@@ -13238,14 +13361,18 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
* Don't set the interface flags etc. now, will do it in
* ip_ll_subnet_defaults.
*/
- if (!initialize) {
- mutex_exit(&ill->ill_lock);
- mutex_exit(&ill->ill_phyint->phyint_lock);
- return (ipif);
- }
+ if (!initialize)
+ goto out;
+
ipif->ipif_mtu = ill->ill_max_mtu;
- if (ill->ill_bcast_addr_length != 0) {
+ /*
+ * NOTE: The IPMP meta-interface is special-cased because it starts
+ * with no underlying interfaces (and thus an unknown broadcast
+ * address length), but all interfaces that can be placed into an IPMP
+ * group are required to be broadcast-capable.
+ */
+ if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) {
/*
* Later detect lack of DLPI driver multicast
* capability by catching DL_ENABMULTI errors in
@@ -13269,8 +13396,7 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
ill->ill_flags |= ILLF_NOARP;
}
if (ill->ill_phys_addr_length == 0) {
- if (ill->ill_media &&
- ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) {
+ if (ill->ill_mactype == SUNW_DL_VNI) {
ipif->ipif_flags |= IPIF_NOXMIT;
phyi->phyint_flags |= PHYI_VIRTUAL;
} else {
@@ -13285,8 +13411,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
}
}
}
+out:
+ mutex_exit(&phyi->phyint_lock);
mutex_exit(&ill->ill_lock);
- mutex_exit(&ill->ill_phyint->phyint_lock);
return (ipif);
}
@@ -13300,34 +13427,49 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
* for details.
*/
void
-ipif_arp_down(ipif_t *ipif)
+ipif_resolver_down(ipif_t *ipif)
{
mblk_t *mp;
ill_t *ill = ipif->ipif_ill;
- ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
+ ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
ASSERT(IAM_WRITER_IPIF(ipif));
+ if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
+ return;
+
/* Delete the mapping for the local address */
mp = ipif->ipif_arp_del_mp;
if (mp != NULL) {
- ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+ ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
*(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id));
putnext(ill->ill_rq, mp);
ipif->ipif_arp_del_mp = NULL;
}
/*
+ * Make IPMP aware of the deleted data address.
+ */
+ if (IS_IPMP(ill))
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+
+ /*
* If this is the last ipif that is going down and there are no
* duplicate addresses we may yet attempt to re-probe, then we need to
* clean up ARP completely.
*/
if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) {
+ /*
+ * If this was the last ipif on an IPMP interface, purge any
+ * IPMP ARP entries associated with it.
+ */
+ if (IS_IPMP(ill))
+ ipmp_illgrp_refresh_arpent(ill->ill_grp);
/* Send up AR_INTERFACE_DOWN message */
mp = ill->ill_arp_down_mp;
if (mp != NULL) {
- ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+ ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
*(unsigned *)mp->b_rptr, ill->ill_name,
ipif->ipif_id));
putnext(ill->ill_rq, mp);
@@ -13337,7 +13479,7 @@ ipif_arp_down(ipif_t *ipif)
/* Tell ARP to delete the multicast mappings */
mp = ill->ill_arp_del_mapping_mp;
if (mp != NULL) {
- ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+ ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
*(unsigned *)mp->b_rptr, ill->ill_name,
ipif->ipif_id));
putnext(ill->ill_rq, mp);
@@ -13377,6 +13519,13 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
return (0);
/*
+ * IPMP meta-interfaces don't have any inherent multicast mappings,
+ * and instead use the ones on the underlying interfaces.
+ */
+ if (IS_IPMP(ill))
+ return (0);
+
+ /*
* Delete the existing mapping from ARP. Normally ipif_down
* -> ipif_arp_down should send this up to ARP. The only
* reason we would find this when we are switching from
@@ -13473,26 +13622,23 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
}
/*
- * Get the resolver set up for a new interface address.
- * (Always called as writer.)
- * Called both for IPv4 and IPv6 interfaces,
- * though it only sets up the resolver for v6
- * if it's an xresolv interface (one using an external resolver).
- * Honors ILLF_NOARP.
- * The enumerated value res_act is used to tune the behavior.
- * If set to Res_act_initial, then we set up all the resolver
- * structures for a new interface. If set to Res_act_move, then
- * we just send an AR_ENTRY_ADD message up to ARP for IPv4
- * interfaces; this is called by ip_rput_dlpi_writer() to handle
- * asynchronous hardware address change notification. If set to
- * Res_act_defend, then we tell ARP that it needs to send a single
- * gratuitous message in defense of the address.
+ * Get the resolver set up for a new IP address. (Always called as writer.)
+ * Called both for IPv4 and IPv6 interfaces, though it only sets up the
+ * resolver for v6 if it's an ILLF_XRESOLV interface. Honors ILLF_NOARP.
+ *
+ * The enumerated value res_act tunes the behavior:
+ * * Res_act_initial: set up all the resolver structures for a new
+ * IP address.
+ * * Res_act_defend: tell ARP that it needs to send a single gratuitous
+ * ARP message in defense of the address.
+ * * Res_act_rebind: tell ARP to change the hardware address for an IP
+ * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif().
+ *
* Returns error on failure.
*/
int
ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
{
- caddr_t addr;
mblk_t *arp_up_mp = NULL;
mblk_t *arp_down_mp = NULL;
mblk_t *arp_add_mp = NULL;
@@ -13500,9 +13646,9 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
mblk_t *arp_add_mapping_mp = NULL;
mblk_t *arp_del_mapping_mp = NULL;
ill_t *ill = ipif->ipif_ill;
- uchar_t *area_p = NULL;
- uchar_t *ared_p = NULL;
int err = ENOMEM;
+ boolean_t added_ipif = B_FALSE;
+ boolean_t publish;
boolean_t was_dup;
ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
@@ -13540,11 +13686,7 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
* External resolver for IPv6
*/
ASSERT(res_act == Res_act_initial);
- if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
- addr = (caddr_t)&ipif->ipif_v6lcl_addr;
- area_p = (uchar_t *)&ip6_area_template;
- ared_p = (uchar_t *)&ip6_ared_template;
- }
+ publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr);
} else {
/*
* IPv4 arp case. If the ARP stream has already started
@@ -13562,41 +13704,39 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
ill->ill_arp_bringup_pending = 1;
mutex_exit(&ill->ill_lock);
}
- if (ipif->ipif_lcl_addr != INADDR_ANY) {
- addr = (caddr_t)&ipif->ipif_lcl_addr;
- area_p = (uchar_t *)&ip_area_template;
- ared_p = (uchar_t *)&ip_ared_template;
+ publish = (ipif->ipif_lcl_addr != INADDR_ANY);
+ }
+
+ if (IS_IPMP(ill) && publish) {
+ /*
+ * If we're here via ipif_up(), then the ipif won't be bound
+ * yet -- add it to the group, which will bind it if possible.
+ * (We would add it in ipif_up(), but deleting on failure
+ * there is gruesome.) If we're here via ipmp_ill_bind_ipif(),
+ * then the ipif has already been added to the group and we
+ * just need to use the binding.
+ */
+ if (ipmp_ipif_bound_ill(ipif) == NULL) {
+ if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) {
+ /*
+ * We couldn't bind the ipif to an ill yet,
+ * so we have nothing to publish.
+ */
+ publish = B_FALSE;
+ }
+ added_ipif = B_TRUE;
}
}
/*
* Add an entry for the local address in ARP only if it
- * is not UNNUMBERED and the address is not INADDR_ANY.
+ * is not UNNUMBERED and it is suitable for publishing.
*/
- if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) {
- area_t *area;
-
- /* Now ask ARP to publish our address. */
- arp_add_mp = ill_arp_alloc(ill, area_p, addr);
- if (arp_add_mp == NULL)
- goto failed;
- area = (area_t *)arp_add_mp->b_rptr;
- if (res_act != Res_act_initial) {
- /*
- * Copy the new hardware address and length into
- * arp_add_mp to be sent to ARP.
- */
- area->area_hw_addr_length = ill->ill_phys_addr_length;
- bcopy(ill->ill_phys_addr,
- ((char *)area + area->area_hw_addr_offset),
- area->area_hw_addr_length);
- }
-
- area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH |
- ACE_F_MYADDR;
-
+ if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) {
if (res_act == Res_act_defend) {
- area->area_flags |= ACE_F_DEFEND;
+ arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND);
+ if (arp_add_mp == NULL)
+ goto failed;
/*
* If we're just defending our address now, then
* there's no need to set up ARP multicast mappings.
@@ -13605,17 +13745,18 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
goto done;
}
- if (res_act != Res_act_initial)
- goto arp_setup_multicast;
-
/*
- * Allocate an ARP deletion message so we know we can tell ARP
- * when the interface goes down.
+ * Allocate an ARP add message and an ARP delete message (the
+ * latter is saved for use when the address goes down).
*/
- arp_del_mp = ill_arp_alloc(ill, ared_p, addr);
- if (arp_del_mp == NULL)
+ if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL)
+ goto failed;
+
+ if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL)
goto failed;
+ if (res_act != Res_act_initial)
+ goto arp_setup_multicast;
} else {
if (res_act != Res_act_initial)
goto done;
@@ -13624,14 +13765,11 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
* Need to bring up ARP or setup multicast mapping only
* when the first interface is coming UP.
*/
- if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
- was_dup) {
+ if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup)
goto done;
- }
/*
- * Allocate an ARP down message (to be saved) and an ARP up
- * message.
+ * Allocate an ARP down message (to be saved) and an ARP up message.
*/
arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0);
if (arp_down_mp == NULL)
@@ -13648,33 +13786,21 @@ arp_setup_multicast:
/*
* Setup the multicast mappings. This function initializes
* ill_arp_del_mapping_mp also. This does not need to be done for
- * IPv6.
+ * IPv6, or for the IPMP interface (since it has no link-layer).
*/
- if (!ill->ill_isv6) {
+ if (!ill->ill_isv6 && !IS_IPMP(ill)) {
err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp);
if (err != 0)
goto failed;
ASSERT(ill->ill_arp_del_mapping_mp != NULL);
ASSERT(arp_add_mapping_mp != NULL);
}
-
done:
- if (arp_del_mp != NULL) {
- ASSERT(ipif->ipif_arp_del_mp == NULL);
- ipif->ipif_arp_del_mp = arp_del_mp;
- }
- if (arp_down_mp != NULL) {
- ASSERT(ill->ill_arp_down_mp == NULL);
- ill->ill_arp_down_mp = arp_down_mp;
- }
- if (arp_del_mapping_mp != NULL) {
- ASSERT(ill->ill_arp_del_mapping_mp == NULL);
- ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
- }
if (arp_up_mp != NULL) {
ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n",
ill->ill_name, ipif->ipif_id));
putnext(ill->ill_rq, arp_up_mp);
+ arp_up_mp = NULL;
}
if (arp_add_mp != NULL) {
ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n",
@@ -13686,6 +13812,7 @@ done:
if (!ill->ill_arp_extend)
ipif->ipif_addr_ready = 1;
putnext(ill->ill_rq, arp_add_mp);
+ arp_add_mp = NULL;
} else {
ipif->ipif_addr_ready = 1;
}
@@ -13693,29 +13820,40 @@ done:
ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n",
ill->ill_name, ipif->ipif_id));
putnext(ill->ill_rq, arp_add_mapping_mp);
+ arp_add_mapping_mp = NULL;
}
- if (res_act != Res_act_initial)
- return (0);
- if (ill->ill_flags & ILLF_NOARP)
- err = ill_arp_off(ill);
- else
- err = ill_arp_on(ill);
- if (err != 0) {
- ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err));
- freemsg(ipif->ipif_arp_del_mp);
- freemsg(ill->ill_arp_down_mp);
- freemsg(ill->ill_arp_del_mapping_mp);
- ipif->ipif_arp_del_mp = NULL;
- ill->ill_arp_down_mp = NULL;
- ill->ill_arp_del_mapping_mp = NULL;
- return (err);
+ if (res_act == Res_act_initial) {
+ if (ill->ill_flags & ILLF_NOARP)
+ err = ill_arp_off(ill);
+ else
+ err = ill_arp_on(ill);
+ if (err != 0) {
+ ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n",
+ err));
+ goto failed;
+ }
}
+
+ if (arp_del_mp != NULL) {
+ ASSERT(ipif->ipif_arp_del_mp == NULL);
+ ipif->ipif_arp_del_mp = arp_del_mp;
+ }
+ if (arp_down_mp != NULL) {
+ ASSERT(ill->ill_arp_down_mp == NULL);
+ ill->ill_arp_down_mp = arp_down_mp;
+ }
+ if (arp_del_mapping_mp != NULL) {
+ ASSERT(ill->ill_arp_del_mapping_mp == NULL);
+ ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
+ }
+
return ((ill->ill_ipif_up_count != 0 || was_dup ||
ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS);
-
failed:
ip1dbg(("ipif_resolver_up: FAILED\n"));
+ if (added_ipif)
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
freemsg(arp_add_mp);
freemsg(arp_del_mp);
freemsg(arp_add_mapping_mp);
@@ -13734,13 +13872,12 @@ ipif_arp_start_dad(ipif_t *ipif)
{
ill_t *ill = ipif->ipif_ill;
mblk_t *arp_add_mp;
- area_t *area;
+ /* ACE_F_UNVERIFIED restarts DAD */
if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing ||
(ipif->ipif_flags & IPIF_UNNUMBERED) ||
ipif->ipif_lcl_addr == INADDR_ANY ||
- (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
- (char *)&ipif->ipif_lcl_addr)) == NULL) {
+ (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) {
/*
* If we can't contact ARP for some reason, that's not really a
* problem. Just send out the routing socket notification that
@@ -13752,10 +13889,6 @@ ipif_arp_start_dad(ipif_t *ipif)
return;
}
- /* Setting the 'unverified' flag restarts DAD */
- area = (area_t *)arp_add_mp->b_rptr;
- area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR |
- ACE_F_UNVERIFIED;
putnext(ill->ill_rq, arp_add_mp);
}
@@ -13764,7 +13897,8 @@ ipif_ndp_start_dad(ipif_t *ipif)
{
nce_t *nce;
- nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE);
+ nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr,
+ B_FALSE);
if (nce == NULL)
return;
@@ -13805,7 +13939,7 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
*/
if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) ||
(!ill->ill_isv6 && !ill->ill_arp_extend)) {
- ip_rts_ifmsg(ill->ill_ipif);
+ ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
return;
}
@@ -13838,8 +13972,10 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
* we'll handle eventual routing socket
* notification via DAD completion.)
*/
- if (ipif == ill->ill_ipif)
- ip_rts_ifmsg(ill->ill_ipif);
+ if (ipif == ill->ill_ipif) {
+ ip_rts_ifmsg(ill->ill_ipif,
+ RTSQ_DEFAULT);
+ }
}
} else {
/*
@@ -13855,285 +13991,30 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
* If we've torn down links, then notify the user right away.
*/
if (!went_up)
- ip_rts_ifmsg(ill->ill_ipif);
-}
-
-/*
- * Wakeup all threads waiting to enter the ipsq, and sleeping
- * on any of the ills in this ipsq. The ill_lock of the ill
- * must be held so that waiters don't miss wakeups
- */
-static void
-ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock)
-{
- phyint_t *phyint;
-
- phyint = ipsq->ipsq_phyint_list;
- while (phyint != NULL) {
- if (phyint->phyint_illv4) {
- if (!caller_holds_lock)
- mutex_enter(&phyint->phyint_illv4->ill_lock);
- ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
- cv_broadcast(&phyint->phyint_illv4->ill_cv);
- if (!caller_holds_lock)
- mutex_exit(&phyint->phyint_illv4->ill_lock);
- }
- if (phyint->phyint_illv6) {
- if (!caller_holds_lock)
- mutex_enter(&phyint->phyint_illv6->ill_lock);
- ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
- cv_broadcast(&phyint->phyint_illv6->ill_cv);
- if (!caller_holds_lock)
- mutex_exit(&phyint->phyint_illv6->ill_lock);
- }
- phyint = phyint->phyint_ipsq_next;
- }
-}
-
-static ipsq_t *
-ipsq_create(char *groupname, ip_stack_t *ipst)
-{
- ipsq_t *ipsq;
-
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
- ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
- if (ipsq == NULL) {
- return (NULL);
- }
-
- if (groupname != NULL)
- (void) strcpy(ipsq->ipsq_name, groupname);
- else
- ipsq->ipsq_name[0] = '\0';
-
- mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL);
- ipsq->ipsq_flags |= IPSQ_GROUP;
- ipsq->ipsq_next = ipst->ips_ipsq_g_head;
- ipst->ips_ipsq_g_head = ipsq;
- ipsq->ipsq_ipst = ipst; /* No netstack_hold */
- return (ipsq);
-}
-
-/*
- * Return an ipsq correspoding to the groupname. If 'create' is true
- * allocate a new ipsq if one does not exist. Usually an ipsq is associated
- * uniquely with an IPMP group. However during IPMP groupname operations,
- * multiple IPMP groups may be associated with a single ipsq. But no
- * IPMP group can be associated with more than 1 ipsq at any time.
- * For example
- * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs
- * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2
- * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2
- *
- * Now the command ifconfig hme3 group mpk17-84 results in the temporary
- * status shown below during the execution of the above command.
- * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4
- *
- * After the completion of the above groupname command we return to the stable
- * state shown below.
- * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3
- * hme4 mpk17-85 ipsq2 mpk17-85 1
- *
- * Because of the above, we don't search based on the ipsq_name since that
- * would miss the correct ipsq during certain windows as shown above.
- * The ipsq_name is only used during split of an ipsq to return the ipsq to its
- * natural state.
- */
-static ipsq_t *
-ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq,
- ip_stack_t *ipst)
-{
- ipsq_t *ipsq;
- int group_len;
- phyint_t *phyint;
-
- ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
- group_len = strlen(groupname);
- ASSERT(group_len != 0);
- group_len++;
-
- for (ipsq = ipst->ips_ipsq_g_head;
- ipsq != NULL;
- ipsq = ipsq->ipsq_next) {
- /*
- * When an ipsq is being split, and ill_split_ipsq
- * calls this function, we exclude it from being considered.
- */
- if (ipsq == exclude_ipsq)
- continue;
-
- /*
- * Compare against the ipsq_name. The groupname change happens
- * in 2 phases. The 1st phase merges the from group into
- * the to group's ipsq, by calling ill_merge_groups and restarts
- * the ioctl. The 2nd phase then locates the ipsq again thru
- * ipsq_name. At this point the phyint_groupname has not been
- * updated.
- */
- if ((group_len == strlen(ipsq->ipsq_name) + 1) &&
- (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) {
- /*
- * Verify that an ipmp groupname is exactly
- * part of 1 ipsq and is not found in any other
- * ipsq.
- */
- ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, ipst) ==
- NULL);
- return (ipsq);
- }
-
- /*
- * Comparison against ipsq_name alone is not sufficient.
- * In the case when groups are currently being
- * merged, the ipsq could hold other IPMP groups temporarily.
- * so we walk the phyint list and compare against the
- * phyint_groupname as well.
- */
- phyint = ipsq->ipsq_phyint_list;
- while (phyint != NULL) {
- if ((group_len == phyint->phyint_groupname_len) &&
- (bcmp(phyint->phyint_groupname, groupname,
- group_len) == 0)) {
- /*
- * Verify that an ipmp groupname is exactly
- * part of 1 ipsq and is not found in any other
- * ipsq.
- */
- ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq,
- ipst) == NULL);
- return (ipsq);
- }
- phyint = phyint->phyint_ipsq_next;
- }
- }
- if (create)
- ipsq = ipsq_create(groupname, ipst);
- return (ipsq);
+ ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
}
static void
ipsq_delete(ipsq_t *ipsq)
{
- ipsq_t *nipsq;
- ipsq_t *pipsq = NULL;
- ip_stack_t *ipst = ipsq->ipsq_ipst;
-
- /*
- * We don't hold the ipsq lock, but we are sure no new
- * messages can land up, since the ipsq_refs is zero.
- * i.e. this ipsq is unnamed and no phyint or phyint group
- * is associated with this ipsq. (Lookups are based on ill_name
- * or phyint_groupname)
- */
- ASSERT(ipsq->ipsq_refs == 0);
- ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL);
- ASSERT(ipsq->ipsq_pending_mp == NULL);
- if (!(ipsq->ipsq_flags & IPSQ_GROUP)) {
- /*
- * This is not the ipsq of an IPMP group.
- */
- ipsq->ipsq_ipst = NULL;
- kmem_free(ipsq, sizeof (ipsq_t));
- return;
- }
-
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
- /*
- * Locate the ipsq before we can remove it from
- * the singly linked list of ipsq's.
- */
- for (nipsq = ipst->ips_ipsq_g_head; nipsq != NULL;
- nipsq = nipsq->ipsq_next) {
- if (nipsq == ipsq) {
- break;
- }
- pipsq = nipsq;
- }
-
- ASSERT(nipsq == ipsq);
+ ipxop_t *ipx = ipsq->ipsq_xop;
- /* unlink ipsq from the list */
- if (pipsq != NULL)
- pipsq->ipsq_next = ipsq->ipsq_next;
- else
- ipst->ips_ipsq_g_head = ipsq->ipsq_next;
ipsq->ipsq_ipst = NULL;
+ ASSERT(ipsq->ipsq_phyint == NULL);
+ ASSERT(ipsq->ipsq_xop != NULL);
+ ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL);
+ ASSERT(ipx->ipx_pending_mp == NULL);
kmem_free(ipsq, sizeof (ipsq_t));
- rw_exit(&ipst->ips_ill_g_lock);
-}
-
-static void
-ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp,
- queue_t *q)
-{
- ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock));
- ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL);
- ASSERT(old_ipsq->ipsq_pending_ipif == NULL);
- ASSERT(old_ipsq->ipsq_pending_mp == NULL);
- ASSERT(current_mp != NULL);
-
- ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl,
- NEW_OP, NULL);
-
- ASSERT(new_ipsq->ipsq_xopq_mptail != NULL &&
- new_ipsq->ipsq_xopq_mphead != NULL);
-
- /*
- * move from old ipsq to the new ipsq.
- */
- new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead;
- if (old_ipsq->ipsq_xopq_mphead != NULL)
- new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail;
-
- old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL;
}
-void
-ill_group_cleanup(ill_t *ill)
-{
- ill_t *ill_v4;
- ill_t *ill_v6;
- ipif_t *ipif;
-
- ill_v4 = ill->ill_phyint->phyint_illv4;
- ill_v6 = ill->ill_phyint->phyint_illv6;
-
- if (ill_v4 != NULL) {
- mutex_enter(&ill_v4->ill_lock);
- for (ipif = ill_v4->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- IPIF_UNMARK_MOVING(ipif);
- }
- ill_v4->ill_up_ipifs = B_FALSE;
- mutex_exit(&ill_v4->ill_lock);
- }
-
- if (ill_v6 != NULL) {
- mutex_enter(&ill_v6->ill_lock);
- for (ipif = ill_v6->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- IPIF_UNMARK_MOVING(ipif);
- }
- ill_v6->ill_up_ipifs = B_FALSE;
- mutex_exit(&ill_v6->ill_lock);
- }
-}
-/*
- * This function is called when an ill has had a change in its group status
- * to bring up all the ipifs that were up before the change.
- */
-int
-ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
+static int
+ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
{
+ int err;
ipif_t *ipif;
- ill_t *ill_v4;
- ill_t *ill_v6;
- ill_t *from_ill;
- int err = 0;
- ASSERT(IAM_WRITER_ILL(ill));
+ if (ill == NULL)
+ return (0);
/*
* Except for ipif_state_flags and ill_state_flags the other
@@ -14142,389 +14023,86 @@ ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
* even an ipif that was already down, in ill_down_ipifs. So we
* just blindly clear the IPIF_CHANGING flag here on all ipifs.
*/
- ill_v4 = ill->ill_phyint->phyint_illv4;
- ill_v6 = ill->ill_phyint->phyint_illv6;
- if (ill_v4 != NULL) {
- ill_v4->ill_up_ipifs = B_TRUE;
- for (ipif = ill_v4->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- mutex_enter(&ill_v4->ill_lock);
- ipif->ipif_state_flags &= ~IPIF_CHANGING;
- IPIF_UNMARK_MOVING(ipif);
- mutex_exit(&ill_v4->ill_lock);
- if (ipif->ipif_was_up) {
- if (!(ipif->ipif_flags & IPIF_UP))
- err = ipif_up(ipif, q, mp);
- ipif->ipif_was_up = B_FALSE;
- if (err != 0) {
- /*
- * Can there be any other error ?
- */
- ASSERT(err == EINPROGRESS);
- return (err);
- }
- }
- }
- mutex_enter(&ill_v4->ill_lock);
- ill_v4->ill_state_flags &= ~ILL_CHANGING;
- mutex_exit(&ill_v4->ill_lock);
- ill_v4->ill_up_ipifs = B_FALSE;
- if (ill_v4->ill_move_in_progress) {
- ASSERT(ill_v4->ill_move_peer != NULL);
- ill_v4->ill_move_in_progress = B_FALSE;
- from_ill = ill_v4->ill_move_peer;
- from_ill->ill_move_in_progress = B_FALSE;
- from_ill->ill_move_peer = NULL;
- mutex_enter(&from_ill->ill_lock);
- from_ill->ill_state_flags &= ~ILL_CHANGING;
- mutex_exit(&from_ill->ill_lock);
- if (ill_v6 == NULL) {
- if (from_ill->ill_phyint->phyint_flags &
- PHYI_STANDBY) {
- phyint_inactive(from_ill->ill_phyint);
- }
- if (ill_v4->ill_phyint->phyint_flags &
- PHYI_STANDBY) {
- phyint_inactive(ill_v4->ill_phyint);
- }
- }
- ill_v4->ill_move_peer = NULL;
- }
- }
+ ASSERT(IAM_WRITER_ILL(ill));
- if (ill_v6 != NULL) {
- ill_v6->ill_up_ipifs = B_TRUE;
- for (ipif = ill_v6->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- mutex_enter(&ill_v6->ill_lock);
- ipif->ipif_state_flags &= ~IPIF_CHANGING;
- IPIF_UNMARK_MOVING(ipif);
- mutex_exit(&ill_v6->ill_lock);
- if (ipif->ipif_was_up) {
- if (!(ipif->ipif_flags & IPIF_UP))
- err = ipif_up(ipif, q, mp);
- ipif->ipif_was_up = B_FALSE;
- if (err != 0) {
- /*
- * Can there be any other error ?
- */
- ASSERT(err == EINPROGRESS);
- return (err);
- }
- }
- }
- mutex_enter(&ill_v6->ill_lock);
- ill_v6->ill_state_flags &= ~ILL_CHANGING;
- mutex_exit(&ill_v6->ill_lock);
- ill_v6->ill_up_ipifs = B_FALSE;
- if (ill_v6->ill_move_in_progress) {
- ASSERT(ill_v6->ill_move_peer != NULL);
- ill_v6->ill_move_in_progress = B_FALSE;
- from_ill = ill_v6->ill_move_peer;
- from_ill->ill_move_in_progress = B_FALSE;
- from_ill->ill_move_peer = NULL;
- mutex_enter(&from_ill->ill_lock);
- from_ill->ill_state_flags &= ~ILL_CHANGING;
- mutex_exit(&from_ill->ill_lock);
- if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
- phyint_inactive(from_ill->ill_phyint);
- }
- if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) {
- phyint_inactive(ill_v6->ill_phyint);
+ ill->ill_up_ipifs = B_TRUE;
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ mutex_enter(&ill->ill_lock);
+ ipif->ipif_state_flags &= ~IPIF_CHANGING;
+ mutex_exit(&ill->ill_lock);
+ if (ipif->ipif_was_up) {
+ if (!(ipif->ipif_flags & IPIF_UP))
+ err = ipif_up(ipif, q, mp);
+ ipif->ipif_was_up = B_FALSE;
+ if (err != 0) {
+ ASSERT(err == EINPROGRESS);
+ return (err);
}
- ill_v6->ill_move_peer = NULL;
}
}
+ mutex_enter(&ill->ill_lock);
+ ill->ill_state_flags &= ~ILL_CHANGING;
+ mutex_exit(&ill->ill_lock);
+ ill->ill_up_ipifs = B_FALSE;
return (0);
}
/*
- * bring down all the approriate ipifs.
+ * This function is called to bring up all the ipifs that were up before
+ * bringing the ill down via ill_down_ipifs().
*/
-/* ARGSUSED */
-static void
-ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover)
+int
+ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
{
- ipif_t *ipif;
+ int err;
ASSERT(IAM_WRITER_ILL(ill));
- /*
- * Except for ipif_state_flags the other fields of the ipif/ill that
- * are modified below are protected implicitly since we are a writer
- */
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER))
- continue;
- /*
- * Don't bring down the LINK LOCAL addresses as they are tied
- * to physical interface and they don't move. Treat them as
- * IPIF_NOFAILOVER.
- */
- if (chk_nofailover && ill->ill_isv6 &&
- IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))
- continue;
- if (index == 0 || index == ipif->ipif_orig_ifindex) {
- /*
- * We go through the ipif_down logic even if the ipif
- * is already down, since routes can be added based
- * on down ipifs. Going through ipif_down once again
- * will delete any IREs created based on these routes.
- */
- if (ipif->ipif_flags & IPIF_UP)
- ipif->ipif_was_up = B_TRUE;
- /*
- * If called with chk_nofailover true ipif is moving.
- */
- mutex_enter(&ill->ill_lock);
- if (chk_nofailover) {
- ipif->ipif_state_flags |=
- IPIF_MOVING | IPIF_CHANGING;
- } else {
- ipif->ipif_state_flags |= IPIF_CHANGING;
- }
- mutex_exit(&ill->ill_lock);
- /*
- * Need to re-create net/subnet bcast ires if
- * they are dependent on ipif.
- */
- if (!ipif->ipif_isv6)
- ipif_check_bcast_ires(ipif);
- (void) ipif_logical_down(ipif, NULL, NULL);
- ipif_non_duplicate(ipif);
- ipif_down_tail(ipif);
- }
- }
-}
-
-#define IPSQ_INC_REF(ipsq, ipst) { \
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \
- (ipsq)->ipsq_refs++; \
-}
+ err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
+ if (err != 0)
+ return (err);
-#define IPSQ_DEC_REF(ipsq, ipst) { \
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \
- (ipsq)->ipsq_refs--; \
- if ((ipsq)->ipsq_refs == 0) \
- (ipsq)->ipsq_name[0] = '\0'; \
+ return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp));
}
/*
- * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
- * new_ipsq.
+ * Bring down any IPIF_UP ipifs on ill.
*/
static void
-ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq, ip_stack_t *ipst)
+ill_down_ipifs(ill_t *ill)
{
- phyint_t *phyint;
- phyint_t *next_phyint;
-
- /*
- * To change the ipsq of an ill, we need to hold the ill_g_lock as
- * writer and the ill_lock of the ill in question. Also the dest
- * ipsq can't vanish while we hold the ill_g_lock as writer.
- */
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
- phyint = cur_ipsq->ipsq_phyint_list;
- cur_ipsq->ipsq_phyint_list = NULL;
- while (phyint != NULL) {
- next_phyint = phyint->phyint_ipsq_next;
- IPSQ_DEC_REF(cur_ipsq, ipst);
- phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list;
- new_ipsq->ipsq_phyint_list = phyint;
- IPSQ_INC_REF(new_ipsq, ipst);
- phyint->phyint_ipsq = new_ipsq;
- phyint = next_phyint;
- }
-}
-
-#define SPLIT_SUCCESS 0
-#define SPLIT_NOT_NEEDED 1
-#define SPLIT_FAILED 2
-
-int
-ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry,
- ip_stack_t *ipst)
-{
- ipsq_t *newipsq = NULL;
-
- /*
- * Assertions denote pre-requisites for changing the ipsq of
- * a phyint
- */
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
- /*
- * <ill-phyint> assocs can't change while ill_g_lock
- * is held as writer. See ill_phyint_reinit()
- */
- ASSERT(phyint->phyint_illv4 == NULL ||
- MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
- ASSERT(phyint->phyint_illv6 == NULL ||
- MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
-
- if ((phyint->phyint_groupname_len !=
- (strlen(cur_ipsq->ipsq_name) + 1) ||
- bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name,
- phyint->phyint_groupname_len) != 0)) {
- /*
- * Once we fail in creating a new ipsq due to memory shortage,
- * don't attempt to create new ipsq again, based on another
- * phyint, since we want all phyints belonging to an IPMP group
- * to be in the same ipsq even in the event of mem alloc fails.
- */
- newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry,
- cur_ipsq, ipst);
- if (newipsq == NULL) {
- /* Memory allocation failure */
- return (SPLIT_FAILED);
- } else {
- /* ipsq_refs protected by ill_g_lock (writer) */
- IPSQ_DEC_REF(cur_ipsq, ipst);
- phyint->phyint_ipsq = newipsq;
- phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list;
- newipsq->ipsq_phyint_list = phyint;
- IPSQ_INC_REF(newipsq, ipst);
- return (SPLIT_SUCCESS);
- }
- }
- return (SPLIT_NOT_NEEDED);
-}
+ ipif_t *ipif;
-/*
- * The ill locks of the phyint and the ill_g_lock (writer) must be held
- * to do this split
- */
-static int
-ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, ip_stack_t *ipst)
-{
- ipsq_t *newipsq;
+ ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
/*
- * <ill-phyint> assocs can't change while ill_g_lock
- * is held as writer. See ill_phyint_reinit()
+ * Except for ipif_state_flags the other fields of the ipif/ill that
+ * are modified below are protected implicitly since we are a writer
*/
-
- ASSERT(phyint->phyint_illv4 == NULL ||
- MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
- ASSERT(phyint->phyint_illv6 == NULL ||
- MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
-
- if (!ipsq_init((phyint->phyint_illv4 != NULL) ?
- phyint->phyint_illv4: phyint->phyint_illv6)) {
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
/*
- * ipsq_init failed due to no memory
- * caller will use the same ipsq
+ * We go through the ipif_down logic even if the ipif
+ * is already down, since routes can be added based
+ * on down ipifs. Going through ipif_down once again
+ * will delete any IREs created based on these routes.
*/
- return (SPLIT_FAILED);
- }
-
- /* ipsq_ref is protected by ill_g_lock (writer) */
- IPSQ_DEC_REF(cur_ipsq, ipst);
-
- /*
- * This is a new ipsq that is unknown to the world.
- * So we don't need to hold ipsq_lock,
- */
- newipsq = phyint->phyint_ipsq;
- newipsq->ipsq_writer = NULL;
- newipsq->ipsq_reentry_cnt--;
- ASSERT(newipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
- newipsq->ipsq_depth = 0;
-#endif
-
- return (SPLIT_SUCCESS);
-}
+ if (ipif->ipif_flags & IPIF_UP)
+ ipif->ipif_was_up = B_TRUE;
-/*
- * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
- * ipsq's representing their individual groups or themselves. Return
- * whether split needs to be retried again later.
- */
-static boolean_t
-ill_split_ipsq(ipsq_t *cur_ipsq)
-{
- phyint_t *phyint;
- phyint_t *next_phyint;
- int error;
- boolean_t need_retry = B_FALSE;
- ip_stack_t *ipst = cur_ipsq->ipsq_ipst;
+ mutex_enter(&ill->ill_lock);
+ ipif->ipif_state_flags |= IPIF_CHANGING;
+ mutex_exit(&ill->ill_lock);
- phyint = cur_ipsq->ipsq_phyint_list;
- cur_ipsq->ipsq_phyint_list = NULL;
- while (phyint != NULL) {
- next_phyint = phyint->phyint_ipsq_next;
/*
- * 'created' will tell us whether the callee actually
- * created an ipsq. Lack of memory may force the callee
- * to return without creating an ipsq.
+ * Need to re-create net/subnet bcast ires if
+ * they are dependent on ipif.
*/
- if (phyint->phyint_groupname == NULL) {
- error = ill_split_to_own_ipsq(phyint, cur_ipsq, ipst);
- } else {
- error = ill_split_to_grp_ipsq(phyint, cur_ipsq,
- need_retry, ipst);
- }
-
- switch (error) {
- case SPLIT_FAILED:
- need_retry = B_TRUE;
- /* FALLTHRU */
- case SPLIT_NOT_NEEDED:
- /*
- * Keep it on the list.
- */
- phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list;
- cur_ipsq->ipsq_phyint_list = phyint;
- break;
- case SPLIT_SUCCESS:
- break;
- default:
- ASSERT(0);
- }
-
- phyint = next_phyint;
- }
- return (need_retry);
-}
-
-/*
- * given an ipsq 'ipsq' lock all ills associated with this ipsq.
- * and return the ills in the list. This list will be
- * needed to unlock all the ills later on by the caller.
- * The <ill-ipsq> associations could change between the
- * lock and unlock. Hence the unlock can't traverse the
- * ipsq to get the list of ills.
- */
-static int
-ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max)
-{
- int cnt = 0;
- phyint_t *phyint;
- ip_stack_t *ipst = ipsq->ipsq_ipst;
-
- /*
- * The caller holds ill_g_lock to ensure that the ill memberships
- * of the ipsq don't change
- */
- ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
- phyint = ipsq->ipsq_phyint_list;
- while (phyint != NULL) {
- if (phyint->phyint_illv4 != NULL) {
- ASSERT(cnt < list_max);
- list[cnt++] = phyint->phyint_illv4;
- }
- if (phyint->phyint_illv6 != NULL) {
- ASSERT(cnt < list_max);
- list[cnt++] = phyint->phyint_illv6;
- }
- phyint = phyint->phyint_ipsq_next;
+ if (!ipif->ipif_isv6)
+ ipif_check_bcast_ires(ipif);
+ (void) ipif_logical_down(ipif, NULL, NULL);
+ ipif_non_duplicate(ipif);
+ ipif_down_tail(ipif);
}
- ill_lock_ills(list, cnt);
- return (cnt);
}
void
@@ -14577,3504 +14155,251 @@ ill_unlock_ills(ill_t **list, int cnt)
}
/*
- * Merge all the ills from 1 ipsq group into another ipsq group.
- * The source ipsq group is specified by the ipsq associated with
- * 'from_ill'. The destination ipsq group is specified by the ipsq
- * associated with 'to_ill' or 'groupname' respectively.
- * Note that ipsq itself does not have a reference count mechanism
- * and functions don't look up an ipsq and pass it around. Instead
- * functions pass around an ill or groupname, and the ipsq is looked
- * up from the ill or groupname and the required operation performed
- * atomically with the lookup on the ipsq.
+ * Redo source address selection. This is called when a
+ * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up.
*/
-static int
-ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp,
- queue_t *q)
-{
- ipsq_t *old_ipsq;
- ipsq_t *new_ipsq;
- ill_t **ill_list;
- int cnt;
- size_t ill_list_size;
- boolean_t became_writer_on_new_sq = B_FALSE;
- ip_stack_t *ipst = from_ill->ill_ipst;
-
- ASSERT(to_ill == NULL || ipst == to_ill->ill_ipst);
- /* Exactly 1 of 'to_ill' and groupname can be specified. */
- ASSERT((to_ill != NULL) ^ (groupname != NULL));
-
- /*
- * Need to hold ill_g_lock as writer and also the ill_lock to
- * change the <ill-ipsq> assoc of an ill. Need to hold the
- * ipsq_lock to prevent new messages from landing on an ipsq.
- */
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
- old_ipsq = from_ill->ill_phyint->phyint_ipsq;
- if (groupname != NULL)
- new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL, ipst);
- else {
- new_ipsq = to_ill->ill_phyint->phyint_ipsq;
- }
-
- ASSERT(old_ipsq != NULL && new_ipsq != NULL);
-
- /*
- * both groups are on the same ipsq.
- */
- if (old_ipsq == new_ipsq) {
- rw_exit(&ipst->ips_ill_g_lock);
- return (0);
- }
-
- cnt = old_ipsq->ipsq_refs << 1;
- ill_list_size = cnt * sizeof (ill_t *);
- ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
- if (ill_list == NULL) {
- rw_exit(&ipst->ips_ill_g_lock);
- return (ENOMEM);
- }
- cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt);
-
- /* Need ipsq lock to enque messages on new ipsq or to become writer */
- mutex_enter(&new_ipsq->ipsq_lock);
- if ((new_ipsq->ipsq_writer == NULL &&
- new_ipsq->ipsq_current_ipif == NULL) ||
- (new_ipsq->ipsq_writer == curthread)) {
- new_ipsq->ipsq_writer = curthread;
- new_ipsq->ipsq_reentry_cnt++;
- became_writer_on_new_sq = B_TRUE;
- }
-
- /*
- * We are holding ill_g_lock as writer and all the ill locks of
- * the old ipsq. So the old_ipsq can't be looked up, and hence no new
- * message can land up on the old ipsq even though we don't hold the
- * ipsq_lock of the old_ipsq. Now move all messages to the newipsq.
- */
- ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q);
-
- /*
- * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'.
- * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq>
- * assocs. till we release the ill_g_lock, and hence it can't vanish.
- */
- ill_merge_ipsq(old_ipsq, new_ipsq, ipst);
-
- /*
- * Mark the new ipsq as needing a split since it is currently
- * being shared by more than 1 IPMP group. The split will
- * occur at the end of ipsq_exit
- */
- new_ipsq->ipsq_split = B_TRUE;
-
- /* Now release all the locks */
- mutex_exit(&new_ipsq->ipsq_lock);
- ill_unlock_ills(ill_list, cnt);
- rw_exit(&ipst->ips_ill_g_lock);
-
- kmem_free(ill_list, ill_list_size);
-
- /*
- * If we succeeded in becoming writer on the new ipsq, then
- * drain the new ipsq and start processing all enqueued messages
- * including the current ioctl we are processing which is either
- * a set groupname or failover/failback.
- */
- if (became_writer_on_new_sq)
- ipsq_exit(new_ipsq);
-
- /*
- * syncq has been changed and all the messages have been moved.
- */
- mutex_enter(&old_ipsq->ipsq_lock);
- old_ipsq->ipsq_current_ipif = NULL;
- old_ipsq->ipsq_current_ioctl = 0;
- old_ipsq->ipsq_current_done = B_TRUE;
- mutex_exit(&old_ipsq->ipsq_lock);
- return (EINPROGRESS);
-}
-
-/*
- * Delete and add the loopback copy and non-loopback copy of
- * the BROADCAST ire corresponding to ill and addr. Used to
- * group broadcast ires together when ill becomes part of
- * a group.
- *
- * This function is also called when ill is leaving the group
- * so that the ires belonging to the group gets re-grouped.
- */
-static void
-ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr)
-{
- ire_t *ire, *nire, *nire_next, *ire_head = NULL;
- ire_t **ire_ptpn = &ire_head;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * The loopback and non-loopback IREs are inserted in the order in which
- * they're found, on the basis that they are correctly ordered (loopback
- * first).
- */
- for (;;) {
- ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
- ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
- if (ire == NULL)
- break;
-
- /*
- * we are passing in KM_SLEEP because it is not easy to
- * go back to a sane state in case of memory failure.
- */
- nire = kmem_cache_alloc(ire_cache, KM_SLEEP);
- ASSERT(nire != NULL);
- bzero(nire, sizeof (ire_t));
- /*
- * Don't use ire_max_frag directly since we don't
- * hold on to 'ire' until we add the new ire 'nire' and
- * we don't want the new ire to have a dangling reference
- * to 'ire'. The ire_max_frag of a broadcast ire must
- * be in sync with the ipif_mtu of the associate ipif.
- * For eg. this happens as a result of SIOCSLIFNAME,
- * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by
- * the driver. A change in ire_max_frag triggered as
- * as a result of path mtu discovery, or due to an
- * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a
- * route change -mtu command does not apply to broadcast ires.
- *
- * XXX We need a recovery strategy here if ire_init fails
- */
- if (ire_init(nire,
- (uchar_t *)&ire->ire_addr,
- (uchar_t *)&ire->ire_mask,
- (uchar_t *)&ire->ire_src_addr,
- (uchar_t *)&ire->ire_gateway_addr,
- ire->ire_stq == NULL ? &ip_loopback_mtu :
- &ire->ire_ipif->ipif_mtu,
- ire->ire_nce,
- ire->ire_rfq,
- ire->ire_stq,
- ire->ire_type,
- ire->ire_ipif,
- ire->ire_cmask,
- ire->ire_phandle,
- ire->ire_ihandle,
- ire->ire_flags,
- &ire->ire_uinfo,
- NULL,
- NULL,
- ipst) == NULL) {
- cmn_err(CE_PANIC, "ire_init() failed");
- }
- ire_delete(ire);
- ire_refrele(ire);
-
- /*
- * The newly created IREs are inserted at the tail of the list
- * starting with ire_head. As we've just allocated them no one
- * knows about them so it's safe.
- */
- *ire_ptpn = nire;
- ire_ptpn = &nire->ire_next;
- }
-
- for (nire = ire_head; nire != NULL; nire = nire_next) {
- int error;
- ire_t *oire;
- /* unlink the IRE from our list before calling ire_add() */
- nire_next = nire->ire_next;
- nire->ire_next = NULL;
-
- /* ire_add adds the ire at the right place in the list */
- oire = nire;
- error = ire_add(&nire, NULL, NULL, NULL, B_FALSE);
- ASSERT(error == 0);
- ASSERT(oire == nire);
- ire_refrele(nire); /* Held in ire_add */
- }
-}
-
-/*
- * This function is usually called when an ill is inserted in
- * a group and all the ipifs are already UP. As all the ipifs
- * are already UP, the broadcast ires have already been created
- * and been inserted. But, ire_add_v4 would not have grouped properly.
- * We need to re-group for the benefit of ip_wput_ire which
- * expects BROADCAST ires to be grouped properly to avoid sending
- * more than one copy of the broadcast packet per group.
- *
- * NOTE : We don't check for ill_ipif_up_count to be non-zero here
- * because when ipif_up_done ends up calling this, ires have
- * already been added before illgrp_insert i.e before ill_group
- * has been initialized.
- */
-static void
-ill_group_bcast_for_xmit(ill_t *ill)
+void
+ill_update_source_selection(ill_t *ill)
{
- ill_group_t *illgrp;
ipif_t *ipif;
- ipaddr_t addr;
- ipaddr_t net_mask;
- ipaddr_t subnet_netmask;
- illgrp = ill->ill_group;
+ ASSERT(IAM_WRITER_ILL(ill));
/*
- * This function is called even when an ill is deleted from
- * the group. Hence, illgrp could be null.
+ * Underlying interfaces are only used for test traffic and thus
+ * should always send with their (deprecated) source addresses.
*/
- if (illgrp != NULL && illgrp->illgrp_ill_count == 1)
+ if (IS_UNDER_IPMP(ill))
return;
- /*
- * Delete all the BROADCAST ires matching this ill and add
- * them back. This time, ire_add_v4 should take care of
- * grouping them with others because ill is part of the
- * group.
- */
- ill_bcast_delete_and_add(ill, 0);
- ill_bcast_delete_and_add(ill, INADDR_BROADCAST);
-
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-
- if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
- !(ipif->ipif_flags & IPIF_NOLOCAL)) {
- net_mask = ip_net_mask(ipif->ipif_lcl_addr);
- } else {
- net_mask = htonl(IN_CLASSA_NET);
- }
- addr = net_mask & ipif->ipif_subnet;
- ill_bcast_delete_and_add(ill, addr);
- ill_bcast_delete_and_add(ill, ~net_mask | addr);
-
- subnet_netmask = ipif->ipif_net_mask;
- addr = ipif->ipif_subnet;
- ill_bcast_delete_and_add(ill, addr);
- ill_bcast_delete_and_add(ill, ~subnet_netmask | addr);
- }
-}
-
-/*
- * This function is called from illgrp_delete when ill is being deleted
- * from the group.
- *
- * As ill is not there in the group anymore, any address belonging
- * to this ill should be cleared of IRE_MARK_NORECV.
- */
-static void
-ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr)
-{
- ire_t *ire;
- irb_t *irb;
- ip_stack_t *ipst = ill->ill_ipst;
-
- ASSERT(ill->ill_group == NULL);
-
- ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
- ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
-
- if (ire != NULL) {
- /*
- * IPMP and plumbing operations are serialized on the ipsq, so
- * no one will insert or delete a broadcast ire under our feet.
- */
- irb = ire->ire_bucket;
- rw_enter(&irb->irb_lock, RW_READER);
- ire_refrele(ire);
-
- for (; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_addr != addr)
- break;
- if (ire_to_ill(ire) != ill)
- continue;
-
- ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED));
- ire->ire_marks &= ~IRE_MARK_NORECV;
- }
- rw_exit(&irb->irb_lock);
- }
-}
-
-ire_t *
-irep_insert(ill_group_t *illgrp, ipaddr_t addr, ire_t *ire, ire_t ***pirep)
-{
- boolean_t first = B_TRUE;
- ire_t *clear_ire = NULL;
- ire_t *start_ire = NULL;
- uint64_t match_flags;
- uint64_t phyi_flags;
- boolean_t fallback = B_FALSE;
-
- /*
- * irb_lock must be held by the caller.
- * Get to the first ire matching the address and the
- * group. If the address does not match we are done
- * as we could not find the IRE. If the address matches
- * we should get to the first one matching the group.
- */
- while (ire != NULL) {
- if (ire->ire_addr != addr ||
- ire->ire_ipif->ipif_ill->ill_group == illgrp) {
- break;
- }
- ire = ire->ire_next;
- }
- match_flags = PHYI_FAILED | PHYI_INACTIVE;
- start_ire = ire;
-redo:
- while (ire != NULL && ire->ire_addr == addr &&
- ire->ire_ipif->ipif_ill->ill_group == illgrp) {
- /*
- * The first ire for any address within a group
- * should always be the one with IRE_MARK_NORECV cleared
- * so that ip_wput_ire can avoid searching for one.
- * Note down the insertion point which will be used
- * later.
- */
- if (first && (*pirep == NULL))
- *pirep = ire->ire_ptpn;
- /*
- * PHYI_FAILED is set when the interface fails.
- * This interface might have become good, but the
- * daemon has not yet detected. We should still
- * not receive on this. PHYI_OFFLINE should never
- * be picked as this has been offlined and soon
- * be removed.
- */
- phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags;
- if (phyi_flags & PHYI_OFFLINE) {
- ire->ire_marks |= IRE_MARK_NORECV;
- ire = ire->ire_next;
- continue;
- }
- if (phyi_flags & match_flags) {
- ire->ire_marks |= IRE_MARK_NORECV;
- ire = ire->ire_next;
- if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
- PHYI_INACTIVE) {
- fallback = B_TRUE;
- }
- continue;
- }
- if (first) {
- /*
- * We will move this to the front of the list later
- * on.
- */
- clear_ire = ire;
- ire->ire_marks &= ~IRE_MARK_NORECV;
- } else {
- ire->ire_marks |= IRE_MARK_NORECV;
- }
- first = B_FALSE;
- ire = ire->ire_next;
- }
- /*
- * If we never nominated anybody, try nominating at least
- * an INACTIVE, if we found one. Do it only once though.
- */
- if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) &&
- fallback) {
- match_flags = PHYI_FAILED;
- ire = start_ire;
- *pirep = NULL;
- goto redo;
- }
- return (clear_ire);
-}
-
-/*
- * This function must be called only after the broadcast ires
- * have been grouped together. For a given address addr, nominate
- * only one of the ires whose interface is not FAILED or OFFLINE.
- *
- * This is also called when an ipif goes down, so that we can nominate
- * a different ire with the same address for receiving.
- */
-static void
-ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr, ip_stack_t *ipst)
-{
- irb_t *irb;
- ire_t *ire;
- ire_t *ire1;
- ire_t *save_ire;
- ire_t **irep = NULL;
- ire_t *clear_ire = NULL;
- ire_t *new_lb_ire;
- ire_t *new_nlb_ire;
- boolean_t new_lb_ire_used = B_FALSE;
- boolean_t new_nlb_ire_used = B_FALSE;
- boolean_t refrele_lb_ire = B_FALSE;
- boolean_t refrele_nlb_ire = B_FALSE;
- uint_t max_frag;
-
- ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES,
- NULL, MATCH_IRE_TYPE, ipst);
- /*
- * We may not be able to find some ires if a previous
- * ire_create failed. This happens when an ipif goes
- * down and we are unable to create BROADCAST ires due
- * to memory failure. Thus, we have to check for NULL
- * below. This should handle the case for LOOPBACK,
- * POINTOPOINT and interfaces with some POINTOPOINT
- * logicals for which there are no BROADCAST ires.
- */
- if (ire == NULL)
- return;
- /*
- * Currently IRE_BROADCASTS are deleted when an ipif
- * goes down which runs exclusively. Thus, setting
- * IRE_MARK_RCVD should not race with ire_delete marking
- * IRE_MARK_CONDEMNED. We grab the lock below just to
- * be consistent with other parts of the code that walks
- * a given bucket.
- */
- save_ire = ire;
- irb = ire->ire_bucket;
- new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
- if (new_lb_ire == NULL) {
- ire_refrele(ire);
- return;
- }
- new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
- if (new_nlb_ire == NULL) {
- ire_refrele(ire);
- kmem_cache_free(ire_cache, new_lb_ire);
- return;
- }
- IRB_REFHOLD(irb);
- rw_enter(&irb->irb_lock, RW_WRITER);
- clear_ire = irep_insert(illgrp, addr, ire, &irep);
-
- /*
- * irep non-NULL indicates that we entered the while loop
- * above. If clear_ire is at the insertion point, we don't
- * have to do anything. clear_ire will be NULL if all the
- * interfaces are failed.
- *
- * We cannot unlink and reinsert the ire at the right place
- * in the list since there can be other walkers of this bucket.
- * Instead we delete and recreate the ire
- */
- if (clear_ire != NULL && irep != NULL && *irep != clear_ire) {
- ire_t *clear_ire_stq = NULL;
- ire_t *clr_ire = NULL;
- ire_t *ire_next = NULL;
-
- if (clear_ire->ire_stq == NULL)
- ire_next = clear_ire->ire_next;
-
- rw_exit(&irb->irb_lock);
-
- bzero(new_lb_ire, sizeof (ire_t));
- /* XXX We need a recovery strategy here. */
- if (ire_init(new_lb_ire,
- (uchar_t *)&clear_ire->ire_addr,
- (uchar_t *)&clear_ire->ire_mask,
- (uchar_t *)&clear_ire->ire_src_addr,
- (uchar_t *)&clear_ire->ire_gateway_addr,
- &clear_ire->ire_max_frag,
- NULL, /* let ire_nce_init derive the resolver info */
- clear_ire->ire_rfq,
- clear_ire->ire_stq,
- clear_ire->ire_type,
- clear_ire->ire_ipif,
- clear_ire->ire_cmask,
- clear_ire->ire_phandle,
- clear_ire->ire_ihandle,
- clear_ire->ire_flags,
- &clear_ire->ire_uinfo,
- NULL,
- NULL,
- ipst) == NULL)
- cmn_err(CE_PANIC, "ire_init() failed");
-
- refrele_lb_ire = B_TRUE;
-
- if (ire_next != NULL &&
- ire_next->ire_stq != NULL &&
- ire_next->ire_addr == clear_ire->ire_addr &&
- ire_next->ire_ipif->ipif_ill ==
- clear_ire->ire_ipif->ipif_ill) {
- clear_ire_stq = ire_next;
-
- bzero(new_nlb_ire, sizeof (ire_t));
- /* XXX We need a recovery strategy here. */
- if (ire_init(new_nlb_ire,
- (uchar_t *)&clear_ire_stq->ire_addr,
- (uchar_t *)&clear_ire_stq->ire_mask,
- (uchar_t *)&clear_ire_stq->ire_src_addr,
- (uchar_t *)&clear_ire_stq->ire_gateway_addr,
- &clear_ire_stq->ire_max_frag,
- NULL,
- clear_ire_stq->ire_rfq,
- clear_ire_stq->ire_stq,
- clear_ire_stq->ire_type,
- clear_ire_stq->ire_ipif,
- clear_ire_stq->ire_cmask,
- clear_ire_stq->ire_phandle,
- clear_ire_stq->ire_ihandle,
- clear_ire_stq->ire_flags,
- &clear_ire_stq->ire_uinfo,
- NULL,
- NULL,
- ipst) == NULL)
- cmn_err(CE_PANIC, "ire_init() failed");
-
- refrele_nlb_ire = B_TRUE;
- }
-
- rw_enter(&irb->irb_lock, RW_WRITER);
- /*
- * irb_lock was dropped across call to ire_init() due to
- * lock ordering issue with ipst->ips_ndp{4,6}->ndp_g_lock
- * mutex lock. Therefore irep could have changed. call
- * irep_insert() to get the new insertion point (irep) and
- * recheck all known conditions.
- */
- irep = NULL;
- clr_ire = irep_insert(illgrp, addr, save_ire, &irep);
- if ((irep != NULL) && (*irep != clear_ire) &&
- (clr_ire == clear_ire)) {
- if ((clear_ire_stq != NULL) &&
- (clr_ire->ire_next != clear_ire_stq))
- clear_ire_stq = NULL;
- /*
- * Delete the ire. We can't call ire_delete() since
- * we are holding the bucket lock. We can't release the
- * bucket lock since we can't allow irep to change.
- * So just mark it CONDEMNED.
- * The IRB_REFRELE will delete the ire from the list
- * and do the refrele.
- */
- clear_ire->ire_marks |= IRE_MARK_CONDEMNED;
- irb->irb_marks |= IRB_MARK_CONDEMNED;
-
- if (clear_ire_stq != NULL &&
- clear_ire_stq->ire_nce != NULL) {
- nce_fastpath_list_delete(
- clear_ire_stq->ire_nce);
- clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED;
- }
-
- /*
- * Also take care of otherfields like ib/ob pkt count
- * etc. Need to dup them.
- * ditto in ill_bcast_delete_and_add
- */
-
- /* Set the max_frag before adding the ire */
- max_frag = *new_lb_ire->ire_max_fragp;
- new_lb_ire->ire_max_fragp = NULL;
- new_lb_ire->ire_max_frag = max_frag;
-
- /* Add the new ire's. Insert at *irep */
- new_lb_ire->ire_bucket = clear_ire->ire_bucket;
- ire1 = *irep;
- if (ire1 != NULL)
- ire1->ire_ptpn = &new_lb_ire->ire_next;
- new_lb_ire->ire_next = ire1;
- /* Link the new one in. */
- new_lb_ire->ire_ptpn = irep;
- membar_producer();
- *irep = new_lb_ire;
- new_lb_ire_used = B_TRUE;
- BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
- ire_stats_inserted);
- new_lb_ire->ire_bucket->irb_ire_cnt++;
- DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *),
- new_lb_ire->ire_ipif,
- (char *), "ire", (void *), new_lb_ire);
- new_lb_ire->ire_ipif->ipif_ire_cnt++;
-
- if (clear_ire_stq != NULL) {
- ill_t *ire_ill;
- /* Set the max_frag before adding the ire */
- max_frag = *new_nlb_ire->ire_max_fragp;
- new_nlb_ire->ire_max_fragp = NULL;
- new_nlb_ire->ire_max_frag = max_frag;
-
- new_nlb_ire->ire_bucket = clear_ire->ire_bucket;
- irep = &new_lb_ire->ire_next;
- /* Add the new ire. Insert at *irep */
- ire1 = *irep;
- if (ire1 != NULL)
- ire1->ire_ptpn = &new_nlb_ire->ire_next;
- new_nlb_ire->ire_next = ire1;
- /* Link the new one in. */
- new_nlb_ire->ire_ptpn = irep;
- membar_producer();
- *irep = new_nlb_ire;
- new_nlb_ire_used = B_TRUE;
- BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
- ire_stats_inserted);
- new_nlb_ire->ire_bucket->irb_ire_cnt++;
- DTRACE_PROBE3(ipif__incr__cnt,
- (ipif_t *), new_nlb_ire->ire_ipif,
- (char *), "ire", (void *), new_nlb_ire);
- new_nlb_ire->ire_ipif->ipif_ire_cnt++;
- DTRACE_PROBE3(ill__incr__cnt,
- (ill_t *), new_nlb_ire->ire_stq->q_ptr,
- (char *), "ire", (void *), new_nlb_ire);
- ire_ill = (ill_t *)new_nlb_ire->ire_stq->q_ptr;
- ire_ill->ill_ire_cnt++;
- }
- }
- }
- ire_refrele(save_ire);
- rw_exit(&irb->irb_lock);
- /*
- * Since we dropped the irb_lock across call to ire_init()
- * and rechecking known conditions, it is possible that
- * the checks might fail, therefore undo the work done by
- * ire_init() by calling ire_refrele() on the newly created ire.
- */
- if (!new_lb_ire_used) {
- if (refrele_lb_ire) {
- ire_refrele(new_lb_ire);
- } else {
- kmem_cache_free(ire_cache, new_lb_ire);
- }
- }
- if (!new_nlb_ire_used) {
- if (refrele_nlb_ire) {
- ire_refrele(new_nlb_ire);
- } else {
- kmem_cache_free(ire_cache, new_nlb_ire);
- }
- }
- IRB_REFRELE(irb);
-}
-
-/*
- * Whenever an ipif goes down we have to renominate a different
- * broadcast ire to receive. Whenever an ipif comes up, we need
- * to make sure that we have only one nominated to receive.
- */
-static void
-ipif_renominate_bcast(ipif_t *ipif)
-{
- ill_t *ill = ipif->ipif_ill;
- ipaddr_t subnet_addr;
- ipaddr_t net_addr;
- ipaddr_t net_mask = 0;
- ipaddr_t subnet_netmask;
- ipaddr_t addr;
- ill_group_t *illgrp;
- ip_stack_t *ipst = ill->ill_ipst;
-
- illgrp = ill->ill_group;
- /*
- * If this is the last ipif going down, it might take
- * the ill out of the group. In that case ipif_down ->
- * illgrp_delete takes care of doing the nomination.
- * ipif_down does not call for this case.
- */
- ASSERT(illgrp != NULL);
-
- /* There could not have been any ires associated with this */
- if (ipif->ipif_subnet == 0)
- return;
-
- ill_mark_bcast(illgrp, 0, ipst);
- ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst);
-
- if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
- !(ipif->ipif_flags & IPIF_NOLOCAL)) {
- net_mask = ip_net_mask(ipif->ipif_lcl_addr);
- } else {
- net_mask = htonl(IN_CLASSA_NET);
- }
- addr = net_mask & ipif->ipif_subnet;
- ill_mark_bcast(illgrp, addr, ipst);
-
- net_addr = ~net_mask | addr;
- ill_mark_bcast(illgrp, net_addr, ipst);
-
- subnet_netmask = ipif->ipif_net_mask;
- addr = ipif->ipif_subnet;
- ill_mark_bcast(illgrp, addr, ipst);
-
- subnet_addr = ~subnet_netmask | addr;
- ill_mark_bcast(illgrp, subnet_addr, ipst);
-}
-
-/*
- * Whenever we form or delete ill groups, we need to nominate one set of
- * BROADCAST ires for receiving in the group.
- *
- * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires
- * have been added, but ill_ipif_up_count is 0. Thus, we don't assert
- * for ill_ipif_up_count to be non-zero. This is the only case where
- * ill_ipif_up_count is zero and we would still find the ires.
- *
- * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one
- * ipif is UP and we just have to do the nomination.
- *
- * 3) When ill_handoff_responsibility calls us, some ill has been removed
- * from the group. So, we have to do the nomination.
- *
- * Because of (3), there could be just one ill in the group. But we have
- * to nominate still as IRE_MARK_NORCV may have been marked on this.
- * Thus, this function does not optimize when there is only one ill as
- * it is not correct for (3).
- */
-static void
-ill_nominate_bcast_rcv(ill_group_t *illgrp)
-{
- ill_t *ill;
- ipif_t *ipif;
- ipaddr_t subnet_addr;
- ipaddr_t prev_subnet_addr = 0;
- ipaddr_t net_addr;
- ipaddr_t prev_net_addr = 0;
- ipaddr_t net_mask = 0;
- ipaddr_t subnet_netmask;
- ipaddr_t addr;
- ip_stack_t *ipst;
-
- /*
- * When the last memeber is leaving, there is nothing to
- * nominate.
- */
- if (illgrp->illgrp_ill_count == 0) {
- ASSERT(illgrp->illgrp_ill == NULL);
- return;
- }
-
- ill = illgrp->illgrp_ill;
- ASSERT(!ill->ill_isv6);
- ipst = ill->ill_ipst;
- /*
- * We assume that ires with same address and belonging to the
- * same group, has been grouped together. Nominating a *single*
- * ill in the group for sending and receiving broadcast is done
- * by making sure that the first BROADCAST ire (which will be
- * the one returned by ire_ctable_lookup for ip_rput and the
- * one that will be used in ip_wput_ire) will be the one that
- * will not have IRE_MARK_NORECV set.
- *
- * 1) ip_rput checks and discards packets received on ires marked
- * with IRE_MARK_NORECV. Thus, we don't send up duplicate
- * broadcast packets. We need to clear IRE_MARK_NORECV on the
- * first ire in the group for every broadcast address in the group.
- * ip_rput will accept packets only on the first ire i.e only
- * one copy of the ill.
- *
- * 2) ip_wput_ire needs to send out just one copy of the broadcast
- * packet for the whole group. It needs to send out on the ill
- * whose ire has not been marked with IRE_MARK_NORECV. If it sends
- * on the one marked with IRE_MARK_NORECV, ip_rput will accept
- * the copy echoed back on other port where the ire is not marked
- * with IRE_MARK_NORECV.
- *
- * Note that we just need to have the first IRE either loopback or
- * non-loopback (either of them may not exist if ire_create failed
- * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will
- * always hit the first one and hence will always accept one copy.
- *
- * We have a broadcast ire per ill for all the unique prefixes
- * hosted on that ill. As we don't have a way of knowing the
- * unique prefixes on a given ill and hence in the whole group,
- * we just call ill_mark_bcast on all the prefixes that exist
- * in the group. For the common case of one prefix, the code
- * below optimizes by remebering the last address used for
- * markng. In the case of multiple prefixes, this will still
- * optimize depending the order of prefixes.
- *
- * The only unique address across the whole group is 0.0.0.0 and
- * 255.255.255.255 and thus we call only once. ill_mark_bcast enables
- * the first ire in the bucket for receiving and disables the
- * others.
- */
- ill_mark_bcast(illgrp, 0, ipst);
- ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst);
- for (; ill != NULL; ill = ill->ill_group_next) {
-
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
-
- if (!(ipif->ipif_flags & IPIF_UP) ||
- ipif->ipif_subnet == 0) {
- continue;
- }
- if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
- !(ipif->ipif_flags & IPIF_NOLOCAL)) {
- net_mask = ip_net_mask(ipif->ipif_lcl_addr);
- } else {
- net_mask = htonl(IN_CLASSA_NET);
- }
- addr = net_mask & ipif->ipif_subnet;
- if (prev_net_addr == 0 || prev_net_addr != addr) {
- ill_mark_bcast(illgrp, addr, ipst);
- net_addr = ~net_mask | addr;
- ill_mark_bcast(illgrp, net_addr, ipst);
- }
- prev_net_addr = addr;
-
- subnet_netmask = ipif->ipif_net_mask;
- addr = ipif->ipif_subnet;
- if (prev_subnet_addr == 0 ||
- prev_subnet_addr != addr) {
- ill_mark_bcast(illgrp, addr, ipst);
- subnet_addr = ~subnet_netmask | addr;
- ill_mark_bcast(illgrp, subnet_addr, ipst);
- }
- prev_subnet_addr = addr;
- }
- }
-}
-
-/*
- * This function is called while forming ill groups.
- *
- * Currently, we handle only allmulti groups. We want to join
- * allmulti on only one of the ills in the groups. In future,
- * when we have link aggregation, we may have to join normal
- * multicast groups on multiple ills as switch does inbound load
- * balancing. Following are the functions that calls this
- * function :
- *
- * 1) ill_recover_multicast : Interface is coming back UP.
- * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6
- * will call ill_recover_multicast to recover all the multicast
- * groups. We need to make sure that only one member is joined
- * in the ill group.
- *
- * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed.
- * Somebody is joining allmulti. We need to make sure that only one
- * member is joined in the group.
- *
- * 3) illgrp_insert : If allmulti has already joined, we need to make
- * sure that only one member is joined in the group.
- *
- * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving
- * allmulti who we have nominated. We need to pick someother ill.
- *
- * 5) illgrp_delete : The ill we nominated is leaving the group,
- * we need to pick a new ill to join the group.
- *
- * For (1), (2), (5) - we just have to check whether there is
- * a good ill joined in the group. If we could not find any ills
- * joined the group, we should join.
- *
- * For (4), the one that was nominated to receive, left the group.
- * There could be nobody joined in the group when this function is
- * called.
- *
- * For (3) - we need to explicitly check whether there are multiple
- * ills joined in the group.
- *
- * For simplicity, we don't differentiate any of the above cases. We
- * just leave the group if it is joined on any of them and join on
- * the first good ill.
- */
-int
-ill_nominate_mcast_rcv(ill_group_t *illgrp)
-{
- ilm_t *ilm;
- ill_t *ill;
- ill_t *fallback_inactive_ill = NULL;
- ill_t *fallback_failed_ill = NULL;
- int ret = 0;
-
- /*
- * Leave the allmulti on all the ills and start fresh.
- */
- for (ill = illgrp->illgrp_ill; ill != NULL;
- ill = ill->ill_group_next) {
- if (ill->ill_join_allmulti)
- ill_leave_allmulti(ill);
- }
-
- /*
- * Choose a good ill. Fallback to inactive or failed if
- * none available. We need to fallback to FAILED in the
- * case where we have 2 interfaces in a group - where
- * one of them is failed and another is a good one and
- * the good one (not marked inactive) is leaving the group.
- */
- for (ill = illgrp->illgrp_ill; ill != NULL; ill = ill->ill_group_next) {
- if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE)
- continue;
- if (ill->ill_phyint->phyint_flags & PHYI_FAILED) {
- fallback_failed_ill = ill;
- continue;
- }
- if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) {
- fallback_inactive_ill = ill;
- continue;
- }
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
- ret = ill_join_allmulti(ill);
- /*
- * ill_join_allmulti() can fail because of
- * memory failures so make sure we join at
- * least on one ill.
- */
- if (ill->ill_join_allmulti)
- return (0);
- }
- }
- }
- if (ret != 0) {
- /*
- * If we tried nominating above and failed to do so,
- * return error. We might have tried multiple times.
- * But, return the latest error.
- */
- return (ret);
- }
- if ((ill = fallback_inactive_ill) != NULL) {
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr))
- return (ill_join_allmulti(ill));
- }
- } else if ((ill = fallback_failed_ill) != NULL) {
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr))
- return (ill_join_allmulti(ill));
- }
- }
- return (0);
-}
-
-/*
- * This function is called from illgrp_delete after it is
- * deleted from the group to reschedule responsibilities
- * to a different ill.
- */
-static void
-ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp)
-{
- ilm_t *ilm;
- ipif_t *ipif;
- ipaddr_t subnet_addr;
- ipaddr_t net_addr;
- ipaddr_t net_mask = 0;
- ipaddr_t subnet_netmask;
- ipaddr_t addr;
- ip_stack_t *ipst = ill->ill_ipst;
-
- ASSERT(ill->ill_group == NULL);
- /*
- * Broadcast Responsibility:
- *
- * 1. If this ill has been nominated for receiving broadcast
- * packets, we need to find a new one. Before we find a new
- * one, we need to re-group the ires that are part of this new
- * group (assumed by ill_nominate_bcast_rcv). We do this by
- * calling ill_group_bcast_for_xmit(ill) which will do the right
- * thing for us.
- *
- * 2. If this ill was not nominated for receiving broadcast
- * packets, we need to clear the IRE_MARK_NORECV flag
- * so that we continue to send up broadcast packets.
- */
- if (!ill->ill_isv6) {
- /*
- * Case 1 above : No optimization here. Just redo the
- * nomination.
- */
- ill_group_bcast_for_xmit(ill);
- ill_nominate_bcast_rcv(illgrp);
-
- /*
- * Case 2 above : Lookup and clear IRE_MARK_NORECV.
- */
- ill_clear_bcast_mark(ill, 0);
- ill_clear_bcast_mark(ill, INADDR_BROADCAST);
-
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
-
- if (!(ipif->ipif_flags & IPIF_UP) ||
- ipif->ipif_subnet == 0) {
- continue;
- }
- if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
- !(ipif->ipif_flags & IPIF_NOLOCAL)) {
- net_mask = ip_net_mask(ipif->ipif_lcl_addr);
- } else {
- net_mask = htonl(IN_CLASSA_NET);
- }
- addr = net_mask & ipif->ipif_subnet;
- ill_clear_bcast_mark(ill, addr);
-
- net_addr = ~net_mask | addr;
- ill_clear_bcast_mark(ill, net_addr);
-
- subnet_netmask = ipif->ipif_net_mask;
- addr = ipif->ipif_subnet;
- ill_clear_bcast_mark(ill, addr);
-
- subnet_addr = ~subnet_netmask | addr;
- ill_clear_bcast_mark(ill, subnet_addr);
- }
- }
-
- /*
- * Multicast Responsibility.
- *
- * If we have joined allmulti on this one, find a new member
- * in the group to join allmulti. As this ill is already part
- * of allmulti, we don't have to join on this one.
- *
- * If we have not joined allmulti on this one, there is no
- * responsibility to handoff. But we need to take new
- * responsibility i.e, join allmulti on this one if we need
- * to.
- */
- if (ill->ill_join_allmulti) {
- (void) ill_nominate_mcast_rcv(illgrp);
- } else {
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
- (void) ill_join_allmulti(ill);
- break;
- }
- }
- }
-
- /*
- * We intentionally do the flushing of IRE_CACHES only matching
- * on the ill and not on groups. Note that we are already deleted
- * from the group.
- *
- * This will make sure that all IRE_CACHES whose stq is pointing
- * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get
- * deleted and IRE_CACHES that are not pointing at this ill will
- * be left alone.
- */
- ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
- illgrp_cache_delete, ill, ill);
-
- /*
- * Some conn may have cached one of the IREs deleted above. By removing
- * the ire reference, we clean up the extra reference to the ill held in
- * ire->ire_stq.
- */
- ipcl_walk(conn_cleanup_stale_ire, NULL, ipst);
-
- /*
- * Re-do source address selection for all the members in the
- * group, if they borrowed source address from one of the ipifs
- * in this ill.
- */
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (ill->ill_isv6) {
- ipif_update_other_ipifs_v6(ipif, illgrp);
- } else {
- ipif_update_other_ipifs(ipif, illgrp);
- }
+ if (ill->ill_isv6)
+ ipif_recreate_interface_routes_v6(NULL, ipif);
+ else
+ ipif_recreate_interface_routes(NULL, ipif);
}
}
/*
- * Delete the ill from the group. The caller makes sure that it is
- * in a group and it okay to delete from the group. So, we always
- * delete here.
+ * Finish the group join started in ip_sioctl_groupname().
*/
+/* ARGSUSED */
static void
-illgrp_delete(ill_t *ill)
-{
- ill_group_t *illgrp;
- ill_group_t *tmpg;
- ill_t *tmp_ill;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * Reset illgrp_ill_schednext if it was pointing at us.
- * We need to do this before we set ill_group to NULL.
- */
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- mutex_enter(&ill->ill_lock);
-
- illgrp_reset_schednext(ill);
-
- illgrp = ill->ill_group;
-
- /* Delete the ill from illgrp. */
- if (illgrp->illgrp_ill == ill) {
- illgrp->illgrp_ill = ill->ill_group_next;
- } else {
- tmp_ill = illgrp->illgrp_ill;
- while (tmp_ill->ill_group_next != ill) {
- tmp_ill = tmp_ill->ill_group_next;
- ASSERT(tmp_ill != NULL);
- }
- tmp_ill->ill_group_next = ill->ill_group_next;
- }
- ill->ill_group = NULL;
- ill->ill_group_next = NULL;
-
- illgrp->illgrp_ill_count--;
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
-
- /*
- * As this ill is leaving the group, we need to hand off
- * the responsibilities to the other ills in the group, if
- * this ill had some responsibilities.
- */
-
- ill_handoff_responsibility(ill, illgrp);
-
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
- if (illgrp->illgrp_ill_count == 0) {
-
- ASSERT(illgrp->illgrp_ill == NULL);
- if (ill->ill_isv6) {
- if (illgrp == ipst->ips_illgrp_head_v6) {
- ipst->ips_illgrp_head_v6 = illgrp->illgrp_next;
- } else {
- tmpg = ipst->ips_illgrp_head_v6;
- while (tmpg->illgrp_next != illgrp) {
- tmpg = tmpg->illgrp_next;
- ASSERT(tmpg != NULL);
- }
- tmpg->illgrp_next = illgrp->illgrp_next;
- }
- } else {
- if (illgrp == ipst->ips_illgrp_head_v4) {
- ipst->ips_illgrp_head_v4 = illgrp->illgrp_next;
- } else {
- tmpg = ipst->ips_illgrp_head_v4;
- while (tmpg->illgrp_next != illgrp) {
- tmpg = tmpg->illgrp_next;
- ASSERT(tmpg != NULL);
- }
- tmpg->illgrp_next = illgrp->illgrp_next;
- }
- }
- mutex_destroy(&illgrp->illgrp_lock);
- mi_free(illgrp);
- }
- rw_exit(&ipst->ips_ill_g_lock);
-
- /*
- * Even though the ill is out of the group its not necessary
- * to set ipsq_split as TRUE as the ipifs could be down temporarily
- * We will split the ipsq when phyint_groupname is set to NULL.
- */
-
- /*
- * Send a routing sockets message if we are deleting from
- * groups with names.
- */
- if (ill->ill_phyint->phyint_groupname_len != 0)
- ip_rts_ifmsg(ill->ill_ipif);
-}
-
-/*
- * Re-do source address selection. This is normally called when
- * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST
- * ipif comes up.
- */
-void
-ill_update_source_selection(ill_t *ill)
+ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
{
- ipif_t *ipif;
-
- ASSERT(IAM_WRITER_ILL(ill));
-
- if (ill->ill_group != NULL)
- ill = ill->ill_group->illgrp_ill;
-
- for (; ill != NULL; ill = ill->ill_group_next) {
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (ill->ill_isv6)
- ipif_recreate_interface_routes_v6(NULL, ipif);
- else
- ipif_recreate_interface_routes(NULL, ipif);
- }
- }
-}
-
-/*
- * Insert ill in a group headed by illgrp_head. The caller can either
- * pass a groupname in which case we search for a group with the
- * same name to insert in or pass a group to insert in. This function
- * would only search groups with names.
- *
- * NOTE : The caller should make sure that there is at least one ipif
- * UP on this ill so that illgrp_scheduler can pick this ill
- * for outbound packets. If ill_ipif_up_count is zero, we have
- * already sent a DL_UNBIND to the driver and we don't want to
- * send anymore packets. We don't assert for ipif_up_count
- * to be greater than zero, because ipif_up_done wants to call
- * this function before bumping up the ipif_up_count. See
- * ipif_up_done() for details.
- */
-int
-illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname,
- ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up)
-{
- ill_group_t *illgrp;
- ill_t *prev_ill;
- phyint_t *phyi;
+ ill_t *ill = q->q_ptr;
+ phyint_t *phyi = ill->ill_phyint;
+ ipmp_grp_t *grp = phyi->phyint_grp;
ip_stack_t *ipst = ill->ill_ipst;
- ASSERT(ill->ill_group == NULL);
-
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- mutex_enter(&ill->ill_lock);
-
- if (groupname != NULL) {
- /*
- * Look for a group with a matching groupname to insert.
- */
- for (illgrp = *illgrp_head; illgrp != NULL;
- illgrp = illgrp->illgrp_next) {
-
- ill_t *tmp_ill;
-
- /*
- * If we have an ill_group_t in the list which has
- * no ill_t assigned then we must be in the process of
- * removing this group. We skip this as illgrp_delete()
- * will remove it from the list.
- */
- if ((tmp_ill = illgrp->illgrp_ill) == NULL) {
- ASSERT(illgrp->illgrp_ill_count == 0);
- continue;
- }
-
- ASSERT(tmp_ill->ill_phyint != NULL);
- phyi = tmp_ill->ill_phyint;
- /*
- * Look at groups which has names only.
- */
- if (phyi->phyint_groupname_len == 0)
- continue;
- /*
- * Names are stored in the phyint common to both
- * IPv4 and IPv6.
- */
- if (mi_strcmp(phyi->phyint_groupname,
- groupname) == 0) {
- break;
- }
- }
- } else {
- /*
- * If the caller passes in a NULL "grp_to_insert", we
- * allocate one below and insert this singleton.
- */
- illgrp = grp_to_insert;
- }
-
- ill->ill_group_next = NULL;
-
- if (illgrp == NULL) {
- illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t));
- if (illgrp == NULL) {
- return (ENOMEM);
- }
- illgrp->illgrp_next = *illgrp_head;
- *illgrp_head = illgrp;
- illgrp->illgrp_ill = ill;
- illgrp->illgrp_ill_count = 1;
- ill->ill_group = illgrp;
- /*
- * Used in illgrp_scheduler to protect multiple threads
- * from traversing the list.
- */
- mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0);
- } else {
- ASSERT(ill->ill_net_type ==
- illgrp->illgrp_ill->ill_net_type);
- ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type);
-
- /* Insert ill at tail of this group */
- prev_ill = illgrp->illgrp_ill;
- while (prev_ill->ill_group_next != NULL)
- prev_ill = prev_ill->ill_group_next;
- prev_ill->ill_group_next = ill;
- ill->ill_group = illgrp;
- illgrp->illgrp_ill_count++;
- /*
- * Inherit group properties. Currently only forwarding
- * is the property we try to keep the same with all the
- * ills. When there are more, we will abstract this into
- * a function.
- */
- ill->ill_flags &= ~ILLF_ROUTER;
- ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER);
- }
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
-
- /*
- * 1) When ipif_up_done() calls this function, ipif_up_count
- * may be zero as it has not yet been bumped. But the ires
- * have already been added. So, we do the nomination here
- * itself. But, when ip_sioctl_groupname calls this, it checks
- * for ill_ipif_up_count != 0. Thus we don't check for
- * ill_ipif_up_count here while nominating broadcast ires for
- * receive.
- *
- * 2) Similarly, we need to call ill_group_bcast_for_xmit here
- * to group them properly as ire_add() has already happened
- * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert
- * case, we need to do it here anyway.
- */
- if (!ill->ill_isv6) {
- ill_group_bcast_for_xmit(ill);
- ill_nominate_bcast_rcv(illgrp);
- }
-
- if (!ipif_is_coming_up) {
- /*
- * When ipif_up_done() calls this function, the multicast
- * groups have not been joined yet. So, there is no point in
- * nomination. ill_join_allmulti() will handle groups when
- * ill_recover_multicast() is called from ipif_up_done() later.
- */
- (void) ill_nominate_mcast_rcv(illgrp);
- /*
- * ipif_up_done calls ill_update_source_selection
- * anyway. Moreover, we don't want to re-create
- * interface routes while ipif_up_done() still has reference
- * to them. Refer to ipif_up_done() for more details.
- */
- ill_update_source_selection(ill);
- }
-
- /*
- * Send a routing sockets message if we are inserting into
- * groups with names.
- */
- if (groupname != NULL)
- ip_rts_ifmsg(ill->ill_ipif);
- return (0);
-}
-
-/*
- * Return the first phyint matching the groupname. There could
- * be more than one when there are ill groups.
- *
- * If 'usable' is set, then we exclude ones that are marked with any of
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE).
- * Needs work: called only from ip_sioctl_groupname and from the ipmp/netinfo
- * emulation of ipmp.
- */
-phyint_t *
-phyint_lookup_group(char *groupname, boolean_t usable, ip_stack_t *ipst)
-{
- phyint_t *phyi;
-
- ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
- /*
- * Group names are stored in the phyint - a common structure
- * to both IPv4 and IPv6.
- */
- phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
- for (; phyi != NULL;
- phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
- phyi, AVL_AFTER)) {
- if (phyi->phyint_groupname_len == 0)
- continue;
- /*
- * Skip the ones that should not be used since the callers
- * sometime use this for sending packets.
- */
- if (usable && (phyi->phyint_flags &
- (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)))
- continue;
+ /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */
+ ASSERT(!IS_IPMP(ill) && grp != NULL);
+ ASSERT(IAM_WRITER_IPSQ(ipsq));
- ASSERT(phyi->phyint_groupname != NULL);
- if (mi_strcmp(groupname, phyi->phyint_groupname) == 0)
- return (phyi);
+ if (phyi->phyint_illv4 != NULL) {
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ VERIFY(grp->gr_pendv4-- > 0);
+ rw_exit(&ipst->ips_ipmp_lock);
+ ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4);
}
- return (NULL);
-}
-
-
-/*
- * Return the first usable phyint matching the group index. By 'usable'
- * we exclude ones that are marked ununsable with any of
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE).
- *
- * Used only for the ipmp/netinfo emulation of ipmp.
- */
-phyint_t *
-phyint_lookup_group_ifindex(uint_t group_ifindex, ip_stack_t *ipst)
-{
- phyint_t *phyi;
-
- ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
- if (!ipst->ips_ipmp_hook_emulation)
- return (NULL);
-
- /*
- * Group indicies are stored in the phyint - a common structure
- * to both IPv4 and IPv6.
- */
- phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
- for (; phyi != NULL;
- phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
- phyi, AVL_AFTER)) {
- /* Ignore the ones that do not have a group */
- if (phyi->phyint_groupname_len == 0)
- continue;
-
- ASSERT(phyi->phyint_group_ifindex != 0);
- /*
- * Skip the ones that should not be used since the callers
- * sometime use this for sending packets.
- */
- if (phyi->phyint_flags &
- (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE))
- continue;
- if (phyi->phyint_group_ifindex == group_ifindex)
- return (phyi);
+ if (phyi->phyint_illv6 != NULL) {
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ VERIFY(grp->gr_pendv6-- > 0);
+ rw_exit(&ipst->ips_ipmp_lock);
+ ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6);
}
- return (NULL);
+ freemsg(mp);
}
/*
- * MT notes on creation and deletion of IPMP groups
- *
- * Creation and deletion of IPMP groups introduce the need to merge or
- * split the associated serialization objects i.e the ipsq's. Normally all
- * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled
- * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during
- * the execution of the SIOCSLIFGROUPNAME command the picture changes. There
- * is a need to change the <ill-ipsq> association and we have to operate on both
- * the source and destination IPMP groups. For eg. attempting to set the
- * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to
- * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the
- * source or destination IPMP group are mapped to a single ipsq for executing
- * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's.
- * The <ill-ipsq> mapping is restored back to normal at a later point. This is
- * termed as a split of the ipsq. The converse of the merge i.e. a split of the
- * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname
- * occurred on the ipsq, then the ipsq_split flag is set. This indicates the
- * ipsq has to be examined for redoing the <ill-ipsq> associations.
- *
- * In the above example the ioctl handling code locates the current ipsq of hme0
- * which is ipsq(mpk17-84). It then enters the above ipsq immediately or
- * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates
- * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into
- * the destination ipsq. If the destination ipsq is not busy, it also enters
- * the destination ipsq exclusively. Now the actual groupname setting operation
- * can proceed. If the destination ipsq is busy, the operation is enqueued
- * on the destination (merged) ipsq and will be handled in the unwind from
- * ipsq_exit.
- *
- * To prevent other threads accessing the ill while the group name change is
- * in progres, we bring down the ipifs which also removes the ill from the
- * group. The group is changed in phyint and when the first ipif on the ill
- * is brought up, the ill is inserted into the right IPMP group by
- * illgrp_insert.
+ * Process an SIOCSLIFGROUPNAME request.
*/
/* ARGSUSED */
int
ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *ifreq)
{
- int i;
- char *tmp;
- int namelen;
- ill_t *ill = ipif->ipif_ill;
- ill_t *ill_v4, *ill_v6;
- int err = 0;
- phyint_t *phyi;
- phyint_t *phyi_tmp;
- struct lifreq *lifr;
- mblk_t *mp1;
- char *groupname;
- ipsq_t *ipsq;
+ struct lifreq *lifr = ifreq;
+ ill_t *ill = ipif->ipif_ill;
ip_stack_t *ipst = ill->ill_ipst;
-
- ASSERT(IAM_WRITER_IPIF(ipif));
-
- /* Existance verified in ip_wput_nondata */
- mp1 = mp->b_cont->b_cont;
- lifr = (struct lifreq *)mp1->b_rptr;
- groupname = lifr->lifr_groupname;
-
- if (ipif->ipif_id != 0)
- return (EINVAL);
-
- phyi = ill->ill_phyint;
- ASSERT(phyi != NULL);
-
- if (phyi->phyint_flags & PHYI_VIRTUAL)
- return (EINVAL);
-
- tmp = groupname;
- for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++)
- ;
-
- if (i == LIFNAMSIZ) {
- /* no null termination */
- return (EINVAL);
- }
+ phyint_t *phyi = ill->ill_phyint;
+ ipmp_grp_t *grp = phyi->phyint_grp;
+ mblk_t *ipsq_mp;
+ int err = 0;
/*
- * Calculate the namelen exclusive of the null
- * termination character.
+ * Note that phyint_grp can only change here, where we're exclusive.
*/
- namelen = tmp - groupname;
-
- ill_v4 = phyi->phyint_illv4;
- ill_v6 = phyi->phyint_illv6;
+ ASSERT(IAM_WRITER_ILL(ill));
- /*
- * ILL cannot be part of a usesrc group and and IPMP group at the
- * same time. No need to grab the ill_g_usesrc_lock here, see
- * synchronization notes in ip.c
- */
- if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
+ if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL ||
+ (phyi->phyint_flags & PHYI_VIRTUAL))
return (EINVAL);
- }
-
- /*
- * mark the ill as changing.
- * this should queue all new requests on the syncq.
- */
- GRAB_ILL_LOCKS(ill_v4, ill_v6);
-
- if (ill_v4 != NULL)
- ill_v4->ill_state_flags |= ILL_CHANGING;
- if (ill_v6 != NULL)
- ill_v6->ill_state_flags |= ILL_CHANGING;
- RELEASE_ILL_LOCKS(ill_v4, ill_v6);
-
- if (namelen == 0) {
- /*
- * Null string means remove this interface from the
- * existing group.
- */
- if (phyi->phyint_groupname_len == 0) {
- /*
- * Never was in a group.
- */
- err = 0;
- goto done;
- }
-
- /*
- * IPv4 or IPv6 may be temporarily out of the group when all
- * the ipifs are down. Thus, we need to check for ill_group to
- * be non-NULL.
- */
- if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
- ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
- mutex_enter(&ill_v4->ill_lock);
- if (!ill_is_quiescent(ill_v4)) {
- /*
- * ipsq_pending_mp_add will not fail since
- * connp is NULL
- */
- (void) ipsq_pending_mp_add(NULL,
- ill_v4->ill_ipif, q, mp, ILL_DOWN);
- mutex_exit(&ill_v4->ill_lock);
- err = EINPROGRESS;
- goto done;
- }
- mutex_exit(&ill_v4->ill_lock);
- }
-
- if (ill_v6 != NULL && ill_v6->ill_group != NULL) {
- ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
- mutex_enter(&ill_v6->ill_lock);
- if (!ill_is_quiescent(ill_v6)) {
- (void) ipsq_pending_mp_add(NULL,
- ill_v6->ill_ipif, q, mp, ILL_DOWN);
- mutex_exit(&ill_v6->ill_lock);
- err = EINPROGRESS;
- goto done;
- }
- mutex_exit(&ill_v6->ill_lock);
- }
-
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- GRAB_ILL_LOCKS(ill_v4, ill_v6);
- mutex_enter(&phyi->phyint_lock);
- ASSERT(phyi->phyint_groupname != NULL);
- mi_free(phyi->phyint_groupname);
- phyi->phyint_groupname = NULL;
- phyi->phyint_groupname_len = 0;
-
- /* Restore the ifindex used to be the per interface one */
- phyi->phyint_group_ifindex = 0;
- phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
- mutex_exit(&phyi->phyint_lock);
- RELEASE_ILL_LOCKS(ill_v4, ill_v6);
- rw_exit(&ipst->ips_ill_g_lock);
- err = ill_up_ipifs(ill, q, mp);
- /*
- * set the split flag so that the ipsq can be split
- */
- mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
- phyi->phyint_ipsq->ipsq_split = B_TRUE;
- mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
+ lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0';
- } else {
- if (phyi->phyint_groupname_len != 0) {
- ASSERT(phyi->phyint_groupname != NULL);
- /* Are we inserting in the same group ? */
- if (mi_strcmp(groupname,
- phyi->phyint_groupname) == 0) {
- err = 0;
- goto done;
- }
- }
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- /*
- * Merge ipsq for the group's.
- * This check is here as multiple groups/ills might be
- * sharing the same ipsq.
- * If we have to merege than the operation is restarted
- * on the new ipsq.
- */
- ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL, ipst);
- if (phyi->phyint_ipsq != ipsq) {
- rw_exit(&ipst->ips_ill_g_lock);
- err = ill_merge_groups(ill, NULL, groupname, mp, q);
- goto done;
- }
- /*
- * Running exclusive on new ipsq.
- */
-
- ASSERT(ipsq != NULL);
- ASSERT(ipsq->ipsq_writer == curthread);
-
- /*
- * Check whether the ill_type and ill_net_type matches before
- * we allocate any memory so that the cleanup is easier.
- *
- * We can't group dissimilar ones as we can't load spread
- * packets across the group because of potential link-level
- * header differences.
- */
- phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst);
- if (phyi_tmp != NULL) {
- if ((ill_v4 != NULL &&
- phyi_tmp->phyint_illv4 != NULL) &&
- ((ill_v4->ill_net_type !=
- phyi_tmp->phyint_illv4->ill_net_type) ||
- (ill_v4->ill_type !=
- phyi_tmp->phyint_illv4->ill_type))) {
- mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
- phyi->phyint_ipsq->ipsq_split = B_TRUE;
- mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (EINVAL);
- }
- if ((ill_v6 != NULL &&
- phyi_tmp->phyint_illv6 != NULL) &&
- ((ill_v6->ill_net_type !=
- phyi_tmp->phyint_illv6->ill_net_type) ||
- (ill_v6->ill_type !=
- phyi_tmp->phyint_illv6->ill_type))) {
- mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
- phyi->phyint_ipsq->ipsq_split = B_TRUE;
- mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (EINVAL);
- }
- }
-
- rw_exit(&ipst->ips_ill_g_lock);
-
- /*
- * bring down all v4 ipifs.
- */
- if (ill_v4 != NULL) {
- ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
- }
-
- /*
- * bring down all v6 ipifs.
- */
- if (ill_v6 != NULL) {
- ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
- }
-
- /*
- * make sure all ipifs are down and there are no active
- * references. Call to ipsq_pending_mp_add will not fail
- * since connp is NULL.
- */
- if (ill_v4 != NULL) {
- mutex_enter(&ill_v4->ill_lock);
- if (!ill_is_quiescent(ill_v4)) {
- (void) ipsq_pending_mp_add(NULL,
- ill_v4->ill_ipif, q, mp, ILL_DOWN);
- mutex_exit(&ill_v4->ill_lock);
- err = EINPROGRESS;
- goto done;
- }
- mutex_exit(&ill_v4->ill_lock);
- }
-
- if (ill_v6 != NULL) {
- mutex_enter(&ill_v6->ill_lock);
- if (!ill_is_quiescent(ill_v6)) {
- (void) ipsq_pending_mp_add(NULL,
- ill_v6->ill_ipif, q, mp, ILL_DOWN);
- mutex_exit(&ill_v6->ill_lock);
- err = EINPROGRESS;
- goto done;
- }
- mutex_exit(&ill_v6->ill_lock);
- }
-
- /*
- * allocate including space for null terminator
- * before we insert.
- */
- tmp = (char *)mi_alloc(namelen + 1, BPRI_MED);
- if (tmp == NULL)
- return (ENOMEM);
-
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- GRAB_ILL_LOCKS(ill_v4, ill_v6);
- mutex_enter(&phyi->phyint_lock);
- if (phyi->phyint_groupname_len != 0) {
- ASSERT(phyi->phyint_groupname != NULL);
- mi_free(phyi->phyint_groupname);
- }
-
- /*
- * setup the new group name.
- */
- phyi->phyint_groupname = tmp;
- bcopy(groupname, phyi->phyint_groupname, namelen + 1);
- phyi->phyint_groupname_len = namelen + 1;
-
- if (ipst->ips_ipmp_hook_emulation) {
- /*
- * If the group already exists we use the existing
- * group_ifindex, otherwise we pick a new index here.
- */
- if (phyi_tmp != NULL) {
- phyi->phyint_group_ifindex =
- phyi_tmp->phyint_group_ifindex;
- } else {
- /* XXX We need a recovery strategy here. */
- if (!ip_assign_ifindex(
- &phyi->phyint_group_ifindex, ipst))
- cmn_err(CE_PANIC,
- "ip_assign_ifindex() failed");
- }
- }
- /*
- * Select whether the netinfo and hook use the per-interface
- * or per-group ifindex.
- */
- if (ipst->ips_ipmp_hook_emulation)
- phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex;
- else
- phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-
- if (ipst->ips_ipmp_hook_emulation &&
- phyi_tmp != NULL) {
- /* First phyint in group - group PLUMB event */
- ill_nic_event_plumb(ill, B_TRUE);
- }
- mutex_exit(&phyi->phyint_lock);
- RELEASE_ILL_LOCKS(ill_v4, ill_v6);
- rw_exit(&ipst->ips_ill_g_lock);
-
- err = ill_up_ipifs(ill, q, mp);
- }
-
-done:
/*
- * normally ILL_CHANGING is cleared in ill_up_ipifs.
+ * If the name hasn't changed, there's nothing to do.
*/
- if (err != EINPROGRESS) {
- GRAB_ILL_LOCKS(ill_v4, ill_v6);
- if (ill_v4 != NULL)
- ill_v4->ill_state_flags &= ~ILL_CHANGING;
- if (ill_v6 != NULL)
- ill_v6->ill_state_flags &= ~ILL_CHANGING;
- RELEASE_ILL_LOCKS(ill_v4, ill_v6);
- }
- return (err);
-}
+ if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0)
+ goto unlock;
-/* ARGSUSED */
-int
-ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
- mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
-{
- ill_t *ill;
- phyint_t *phyi;
- struct lifreq *lifr;
- mblk_t *mp1;
-
- /* Existence verified in ip_wput_nondata */
- mp1 = mp->b_cont->b_cont;
- lifr = (struct lifreq *)mp1->b_rptr;
- ill = ipif->ipif_ill;
- phyi = ill->ill_phyint;
-
- lifr->lifr_groupname[0] = '\0';
/*
- * ill_group may be null if all the interfaces
- * are down. But still, the phyint should always
- * hold the name.
- */
- if (phyi->phyint_groupname_len != 0) {
- bcopy(phyi->phyint_groupname, lifr->lifr_groupname,
- phyi->phyint_groupname_len);
- }
-
- return (0);
-}
-
-
-typedef struct conn_move_s {
- ill_t *cm_from_ill;
- ill_t *cm_to_ill;
- int cm_ifindex;
-} conn_move_t;
-
-/*
- * ipcl_walk function for moving conn_multicast_ill for a given ill.
- */
-static void
-conn_move(conn_t *connp, caddr_t arg)
-{
- conn_move_t *connm;
- int ifindex;
- int i;
- ill_t *from_ill;
- ill_t *to_ill;
- ilg_t *ilg;
- ilm_t *ret_ilm;
-
- connm = (conn_move_t *)arg;
- ifindex = connm->cm_ifindex;
- from_ill = connm->cm_from_ill;
- to_ill = connm->cm_to_ill;
-
- /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */
-
- /* All multicast fields protected by conn_lock */
- mutex_enter(&connp->conn_lock);
- ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
- if ((connp->conn_outgoing_ill == from_ill) &&
- (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) {
- connp->conn_outgoing_ill = to_ill;
- connp->conn_incoming_ill = to_ill;
- }
-
- /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */
-
- if ((connp->conn_multicast_ill == from_ill) &&
- (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) {
- connp->conn_multicast_ill = connm->cm_to_ill;
- }
-
- /*
- * Change the ilg_ill to point to the new one. This assumes
- * ilm_move_v6 has moved the ilms to new_ill and the driver
- * has been told to receive packets on this interface.
- * ilm_move_v6 FAILBACKS all the ilms successfully always.
- * But when doing a FAILOVER, it might fail with ENOMEM and so
- * some ilms may not have moved. We check to see whether
- * the ilms have moved to to_ill. We can't check on from_ill
- * as in the process of moving, we could have split an ilm
- * in to two - which has the same orig_ifindex and v6group.
+ * Handle requests to rename an IPMP meta-interface.
*
- * For IPv4, ilg_ipif moves implicitly. The code below really
- * does not do anything for IPv4 as ilg_ill is NULL for IPv4.
- */
- for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
- ilg = &connp->conn_ilg[i];
- if ((ilg->ilg_ill == from_ill) &&
- (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) {
- /* ifindex != 0 indicates failback */
- if (ifindex != 0) {
- connp->conn_ilg[i].ilg_ill = to_ill;
- continue;
- }
-
- mutex_enter(&to_ill->ill_lock);
- ret_ilm = ilm_lookup_ill_index_v6(to_ill,
- &ilg->ilg_v6group, ilg->ilg_orig_ifindex,
- connp->conn_zoneid);
- mutex_exit(&to_ill->ill_lock);
-
- if (ret_ilm != NULL)
- connp->conn_ilg[i].ilg_ill = to_ill;
- }
+ * Note that creation of the IPMP meta-interface is handled in
+ * userland through the standard plumbing sequence. As part of the
+ * plumbing the IPMP meta-interface, its initial groupname is set to
+ * the name of the interface (see ipif_set_values_tail()).
+ */
+ if (IS_IPMP(ill)) {
+ err = ipmp_grp_rename(grp, lifr->lifr_groupname);
+ goto unlock;
}
- mutex_exit(&connp->conn_lock);
-}
-
-static void
-conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex)
-{
- conn_move_t connm;
- ip_stack_t *ipst = from_ill->ill_ipst;
-
- connm.cm_from_ill = from_ill;
- connm.cm_to_ill = to_ill;
- connm.cm_ifindex = ifindex;
-
- ipcl_walk(conn_move, (caddr_t)&connm, ipst);
-}
-
-/*
- * ilm has been moved from from_ill to to_ill.
- * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill.
- * appropriately.
- *
- * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because
- * the code there de-references ipif_ill to get the ill to
- * send multicast requests. It does not work as ipif is on its
- * move and already moved when this function is called.
- * Thus, we need to use from_ill and to_ill send down multicast
- * requests.
- */
-static void
-ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill)
-{
- ipif_t *ipif;
- ilm_t *ilm;
/*
- * See whether we need to send down DL_ENABMULTI_REQ on
- * to_ill as ilm has just been added.
+ * Handle requests to add or remove an IP interface from a group.
*/
- ASSERT(IAM_WRITER_ILL(to_ill));
- ASSERT(IAM_WRITER_ILL(from_ill));
-
- ILM_WALKER_HOLD(to_ill);
- for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-
- if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED))
- continue;
- /*
- * no locks held, ill/ipif cannot dissappear as long
- * as we are writer.
- */
- ipif = to_ill->ill_ipif;
+ if (lifr->lifr_groupname[0] != '\0') { /* add */
/*
- * No need to hold any lock as we are the writer and this
- * can only be changed by a writer.
+ * Moves are handled by first removing the interface from
+ * its existing group, and then adding it to another group.
+ * So, fail if it's already in a group.
*/
- ilm->ilm_is_new = B_FALSE;
-
- if (to_ill->ill_net_type != IRE_IF_RESOLVER ||
- ipif->ipif_flags & IPIF_POINTOPOINT) {
- ip1dbg(("ilm_send_multicast_reqs: to_ill not "
- "resolver\n"));
- continue; /* Must be IRE_IF_NORESOLVER */
- }
-
- if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
- ip1dbg(("ilm_send_multicast_reqs: "
- "to_ill MULTI_BCAST\n"));
- goto from;
+ if (IS_UNDER_IPMP(ill)) {
+ err = EALREADY;
+ goto unlock;
}
- if (to_ill->ill_isv6)
- mld_joingroup(ilm);
- else
- igmp_joingroup(ilm);
-
- if (to_ill->ill_ipif_up_count == 0) {
- /*
- * Nobody there. All multicast addresses will be
- * re-joined when we get the DL_BIND_ACK bringing the
- * interface up.
- */
- ilm->ilm_notify_driver = B_FALSE;
- ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n"));
- goto from;
+ grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst);
+ if (grp == NULL) {
+ err = ENOENT;
+ goto unlock;
}
/*
- * For allmulti address, we want to join on only one interface.
- * Checking for ilm_numentries_v6 is not correct as you may
- * find an ilm with zero address on to_ill, but we may not
- * have nominated to_ill for receiving. Thus, if we have
- * nominated from_ill (ill_join_allmulti is set), nominate
- * only if to_ill is not already nominated (to_ill normally
- * should not have been nominated if "from_ill" has already
- * been nominated. As we don't prevent failovers from happening
- * across groups, we don't assert).
+ * Check if the phyint and its ills are suitable for
+ * inclusion into the group.
*/
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
- /*
- * There is no need to hold ill locks as we are
- * writer on both ills and when ill_join_allmulti()
- * is called the thread is always a writer.
- */
- if (from_ill->ill_join_allmulti &&
- !to_ill->ill_join_allmulti) {
- (void) ill_join_allmulti(to_ill);
- }
- } else if (ilm->ilm_notify_driver) {
-
- /*
- * This is a newly moved ilm so we need to tell the
- * driver about the new group. There can be more than
- * one ilm's for the same group in the list each with a
- * different orig_ifindex. We have to inform the driver
- * once. In ilm_move_v[4,6] we only set the flag
- * ilm_notify_driver for the first ilm.
- */
-
- (void) ip_ll_send_enabmulti_req(to_ill,
- &ilm->ilm_v6addr);
- }
-
- ilm->ilm_notify_driver = B_FALSE;
+ if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0)
+ goto unlock;
/*
- * See whether we need to send down DL_DISABMULTI_REQ on
- * from_ill as ilm has just been removed.
+ * Checks pass; join the group, and enqueue the remaining
+ * illgrp joins for when we've become part of the group xop
+ * and are exclusive across its IPSQs. Since qwriter_ip()
+ * requires an mblk_t to scribble on, and since `mp' will be
+ * freed as part of completing the ioctl, allocate another.
*/
-from:
- ipif = from_ill->ill_ipif;
- if (from_ill->ill_net_type != IRE_IF_RESOLVER ||
- ipif->ipif_flags & IPIF_POINTOPOINT) {
- ip1dbg(("ilm_send_multicast_reqs: "
- "from_ill not resolver\n"));
- continue; /* Must be IRE_IF_NORESOLVER */
- }
-
- if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
- ip1dbg(("ilm_send_multicast_reqs: "
- "from_ill MULTI_BCAST\n"));
- continue;
- }
-
- if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
- if (from_ill->ill_join_allmulti)
- ill_leave_allmulti(from_ill);
- } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) {
- (void) ip_ll_send_disabmulti_req(from_ill,
- &ilm->ilm_v6addr);
+ if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) {
+ err = ENOMEM;
+ goto unlock;
}
- }
- ILM_WALKER_RELE(to_ill);
-}
-
-/*
- * This function is called when all multicast memberships needs
- * to be moved from "from_ill" to "to_ill" for IPv6. This function is
- * called only once unlike the IPv4 counterpart where it is called after
- * every logical interface is moved. The reason is due to multicast
- * memberships are joined using an interface address in IPv4 while in
- * IPv6, interface index is used.
- */
-static void
-ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex)
-{
- ilm_t *ilm;
- ilm_t *ilm_next;
- ilm_t *new_ilm;
- ilm_t **ilmp;
- int count;
- char buf[INET6_ADDRSTRLEN];
- in6_addr_t ipv6_snm = ipv6_solicited_node_mcast;
- ip_stack_t *ipst = from_ill->ill_ipst;
-
- ASSERT(MUTEX_HELD(&to_ill->ill_lock));
- ASSERT(MUTEX_HELD(&from_ill->ill_lock));
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
- if (ifindex == 0) {
/*
- * Form the solicited node mcast address which is used later.
+ * Before we drop ipmp_lock, bump gr_pend* to ensure that the
+ * IPMP meta-interface ills needed by `phyi' cannot go away
+ * before ip_join_illgrps() is called back. See the comments
+ * in ip_sioctl_plink_ipmp() for more.
*/
- ipif_t *ipif;
-
- ipif = from_ill->ill_ipif;
- ASSERT(ipif->ipif_id == 0);
-
- ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
- }
-
- ilmp = &from_ill->ill_ilm;
- for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
- ilm_next = ilm->ilm_next;
-
- if (ilm->ilm_flags & ILM_DELETED) {
- ilmp = &ilm->ilm_next;
- continue;
- }
+ if (phyi->phyint_illv4 != NULL)
+ grp->gr_pendv4++;
+ if (phyi->phyint_illv6 != NULL)
+ grp->gr_pendv6++;
- new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr,
- ilm->ilm_orig_ifindex, ilm->ilm_zoneid);
- ASSERT(ilm->ilm_orig_ifindex != 0);
- if (ilm->ilm_orig_ifindex == ifindex) {
- /*
- * We are failing back multicast memberships.
- * If the same ilm exists in to_ill, it means somebody
- * has joined the same group there e.g. ff02::1
- * is joined within the kernel when the interfaces
- * came UP.
- */
- ASSERT(ilm->ilm_ipif == NULL);
- if (new_ilm != NULL) {
- new_ilm->ilm_refcnt += ilm->ilm_refcnt;
- if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
- !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
- new_ilm->ilm_is_new = B_TRUE;
- }
- } else {
- /*
- * check if we can just move the ilm
- */
- if (from_ill->ill_ilm_walker_cnt != 0) {
- /*
- * We have walkers we cannot move
- * the ilm, so allocate a new ilm,
- * this (old) ilm will be marked
- * ILM_DELETED at the end of the loop
- * and will be freed when the
- * last walker exits.
- */
- new_ilm = (ilm_t *)mi_zalloc
- (sizeof (ilm_t));
- if (new_ilm == NULL) {
- ip0dbg(("ilm_move_v6: "
- "FAILBACK of IPv6"
- " multicast address %s : "
- "from %s to"
- " %s failed : ENOMEM \n",
- inet_ntop(AF_INET6,
- &ilm->ilm_v6addr, buf,
- sizeof (buf)),
- from_ill->ill_name,
- to_ill->ill_name));
-
- ilmp = &ilm->ilm_next;
- continue;
- }
- *new_ilm = *ilm;
- /*
- * we don't want new_ilm linked to
- * ilm's filter list.
- */
- new_ilm->ilm_filter = NULL;
- } else {
- /*
- * No walkers we can move the ilm.
- * lets take it out of the list.
- */
- *ilmp = ilm->ilm_next;
- ilm->ilm_next = NULL;
- DTRACE_PROBE3(ill__decr__cnt,
- (ill_t *), from_ill,
- (char *), "ilm", (void *), ilm);
- ASSERT(from_ill->ill_ilm_cnt > 0);
- from_ill->ill_ilm_cnt--;
-
- new_ilm = ilm;
- }
+ rw_exit(&ipst->ips_ipmp_lock);
- /*
- * if this is the first ilm for the group
- * set ilm_notify_driver so that we notify the
- * driver in ilm_send_multicast_reqs.
- */
- if (ilm_lookup_ill_v6(to_ill,
- &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
- new_ilm->ilm_notify_driver = B_TRUE;
-
- DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
- (char *), "ilm", (void *), new_ilm);
- new_ilm->ilm_ill = to_ill;
- to_ill->ill_ilm_cnt++;
-
- /* Add to the to_ill's list */
- new_ilm->ilm_next = to_ill->ill_ilm;
- to_ill->ill_ilm = new_ilm;
- /*
- * set the flag so that mld_joingroup is
- * called in ilm_send_multicast_reqs().
- */
- new_ilm->ilm_is_new = B_TRUE;
- }
- goto bottom;
- } else if (ifindex != 0) {
- /*
- * If this is FAILBACK (ifindex != 0) and the ifindex
- * has not matched above, look at the next ilm.
- */
- ilmp = &ilm->ilm_next;
- continue;
- }
- /*
- * If we are here, it means ifindex is 0. Failover
- * everything.
- *
- * We need to handle solicited node mcast address
- * and all_nodes mcast address differently as they
- * are joined witin the kenrel (ipif_multicast_up)
- * and potentially from the userland. We are called
- * after the ipifs of from_ill has been moved.
- * If we still find ilms on ill with solicited node
- * mcast address or all_nodes mcast address, it must
- * belong to the UP interface that has not moved e.g.
- * ipif_id 0 with the link local prefix does not move.
- * We join this on the new ill accounting for all the
- * userland memberships so that applications don't
- * see any failure.
- *
- * We need to make sure that we account only for the
- * solicited node and all node multicast addresses
- * that was brought UP on these. In the case of
- * a failover from A to B, we might have ilms belonging
- * to A (ilm_orig_ifindex pointing at A) on B accounting
- * for the membership from the userland. If we are failing
- * over from B to C now, we will find the ones belonging
- * to A on B. These don't account for the ill_ipif_up_count.
- * They just move from B to C. The check below on
- * ilm_orig_ifindex ensures that.
- */
- if ((ilm->ilm_orig_ifindex ==
- from_ill->ill_phyint->phyint_ifindex) &&
- (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) ||
- IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast,
- &ilm->ilm_v6addr))) {
- ASSERT(ilm->ilm_refcnt > 0);
- count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count;
- /*
- * For indentation reasons, we are not using a
- * "else" here.
- */
- if (count == 0) {
- ilmp = &ilm->ilm_next;
- continue;
- }
- ilm->ilm_refcnt -= count;
- if (new_ilm != NULL) {
- /*
- * Can find one with the same
- * ilm_orig_ifindex, if we are failing
- * over to a STANDBY. This happens
- * when somebody wants to join a group
- * on a STANDBY interface and we
- * internally join on a different one.
- * If we had joined on from_ill then, a
- * failover now will find a new ilm
- * with this index.
- */
- ip1dbg(("ilm_move_v6: FAILOVER, found"
- " new ilm on %s, group address %s\n",
- to_ill->ill_name,
- inet_ntop(AF_INET6,
- &ilm->ilm_v6addr, buf,
- sizeof (buf))));
- new_ilm->ilm_refcnt += count;
- if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
- !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
- new_ilm->ilm_is_new = B_TRUE;
- }
- } else {
- new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
- if (new_ilm == NULL) {
- ip0dbg(("ilm_move_v6: FAILOVER of IPv6"
- " multicast address %s : from %s to"
- " %s failed : ENOMEM \n",
- inet_ntop(AF_INET6,
- &ilm->ilm_v6addr, buf,
- sizeof (buf)), from_ill->ill_name,
- to_ill->ill_name));
- ilmp = &ilm->ilm_next;
- continue;
- }
- *new_ilm = *ilm;
- new_ilm->ilm_filter = NULL;
- new_ilm->ilm_refcnt = count;
- new_ilm->ilm_timer = INFINITY;
- new_ilm->ilm_rtx.rtx_timer = INFINITY;
- new_ilm->ilm_is_new = B_TRUE;
- /*
- * If the to_ill has not joined this
- * group we need to tell the driver in
- * ill_send_multicast_reqs.
- */
- if (ilm_lookup_ill_v6(to_ill,
- &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
- new_ilm->ilm_notify_driver = B_TRUE;
-
- new_ilm->ilm_ill = to_ill;
- DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
- (char *), "ilm", (void *), new_ilm);
- to_ill->ill_ilm_cnt++;
-
- /* Add to the to_ill's list */
- new_ilm->ilm_next = to_ill->ill_ilm;
- to_ill->ill_ilm = new_ilm;
- ASSERT(new_ilm->ilm_ipif == NULL);
- }
- if (ilm->ilm_refcnt == 0) {
- goto bottom;
- } else {
- new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
- CLEAR_SLIST(new_ilm->ilm_filter);
- ilmp = &ilm->ilm_next;
- }
- continue;
- } else {
- /*
- * ifindex = 0 means, move everything pointing at
- * from_ill. We are doing this becuase ill has
- * either FAILED or became INACTIVE.
- *
- * As we would like to move things later back to
- * from_ill, we want to retain the identity of this
- * ilm. Thus, we don't blindly increment the reference
- * count on the ilms matching the address alone. We
- * need to match on the ilm_orig_index also. new_ilm
- * was obtained by matching ilm_orig_index also.
- */
- if (new_ilm != NULL) {
- /*
- * This is possible only if a previous restore
- * was incomplete i.e restore to
- * ilm_orig_ifindex left some ilms because
- * of some failures. Thus when we are failing
- * again, we might find our old friends there.
- */
- ip1dbg(("ilm_move_v6: FAILOVER, found new ilm"
- " on %s, group address %s\n",
- to_ill->ill_name,
- inet_ntop(AF_INET6,
- &ilm->ilm_v6addr, buf,
- sizeof (buf))));
- new_ilm->ilm_refcnt += ilm->ilm_refcnt;
- if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
- !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
- new_ilm->ilm_is_new = B_TRUE;
- }
- } else {
- if (from_ill->ill_ilm_walker_cnt != 0) {
- new_ilm = (ilm_t *)
- mi_zalloc(sizeof (ilm_t));
- if (new_ilm == NULL) {
- ip0dbg(("ilm_move_v6: "
- "FAILOVER of IPv6"
- " multicast address %s : "
- "from %s to"
- " %s failed : ENOMEM \n",
- inet_ntop(AF_INET6,
- &ilm->ilm_v6addr, buf,
- sizeof (buf)),
- from_ill->ill_name,
- to_ill->ill_name));
-
- ilmp = &ilm->ilm_next;
- continue;
- }
- *new_ilm = *ilm;
- new_ilm->ilm_filter = NULL;
- } else {
- *ilmp = ilm->ilm_next;
- DTRACE_PROBE3(ill__decr__cnt,
- (ill_t *), from_ill,
- (char *), "ilm", (void *), ilm);
- ASSERT(from_ill->ill_ilm_cnt > 0);
- from_ill->ill_ilm_cnt--;
-
- new_ilm = ilm;
- }
- /*
- * If the to_ill has not joined this
- * group we need to tell the driver in
- * ill_send_multicast_reqs.
- */
- if (ilm_lookup_ill_v6(to_ill,
- &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
- new_ilm->ilm_notify_driver = B_TRUE;
-
- /* Add to the to_ill's list */
- new_ilm->ilm_next = to_ill->ill_ilm;
- to_ill->ill_ilm = new_ilm;
- ASSERT(ilm->ilm_ipif == NULL);
- new_ilm->ilm_ill = to_ill;
- DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
- (char *), "ilm", (void *), new_ilm);
- to_ill->ill_ilm_cnt++;
- new_ilm->ilm_is_new = B_TRUE;
- }
-
- }
-
-bottom:
- /*
- * Revert multicast filter state to (EXCLUDE, NULL).
- * new_ilm->ilm_is_new should already be set if needed.
- */
- new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
- CLEAR_SLIST(new_ilm->ilm_filter);
+ ipmp_phyint_join_grp(phyi, grp);
+ ill_refhold(ill);
+ qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps,
+ SWITCH_OP, B_FALSE);
+ return (0);
+ } else {
/*
- * We allocated/got a new ilm, free the old one.
+ * Request to remove the interface from a group. If the
+ * interface is not in a group, this trivially succeeds.
*/
- if (new_ilm != ilm) {
- if (from_ill->ill_ilm_walker_cnt == 0) {
- *ilmp = ilm->ilm_next;
-
- ASSERT(ilm->ilm_ipif == NULL); /* ipv6 */
- DTRACE_PROBE3(ill__decr__cnt, (ill_t *),
- from_ill, (char *), "ilm", (void *), ilm);
- ASSERT(from_ill->ill_ilm_cnt > 0);
- from_ill->ill_ilm_cnt--;
-
- ilm_inactive(ilm); /* frees this ilm */
-
- } else {
- ilm->ilm_flags |= ILM_DELETED;
- from_ill->ill_ilm_cleanup_reqd = 1;
- ilmp = &ilm->ilm_next;
- }
- }
+ rw_exit(&ipst->ips_ipmp_lock);
+ if (IS_UNDER_IPMP(ill))
+ ipmp_phyint_leave_grp(phyi);
+ return (0);
}
+unlock:
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (err);
}
/*
- * Move all the multicast memberships to to_ill. Called when
- * an ipif moves from "from_ill" to "to_ill". This function is slightly
- * different from IPv6 counterpart as multicast memberships are associated
- * with ills in IPv6. This function is called after every ipif is moved
- * unlike IPv6, where it is moved only once.
+ * Process an SIOCGLIFBINDING request.
*/
-static void
-ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif)
-{
- ilm_t *ilm;
- ilm_t *ilm_next;
- ilm_t *new_ilm;
- ilm_t **ilmp;
- ip_stack_t *ipst = from_ill->ill_ipst;
-
- ASSERT(MUTEX_HELD(&to_ill->ill_lock));
- ASSERT(MUTEX_HELD(&from_ill->ill_lock));
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
- ilmp = &from_ill->ill_ilm;
- for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
- ilm_next = ilm->ilm_next;
-
- if (ilm->ilm_flags & ILM_DELETED) {
- ilmp = &ilm->ilm_next;
- continue;
- }
-
- ASSERT(ilm->ilm_ipif != NULL);
-
- if (ilm->ilm_ipif != ipif) {
- ilmp = &ilm->ilm_next;
- continue;
- }
-
- if (V4_PART_OF_V6(ilm->ilm_v6addr) ==
- htonl(INADDR_ALLHOSTS_GROUP)) {
- new_ilm = ilm_lookup_ipif(ipif,
- V4_PART_OF_V6(ilm->ilm_v6addr));
- if (new_ilm != NULL) {
- new_ilm->ilm_refcnt += ilm->ilm_refcnt;
- /*
- * We still need to deal with the from_ill.
- */
- new_ilm->ilm_is_new = B_TRUE;
- new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
- CLEAR_SLIST(new_ilm->ilm_filter);
- ASSERT(ilm->ilm_ipif == ipif);
- ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
- if (from_ill->ill_ilm_walker_cnt == 0) {
- DTRACE_PROBE3(ill__decr__cnt,
- (ill_t *), from_ill,
- (char *), "ilm", (void *), ilm);
- ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
- }
- goto delete_ilm;
- }
- /*
- * If we could not find one e.g. ipif is
- * still down on to_ill, we add this ilm
- * on ill_new to preserve the reference
- * count.
- */
- }
- /*
- * When ipifs move, ilms always move with it
- * to the NEW ill. Thus we should never be
- * able to find ilm till we really move it here.
- */
- ASSERT(ilm_lookup_ipif(ipif,
- V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL);
-
- if (from_ill->ill_ilm_walker_cnt != 0) {
- new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
- if (new_ilm == NULL) {
- char buf[INET6_ADDRSTRLEN];
- ip0dbg(("ilm_move_v4: FAILBACK of IPv4"
- " multicast address %s : "
- "from %s to"
- " %s failed : ENOMEM \n",
- inet_ntop(AF_INET,
- &ilm->ilm_v6addr, buf,
- sizeof (buf)),
- from_ill->ill_name,
- to_ill->ill_name));
-
- ilmp = &ilm->ilm_next;
- continue;
- }
- *new_ilm = *ilm;
- DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ipif,
- (char *), "ilm", (void *), ilm);
- new_ilm->ilm_ipif->ipif_ilm_cnt++;
- /* We don't want new_ilm linked to ilm's filter list */
- new_ilm->ilm_filter = NULL;
- } else {
- /* Remove from the list */
- *ilmp = ilm->ilm_next;
- new_ilm = ilm;
- }
-
- /*
- * If we have never joined this group on the to_ill
- * make sure we tell the driver.
- */
- if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr,
- ALL_ZONES) == NULL)
- new_ilm->ilm_notify_driver = B_TRUE;
-
- /* Add to the to_ill's list */
- new_ilm->ilm_next = to_ill->ill_ilm;
- to_ill->ill_ilm = new_ilm;
- new_ilm->ilm_is_new = B_TRUE;
-
- /*
- * Revert multicast filter state to (EXCLUDE, NULL)
- */
- new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
- CLEAR_SLIST(new_ilm->ilm_filter);
-
- /*
- * Delete only if we have allocated a new ilm.
- */
- if (new_ilm != ilm) {
-delete_ilm:
- if (from_ill->ill_ilm_walker_cnt == 0) {
- /* Remove from the list */
- *ilmp = ilm->ilm_next;
- ilm->ilm_next = NULL;
- DTRACE_PROBE3(ipif__decr__cnt,
- (ipif_t *), ilm->ilm_ipif,
- (char *), "ilm", (void *), ilm);
- ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
- ilm->ilm_ipif->ipif_ilm_cnt--;
- ilm_inactive(ilm);
- } else {
- ilm->ilm_flags |= ILM_DELETED;
- from_ill->ill_ilm_cleanup_reqd = 1;
- ilmp = &ilm->ilm_next;
- }
- }
- }
-}
-
-static uint_t
-ipif_get_id(ill_t *ill, uint_t id)
-{
- uint_t unit;
- ipif_t *tipif;
- boolean_t found = B_FALSE;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * During failback, we want to go back to the same id
- * instead of the smallest id so that the original
- * configuration is maintained. id is non-zero in that
- * case.
- */
- if (id != 0) {
- /*
- * While failing back, if we still have an ipif with
- * MAX_ADDRS_PER_IF, it means this will be replaced
- * as soon as we return from this function. It was
- * to set to MAX_ADDRS_PER_IF by the caller so that
- * we can choose the smallest id. Thus we return zero
- * in that case ignoring the hint.
- */
- if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF)
- return (0);
- for (tipif = ill->ill_ipif; tipif != NULL;
- tipif = tipif->ipif_next) {
- if (tipif->ipif_id == id) {
- found = B_TRUE;
- break;
- }
- }
- /*
- * If somebody already plumbed another logical
- * with the same id, we won't be able to find it.
- */
- if (!found)
- return (id);
- }
- for (unit = 0; unit <= ipst->ips_ip_addrs_per_if; unit++) {
- found = B_FALSE;
- for (tipif = ill->ill_ipif; tipif != NULL;
- tipif = tipif->ipif_next) {
- if (tipif->ipif_id == unit) {
- found = B_TRUE;
- break;
- }
- }
- if (!found)
- break;
- }
- return (unit);
-}
-
/* ARGSUSED */
-static int
-ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp,
- ipif_t **rep_ipif_ptr)
+int
+ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+ ip_ioctl_cmd_t *ipip, void *ifreq)
{
- ill_t *from_ill;
- ipif_t *rep_ipif;
- uint_t unit;
- int err = 0;
- ipif_t *to_ipif;
- struct iocblk *iocp;
- boolean_t failback_cmd;
- boolean_t remove_ipif;
- int rc;
- ip_stack_t *ipst;
-
- ASSERT(IAM_WRITER_ILL(to_ill));
- ASSERT(IAM_WRITER_IPIF(ipif));
-
- iocp = (struct iocblk *)mp->b_rptr;
- failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK);
- remove_ipif = B_FALSE;
-
- from_ill = ipif->ipif_ill;
- ipst = from_ill->ill_ipst;
-
- ASSERT(MUTEX_HELD(&to_ill->ill_lock));
- ASSERT(MUTEX_HELD(&from_ill->ill_lock));
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
- /*
- * Don't move LINK LOCAL addresses as they are tied to
- * physical interface.
- */
- if (from_ill->ill_isv6 &&
- IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) {
- ipif->ipif_was_up = B_FALSE;
- IPIF_UNMARK_MOVING(ipif);
- return (0);
- }
-
- /*
- * We set the ipif_id to maximum so that the search for
- * ipif_id will pick the lowest number i.e 0 in the
- * following 2 cases :
- *
- * 1) We have a replacement ipif at the head of to_ill.
- * We can't remove it yet as we can exceed ip_addrs_per_if
- * on to_ill and hence the MOVE might fail. We want to
- * remove it only if we could move the ipif. Thus, by
- * setting it to the MAX value, we make the search in
- * ipif_get_id return the zeroth id.
- *
- * 2) When DR pulls out the NIC and re-plumbs the interface,
- * we might just have a zero address plumbed on the ipif
- * with zero id in the case of IPv4. We remove that while
- * doing the failback. We want to remove it only if we
- * could move the ipif. Thus, by setting it to the MAX
- * value, we make the search in ipif_get_id return the
- * zeroth id.
- *
- * Both (1) and (2) are done only when when we are moving
- * an ipif (either due to failover/failback) which originally
- * belonged to this interface i.e the ipif_orig_ifindex is
- * the same as to_ill's ifindex. This is needed so that
- * FAILOVER from A -> B ( A failed) followed by FAILOVER
- * from B -> A (B is being removed from the group) and
- * FAILBACK from A -> B restores the original configuration.
- * Without the check for orig_ifindex, the second FAILOVER
- * could make the ipif belonging to B replace the A's zeroth
- * ipif and the subsequent failback re-creating the replacement
- * ipif again.
- *
- * NOTE : We created the replacement ipif when we did a
- * FAILOVER (See below). We could check for FAILBACK and
- * then look for replacement ipif to be removed. But we don't
- * want to do that because we wan't to allow the possibility
- * of a FAILOVER from A -> B (which creates the replacement ipif),
- * followed by a *FAILOVER* from B -> A instead of a FAILBACK
- * from B -> A.
- */
- to_ipif = to_ill->ill_ipif;
- if ((to_ill->ill_phyint->phyint_ifindex ==
- ipif->ipif_orig_ifindex) &&
- to_ipif->ipif_replace_zero) {
- ASSERT(to_ipif->ipif_id == 0);
- remove_ipif = B_TRUE;
- to_ipif->ipif_id = MAX_ADDRS_PER_IF;
- }
- /*
- * Find the lowest logical unit number on the to_ill.
- * If we are failing back, try to get the original id
- * rather than the lowest one so that the original
- * configuration is maintained.
- *
- * XXX need a better scheme for this.
- */
- if (failback_cmd) {
- unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid);
- } else {
- unit = ipif_get_id(to_ill, 0);
- }
-
- /* Reset back to zero in case we fail below */
- if (to_ipif->ipif_id == MAX_ADDRS_PER_IF)
- to_ipif->ipif_id = 0;
+ ill_t *bound_ill;
+ struct lifreq *lifr = ifreq;
- if (unit == ipst->ips_ip_addrs_per_if) {
- ipif->ipif_was_up = B_FALSE;
- IPIF_UNMARK_MOVING(ipif);
+ if (!IS_IPMP(ipif->ipif_ill))
return (EINVAL);
- }
-
- /*
- * ipif is ready to move from "from_ill" to "to_ill".
- *
- * 1) If we are moving ipif with id zero, create a
- * replacement ipif for this ipif on from_ill. If this fails
- * fail the MOVE operation.
- *
- * 2) Remove the replacement ipif on to_ill if any.
- * We could remove the replacement ipif when we are moving
- * the ipif with id zero. But what if somebody already
- * unplumbed it ? Thus we always remove it if it is present.
- * We want to do it only if we are sure we are going to
- * move the ipif to to_ill which is why there are no
- * returns due to error till ipif is linked to to_ill.
- * Note that the first ipif that we failback will always
- * be zero if it is present.
- */
- if (ipif->ipif_id == 0) {
- ipaddr_t inaddr_any = INADDR_ANY;
- rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED);
- if (rep_ipif == NULL) {
- ipif->ipif_was_up = B_FALSE;
- IPIF_UNMARK_MOVING(ipif);
- return (ENOMEM);
- }
- *rep_ipif = ipif_zero;
- /*
- * Before we put the ipif on the list, store the addresses
- * as mapped addresses as some of the ioctls e.g SIOCGIFADDR
- * assumes so. This logic is not any different from what
- * ipif_allocate does.
- */
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6lcl_addr);
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6src_addr);
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6subnet);
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6net_mask);
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6brd_addr);
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &rep_ipif->ipif_v6pp_dst_addr);
- /*
- * We mark IPIF_NOFAILOVER so that this can never
- * move.
- */
- rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER;
- rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE;
- rep_ipif->ipif_replace_zero = B_TRUE;
- mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL,
- MUTEX_DEFAULT, NULL);
- rep_ipif->ipif_id = 0;
- rep_ipif->ipif_ire_type = ipif->ipif_ire_type;
- rep_ipif->ipif_ill = from_ill;
- rep_ipif->ipif_orig_ifindex =
- from_ill->ill_phyint->phyint_ifindex;
- /* Insert at head */
- rep_ipif->ipif_next = from_ill->ill_ipif;
- from_ill->ill_ipif = rep_ipif;
- /*
- * We don't really care to let apps know about
- * this interface.
- */
- }
-
- if (remove_ipif) {
- /*
- * We set to a max value above for this case to get
- * id zero. ASSERT that we did get one.
- */
- ASSERT((to_ipif->ipif_id == 0) && (unit == 0));
- rep_ipif = to_ipif;
- to_ill->ill_ipif = rep_ipif->ipif_next;
- rep_ipif->ipif_next = NULL;
- /*
- * If some apps scanned and find this interface,
- * it is time to let them know, so that they can
- * delete it.
- */
-
- *rep_ipif_ptr = rep_ipif;
- }
-
- /* Get it out of the ILL interface list. */
- ipif_remove(ipif, B_FALSE);
-
- /* Assign the new ill */
- ipif->ipif_ill = to_ill;
- ipif->ipif_id = unit;
- /* id has already been checked */
- rc = ipif_insert(ipif, B_FALSE, B_FALSE);
- ASSERT(rc == 0);
- /* Let SCTP update its list */
- sctp_move_ipif(ipif, from_ill, to_ill);
- /*
- * Handle the failover and failback of ipif_t between
- * ill_t that have differing maximum mtu values.
- */
- if (ipif->ipif_mtu > to_ill->ill_max_mtu) {
- if (ipif->ipif_saved_mtu == 0) {
- /*
- * As this ipif_t is moving to an ill_t
- * that has a lower ill_max_mtu, its
- * ipif_mtu needs to be saved so it can
- * be restored during failback or during
- * failover to an ill_t which has a
- * higher ill_max_mtu.
- */
- ipif->ipif_saved_mtu = ipif->ipif_mtu;
- ipif->ipif_mtu = to_ill->ill_max_mtu;
- } else {
- /*
- * The ipif_t is, once again, moving to
- * an ill_t that has a lower maximum mtu
- * value.
- */
- ipif->ipif_mtu = to_ill->ill_max_mtu;
- }
- } else if (ipif->ipif_mtu < to_ill->ill_max_mtu &&
- ipif->ipif_saved_mtu != 0) {
- /*
- * The mtu of this ipif_t had to be reduced
- * during an earlier failover; this is an
- * opportunity for it to be increased (either as
- * part of another failover or a failback).
- */
- if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) {
- ipif->ipif_mtu = ipif->ipif_saved_mtu;
- ipif->ipif_saved_mtu = 0;
- } else {
- ipif->ipif_mtu = to_ill->ill_max_mtu;
- }
+ if ((bound_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
+ lifr->lifr_binding[0] = '\0';
+ return (0);
}
- /*
- * We preserve all the other fields of the ipif including
- * ipif_saved_ire_mp. The routes that are saved here will
- * be recreated on the new interface and back on the old
- * interface when we move back.
- */
- ASSERT(ipif->ipif_arp_del_mp == NULL);
-
- return (err);
-}
-
-static int
-ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp,
- int ifindex, ipif_t **rep_ipif_ptr)
-{
- ipif_t *mipif;
- ipif_t *ipif_next;
- int err;
-
- /*
- * We don't really try to MOVE back things if some of the
- * operations fail. The daemon will take care of moving again
- * later on.
- */
- for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) {
- ipif_next = mipif->ipif_next;
- if (!(mipif->ipif_flags & IPIF_NOFAILOVER) &&
- (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) {
-
- err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr);
-
- /*
- * When the MOVE fails, it is the job of the
- * application to take care of this properly
- * i.e try again if it is ENOMEM.
- */
- if (mipif->ipif_ill != from_ill) {
- /*
- * ipif has moved.
- *
- * Move the multicast memberships associated
- * with this ipif to the new ill. For IPv6, we
- * do it once after all the ipifs are moved
- * (in ill_move) as they are not associated
- * with ipifs.
- *
- * We need to move the ilms as the ipif has
- * already been moved to a new ill even
- * in the case of errors. Neither
- * ilm_free(ipif) will find the ilm
- * when somebody unplumbs this ipif nor
- * ilm_delete(ilm) will be able to find the
- * ilm, if we don't move now.
- */
- if (!from_ill->ill_isv6)
- ilm_move_v4(from_ill, to_ill, mipif);
- }
-
- if (err != 0)
- return (err);
- }
- }
+ (void) strlcpy(lifr->lifr_binding, bound_ill->ill_name, LIFNAMSIZ);
+ ill_refrele(bound_ill);
return (0);
}
-static int
-ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp)
-{
- int ifindex;
- int err;
- struct iocblk *iocp;
- ipif_t *ipif;
- ipif_t *rep_ipif_ptr = NULL;
- ipif_t *from_ipif = NULL;
- boolean_t check_rep_if = B_FALSE;
- ip_stack_t *ipst = from_ill->ill_ipst;
-
- iocp = (struct iocblk *)mp->b_rptr;
- if (iocp->ioc_cmd == SIOCLIFFAILOVER) {
- /*
- * Move everything pointing at from_ill to to_ill.
- * We acheive this by passing in 0 as ifindex.
- */
- ifindex = 0;
- } else {
- /*
- * Move everything pointing at from_ill whose original
- * ifindex of connp, ipif, ilm points at to_ill->ill_index.
- * We acheive this by passing in ifindex rather than 0.
- * Multicast vifs, ilgs move implicitly because ipifs move.
- */
- ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK);
- ifindex = to_ill->ill_phyint->phyint_ifindex;
- }
-
- /*
- * Determine if there is at least one ipif that would move from
- * 'from_ill' to 'to_ill'. If so, it is possible that the replacement
- * ipif (if it exists) on the to_ill would be consumed as a result of
- * the move, in which case we need to quiesce the replacement ipif also.
- */
- for (from_ipif = from_ill->ill_ipif; from_ipif != NULL;
- from_ipif = from_ipif->ipif_next) {
- if (((ifindex == 0) ||
- (ifindex == from_ipif->ipif_orig_ifindex)) &&
- !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) {
- check_rep_if = B_TRUE;
- break;
- }
- }
-
- ill_down_ipifs(from_ill, mp, ifindex, B_TRUE);
-
- GRAB_ILL_LOCKS(from_ill, to_ill);
- if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) {
- (void) ipsq_pending_mp_add(NULL, ipif, q,
- mp, ILL_MOVE_OK);
- RELEASE_ILL_LOCKS(from_ill, to_ill);
- return (EINPROGRESS);
- }
-
- /* Check if the replacement ipif is quiescent to delete */
- if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif,
- (iocp->ioc_cmd == SIOCLIFFAILBACK))) {
- to_ill->ill_ipif->ipif_state_flags |=
- IPIF_MOVING | IPIF_CHANGING;
- if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) {
- (void) ipsq_pending_mp_add(NULL, ipif, q,
- mp, ILL_MOVE_OK);
- RELEASE_ILL_LOCKS(from_ill, to_ill);
- return (EINPROGRESS);
- }
- }
- RELEASE_ILL_LOCKS(from_ill, to_ill);
-
- ASSERT(!MUTEX_HELD(&to_ill->ill_lock));
- rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- GRAB_ILL_LOCKS(from_ill, to_ill);
- err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr);
-
- /* ilm_move is done inside ipif_move for IPv4 */
- if (err == 0 && from_ill->ill_isv6)
- ilm_move_v6(from_ill, to_ill, ifindex);
-
- RELEASE_ILL_LOCKS(from_ill, to_ill);
- rw_exit(&ipst->ips_ill_g_lock);
-
- /*
- * send rts messages and multicast messages.
- */
- if (rep_ipif_ptr != NULL) {
- if (rep_ipif_ptr->ipif_recovery_id != 0) {
- (void) untimeout(rep_ipif_ptr->ipif_recovery_id);
- rep_ipif_ptr->ipif_recovery_id = 0;
- }
- ip_rts_ifmsg(rep_ipif_ptr);
- ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr);
-#ifdef DEBUG
- ipif_trace_cleanup(rep_ipif_ptr);
-#endif
- mi_free(rep_ipif_ptr);
- }
-
- conn_move_ill(from_ill, to_ill, ifindex);
-
- return (err);
-}
-
/*
- * Used to extract arguments for FAILOVER/FAILBACK ioctls.
- * Also checks for the validity of the arguments.
- * Note: We are already exclusive inside the from group.
- * It is upto the caller to release refcnt on the to_ill's.
+ * Process an SIOCGLIFGROUPNAME request.
*/
-static int
-ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4,
- ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6)
+/* ARGSUSED */
+int
+ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+ ip_ioctl_cmd_t *ipip, void *ifreq)
{
- int dst_index;
- ipif_t *ipif_v4, *ipif_v6;
- struct lifreq *lifr;
- mblk_t *mp1;
- boolean_t exists;
- sin_t *sin;
- int err = 0;
- ip_stack_t *ipst;
+ ipmp_grp_t *grp;
+ struct lifreq *lifr = ifreq;
+ ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
- if (CONN_Q(q))
- ipst = CONNQ_TO_IPST(q);
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL)
+ lifr->lifr_groupname[0] = '\0';
else
- ipst = ILLQ_TO_IPST(q);
-
- if ((mp1 = mp->b_cont) == NULL)
- return (EPROTO);
-
- if ((mp1 = mp1->b_cont) == NULL)
- return (EPROTO);
-
- lifr = (struct lifreq *)mp1->b_rptr;
- sin = (sin_t *)&lifr->lifr_addr;
-
- /*
- * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6
- * specific operations.
- */
- if (sin->sin_family != AF_UNSPEC)
- return (EINVAL);
-
- /*
- * Get ipif with id 0. We are writer on the from ill. So we can pass
- * NULLs for the last 4 args and we know the lookup won't fail
- * with EINPROGRESS.
- */
- ipif_v4 = ipif_lookup_on_name(lifr->lifr_name,
- mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE,
- ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
- ipif_v6 = ipif_lookup_on_name(lifr->lifr_name,
- mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE,
- ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
-
- if (ipif_v4 == NULL && ipif_v6 == NULL)
- return (ENXIO);
-
- if (ipif_v4 != NULL) {
- ASSERT(ipif_v4->ipif_refcnt != 0);
- if (ipif_v4->ipif_id != 0) {
- err = EINVAL;
- goto done;
- }
-
- ASSERT(IAM_WRITER_IPIF(ipif_v4));
- *ill_from_v4 = ipif_v4->ipif_ill;
- }
-
- if (ipif_v6 != NULL) {
- ASSERT(ipif_v6->ipif_refcnt != 0);
- if (ipif_v6->ipif_id != 0) {
- err = EINVAL;
- goto done;
- }
-
- ASSERT(IAM_WRITER_IPIF(ipif_v6));
- *ill_from_v6 = ipif_v6->ipif_ill;
- }
-
- err = 0;
- dst_index = lifr->lifr_movetoindex;
- *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE,
- q, mp, ip_process_ioctl, &err, ipst);
- if (err != 0) {
- /*
- * A move may be in progress, EINPROGRESS looking up the "to"
- * ill means changes already done to the "from" ipsq need to
- * be undone to avoid potential deadlocks.
- *
- * ENXIO will usually be because there is only v6 on the ill,
- * that's not treated as an error unless an ENXIO is also
- * seen when looking up the v6 "to" ill.
- *
- * If EINPROGRESS, the mp has been enqueued and can not be
- * used to look up the v6 "to" ill, but a preemptive clean
- * up of changes to the v6 "from" ipsq is done.
- */
- if (err == EINPROGRESS) {
- if (*ill_from_v4 != NULL) {
- ill_t *from_ill;
- ipsq_t *from_ipsq;
-
- from_ill = ipif_v4->ipif_ill;
- from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
- mutex_enter(&from_ipsq->ipsq_lock);
- from_ipsq->ipsq_current_ipif = NULL;
- mutex_exit(&from_ipsq->ipsq_lock);
- }
- if (*ill_from_v6 != NULL) {
- ill_t *from_ill;
- ipsq_t *from_ipsq;
-
- from_ill = ipif_v6->ipif_ill;
- from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
- mutex_enter(&from_ipsq->ipsq_lock);
- from_ipsq->ipsq_current_ipif = NULL;
- mutex_exit(&from_ipsq->ipsq_lock);
- }
- goto done;
- }
- ASSERT(err == ENXIO);
- err = 0;
- }
-
- *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE,
- q, mp, ip_process_ioctl, &err, ipst);
- if (err != 0) {
- /*
- * A move may be in progress, EINPROGRESS looking up the "to"
- * ill means changes already done to the "from" ipsq need to
- * be undone to avoid potential deadlocks.
- */
- if (err == EINPROGRESS) {
- if (*ill_from_v6 != NULL) {
- ill_t *from_ill;
- ipsq_t *from_ipsq;
-
- from_ill = ipif_v6->ipif_ill;
- from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
- mutex_enter(&from_ipsq->ipsq_lock);
- from_ipsq->ipsq_current_ipif = NULL;
- mutex_exit(&from_ipsq->ipsq_lock);
- }
- goto done;
- }
- ASSERT(err == ENXIO);
-
- /* Both v4 and v6 lookup failed */
- if (*ill_to_v4 == NULL) {
- err = ENXIO;
- goto done;
- }
- err = 0;
- }
-
- /*
- * If we have something to MOVE i.e "from" not NULL,
- * "to" should be non-NULL.
- */
- if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) ||
- (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) {
- err = EINVAL;
- }
-
-done:
- if (ipif_v4 != NULL)
- ipif_refrele(ipif_v4);
- if (ipif_v6 != NULL)
- ipif_refrele(ipif_v6);
- return (err);
+ (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (0);
}
/*
- * FAILOVER and FAILBACK are modelled as MOVE operations.
- *
- * We don't check whether the MOVE is within the same group or
- * not, because this ioctl can be used as a generic mechanism
- * to failover from interface A to B, though things will function
- * only if they are really part of the same group. Moreover,
- * all ipifs may be down and hence temporarily out of the group.
- *
- * ipif's that need to be moved are first brought down; V4 ipifs are brought
- * down first and then V6. For each we wait for the ipif's to become quiescent.
- * Bringing down the ipifs ensures that all ires pointing to these ipifs's
- * have been deleted and there are no active references. Once quiescent the
- * ipif's are moved and brought up on the new ill.
- *
- * Normally the source ill and destination ill belong to the same IPMP group
- * and hence the same ipsq_t. In the event they don't belong to the same
- * same group the two ipsq's are first merged into one ipsq - that of the
- * to_ill. The multicast memberships on the source and destination ill cannot
- * change during the move operation since multicast joins/leaves also have to
- * execute on the same ipsq and are hence serialized.
+ * Process an SIOCGLIFGROUPINFO request.
*/
/* ARGSUSED */
int
-ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
- ip_ioctl_cmd_t *ipip, void *ifreq)
+ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+ ip_ioctl_cmd_t *ipip, void *dummy)
{
- ill_t *ill_to_v4 = NULL;
- ill_t *ill_to_v6 = NULL;
- ill_t *ill_from_v4 = NULL;
- ill_t *ill_from_v6 = NULL;
- int err = 0;
-
- /*
- * setup from and to ill's, we can get EINPROGRESS only for
- * to_ill's.
- */
- err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6,
- &ill_to_v4, &ill_to_v6);
-
- if (err != 0) {
- ip0dbg(("ip_sioctl_move: extract args failed\n"));
- goto done;
- }
-
- /*
- * nothing to do.
- */
- if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) {
- goto done;
- }
-
- /*
- * nothing to do.
- */
- if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) {
- goto done;
- }
-
- /*
- * Mark the ill as changing.
- * ILL_CHANGING flag is cleared when the ipif's are brought up
- * in ill_up_ipifs in case of error they are cleared below.
- */
-
- GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
- if (ill_from_v4 != NULL)
- ill_from_v4->ill_state_flags |= ILL_CHANGING;
- if (ill_from_v6 != NULL)
- ill_from_v6->ill_state_flags |= ILL_CHANGING;
- RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
-
- /*
- * Make sure that both src and dst are
- * in the same syncq group. If not make it happen.
- * We are not holding any locks because we are the writer
- * on the from_ipsq and we will hold locks in ill_merge_groups
- * to protect to_ipsq against changing.
- */
- if (ill_from_v4 != NULL) {
- if (ill_from_v4->ill_phyint->phyint_ipsq !=
- ill_to_v4->ill_phyint->phyint_ipsq) {
- err = ill_merge_groups(ill_from_v4, ill_to_v4,
- NULL, mp, q);
- goto err_ret;
-
- }
- ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock));
- } else {
-
- if (ill_from_v6->ill_phyint->phyint_ipsq !=
- ill_to_v6->ill_phyint->phyint_ipsq) {
- err = ill_merge_groups(ill_from_v6, ill_to_v6,
- NULL, mp, q);
- goto err_ret;
-
- }
- ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock));
- }
-
- /*
- * Now that the ipsq's have been merged and we are the writer
- * lets mark to_ill as changing as well.
- */
-
- GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
- if (ill_to_v4 != NULL)
- ill_to_v4->ill_state_flags |= ILL_CHANGING;
- if (ill_to_v6 != NULL)
- ill_to_v6->ill_state_flags |= ILL_CHANGING;
- RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
-
- /*
- * Its ok for us to proceed with the move even if
- * ill_pending_mp is non null on one of the from ill's as the reply
- * should not be looking at the ipif, it should only care about the
- * ill itself.
- */
-
- /*
- * lets move ipv4 first.
- */
- if (ill_from_v4 != NULL) {
- ASSERT(IAM_WRITER_ILL(ill_to_v4));
- ill_from_v4->ill_move_in_progress = B_TRUE;
- ill_to_v4->ill_move_in_progress = B_TRUE;
- ill_to_v4->ill_move_peer = ill_from_v4;
- ill_from_v4->ill_move_peer = ill_to_v4;
- err = ill_move(ill_from_v4, ill_to_v4, q, mp);
- }
-
- /*
- * Now lets move ipv6.
- */
- if (err == 0 && ill_from_v6 != NULL) {
- ASSERT(IAM_WRITER_ILL(ill_to_v6));
- ill_from_v6->ill_move_in_progress = B_TRUE;
- ill_to_v6->ill_move_in_progress = B_TRUE;
- ill_to_v6->ill_move_peer = ill_from_v6;
- ill_from_v6->ill_move_peer = ill_to_v6;
- err = ill_move(ill_from_v6, ill_to_v6, q, mp);
- }
-
-err_ret:
- /*
- * EINPROGRESS means we are waiting for the ipif's that need to be
- * moved to become quiescent.
- */
- if (err == EINPROGRESS) {
- goto done;
- }
-
- /*
- * if err is set ill_up_ipifs will not be called
- * lets clear the flags.
- */
-
- GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
- GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
- /*
- * Some of the clearing may be redundant. But it is simple
- * not making any extra checks.
- */
- if (ill_from_v6 != NULL) {
- ill_from_v6->ill_move_in_progress = B_FALSE;
- ill_from_v6->ill_move_peer = NULL;
- ill_from_v6->ill_state_flags &= ~ILL_CHANGING;
- }
- if (ill_from_v4 != NULL) {
- ill_from_v4->ill_move_in_progress = B_FALSE;
- ill_from_v4->ill_move_peer = NULL;
- ill_from_v4->ill_state_flags &= ~ILL_CHANGING;
- }
- if (ill_to_v6 != NULL) {
- ill_to_v6->ill_move_in_progress = B_FALSE;
- ill_to_v6->ill_move_peer = NULL;
- ill_to_v6->ill_state_flags &= ~ILL_CHANGING;
- }
- if (ill_to_v4 != NULL) {
- ill_to_v4->ill_move_in_progress = B_FALSE;
- ill_to_v4->ill_move_peer = NULL;
- ill_to_v4->ill_state_flags &= ~ILL_CHANGING;
- }
-
- /*
- * Check for setting INACTIVE, if STANDBY is set and FAILED is not set.
- * Do this always to maintain proper state i.e even in case of errors.
- * As phyint_inactive looks at both v4 and v6 interfaces,
- * we need not call on both v4 and v6 interfaces.
- */
- if (ill_from_v4 != NULL) {
- if ((ill_from_v4->ill_phyint->phyint_flags &
- (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
- phyint_inactive(ill_from_v4->ill_phyint);
- }
- } else if (ill_from_v6 != NULL) {
- if ((ill_from_v6->ill_phyint->phyint_flags &
- (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
- phyint_inactive(ill_from_v6->ill_phyint);
- }
- }
-
- if (ill_to_v4 != NULL) {
- if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) {
- ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
- }
- } else if (ill_to_v6 != NULL) {
- if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) {
- ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
- }
- }
-
- RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
- RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
-
-no_err:
- /*
- * lets bring the interfaces up on the to_ill.
- */
- if (err == 0) {
- err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4,
- q, mp);
- }
-
- if (err == 0) {
- if (ill_from_v4 != NULL && ill_to_v4 != NULL)
- ilm_send_multicast_reqs(ill_from_v4, ill_to_v4);
+ lifgroupinfo_t *lifgr;
+ ipmp_grp_t *grp;
+ ip_stack_t *ipst = CONNQ_TO_IPST(q);
- if (ill_from_v6 != NULL && ill_to_v6 != NULL)
- ilm_send_multicast_reqs(ill_from_v6, ill_to_v6);
- }
-done:
+ /* ip_wput_nondata() verified mp->b_cont->b_cont */
+ lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr;
+ lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0';
- if (ill_to_v4 != NULL) {
- ill_refrele(ill_to_v4);
- }
- if (ill_to_v6 != NULL) {
- ill_refrele(ill_to_v6);
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) {
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (ENOENT);
}
-
- return (err);
+ ipmp_grp_info(grp, lifgr);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (0);
}
static void
@@ -18167,10 +14492,9 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
* we only wait for the ACK of the DL_UNBIND_REQ.
*/
mutex_enter(&ill->ill_lock);
- if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
- (prim == DL_UNBIND_REQ)) {
+ if (!(ill->ill_state_flags & ILL_CONDEMNED) || (prim == DL_UNBIND_REQ))
ill->ill_dlpi_pending = prim;
- }
+
mutex_exit(&ill->ill_lock);
putnext(ill->ill_wq, mp);
}
@@ -18324,6 +14648,7 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
{
mblk_t *mp;
ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
+ ipxop_t *ipx = ipsq->ipsq_xop;
ASSERT(IAM_WRITER_IPSQ(ipsq));
mutex_enter(&ill->ill_lock);
@@ -18336,12 +14661,11 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
if ((mp = ill->ill_dlpi_deferred) == NULL) {
ill->ill_dlpi_pending = DL_PRIM_INVAL;
-
- mutex_enter(&ipsq->ipsq_lock);
- if (ipsq->ipsq_current_done)
- ipsq->ipsq_current_ipif = NULL;
- mutex_exit(&ipsq->ipsq_lock);
-
+ if (ipx->ipx_current_done) {
+ mutex_enter(&ipx->ipx_lock);
+ ipx->ipx_current_ipif = NULL;
+ mutex_exit(&ipx->ipx_lock);
+ }
cv_signal(&ill->ill_cv);
mutex_exit(&ill->ill_lock);
return;
@@ -18379,7 +14703,7 @@ conn_delete_ire(conn_t *connp, caddr_t arg)
}
/*
- * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number
+ * Some operations (e.g., ipif_down()) conditionally delete a number
* of IREs. Those IREs may have been previously cached in the conn structure.
* This ipcl_walk() walker function releases all references to such IREs based
* on the condemned flag.
@@ -18403,7 +14727,6 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
/*
* Take down a specific interface, but don't lose any information about it.
- * Also delete interface from its interface group (ifgrp).
* (Always called as writer.)
* This function goes through the down sequence even if the interface is
* already down. There are 2 reasons.
@@ -18501,7 +14824,7 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
* For eg. bind, and route operations (Eg. route add / delete) cannot return
* failure if the ipif is currently undergoing an exclusive operation, and
* hence pass the flag. The mblk is then enqueued in the ipsq and the operation
- * is restarted by ipsq_exit() when the currently exclusive ioctl completes.
+ * is restarted by ipsq_exit() when the current exclusive operation completes.
* The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The
* lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
* change while the ill_lock is held. Before dropping the ill_lock we acquire
@@ -18522,7 +14845,6 @@ int
ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
{
ill_t *ill = ipif->ipif_ill;
- phyint_t *phyi;
conn_t *connp;
boolean_t success;
boolean_t ipif_was_up = B_FALSE;
@@ -18569,20 +14891,7 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
}
/*
- * Before we delete the ill from the group (if any), we need
- * to make sure that we delete all the routes dependent on
- * this and also any ipifs dependent on this ipif for
- * source address. We need to do before we delete from
- * the group because
- *
- * 1) ipif_down_delete_ire de-references ill->ill_group.
- *
- * 2) ipif_update_other_ipifs needs to walk the whole group
- * for re-doing source address selection. Note that
- * ipif_select_source[_v6] called from
- * ipif_update_other_ipifs[_v6] will not pick this ipif
- * because we have already marked down here i.e cleared
- * IPIF_UP.
+ * Delete all IRE's pointing at this ipif or its source address.
*/
if (ipif->ipif_isv6) {
ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES,
@@ -18592,6 +14901,17 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
ipst);
}
+ if (ipif_was_up && ill->ill_ipif_up_count == 0) {
+ /*
+ * Since the interface is now down, it may have just become
+ * inactive. Note that this needs to be done even for a
+ * lll_logical_down(), or ARP entries will not get correctly
+ * restored when the interface comes back up.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_ill_refresh_active(ill);
+ }
+
/*
* Cleaning up the conn_ire_cache or conns must be done only after the
* ires have been deleted above. Otherwise a thread could end up
@@ -18609,53 +14929,9 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
* entries for such ipifs.
*/
if (ipif->ipif_isv6)
- ipif_update_other_ipifs_v6(ipif, ill->ill_group);
+ ipif_update_other_ipifs_v6(ipif);
else
- ipif_update_other_ipifs(ipif, ill->ill_group);
-
- if (ipif_was_up) {
- /*
- * Check whether it is last ipif to leave this group.
- * If this is the last ipif to leave, we should remove
- * this ill from the group as ipif_select_source will not
- * be able to find any useful ipifs if this ill is selected
- * for load balancing.
- *
- * For nameless groups, we should call ifgrp_delete if this
- * belongs to some group. As this ipif is going down, we may
- * need to reconstruct groups.
- */
- phyi = ill->ill_phyint;
- /*
- * If the phyint_groupname_len is 0, it may or may not
- * be in the nameless group. If the phyint_groupname_len is
- * not 0, then this ill should be part of some group.
- * As we always insert this ill in the group if
- * phyint_groupname_len is not zero when the first ipif
- * comes up (in ipif_up_done), it should be in a group
- * when the namelen is not 0.
- *
- * NOTE : When we delete the ill from the group,it will
- * blow away all the IRE_CACHES pointing either at this ipif or
- * ill_wq (illgrp_cache_delete does this). Thus, no IRES
- * should be pointing at this ill.
- */
- ASSERT(phyi->phyint_groupname_len == 0 ||
- (phyi->phyint_groupname != NULL && ill->ill_group != NULL));
-
- if (phyi->phyint_groupname_len != 0) {
- if (ill->ill_ipif_up_count == 0)
- illgrp_delete(ill);
- }
-
- /*
- * If we have deleted some of the broadcast ires associated
- * with this ipif, we need to re-nominate somebody else if
- * the ires that we deleted were the nominated ones.
- */
- if (ill->ill_group != NULL && !ill->ill_isv6)
- ipif_renominate_bcast(ipif);
- }
+ ipif_update_other_ipifs(ipif);
/*
* neighbor-discovery or arp entries for this interface.
@@ -18734,17 +15010,12 @@ ipif_down_tail(ipif_t *ipif)
ill->ill_logical_down = 0;
/*
- * Have to be after removing the routes in ipif_down_delete_ire.
+ * Has to be after removing the routes in ipif_down_delete_ire.
*/
- if (ipif->ipif_isv6) {
- if (ill->ill_flags & ILLF_XRESOLV)
- ipif_arp_down(ipif);
- } else {
- ipif_arp_down(ipif);
- }
+ ipif_resolver_down(ipif);
- ip_rts_ifmsg(ipif);
- ip_rts_newaddrmsg(RTM_DELETE, 0, ipif);
+ ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+ ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
}
/*
@@ -18804,39 +15075,11 @@ static void
ipif_down_delete_ire(ire_t *ire, char *ipif_arg)
{
ipif_t *ipif = (ipif_t *)ipif_arg;
- ill_t *ire_ill;
- ill_t *ipif_ill;
ASSERT(IAM_WRITER_IPIF(ipif));
if (ire->ire_ipif == NULL)
return;
- /*
- * For IPv4, we derive source addresses for an IRE from ipif's
- * belonging to the same IPMP group as the IRE's outgoing
- * interface. If an IRE's outgoing interface isn't in the
- * same IPMP group as a particular ipif, then that ipif
- * couldn't have been used as a source address for this IRE.
- *
- * For IPv6, source addresses are only restricted to the IPMP group
- * if the IRE is for a link-local address or a multicast address.
- * Otherwise, source addresses for an IRE can be chosen from
- * interfaces other than the the outgoing interface for that IRE.
- *
- * For source address selection details, see ipif_select_source()
- * and ipif_select_source_v6().
- */
- if (ire->ire_ipversion == IPV4_VERSION ||
- IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) ||
- IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
- ire_ill = ire->ire_ipif->ipif_ill;
- ipif_ill = ipif->ipif_ill;
-
- if (ire_ill->ill_group != ipif_ill->ill_group) {
- return;
- }
- }
-
if (ire->ire_ipif != ipif) {
/*
* Look for a matching source address.
@@ -18875,83 +15118,53 @@ void
ill_ipif_cache_delete(ire_t *ire, char *ill_arg)
{
ill_t *ill = (ill_t *)ill_arg;
- ill_t *ipif_ill;
ASSERT(IAM_WRITER_ILL(ill));
- /*
- * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
- * Hence this should be IRE_CACHE.
- */
ASSERT(ire->ire_type == IRE_CACHE);
/*
- * We are called for IRE_CACHES whose ire_ipif matches ill.
- * We are only interested in IRE_CACHES that has borrowed
- * the source address from ill_arg e.g. ipif_up_done[_v6]
- * for which we need to look at ire_ipif->ipif_ill match
- * with ill.
+ * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
+ * ill, but we only want to delete the IRE if ire_ipif matches.
*/
ASSERT(ire->ire_ipif != NULL);
- ipif_ill = ire->ire_ipif->ipif_ill;
- if (ipif_ill == ill || (ill->ill_group != NULL &&
- ipif_ill->ill_group == ill->ill_group)) {
+ if (ill == ire->ire_ipif->ipif_ill)
ire_delete(ire);
- }
}
/*
- * Delete all the ire whose stq references ill_arg.
+ * Delete all the IREs whose ire_stq's reference `ill_arg'. IPMP uses this
+ * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references
+ * the IPMP ill.
*/
-static void
+void
ill_stq_cache_delete(ire_t *ire, char *ill_arg)
{
ill_t *ill = (ill_t *)ill_arg;
- ill_t *ire_ill;
ASSERT(IAM_WRITER_ILL(ill));
- /*
- * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
- * Hence this should be IRE_CACHE.
- */
ASSERT(ire->ire_type == IRE_CACHE);
/*
- * We are called for IRE_CACHES whose ire_stq and ire_ipif
- * matches ill. We are only interested in IRE_CACHES that
- * has ire_stq->q_ptr pointing at ill_arg. Thus we do the
- * filtering here.
+ * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
+ * ill, but we only want to delete the IRE if ire_stq matches.
*/
- ire_ill = (ill_t *)ire->ire_stq->q_ptr;
-
- if (ire_ill == ill)
+ if (ire->ire_stq->q_ptr == ill_arg)
ire_delete(ire);
}
/*
- * This is called when an ill leaves the group. We want to delete
- * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is
- * pointing at ill.
+ * Delete all broadcast IREs with a source address on `ill_arg'.
*/
static void
-illgrp_cache_delete(ire_t *ire, char *ill_arg)
+ill_broadcast_delete(ire_t *ire, char *ill_arg)
{
- ill_t *ill = (ill_t *)ill_arg;
+ ill_t *ill = (ill_t *)ill_arg;
ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(ill->ill_group == NULL);
- /*
- * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
- * Hence this should be IRE_CACHE.
- */
- ASSERT(ire->ire_type == IRE_CACHE);
- /*
- * We are called for IRE_CACHES whose ire_stq and ire_ipif
- * matches ill. We are interested in both.
- */
- ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) ||
- (ire->ire_ipif->ipif_ill == ill));
+ ASSERT(ire->ire_type == IRE_BROADCAST);
- ire_delete(ire);
+ if (ire->ire_ipif->ipif_ill == ill)
+ ire_delete(ire);
}
/*
@@ -18997,13 +15210,12 @@ ipif_free(ipif_t *ipif)
rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
/* Remove pointers to this ill in the multicast routing tables */
reset_mrt_vif_ipif(ipif);
+ /* If necessary, clear the cached source ipif rotor. */
+ if (ipif->ipif_ill->ill_src_ipif == ipif)
+ ipif->ipif_ill->ill_src_ipif = NULL;
rw_exit(&ipst->ips_ill_g_lock);
}
-/*
- * Warning: this is not the only function that calls mi_free on an ipif_t. See
- * also ill_move().
- */
static void
ipif_free_tail(ipif_t *ipif)
{
@@ -19036,7 +15248,7 @@ ipif_free_tail(ipif_t *ipif)
sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
/* Get it out of the ILL interface list. */
- ipif_remove(ipif, B_TRUE);
+ ipif_remove(ipif);
rw_exit(&ipst->ips_ill_g_lock);
mutex_destroy(&ipif->ipif_saved_ire_lock);
@@ -19208,8 +15420,10 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
} else if (IPIF_CAN_WAIT(ipif, q)) {
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ill->ill_lock);
ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
RELEASE_CONN_LOCK(q);
ill_refrele(ill);
@@ -19244,7 +15458,7 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
ire_type = IRE_LOOPBACK;
else
ire_type = IRE_LOCAL;
- ipif = ipif_allocate(ill, id, ire_type, B_TRUE);
+ ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE);
if (ipif != NULL)
ipif_refhold_locked(ipif);
else if (error != NULL)
@@ -19342,65 +15556,62 @@ ill_mtu_change(ire_t *ire, char *ill_arg)
void
ipif_multicast_up(ipif_t *ipif)
{
- int err, index;
+ int err;
ill_t *ill;
ASSERT(IAM_WRITER_IPIF(ipif));
ill = ipif->ipif_ill;
- index = ill->ill_phyint->phyint_ifindex;
ip1dbg(("ipif_multicast_up\n"));
if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up)
return;
if (ipif->ipif_isv6) {
+ in6_addr_t v6allmc = ipv6_all_hosts_mcast;
+ in6_addr_t v6solmc = ipv6_solicited_node_mcast;
+
+ v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
+
if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
return;
- /* Join the all hosts multicast address */
ip1dbg(("ipif_multicast_up - addmulti\n"));
+
/*
- * Passing B_TRUE means we have to join the multicast
- * membership on this interface even though this is
- * FAILED. If we join on a different one in the group,
- * we will not be able to delete the membership later
- * as we currently don't track where we join when we
- * join within the kernel unlike applications where
- * we have ilg/ilg_orig_index. See ip_addmulti_v6
- * for more on this.
+ * Join the all hosts multicast address. We skip this for
+ * underlying IPMP interfaces since they should be invisible.
*/
- err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index,
- ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
- if (err != 0) {
- ip0dbg(("ipif_multicast_up: "
- "all_hosts_mcast failed %d\n",
- err));
- return;
+ if (!IS_UNDER_IPMP(ill)) {
+ err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid,
+ ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
+ if (err != 0) {
+ ip0dbg(("ipif_multicast_up: "
+ "all_hosts_mcast failed %d\n", err));
+ return;
+ }
+ ipif->ipif_joined_allhosts = 1;
}
+
/*
* Enable multicast for the solicited node multicast address
*/
if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
- in6_addr_t ipv6_multi = ipv6_solicited_node_mcast;
-
- ipv6_multi.s6_addr32[3] |=
- ipif->ipif_v6lcl_addr.s6_addr32[3];
-
- err = ip_addmulti_v6(&ipv6_multi, ill, index,
- ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE,
- NULL);
+ err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid,
+ ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
if (err != 0) {
ip0dbg(("ipif_multicast_up: solicited MC"
" failed %d\n", err));
- (void) ip_delmulti_v6(&ipv6_all_hosts_mcast,
- ill, ill->ill_phyint->phyint_ifindex,
- ipif->ipif_zoneid, B_TRUE, B_TRUE);
+ if (ipif->ipif_joined_allhosts) {
+ (void) ip_delmulti_v6(&v6allmc, ill,
+ ipif->ipif_zoneid, B_TRUE, B_TRUE);
+ ipif->ipif_joined_allhosts = 0;
+ }
return;
}
}
} else {
- if (ipif->ipif_lcl_addr == INADDR_ANY)
+ if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
return;
/* Join the all hosts multicast address */
@@ -19420,7 +15631,7 @@ ipif_multicast_up(ipif_t *ipif)
* (Explicit memberships are blown away in ill_leave_multicast() when the
* ill is brought down.)
*/
-static void
+void
ipif_multicast_down(ipif_t *ipif)
{
int err;
@@ -19444,19 +15655,18 @@ ipif_multicast_down(ipif_t *ipif)
}
/*
- * Leave the all hosts multicast address. Similar to ip_addmulti_v6,
- * we should look for ilms on this ill rather than the ones that have
- * been failed over here. They are here temporarily. As
- * ipif_multicast_up has joined on this ill, we should delete only
- * from this ill.
+ * Leave the all-hosts multicast address.
*/
- err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
- ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid,
- B_TRUE, B_TRUE);
- if (err != 0) {
- ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n",
- err));
+ if (ipif->ipif_joined_allhosts) {
+ err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
+ ipif->ipif_zoneid, B_TRUE, B_TRUE);
+ if (err != 0) {
+ ip0dbg(("ipif_multicast_down: all_hosts_mcast "
+ "failed %d\n", err));
+ }
+ ipif->ipif_joined_allhosts = 0;
}
+
/*
* Disable multicast for the solicited node multicast address
*/
@@ -19467,9 +15677,7 @@ ipif_multicast_down(ipif_t *ipif)
ipif->ipif_v6lcl_addr.s6_addr32[3];
err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill,
- ipif->ipif_ill->ill_phyint->phyint_ifindex,
ipif->ipif_zoneid, B_TRUE, B_TRUE);
-
if (err != 0) {
ip0dbg(("ipif_multicast_down: sol MC failed %d\n",
err));
@@ -19683,9 +15891,8 @@ ipif_set_default(ipif_t *ipif)
* Return 0 if this address can be used as local address without causing
* duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
* is already up on a different ill, and EADDRINUSE if it's up on the same ill.
- * Special checks are needed to allow the same IPv6 link-local address
- * on different ills.
- * TODO: allowing the same site-local address on different ill's.
+ * Note that the same IPv6 link-local address is allowed as long as the ills
+ * are not on the same link.
*/
int
ip_addr_availability_check(ipif_t *new_ipif)
@@ -19717,30 +15924,26 @@ ip_addr_availability_check(ipif_t *new_ipif)
ipif = ipif->ipif_next) {
if ((ipif == new_ipif) ||
!(ipif->ipif_flags & IPIF_UP) ||
- (ipif->ipif_flags & IPIF_UNNUMBERED))
+ (ipif->ipif_flags & IPIF_UNNUMBERED) ||
+ !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
+ &our_v6addr))
continue;
- if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
- &our_v6addr)) {
- if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
- new_ipif->ipif_flags |= IPIF_UNNUMBERED;
- else if (ipif->ipif_flags & IPIF_POINTOPOINT)
- ipif->ipif_flags |= IPIF_UNNUMBERED;
- else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) &&
- new_ipif->ipif_ill != ill)
- continue;
- else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) &&
- new_ipif->ipif_ill != ill)
- continue;
- else if (new_ipif->ipif_zoneid !=
- ipif->ipif_zoneid &&
- ipif->ipif_zoneid != ALL_ZONES &&
- IS_LOOPBACK(ill))
- continue;
- else if (new_ipif->ipif_ill == ill)
- return (EADDRINUSE);
- else
- return (EADDRNOTAVAIL);
- }
+
+ if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
+ new_ipif->ipif_flags |= IPIF_UNNUMBERED;
+ else if (ipif->ipif_flags & IPIF_POINTOPOINT)
+ ipif->ipif_flags |= IPIF_UNNUMBERED;
+ else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) ||
+ IN6_IS_ADDR_SITELOCAL(&our_v6addr)) &&
+ !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill))
+ continue;
+ else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid &&
+ ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill))
+ continue;
+ else if (new_ipif->ipif_ill == ill)
+ return (EADDRINUSE);
+ else
+ return (EADDRNOTAVAIL);
}
}
@@ -19753,13 +15956,15 @@ ip_addr_availability_check(ipif_t *new_ipif)
* When the routine returns EINPROGRESS then mp has been consumed and
* the ioctl will be acked from ip_rput_dlpi.
*/
-static int
+int
ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
{
- ill_t *ill = ipif->ipif_ill;
- boolean_t isv6 = ipif->ipif_isv6;
- int err = 0;
- boolean_t success;
+ ill_t *ill = ipif->ipif_ill;
+ boolean_t isv6 = ipif->ipif_isv6;
+ int err = 0;
+ boolean_t success;
+ uint_t ipif_orig_id;
+ ip_stack_t *ipst = ill->ill_ipst;
ASSERT(IAM_WRITER_IPIF(ipif));
@@ -19769,6 +15974,123 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
if (ipif->ipif_flags & IPIF_UP)
return (EALREADY);
+ /*
+ * If this is a request to bring up a data address on an interface
+ * under IPMP, then move the address to its IPMP meta-interface and
+ * try to bring it up. One complication is that the zeroth ipif for
+ * an ill is special, in that every ill always has one, and that code
+ * throughout IP deferences ill->ill_ipif without holding any locks.
+ */
+ if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) &&
+ (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) {
+ ipif_t *stubipif = NULL, *moveipif = NULL;
+ ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
+
+ /*
+ * The ipif being brought up should be quiesced. If it's not,
+ * something has gone amiss and we need to bail out. (If it's
+ * quiesced, we know it will remain so via IPIF_CHANGING.)
+ */
+ mutex_enter(&ill->ill_lock);
+ if (!ipif_is_quiescent(ipif)) {
+ mutex_exit(&ill->ill_lock);
+ return (EINVAL);
+ }
+ mutex_exit(&ill->ill_lock);
+
+ /*
+ * If we're going to need to allocate ipifs, do it prior
+ * to starting the move (and grabbing locks).
+ */
+ if (ipif->ipif_id == 0) {
+ moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
+ B_FALSE);
+ stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
+ B_FALSE);
+ if (moveipif == NULL || stubipif == NULL) {
+ mi_free(moveipif);
+ mi_free(stubipif);
+ return (ENOMEM);
+ }
+ }
+
+ /*
+ * Grab or transfer the ipif to move. During the move, keep
+ * ill_g_lock held to prevent any ill walker threads from
+ * seeing things in an inconsistent state.
+ */
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ if (ipif->ipif_id != 0) {
+ ipif_remove(ipif);
+ } else {
+ ipif_transfer(ipif, moveipif, stubipif);
+ ipif = moveipif;
+ }
+
+ /*
+ * Place the ipif on the IPMP ill. If the zeroth ipif on
+ * the IPMP ill is a stub (0.0.0.0 down address) then we
+ * replace that one. Otherwise, pick the next available slot.
+ */
+ ipif->ipif_ill = ipmp_ill;
+ ipif_orig_id = ipif->ipif_id;
+
+ if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) {
+ ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL);
+ ipif = ipmp_ill->ill_ipif;
+ } else {
+ ipif->ipif_id = -1;
+ if (ipif_insert(ipif, B_FALSE) != 0) {
+ /*
+ * No more available ipif_id's -- put it back
+ * on the original ill and fail the operation.
+ * Since we're writer on the ill, we can be
+ * sure our old slot is still available.
+ */
+ ipif->ipif_id = ipif_orig_id;
+ ipif->ipif_ill = ill;
+ if (ipif_orig_id == 0) {
+ ipif_transfer(ipif, ill->ill_ipif,
+ NULL);
+ } else {
+ VERIFY(ipif_insert(ipif, B_FALSE) == 0);
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (ENOMEM);
+ }
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ /*
+ * Tell SCTP that the ipif has moved. Note that even if we
+ * had to allocate a new ipif, the original sequence id was
+ * preserved and therefore SCTP won't know.
+ */
+ sctp_move_ipif(ipif, ill, ipmp_ill);
+
+ /*
+ * If the ipif being brought up was on slot zero, then we
+ * first need to bring up the placeholder we stuck there. In
+ * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call
+ * to ipif_up() itself, if we successfully bring up the
+ * placeholder, we'll check ill_move_ipif and bring it up too.
+ */
+ if (ipif_orig_id == 0) {
+ ASSERT(ill->ill_move_ipif == NULL);
+ ill->ill_move_ipif = ipif;
+ if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0)
+ ASSERT(ill->ill_move_ipif == NULL);
+ if (err != EINPROGRESS)
+ ill->ill_move_ipif = NULL;
+ return (err);
+ }
+
+ /*
+ * Bring it up on the IPMP ill.
+ */
+ return (ipif_up(ipif, q, mp));
+ }
+
/* Skip arp/ndp for any loopback interface. */
if (ill->ill_wq != NULL) {
conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
@@ -19798,7 +16120,6 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
*/
ASSERT(connp != NULL || !CONN_Q(q));
- ASSERT(ipsq->ipsq_pending_mp == NULL);
if (connp != NULL)
mutex_enter(&connp->conn_lock);
mutex_enter(&ill->ill_lock);
@@ -19810,27 +16131,25 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
return (EINTR);
/*
- * Crank up IPv6 neighbor discovery
- * Unlike ARP, this should complete when
- * ipif_ndp_up returns. However, for
- * ILLF_XRESOLV interfaces we also send a
- * AR_INTERFACE_UP to the external resolver.
- * That ioctl will complete in ip_rput.
+ * Crank up the resolver. For IPv6, this cranks up the
+ * external resolver if one is configured, but even if an
+ * external resolver isn't configured, it must be called to
+ * reset DAD state. For IPv6, if an external resolver is not
+ * being used, ipif_resolver_up() will never return
+ * EINPROGRESS, so we can always call ipif_ndp_up() here.
+ * Note that if an external resolver is being used, there's no
+ * need to call ipif_ndp_up() since it will do nothing.
*/
- if (isv6) {
- err = ipif_ndp_up(ipif);
- if (err != 0) {
- if (err != EINPROGRESS)
- mp = ipsq_pending_mp_get(ipsq, &connp);
- return (err);
- }
- }
- /* Now, ARP */
err = ipif_resolver_up(ipif, Res_act_initial);
if (err == EINPROGRESS) {
- /* We will complete it in ip_arp_done */
+ /* We will complete it in ip_arp_done() */
return (err);
}
+
+ if (isv6 && err == 0)
+ err = ipif_ndp_up(ipif, B_TRUE);
+
+ ASSERT(err != EINPROGRESS);
mp = ipsq_pending_mp_get(ipsq, &connp);
ASSERT(mp != NULL);
if (err != 0)
@@ -19843,7 +16162,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
ipif->ipif_addr_ready = 1;
}
- return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
+
+ err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif);
+ if (err == 0 && ill->ill_move_ipif != NULL) {
+ ipif = ill->ill_move_ipif;
+ ill->ill_move_ipif = NULL;
+ return (ipif_up(ipif, q, mp));
+ }
+ return (err);
}
/*
@@ -19939,13 +16265,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
return (EINPROGRESS);
bad:
ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
- /*
- * We don't have to check for possible removal from illgrp
- * as we have not yet inserted in illgrp. For groups
- * without names, this ipif is still not UP and hence
- * this could not have possibly had any influence in forming
- * groups.
- */
freemsg(bind_mp);
freemsg(unbind_mp);
@@ -19974,12 +16293,10 @@ ipif_up_done(ipif_t *ipif)
ipif_t *tmp_ipif;
boolean_t flush_ire_cache = B_TRUE;
int err = 0;
- phyint_t *phyi;
ire_t **ipif_saved_irep = NULL;
int ipif_saved_ire_cnt;
int cnt;
boolean_t src_ipif_held = B_FALSE;
- boolean_t ire_added = B_FALSE;
boolean_t loopback = B_FALSE;
ip_stack_t *ipst = ill->ill_ipst;
@@ -20010,7 +16327,7 @@ ipif_up_done(ipif_t *ipif)
break;
}
if (flush_ire_cache)
- ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE,
+ ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
/*
@@ -20044,7 +16361,9 @@ ipif_up_done(ipif_t *ipif)
ipif->ipif_ire_type = IRE_LOCAL;
}
- if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) {
+ if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) ||
+ ((ipif->ipif_flags & IPIF_DEPRECATED) &&
+ !(ipif->ipif_flags & IPIF_NOFAILOVER))) {
/*
* Can't use our source address. Select a different
* source address for the IRE_INTERFACE and IRE_LOCAL
@@ -20189,11 +16508,9 @@ ipif_up_done(ipif_t *ipif)
}
/*
- * Need to atomically check for ip_addr_availablity_check
- * under ip_addr_avail_lock, and if it fails got bad, and remove
- * from group also.The ill_g_lock is grabbed as reader
- * just to make sure no new ills or new ipifs are being added
- * to the system while we are checking the uniqueness of addresses.
+ * Need to atomically check for IP address availability under
+ * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new
+ * ills or new ipifs can be added while we are checking availability.
*/
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
mutex_enter(&ipst->ips_ip_addr_avail_lock);
@@ -20227,13 +16544,6 @@ ipif_up_done(ipif_t *ipif)
/*
* Add in all newly created IREs. ire_create_bcast() has
* already checked for duplicates of the IRE_BROADCAST type.
- * We want to add before we call ifgrp_insert which wants
- * to know whether IRE_IF_RESOLVER exists or not.
- *
- * NOTE : We refrele the ire though we may branch to "bad"
- * later on where we do ire_delete. This is okay
- * because nobody can delete it as we are running
- * exclusively.
*/
for (irep1 = irep; irep1 > ire_array; ) {
irep1--;
@@ -20243,44 +16553,6 @@ ipif_up_done(ipif_t *ipif)
*/
(void) ire_add(irep1, NULL, NULL, NULL, B_FALSE);
}
- ire_added = B_TRUE;
- /*
- * Form groups if possible.
- *
- * If we are supposed to be in a ill_group with a name, insert it
- * now as we know that at least one ipif is UP. Otherwise form
- * nameless groups.
- *
- * If ip_enable_group_ifs is set and ipif address is not 0, insert
- * this ipif into the appropriate interface group, or create a
- * new one. If this is already in a nameless group, we try to form
- * a bigger group looking at other ills potentially sharing this
- * ipif's prefix.
- */
- phyi = ill->ill_phyint;
- if (phyi->phyint_groupname_len != 0) {
- ASSERT(phyi->phyint_groupname != NULL);
- if (ill->ill_ipif_up_count == 1) {
- ASSERT(ill->ill_group == NULL);
- err = illgrp_insert(&ipst->ips_illgrp_head_v4, ill,
- phyi->phyint_groupname, NULL, B_TRUE);
- if (err != 0) {
- ip1dbg(("ipif_up_done: illgrp allocation "
- "failed, error %d\n", err));
- goto bad;
- }
- }
- ASSERT(ill->ill_group != NULL);
- }
-
- /*
- * When this is part of group, we need to make sure that
- * any broadcast ires created because of this ipif coming
- * UP gets marked/cleared with IRE_MARK_NORECV appropriately
- * so that we don't receive duplicate broadcast packets.
- */
- if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0)
- ipif_renominate_bcast(ipif);
/* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
@@ -20331,19 +16603,30 @@ ipif_up_done(ipif_t *ipif)
*/
ill_recover_multicast(ill);
}
- /* Join the allhosts multicast address */
- ipif_multicast_up(ipif);
- if (!loopback) {
+ if (ill->ill_ipif_up_count == 1) {
+ /*
+ * Since the interface is now up, it may now be active.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_ill_refresh_active(ill);
+
/*
- * See whether anybody else would benefit from the
- * new ipif that we added. We call this always rather
- * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST
- * ipif is for the benefit of illgrp_insert (done above)
- * which does not do source address selection as it does
- * not want to re-create interface routes that we are
- * having reference to it here.
+ * If this is an IPMP interface, we may now be able to
+ * establish ARP entries.
*/
+ if (IS_IPMP(ill))
+ ipmp_illgrp_refresh_arpent(ill->ill_grp);
+ }
+
+ /* Join the allhosts multicast address */
+ ipif_multicast_up(ipif);
+
+ /*
+ * See if anybody else would benefit from our new ipif.
+ */
+ if (!loopback &&
+ !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
ill_update_source_selection(ill);
}
@@ -20386,27 +16669,11 @@ ipif_up_done(ipif_t *ipif)
bad:
ip1dbg(("ipif_up_done: FAILED \n"));
- /*
- * We don't have to bother removing from ill groups because
- *
- * 1) For groups with names, we insert only when the first ipif
- * comes up. In that case if it fails, it will not be in any
- * group. So, we need not try to remove for that case.
- *
- * 2) For groups without names, either we tried to insert ipif_ill
- * in a group as singleton or found some other group to become
- * a bigger group. For the former, if it fails we don't have
- * anything to do as ipif_ill is not in the group and for the
- * latter, there are no failures in illgrp_insert/illgrp_delete
- * (ENOMEM can't occur for this. Check ifgrp_insert).
- */
+
while (irep > ire_array) {
irep--;
- if (*irep != NULL) {
+ if (*irep != NULL)
ire_delete(*irep);
- if (ire_added)
- ire_refrele(*irep);
- }
}
(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
@@ -20417,7 +16684,7 @@ bad:
if (src_ipif_held)
ipif_refrele(src_ipif);
- ipif_arp_down(ipif);
+ ipif_resolver_down(ipif);
return (err);
}
@@ -20493,119 +16760,6 @@ ill_arp_on(ill_t *ill)
}
/*
- * Called after either deleting ill from the group or when setting
- * FAILED or STANDBY on the interface.
- */
-static void
-illgrp_reset_schednext(ill_t *ill)
-{
- ill_group_t *illgrp;
- ill_t *save_ill;
-
- ASSERT(IAM_WRITER_ILL(ill));
- /*
- * When called from illgrp_delete, ill_group will be non-NULL.
- * But when called from ip_sioctl_flags, it could be NULL if
- * somebody is setting FAILED/INACTIVE on some interface which
- * is not part of a group.
- */
- illgrp = ill->ill_group;
- if (illgrp == NULL)
- return;
- if (illgrp->illgrp_ill_schednext != ill)
- return;
-
- illgrp->illgrp_ill_schednext = NULL;
- save_ill = ill;
- /*
- * Choose a good ill to be the next one for
- * outbound traffic. As the flags FAILED/STANDBY is
- * not yet marked when called from ip_sioctl_flags,
- * we check for ill separately.
- */
- for (ill = illgrp->illgrp_ill; ill != NULL;
- ill = ill->ill_group_next) {
- if ((ill != save_ill) &&
- !(ill->ill_phyint->phyint_flags &
- (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) {
- illgrp->illgrp_ill_schednext = ill;
- return;
- }
- }
-}
-
-/*
- * Given an ill, find the next ill in the group to be scheduled.
- * (This should be called by ip_newroute() before ire_create().)
- * The passed in ill may be pulled out of the group, after we have picked
- * up a different outgoing ill from the same group. However ire add will
- * atomically check this.
- */
-ill_t *
-illgrp_scheduler(ill_t *ill)
-{
- ill_t *retill;
- ill_group_t *illgrp;
- int illcnt;
- int i;
- uint64_t flags;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * We don't use a lock to check for the ill_group. If this ill
- * is currently being inserted we may end up just returning this
- * ill itself. That is ok.
- */
- if (ill->ill_group == NULL) {
- ill_refhold(ill);
- return (ill);
- }
-
- /*
- * Grab the ill_g_lock as reader to make sure we are dealing with
- * a set of stable ills. No ill can be added or deleted or change
- * group while we hold the reader lock.
- */
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- if ((illgrp = ill->ill_group) == NULL) {
- rw_exit(&ipst->ips_ill_g_lock);
- ill_refhold(ill);
- return (ill);
- }
-
- illcnt = illgrp->illgrp_ill_count;
- mutex_enter(&illgrp->illgrp_lock);
- retill = illgrp->illgrp_ill_schednext;
-
- if (retill == NULL)
- retill = illgrp->illgrp_ill;
-
- /*
- * We do a circular search beginning at illgrp_ill_schednext
- * or illgrp_ill. We don't check the flags against the ill lock
- * since it can change anytime. The ire creation will be atomic
- * and will fail if the ill is FAILED or OFFLINE.
- */
- for (i = 0; i < illcnt; i++) {
- flags = retill->ill_phyint->phyint_flags;
-
- if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
- ILL_CAN_LOOKUP(retill)) {
- illgrp->illgrp_ill_schednext = retill->ill_group_next;
- ill_refhold(retill);
- break;
- }
- retill = retill->ill_group_next;
- if (retill == NULL)
- retill = illgrp->illgrp_ill;
- }
- mutex_exit(&illgrp->illgrp_lock);
- rw_exit(&ipst->ips_ill_g_lock);
-
- return (i == illcnt ? NULL : retill);
-}
-
-/*
* Checks for availbility of a usable source address (if there is one) when the
* destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
* this selection is done regardless of the destination.
@@ -20654,11 +16808,26 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
}
/*
- * Determine the best source address given a destination address and an ill.
- * Prefers non-deprecated over deprecated but will return a deprecated
- * address if there is no other choice. If there is a usable source address
- * on the interface pointed to by ill_usesrc_ifindex then that is given
- * first preference.
+ * IP source address type, sorted from worst to best. For a given type,
+ * always prefer IP addresses on the same subnet. All-zones addresses are
+ * suboptimal because they pose problems with unlabeled destinations.
+ */
+typedef enum {
+ IPIF_NONE,
+ IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */
+ IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */
+ IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */
+ IPIF_SAMENET_ALLZONES, /* allzones and same subnet */
+ IPIF_DIFFNET, /* normal and different subnet */
+ IPIF_SAMENET /* normal and same subnet */
+} ipif_type_t;
+
+/*
+ * Pick the optimal ipif on `ill' for sending to destination `dst' from zone
+ * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t
+ * enumeration, and return the highest-rated ipif. If there's a tie, we pick
+ * the first one, unless IPMP is used in which case we round-robin among them;
+ * see below for more.
*
* Returns NULL if there is no suitable source address for the ill.
* This only occurs when there is no valid source address for the ill.
@@ -20666,17 +16835,13 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
ipif_t *
ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
{
- ipif_t *ipif;
- ipif_t *ipif_dep = NULL; /* Fallback to deprecated */
- ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE];
- int index = 0;
- boolean_t wrapped = B_FALSE;
- boolean_t same_subnet_only = B_FALSE;
- boolean_t ipif_same_found, ipif_other_found;
- boolean_t specific_found;
- ill_t *till, *usill = NULL;
+ ill_t *usill = NULL;
+ ill_t *ipmp_ill = NULL;
+ ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif;
+ ipif_type_t type, best_type;
tsol_tpc_t *src_rhtp, *dst_rhtp;
- ip_stack_t *ipst = ill->ill_ipst;
+ ip_stack_t *ipst = ill->ill_ipst;
+ boolean_t samenet;
if (ill->ill_usesrc_ifindex != 0) {
usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
@@ -20688,6 +16853,17 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
}
/*
+ * Test addresses should never be used for source address selection,
+ * so if we were passed one, switch to the IPMP meta-interface.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL)
+ ill = ipmp_ill; /* Select source from IPMP ill */
+ else
+ return (NULL);
+ }
+
+ /*
* If we're dealing with an unlabeled destination on a labeled system,
* make sure that we ignore source addresses that are incompatible with
* the destination's default label. That destination's default label
@@ -20705,7 +16881,7 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
}
/*
- * Holds the ill_g_lock as reader. This makes sure that no ipif/ill
+ * Hold the ill_g_lock as reader. This makes sure that no ipif/ill
* can be deleted. But an ipif/ill can get CONDEMNED any time.
* After selecting the right ipif, under ill_lock make sure ipif is
* not condemned, and increment refcnt. If ipif is CONDEMNED,
@@ -20713,190 +16889,117 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
* but not under a lock.
*/
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-
retry:
- till = ill;
- ipif_arr[0] = NULL;
+ /*
+ * For source address selection, we treat the ipif list as circular
+ * and continue until we get back to where we started. This allows
+ * IPMP to vary source address selection (which improves inbound load
+ * spreading) by caching its last ending point and starting from
+ * there. NOTE: we don't have to worry about ill_src_ipif changing
+ * ills since that can't happen on the IPMP ill.
+ */
+ start_ipif = ill->ill_ipif;
+ if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
+ start_ipif = ill->ill_src_ipif;
- if (till->ill_group != NULL)
- till = till->ill_group->illgrp_ill;
+ ipif = start_ipif;
+ best_ipif = NULL;
+ best_type = IPIF_NONE;
+ do {
+ if ((next_ipif = ipif->ipif_next) == NULL)
+ next_ipif = ill->ill_ipif;
- /*
- * Choose one good source address from each ill across the group.
- * If possible choose a source address in the same subnet as
- * the destination address.
- *
- * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE
- * This is okay because of the following.
- *
- * If PHYI_FAILED is set and we still have non-deprecated
- * addresses, it means the addresses have not yet been
- * failed over to a different interface. We potentially
- * select them to create IRE_CACHES, which will be later
- * flushed when the addresses move over.
- *
- * If PHYI_INACTIVE is set and we still have non-deprecated
- * addresses, it means either the user has configured them
- * or PHYI_INACTIVE has not been cleared after the addresses
- * been moved over. For the former, in.mpathd does a failover
- * when the interface becomes INACTIVE and hence we should
- * not find them. Once INACTIVE is set, we don't allow them
- * to create logical interfaces anymore. For the latter, a
- * flush will happen when INACTIVE is cleared which will
- * flush the IRE_CACHES.
- *
- * If PHYI_OFFLINE is set, all the addresses will be failed
- * over soon. We potentially select them to create IRE_CACHEs,
- * which will be later flushed when the addresses move over.
- *
- * NOTE : As ipif_select_source is called to borrow source address
- * for an ipif that is part of a group, source address selection
- * will be re-done whenever the group changes i.e either an
- * insertion/deletion in the group.
- *
- * Fill ipif_arr[] with source addresses, using these rules:
- *
- * 1. At most one source address from a given ill ends up
- * in ipif_arr[] -- that is, at most one of the ipif's
- * associated with a given ill ends up in ipif_arr[].
- *
- * 2. If there is at least one non-deprecated ipif in the
- * IPMP group with a source address on the same subnet as
- * our destination, then fill ipif_arr[] only with
- * source addresses on the same subnet as our destination.
- * Note that because of (1), only the first
- * non-deprecated ipif found with a source address
- * matching the destination ends up in ipif_arr[].
- *
- * 3. Otherwise, fill ipif_arr[] with non-deprecated source
- * addresses not in the same subnet as our destination.
- * Again, because of (1), only the first off-subnet source
- * address will be chosen.
- *
- * 4. If there are no non-deprecated ipifs, then just use
- * the source address associated with the last deprecated
- * one we find that happens to be on the same subnet,
- * otherwise the first one not in the same subnet.
- */
- specific_found = B_FALSE;
- for (; till != NULL; till = till->ill_group_next) {
- ipif_same_found = B_FALSE;
- ipif_other_found = B_FALSE;
- for (ipif = till->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (!IPIF_CAN_LOOKUP(ipif))
- continue;
- /* Always skip NOLOCAL and ANYCAST interfaces */
- if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
- continue;
- if (!(ipif->ipif_flags & IPIF_UP) ||
- !ipif->ipif_addr_ready)
- continue;
- if (ipif->ipif_zoneid != zoneid &&
- ipif->ipif_zoneid != ALL_ZONES)
- continue;
- /*
- * Interfaces with 0.0.0.0 address are allowed to be UP,
- * but are not valid as source addresses.
- */
- if (ipif->ipif_lcl_addr == INADDR_ANY)
- continue;
+ if (!IPIF_CAN_LOOKUP(ipif))
+ continue;
+ /* Always skip NOLOCAL and ANYCAST interfaces */
+ if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
+ continue;
+ if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready)
+ continue;
+ if (ipif->ipif_zoneid != zoneid &&
+ ipif->ipif_zoneid != ALL_ZONES)
+ continue;
- /*
- * Check compatibility of local address for
- * destination's default label if we're on a labeled
- * system. Incompatible addresses can't be used at
- * all.
- */
- if (dst_rhtp != NULL) {
- boolean_t incompat;
+ /*
+ * Interfaces with 0.0.0.0 address are allowed to be UP, but
+ * are not valid as source addresses.
+ */
+ if (ipif->ipif_lcl_addr == INADDR_ANY)
+ continue;
- src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
- IPV4_VERSION, B_FALSE);
- if (src_rhtp == NULL)
- continue;
- incompat =
- src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
- src_rhtp->tpc_tp.tp_doi !=
- dst_rhtp->tpc_tp.tp_doi ||
- (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
- &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
- !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
- src_rhtp->tpc_tp.tp_sl_set_cipso));
- TPC_RELE(src_rhtp);
- if (incompat)
- continue;
- }
+ /*
+ * Check compatibility of local address for destination's
+ * default label if we're on a labeled system. Incompatible
+ * addresses can't be used at all.
+ */
+ if (dst_rhtp != NULL) {
+ boolean_t incompat;
- /*
- * We prefer not to use all all-zones addresses, if we
- * can avoid it, as they pose problems with unlabeled
- * destinations.
- */
- if (ipif->ipif_zoneid != ALL_ZONES) {
- if (!specific_found &&
- (!same_subnet_only ||
- (ipif->ipif_net_mask & dst) ==
- ipif->ipif_subnet)) {
- index = 0;
- specific_found = B_TRUE;
- ipif_other_found = B_FALSE;
- }
- } else {
- if (specific_found)
- continue;
- }
- if (ipif->ipif_flags & IPIF_DEPRECATED) {
- if (ipif_dep == NULL ||
- (ipif->ipif_net_mask & dst) ==
- ipif->ipif_subnet)
- ipif_dep = ipif;
+ src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
+ IPV4_VERSION, B_FALSE);
+ if (src_rhtp == NULL)
+ continue;
+ incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
+ src_rhtp->tpc_tp.tp_doi !=
+ dst_rhtp->tpc_tp.tp_doi ||
+ (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
+ &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
+ !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
+ src_rhtp->tpc_tp.tp_sl_set_cipso));
+ TPC_RELE(src_rhtp);
+ if (incompat)
continue;
- }
- if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) {
- /* found a source address in the same subnet */
- if (!same_subnet_only) {
- same_subnet_only = B_TRUE;
- index = 0;
- }
- ipif_same_found = B_TRUE;
- } else {
- if (same_subnet_only || ipif_other_found)
- continue;
- ipif_other_found = B_TRUE;
- }
- ipif_arr[index++] = ipif;
- if (index == MAX_IPIF_SELECT_SOURCE) {
- wrapped = B_TRUE;
- index = 0;
- }
- if (ipif_same_found)
- break;
}
- }
- if (ipif_arr[0] == NULL) {
- ipif = ipif_dep;
- } else {
- if (wrapped)
- index = MAX_IPIF_SELECT_SOURCE;
- ipif = ipif_arr[ipif_rand(ipst) % index];
- ASSERT(ipif != NULL);
- }
+ samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
- if (ipif != NULL) {
+ if (ipif->ipif_flags & IPIF_DEPRECATED) {
+ type = samenet ? IPIF_SAMENET_DEPRECATED :
+ IPIF_DIFFNET_DEPRECATED;
+ } else if (ipif->ipif_zoneid == ALL_ZONES) {
+ type = samenet ? IPIF_SAMENET_ALLZONES :
+ IPIF_DIFFNET_ALLZONES;
+ } else {
+ type = samenet ? IPIF_SAMENET : IPIF_DIFFNET;
+ }
+
+ if (type > best_type) {
+ best_type = type;
+ best_ipif = ipif;
+ if (best_type == IPIF_SAMENET)
+ break; /* can't get better */
+ }
+ } while ((ipif = next_ipif) != start_ipif);
+
+ if ((ipif = best_ipif) != NULL) {
mutex_enter(&ipif->ipif_ill->ill_lock);
if (!IPIF_CAN_LOOKUP(ipif)) {
mutex_exit(&ipif->ipif_ill->ill_lock);
goto retry;
}
ipif_refhold_locked(ipif);
+
+ /*
+ * For IPMP, update the source ipif rotor to the next ipif,
+ * provided we can look it up. (We must not use it if it's
+ * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
+ * ipif_free() checked ill_src_ipif.)
+ */
+ if (IS_IPMP(ill) && ipif != NULL) {
+ next_ipif = ipif->ipif_next;
+ if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif))
+ ill->ill_src_ipif = next_ipif;
+ else
+ ill->ill_src_ipif = NULL;
+ }
mutex_exit(&ipif->ipif_ill->ill_lock);
}
rw_exit(&ipst->ips_ill_g_lock);
if (usill != NULL)
ill_refrele(usill);
+ if (ipmp_ill != NULL)
+ ill_refrele(ipmp_ill);
if (dst_rhtp != NULL)
TPC_RELE(dst_rhtp);
@@ -20929,8 +17032,7 @@ retry:
* ipif_update_other_ipifs calls us.
*
* If old_ipif is NULL, just redo the source address selection
- * if needed. This happens when illgrp_insert or ipif_up_done
- * calls us.
+ * if needed. This happens when ipif_up_done calls us.
*/
static void
ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
@@ -21064,49 +17166,31 @@ ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
/*
* This old_ipif is going away.
*
- * Determine if any other ipif's is using our address as
+ * Determine if any other ipif's are using our address as
* ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or
* IPIF_DEPRECATED).
* Find the IRE_INTERFACE for such ipifs and recreate them
* to use an different source address following the rules in
* ipif_up_done.
- *
- * This function takes an illgrp as an argument so that illgrp_delete
- * can call this to update source address even after deleting the
- * old_ipif->ipif_ill from the ill group.
*/
static void
-ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp)
+ipif_update_other_ipifs(ipif_t *old_ipif)
{
- ipif_t *ipif;
- ill_t *ill;
+ ipif_t *ipif;
+ ill_t *ill;
char buf[INET6_ADDRSTRLEN];
ASSERT(IAM_WRITER_IPIF(old_ipif));
- ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif));
ill = old_ipif->ipif_ill;
- ip1dbg(("ipif_update_other_ipifs(%s, %s)\n",
- ill->ill_name,
- inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr,
- buf, sizeof (buf))));
- /*
- * If this part of a group, look at all ills as ipif_select_source
- * borrows source address across all the ills in the group.
- */
- if (illgrp != NULL)
- ill = illgrp->illgrp_ill;
-
- for (; ill != NULL; ill = ill->ill_group_next) {
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
-
- if (ipif == old_ipif)
- continue;
+ ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name,
+ inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf))));
- ipif_recreate_interface_routes(old_ipif, ipif);
- }
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ if (ipif == old_ipif)
+ continue;
+ ipif_recreate_interface_routes(old_ipif, ipif);
}
}
@@ -21117,8 +17201,7 @@ if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
{
/*
* ill_phyint_reinit merged the v4 and v6 into a single
- * ipsq. Could also have become part of a ipmp group in the
- * process, and we might not have been able to complete the
+ * ipsq. We might not have been able to complete the
* operation in ipif_set_values, if we could not become
* exclusive. If so restart it here.
*/
@@ -21171,6 +17254,48 @@ ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
}
/*
+ * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the
+ * minimum (but complete) set exist. This is necessary when adding or
+ * removing an interface to/from an IPMP group, since interfaces in an
+ * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever
+ * its test address subnets overlap with IPMP data addresses). It's also
+ * used to refresh the IRE_BROADCAST entries associated with the IPMP
+ * interface when the nominated broadcast interface changes.
+ */
+void
+ill_refresh_bcast(ill_t *ill)
+{
+ ire_t *ire_array[12]; /* max ipif_create_bcast_ires() can create */
+ ire_t **irep;
+ ipif_t *ipif;
+
+ ASSERT(!ill->ill_isv6);
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ /*
+ * Remove any old broadcast IREs.
+ */
+ ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST,
+ ill_broadcast_delete, ill, ill);
+
+ /*
+ * Create new ones for any ipifs that are up and broadcast-capable.
+ */
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) !=
+ (IPIF_UP|IPIF_BROADCAST))
+ continue;
+
+ irep = ipif_create_bcast_ires(ipif, ire_array);
+ while (irep-- > ire_array) {
+ (void) ire_add(irep, NULL, NULL, NULL, B_FALSE);
+ if (*irep != NULL)
+ ire_refrele(*irep);
+ }
+ }
+}
+
+/*
* Create any IRE_BROADCAST entries for `ipif', and store those entries in
* `irep'. Returns a pointer to the next free `irep' entry (just like
* ire_check_and_create_bcast()).
@@ -21433,10 +17558,33 @@ ipif_check_bcast_ires(ipif_t *test_ipif)
/*
* Walk through all the ipifs that will be affected by the dying IREs,
- * and recreate the IREs as necessary.
+ * and recreate the IREs as necessary. Note that all interfaces in an
+ * IPMP illgrp share the same broadcast IREs, and thus the entire
+ * illgrp must be walked, starting with the IPMP meta-interface (so
+ * that broadcast IREs end up on it whenever possible).
*/
+ if (IS_UNDER_IPMP(ill))
+ ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
+
irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
+ if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
+ ipmp_illgrp_t *illg = ill->ill_grp;
+
+ ill = list_head(&illg->ig_if);
+ for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+ for (i = 0; i < BCAST_COUNT; i++) {
+ if (bireinfo[i].bi_willdie &&
+ !bireinfo[i].bi_haverep)
+ break;
+ }
+ if (i == BCAST_COUNT)
+ break;
+
+ irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
+ }
+ }
+
/*
* Scan through the set of broadcast IREs and see if there are any
* that we need to replace that have not yet been replaced. If so,
@@ -21528,7 +17676,7 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
/*
* If there's another ill already with the requested name, ensure
- * that it's of the same type. Otherwise, ill_phyint_reinit() will
+ * that it's of the same type. Otherwise, ill_phyint_reinit() will
* fuse together two unrelated ills, which will cause chaos.
*/
ipst = ill->ill_ipst;
@@ -21620,8 +17768,7 @@ ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
{
/*
* ill_phyint_reinit merged the v4 and v6 into a single
- * ipsq. Could also have become part of a ipmp group in the
- * process, and we might not have been able to complete the
+ * ipsq. We might not have been able to complete the
* slifname in ipif_set_values, if we could not become
* exclusive. If so restart it here
*/
@@ -21665,85 +17812,6 @@ ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
return (ipif);
}
-typedef struct conn_change_s {
- uint_t cc_old_ifindex;
- uint_t cc_new_ifindex;
-} conn_change_t;
-
-/*
- * ipcl_walk function for changing interface index.
- */
-static void
-conn_change_ifindex(conn_t *connp, caddr_t arg)
-{
- conn_change_t *connc;
- uint_t old_ifindex;
- uint_t new_ifindex;
- int i;
- ilg_t *ilg;
-
- connc = (conn_change_t *)arg;
- old_ifindex = connc->cc_old_ifindex;
- new_ifindex = connc->cc_new_ifindex;
-
- if (connp->conn_orig_bound_ifindex == old_ifindex)
- connp->conn_orig_bound_ifindex = new_ifindex;
-
- if (connp->conn_orig_multicast_ifindex == old_ifindex)
- connp->conn_orig_multicast_ifindex = new_ifindex;
-
- for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
- ilg = &connp->conn_ilg[i];
- if (ilg->ilg_orig_ifindex == old_ifindex)
- ilg->ilg_orig_ifindex = new_ifindex;
- }
-}
-
-/*
- * Walk all the ipifs and ilms on this ill and change the orig_ifindex
- * to new_index if it matches the old_index.
- *
- * Failovers typically happen within a group of ills. But somebody
- * can remove an ill from the group after a failover happened. If
- * we are setting the ifindex after this, we potentially need to
- * look at all the ills rather than just the ones in the group.
- * We cut down the work by looking at matching ill_net_types
- * and ill_types as we could not possibly grouped them together.
- */
-static void
-ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc)
-{
- ill_t *ill;
- ipif_t *ipif;
- uint_t old_ifindex;
- uint_t new_ifindex;
- ilm_t *ilm;
- ill_walk_context_t ctx;
- ip_stack_t *ipst = ill_orig->ill_ipst;
-
- old_ifindex = connc->cc_old_ifindex;
- new_ifindex = connc->cc_new_ifindex;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- ill = ILL_START_WALK_ALL(&ctx, ipst);
- for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if ((ill_orig->ill_net_type != ill->ill_net_type) ||
- (ill_orig->ill_type != ill->ill_type)) {
- continue;
- }
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (ipif->ipif_orig_ifindex == old_ifindex)
- ipif->ipif_orig_ifindex = new_ifindex;
- }
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (ilm->ilm_orig_ifindex == old_ifindex)
- ilm->ilm_orig_ifindex = new_ifindex;
- }
- }
- rw_exit(&ipst->ips_ill_g_lock);
-}
-
/*
* We first need to ensure that the new index is unique, and
* then carry the change across both v4 and v6 ill representation
@@ -21755,13 +17823,10 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *ifreq)
{
ill_t *ill;
- ill_t *ill_other;
phyint_t *phyi;
- int old_index;
- conn_change_t connc;
struct ifreq *ifr = (struct ifreq *)ifreq;
struct lifreq *lifr = (struct lifreq *)ifreq;
- uint_t index;
+ uint_t old_index, index;
ill_t *ill_v4;
ill_t *ill_v6;
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
@@ -21773,31 +17838,15 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
/*
* Only allow on physical interface. Also, index zero is illegal.
- *
- * Need to check for PHYI_FAILED and PHYI_INACTIVE
- *
- * 1) If PHYI_FAILED is set, a failover could have happened which
- * implies a possible failback might have to happen. As failback
- * depends on the old index, we should fail setting the index.
- *
- * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that
- * any addresses or multicast memberships are failed over to
- * a non-STANDBY interface. As failback depends on the old
- * index, we should fail setting the index for this case also.
- *
- * 3) If PHYI_OFFLINE is set, a possible failover has happened.
- * Be consistent with PHYI_FAILED and fail the ioctl.
*/
ill = ipif->ipif_ill;
phyi = ill->ill_phyint;
- if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) ||
- ipif->ipif_id != 0 || index == 0) {
+ if (ipif->ipif_id != 0 || index == 0) {
return (EINVAL);
}
- old_index = phyi->phyint_ifindex;
/* If the index is not changing, no work to do */
- if (old_index == index)
+ if (phyi->phyint_ifindex == index)
return (0);
/*
@@ -21816,31 +17865,17 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
return (EBUSY);
}
- /*
- * The new index is unused. Set it in the phyint.
- * Locate the other ill so that we can send a routing
- * sockets message.
- */
- if (ill->ill_isv6) {
- ill_other = phyi->phyint_illv4;
- } else {
- ill_other = phyi->phyint_illv6;
- }
-
+ /* The new index is unused. Set it in the phyint. */
+ old_index = phyi->phyint_ifindex;
phyi->phyint_ifindex = index;
/* Update SCTP's ILL list */
sctp_ill_reindex(ill, old_index);
- connc.cc_old_ifindex = old_index;
- connc.cc_new_ifindex = index;
- ip_change_ifindex(ill, &connc);
- ipcl_walk(conn_change_ifindex, (caddr_t)&connc, ipst);
-
/* Send the routing sockets message */
- ip_rts_ifmsg(ipif);
- if (ill_other != NULL)
- ip_rts_ifmsg(ill_other->ill_ipif);
+ ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+ if (ILL_OTHER(ill))
+ ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
return (0);
}
@@ -22038,6 +18073,45 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
B_TRUE));
}
+/*
+ * Return the number of addresses on `ill' with one or more of the values
+ * in `set' set and all of the values in `clear' clear.
+ */
+static uint_t
+ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear)
+{
+ ipif_t *ipif;
+ uint_t cnt = 0;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+ if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear))
+ cnt++;
+
+ return (cnt);
+}
+
+/*
+ * Return the number of migratable addresses on `ill' that are under
+ * application control.
+ */
+uint_t
+ill_appaddr_cnt(const ill_t *ill)
+{
+ return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF,
+ IPIF_NOFAILOVER));
+}
+
+/*
+ * Return the number of point-to-point addresses on `ill'.
+ */
+uint_t
+ill_ptpaddr_cnt(const ill_t *ill)
+{
+ return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0));
+}
+
/* ARGSUSED */
int
ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
@@ -22158,7 +18232,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
int err = 0, ret;
uint_t ifindex;
- phyint_t *us_phyint, *us_cli_phyint;
ipsq_t *ipsq = NULL;
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
@@ -22167,19 +18240,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ASSERT(CONN_Q(q));
isv6 = (Q_TO_CONN(q))->conn_af_isv6;
- us_cli_phyint = usesrc_cli_ill->ill_phyint;
-
- ASSERT(us_cli_phyint != NULL);
-
- /*
- * If the client ILL is being used for IPMP, abort.
- * Note, this can be done before ipsq_try_enter since we are already
- * exclusive on this ILL
- */
- if ((us_cli_phyint->phyint_groupname != NULL) ||
- (us_cli_phyint->phyint_flags & PHYI_STANDBY)) {
- return (EINVAL);
- }
ifindex = lifr->lifr_index;
if (ifindex == 0) {
@@ -22198,15 +18258,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
return (err);
}
- /*
- * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP
- * group nor can either of the interfaces be used for standy. So
- * to guarantee mutual exclusion with ip_sioctl_flags (which sets
- * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname)
- * we need to be exclusive on the ipsq belonging to the usesrc_ill.
- * We are already exlusive on this ipsq i.e ipsq corresponding to
- * the usesrc_cli_ill
- */
ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
NEW_OP, B_TRUE);
if (ipsq == NULL) {
@@ -22215,11 +18266,19 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
goto done;
}
- /* Check if the usesrc_ill is used for IPMP */
- us_phyint = usesrc_ill->ill_phyint;
- if ((us_phyint->phyint_groupname != NULL) ||
- (us_phyint->phyint_flags & PHYI_STANDBY)) {
- err = EINVAL;
+ /* USESRC isn't currently supported with IPMP */
+ if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) {
+ err = ENOTSUP;
+ goto done;
+ }
+
+ /*
+ * USESRC isn't compatible with the STANDBY flag. (STANDBY is only
+ * used by IPMP underlying interfaces, but someone might think it's
+ * more general and try to use it independently with VNI.)
+ */
+ if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
+ err = ENOTSUP;
goto done;
}
@@ -22372,79 +18431,45 @@ ill_phyint_compare_name(const void *name_ptr, const void *phyip)
return (-1);
return (0);
}
+
/*
- * This function is called from ill_delete when the ill is being
- * unplumbed. We remove the reference from the phyint and we also
- * free the phyint when there are no more references to it.
+ * This function is called on the unplumb path via ill_glist_delete() when
+ * there are no ills left on the phyint and thus the phyint can be freed.
*/
static void
-ill_phyint_free(ill_t *ill)
+phyint_free(phyint_t *phyi)
{
- phyint_t *phyi;
- phyint_t *next_phyint;
- ipsq_t *cur_ipsq;
- ip_stack_t *ipst = ill->ill_ipst;
+ ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
- ASSERT(ill->ill_phyint != NULL);
+ ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL);
- ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
- phyi = ill->ill_phyint;
- ill->ill_phyint = NULL;
/*
- * ill_init allocates a phyint always to store the copy
- * of flags relevant to phyint. At that point in time, we could
- * not assign the name and hence phyint_illv4/v6 could not be
- * initialized. Later in ipif_set_values, we assign the name to
- * the ill, at which point in time we assign phyint_illv4/v6.
- * Thus we don't rely on phyint_illv6 to be initialized always.
+ * If this phyint was an IPMP meta-interface, blow away the group.
+ * This is safe to do because all of the illgrps have already been
+ * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us.
+ * If we're cleaning up as a result of failed initialization,
+ * phyint_grp may be NULL.
*/
- if (ill->ill_flags & ILLF_IPV6) {
- phyi->phyint_illv6 = NULL;
- } else {
- phyi->phyint_illv4 = NULL;
+ if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) {
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ ipmp_grp_destroy(phyi->phyint_grp);
+ phyi->phyint_grp = NULL;
+ rw_exit(&ipst->ips_ipmp_lock);
}
- /*
- * ipif_down removes it from the group when the last ipif goes
- * down.
- */
- ASSERT(ill->ill_group == NULL);
-
- if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL)
- return;
/*
- * Make sure this phyint was put in the list.
+ * If this interface was under IPMP, take it out of the group.
*/
- if (phyi->phyint_ifindex > 0) {
- avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
- phyi);
- avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
- phyi);
- }
+ if (phyi->phyint_grp != NULL)
+ ipmp_phyint_leave_grp(phyi);
+
/*
- * remove phyint from the ipsq list.
+ * Delete the phyint and disassociate its ipsq. The ipsq itself
+ * will be freed in ipsq_exit().
*/
- cur_ipsq = phyi->phyint_ipsq;
- if (phyi == cur_ipsq->ipsq_phyint_list) {
- cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next;
- } else {
- next_phyint = cur_ipsq->ipsq_phyint_list;
- while (next_phyint != NULL) {
- if (next_phyint->phyint_ipsq_next == phyi) {
- next_phyint->phyint_ipsq_next =
- phyi->phyint_ipsq_next;
- break;
- }
- next_phyint = next_phyint->phyint_ipsq_next;
- }
- ASSERT(next_phyint != NULL);
- }
- IPSQ_DEC_REF(cur_ipsq, ipst);
+ phyi->phyint_ipsq->ipsq_phyint = NULL;
+ phyi->phyint_name[0] = '\0';
- if (phyi->phyint_groupname_len != 0) {
- ASSERT(phyi->phyint_groupname != NULL);
- mi_free(phyi->phyint_groupname);
- }
mi_free(phyi);
}
@@ -22464,7 +18489,6 @@ ill_phyint_reinit(ill_t *ill)
phyint_t *phyi;
avl_index_t where = 0;
ill_t *ill_other = NULL;
- ipsq_t *ipsq;
ip_stack_t *ipst = ill->ill_ipst;
ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
@@ -22476,6 +18500,11 @@ ill_phyint_reinit(ill_t *ill)
phyi_old->phyint_illv4 == NULL));
ASSERT(phyi_old->phyint_ifindex == 0);
+ /*
+ * Now that our ill has a name, set it in the phyint.
+ */
+ (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ);
+
phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
ill->ill_name, &where);
@@ -22497,8 +18526,7 @@ ill_phyint_reinit(ill_t *ill)
* we are initializing IPv4.
*/
if (phyi != NULL) {
- ill_other = (isv6) ? phyi->phyint_illv4 :
- phyi->phyint_illv6;
+ ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6;
ASSERT(ill_other->ill_phyint != NULL);
ASSERT((isv6 && !ill_other->ill_isv6) ||
(!isv6 && ill_other->ill_isv6));
@@ -22517,26 +18545,15 @@ ill_phyint_reinit(ill_t *ill)
ASSERT(phyi->phyint_illv4 == NULL);
phyi->phyint_illv4 = ill;
}
- /*
- * This is a new ill, currently undergoing SLIFNAME
- * So we could not have joined an IPMP group until now.
- */
- ASSERT(phyi_old->phyint_ipsq_next == NULL &&
- phyi_old->phyint_groupname == NULL);
/*
- * This phyi_old is going away. Decref ipsq_refs and
- * assert it is zero. The ipsq itself will be freed in
- * ipsq_exit
+ * Delete the old phyint and make its ipsq eligible
+ * to be freed in ipsq_exit().
*/
- ipsq = phyi_old->phyint_ipsq;
- IPSQ_DEC_REF(ipsq, ipst);
- ASSERT(ipsq->ipsq_refs == 0);
- /* Get the singleton phyint out of the ipsq list */
- ASSERT(phyi_old->phyint_ipsq_next == NULL);
- ipsq->ipsq_phyint_list = NULL;
phyi_old->phyint_illv4 = NULL;
phyi_old->phyint_illv6 = NULL;
+ phyi_old->phyint_ipsq->ipsq_phyint = NULL;
+ phyi_old->phyint_name[0] = '\0';
mi_free(phyi_old);
} else {
mutex_enter(&ill->ill_lock);
@@ -22551,9 +18568,6 @@ ill_phyint_reinit(ill_t *ill)
if (!phyint_assign_ifindex(phyi, ipst))
cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
- /* No IPMP group yet, thus the hook uses the ifindex */
- phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-
avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
(void *)phyi, where);
@@ -22571,13 +18585,6 @@ ill_phyint_reinit(ill_t *ill)
ill->ill_phyint = phyi;
/*
- * Keep the index on ipif_orig_index to be used by FAILOVER.
- * We do this here as when the first ipif was allocated,
- * ipif_allocate does not know the right interface index.
- */
-
- ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex;
- /*
* Now that the phyint's ifindex has been assigned, complete the
* remaining
*/
@@ -22606,45 +18613,14 @@ ill_phyint_reinit(ill_t *ill)
*/
if (ill->ill_name_length <= 2 ||
ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') {
- /*
- * Generate nic plumb event for ill_name even if
- * ipmp_hook_emulation is set. That avoids generating events
- * for the ill_names should ipmp_hook_emulation be turned on
- * later.
- */
- ill_nic_event_plumb(ill, B_FALSE);
+ ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name,
+ ill->ill_name_length);
}
RELEASE_ILL_LOCKS(ill, ill_other);
mutex_exit(&phyi->phyint_lock);
}
/*
- * Allocate a NE_PLUMB nic info event and store in the ill.
- * If 'group' is set we do it for the group name, otherwise the ill name.
- * It will be sent when we leave the ipsq.
- */
-void
-ill_nic_event_plumb(ill_t *ill, boolean_t group)
-{
- phyint_t *phyi = ill->ill_phyint;
- char *name;
- int namelen;
-
- ASSERT(MUTEX_HELD(&ill->ill_lock));
-
- if (group) {
- ASSERT(phyi->phyint_groupname_len != 0);
- namelen = phyi->phyint_groupname_len;
- name = phyi->phyint_groupname;
- } else {
- namelen = ill->ill_name_length;
- name = ill->ill_name;
- }
-
- ill_nic_event_dispatch(ill, 0, NE_PLUMB, name, namelen);
-}
-
-/*
* Notify any downstream modules of the name of this interface.
* An M_IOCTL is used even though we don't expect a successful reply.
* Any reply message from the driver (presumably an M_IOCNAK) will
@@ -22686,8 +18662,9 @@ ip_ifname_notify(ill_t *ill, queue_t *q)
static int
ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
{
- int err;
+ int err;
ip_stack_t *ipst = ill->ill_ipst;
+ phyint_t *phyi = ill->ill_phyint;
/* Set the obsolete NDD per-interface forwarding name. */
err = ill_set_ndd_name(ill);
@@ -22696,6 +18673,34 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
err);
}
+ /*
+ * Now that ill_name is set, the configuration for the IPMP
+ * meta-interface can be performed.
+ */
+ if (IS_IPMP(ill)) {
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ /*
+ * If phyi->phyint_grp is NULL, then this is the first IPMP
+ * meta-interface and we need to create the IPMP group.
+ */
+ if (phyi->phyint_grp == NULL) {
+ /*
+ * If someone has renamed another IPMP group to have
+ * the same name as our interface, bail.
+ */
+ if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) {
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (EEXIST);
+ }
+ phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi);
+ if (phyi->phyint_grp == NULL) {
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (ENOMEM);
+ }
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+ }
+
/* Tell downstream modules where they are. */
ip_ifname_notify(ill, q);
@@ -22966,10 +18971,10 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
/*
* If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
*/
- if (ipsq->ipsq_current_ipif == NULL)
+ if (ipsq->ipsq_xop->ipx_current_ipif == NULL)
ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
else
- ASSERT(ipsq->ipsq_current_ipif == ipif);
+ ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif);
error = ipif_set_values_tail(ill, ipif, mp, q);
ipsq_exit(ipsq);
@@ -22986,18 +18991,8 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
void
ipif_init(ip_stack_t *ipst)
{
- hrtime_t hrt;
int i;
- /*
- * Can't call drv_getparm here as it is too early in the boot.
- * As we use ipif_src_random just for picking a different
- * source address everytime, this need not be really random.
- */
- hrt = gethrtime();
- ipst->ips_ipif_src_random =
- ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff);
-
for (i = 0; i < MAX_G_HEADS; i++) {
ipst->ips_ill_g_heads[i].ill_g_list_head =
(ill_if_t *)&ipst->ips_ill_g_heads[i];
@@ -23023,7 +19018,11 @@ ipif_init(ip_stack_t *ipst)
* match is found to take care of such rare network configurations like -
* le0: 129.146.1.1/16
* le1: 129.146.2.2/24
- * It is used only by SO_DONTROUTE at the moment.
+ *
+ * This is used by SO_DONTROUTE and IP_NEXTHOP. Since neither of those are
+ * supported on underlying interfaces in an IPMP group, underlying interfaces
+ * are ignored when looking up a match. (If we didn't ignore them, we'd
+ * risk using a test address as a source for outgoing traffic.)
*/
ipif_t *
ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
@@ -23038,6 +19037,8 @@ ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ if (IS_UNDER_IPMP(ill))
+ continue;
mutex_enter(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
@@ -23660,30 +19661,76 @@ ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa,
* Knows about IEEE 802 and IEEE EUI-64 mappings.
*/
static boolean_t
-ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr)
{
char *addr;
- if (phys_length != ETHERADDRL)
+ if (ill->ill_phys_addr_length != ETHERADDRL)
return (B_FALSE);
/* Form EUI-64 like address */
addr = (char *)&v6addr->s6_addr32[2];
- bcopy((char *)phys_addr, addr, 3);
+ bcopy(ill->ill_phys_addr, addr, 3);
addr[0] ^= 0x2; /* Toggle Universal/Local bit */
addr[3] = (char)0xff;
addr[4] = (char)0xfe;
- bcopy((char *)phys_addr + 3, addr + 5, 3);
+ bcopy(ill->ill_phys_addr + 3, addr + 5, 3);
return (B_TRUE);
}
/* ARGSUSED */
static boolean_t
-ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr)
{
return (B_FALSE);
}
+typedef struct ipmp_ifcookie {
+ uint32_t ic_hostid;
+ char ic_ifname[LIFNAMSIZ];
+ char ic_zonename[ZONENAME_MAX];
+} ipmp_ifcookie_t;
+
+/*
+ * Construct a pseudo-random interface ID for the IPMP interface that's both
+ * predictable and (almost) guaranteed to be unique.
+ */
+static boolean_t
+ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
+{
+ zone_t *zp;
+ uint8_t *addr;
+ uchar_t hash[16];
+ ulong_t hostid;
+ MD5_CTX ctx;
+ ipmp_ifcookie_t ic = { 0 };
+
+ ASSERT(IS_IPMP(ill));
+
+ (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
+ ic.ic_hostid = htonl((uint32_t)hostid);
+
+ (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ);
+
+ if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) {
+ (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX);
+ zone_rele(zp);
+ }
+
+ MD5Init(&ctx);
+ MD5Update(&ctx, &ic, sizeof (ic));
+ MD5Final(hash, &ctx);
+
+ /*
+ * Map the hash to an interface ID per the basic approach in RFC3041.
+ */
+ addr = &v6addr->s6_addr8[8];
+ bcopy(hash + 8, addr, sizeof (uint64_t));
+ addr[0] &= ~0x2; /* set local bit */
+
+ return (B_TRUE);
+}
+
/* ARGSUSED */
static boolean_t
ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
@@ -23739,14 +19786,14 @@ ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
* Derive IPoIB interface id from the link layer address.
*/
static boolean_t
-ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
{
char *addr;
- if (phys_length != 20)
+ if (ill->ill_phys_addr_length != 20)
return (B_FALSE);
addr = (char *)&v6addr->s6_addr32[2];
- bcopy(phys_addr + 12, addr, 8);
+ bcopy(ill->ill_phys_addr + 12, addr, 8);
/*
* In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
* in the globally assigned EUI-64 GUID to 1, in violation of IEEE
@@ -23863,6 +19910,7 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
*ipifp = NULL;
return (B_FALSE);
}
+
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
if (!IPIF_CAN_LOOKUP(ipif))
continue;
@@ -23897,71 +19945,9 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
}
/*
- * Same as ipif_lookup_zoneid() but looks at all the ills in the same group.
- */
-boolean_t
-ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
-{
- ill_t *illg;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * We look at the passed-in ill first without grabbing ill_g_lock.
- */
- if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) {
- return (B_TRUE);
- }
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- if (ill->ill_group == NULL) {
- /* ill not in a group */
- rw_exit(&ipst->ips_ill_g_lock);
- return (B_FALSE);
- }
-
- /*
- * There's no ipif in the zone on ill, however ill is part of an IPMP
- * group. We need to look for an ipif in the zone on all the ills in the
- * group.
- */
- illg = ill->ill_group->illgrp_ill;
- do {
- /*
- * We don't call ipif_lookup_zoneid() on ill as we already know
- * that it's not there.
- */
- if (illg != ill &&
- ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) {
- break;
- }
- } while ((illg = illg->ill_group_next) != NULL);
- rw_exit(&ipst->ips_ill_g_lock);
- return (illg != NULL);
-}
-
-/*
- * Check if this ill is only being used to send ICMP probes for IPMP
- */
-boolean_t
-ill_is_probeonly(ill_t *ill)
-{
- /*
- * Check if the interface is FAILED, or INACTIVE
- */
- if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE))
- return (B_TRUE);
-
- return (B_FALSE);
-}
-
-/*
* Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
* If a pointer to an ipif_t is returned then the caller will need to do
* an ill_refrele().
- *
- * If there is no real interface which matches the ifindex, then it looks
- * for a group that has a matching index. In the case of a group match the
- * lifidx must be zero. We don't need emulate the logical interfaces
- * since IP Filter's use of netinfo doesn't use that.
*/
ipif_t *
ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
@@ -23972,18 +19958,8 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
ipst);
-
- if (ill == NULL) {
- /* Fallback to group names only if hook_emulation set */
- if (!ipst->ips_ipmp_hook_emulation)
- return (NULL);
-
- if (lifidx != 0)
- return (NULL);
- ill = ill_group_lookup_on_ifindex(ifindex, isv6, ipst);
- if (ill == NULL)
- return (NULL);
- }
+ if (ill == NULL)
+ return (NULL);
mutex_enter(&ill->ill_lock);
if (ill->ill_state_flags & ILL_CONDEMNED) {
@@ -24059,7 +20035,7 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp)
* If we can quiesce the ill, then set the address. If not, then
* ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
*/
- ill_down_ipifs(ill, NULL, 0, B_FALSE);
+ ill_down_ipifs(ill);
mutex_enter(&ill->ill_lock);
if (!ill_is_quiescent(ill)) {
/* call cannot fail since `conn_t *' argument is NULL */
@@ -24283,10 +20259,7 @@ ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event,
if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL)
goto fail;
- if (event == NE_UNPLUMB)
- info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
- else
- info->hnei_event.hne_nic = ill->ill_phyint->phyint_hook_ifindex;
+ info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
info->hnei_event.hne_lif = lif;
info->hnei_event.hne_event = event;
info->hnei_event.hne_protocol = ill->ill_isv6 ?
@@ -24323,8 +20296,8 @@ fail:
void
ipif_up_notify(ipif_t *ipif)
{
- ip_rts_ifmsg(ipif);
- ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
+ ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+ ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT);
sctp_update_ipif(ipif, SCTP_IPIF_UP);
ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id),
NE_LIF_UP, NULL, 0);
diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c
index 405cb653d5..52a7e74806 100644
--- a/usr/src/uts/common/inet/ip/ip_ire.c
+++ b/usr/src/uts/common/inet/ip/ip_ire.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -31,6 +31,7 @@
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
+#include <sys/strsun.h>
#include <sys/ddi.h>
#include <sys/cmn_err.h>
#include <sys/policy.h>
@@ -61,7 +62,6 @@
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
#include <inet/sadb.h>
-#include <sys/kmem.h>
#include <inet/tcp.h>
#include <inet/ipclassifier.h>
#include <sys/zone.h>
@@ -220,11 +220,6 @@ struct kmem_cache *rt_entry_cache;
* IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is
* to be ignored when walking the ires using ire_next.
*
- * IRE_MARK_HIDDEN signifies that the ire is a special ire typically for the
- * benefit of in.mpathd which needs to probe interfaces for failures. Normal
- * applications should not be seeing this ire and hence this ire is ignored
- * in most cases in the search using ire_next.
- *
* Zones note:
* Walking IREs within a given zone also walks certain ires in other
* zones. This is done intentionally. IRE walks with a specified
@@ -1235,10 +1230,9 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
{
irb_t *irb;
boolean_t drop = B_FALSE;
- /* LINTED : set but not used in function */
boolean_t mctl_present;
mblk_t *first_mp = NULL;
- mblk_t *save_mp = NULL;
+ mblk_t *data_mp = NULL;
ire_t *dst_ire;
ipha_t *ipha;
ip6_t *ip6h;
@@ -1258,27 +1252,16 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
* we resolve an IPv6 address with an IPv4 ire
* or vice versa.
*/
+ EXTRACT_PKT_MP(mp, first_mp, mctl_present);
+ data_mp = mp;
+ mp = first_mp;
if (ire->ire_ipversion == IPV4_VERSION) {
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
- ipha = (ipha_t *)mp->b_rptr;
- save_mp = mp;
- mp = first_mp;
-
+ ipha = (ipha_t *)data_mp->b_rptr;
dst_ire = ire_cache_lookup(ipha->ipha_dst,
ire->ire_zoneid, MBLK_GETLABEL(mp), ipst);
} else {
ASSERT(ire->ire_ipversion == IPV6_VERSION);
- /*
- * Get a pointer to the beginning of the IPv6 header.
- * Ignore leading IPsec control mblks.
- */
- first_mp = mp;
- if (mp->b_datap->db_type == M_CTL) {
- mp = mp->b_cont;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- save_mp = mp;
- mp = first_mp;
+ ip6h = (ip6_t *)data_mp->b_rptr;
dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst,
ire->ire_zoneid, MBLK_GETLABEL(mp), ipst);
}
@@ -1330,10 +1313,8 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
* is over: we just drop the packet.
*/
if (ire->ire_flags & RTF_MULTIRT) {
- if (save_mp) {
- save_mp->b_prev = NULL;
- save_mp->b_next = NULL;
- }
+ data_mp->b_prev = NULL;
+ data_mp->b_next = NULL;
MULTIRT_DEBUG_UNTAG(mp);
freemsg(mp);
} else {
@@ -1355,9 +1336,31 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
(CONN_Q(q) ? Q_TO_CONN(q) : NULL),
ire->ire_zoneid, ipst);
} else {
+ int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN;
+
ASSERT(ire->ire_ipversion == IPV6_VERSION);
- ip_newroute_v6(q, mp, &ip6h->ip6_dst, NULL,
- NULL, ire->ire_zoneid, ipst);
+
+ /*
+ * If necessary, skip over the ip6i_t to find
+ * the header with the actual source address.
+ */
+ if (ip6h->ip6_nxt == IPPROTO_RAW) {
+ if (MBLKL(data_mp) < minlen &&
+ pullupmsg(data_mp, -1) == 0) {
+ ip1dbg(("ire_add_then_send: "
+ "cannot pullupmsg ip6i\n"));
+ if (mctl_present)
+ freeb(first_mp);
+ ire_refrele(ire);
+ return;
+ }
+ ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN);
+ ip6h = (ip6_t *)(data_mp->b_rptr +
+ sizeof (ip6i_t));
+ }
+ ip_newroute_v6(q, mp, &ip6h->ip6_dst,
+ &ip6h->ip6_src, NULL, ire->ire_zoneid,
+ ipst);
}
}
@@ -1680,7 +1683,9 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep,
{
ire_t *ire;
uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ boolean_t prefer;
+ ill_t *ill = ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
/*
* No broadcast IREs for the LOOPBACK interface
@@ -1690,21 +1695,26 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep,
(ipif->ipif_flags & IPIF_NOXMIT))
return (irep);
- /* If this would be a duplicate, don't bother. */
+ /*
+ * If this new IRE would be a duplicate, only prefer it if one of
+ * the following is true:
+ *
+ * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST
+ * set and the new one has all of those clear.
+ *
+ * 2. The existing one corresponds to an underlying ILL in an IPMP
+ * group and the new one corresponds to an IPMP group interface.
+ */
if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif,
ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) {
- /*
- * We look for non-deprecated (and non-anycast, non-nolocal)
- * ipifs as the best choice. ipifs with check_flags matching
- * (deprecated, etc) are used only if non-deprecated ipifs
- * are not available. if the existing ire's ipif is deprecated
- * and the new ipif is non-deprecated, switch to the new ipif
- */
- if ((!(ire->ire_ipif->ipif_flags & check_flags)) ||
- (ipif->ipif_flags & check_flags)) {
+ prefer = ((ire->ire_ipif->ipif_flags & check_flags) &&
+ !(ipif->ipif_flags & check_flags)) ||
+ (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill));
+ if (!prefer) {
ire_refrele(ire);
return (irep);
}
+
/*
* Bcast ires exist in pairs. Both have to be deleted,
* Since we are exclusive we can make the above assertion.
@@ -1716,10 +1726,7 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep,
ire_delete(ire);
ire_refrele(ire);
}
-
- irep = ire_create_bcast(ipif, addr, irep);
-
- return (irep);
+ return (ire_create_bcast(ipif, addr, irep));
}
uint_t ip_loopback_mtu = IP_LOOPBACK_MTU;
@@ -1733,6 +1740,22 @@ ire_t **
ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep)
{
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ ill_t *ill = ipif->ipif_ill;
+
+ ASSERT(IAM_WRITER_IPIF(ipif));
+
+ if (IS_IPMP(ill)) {
+ /*
+ * Broadcast IREs for the IPMP meta-interface use the
+ * nominated broadcast interface to send and receive packets.
+ * If there's no nominated interface, send the packets down to
+ * the IPMP stub driver, which will discard them. If the
+ * nominated broadcast interface changes, ill_refresh_bcast()
+ * will refresh the broadcast IREs.
+ */
+ if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+ ill = ipif->ipif_ill;
+ }
*irep++ = ire_create(
(uchar_t *)&addr, /* dest addr */
@@ -1741,8 +1764,8 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep)
NULL, /* no gateway */
&ipif->ipif_mtu, /* max frag */
NULL, /* no src nce */
- ipif->ipif_rq, /* recv-from queue */
- ipif->ipif_wq, /* send-to queue */
+ ill->ill_rq, /* recv-from queue */
+ ill->ill_wq, /* send-to queue */
IRE_BROADCAST,
ipif,
0,
@@ -1761,7 +1784,7 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep)
NULL, /* no gateway */
&ip_loopback_mtu, /* max frag size */
NULL, /* no src_nce */
- ipif->ipif_rq, /* recv-from queue */
+ ill->ill_rq, /* recv-from queue */
NULL, /* no send-to queue */
IRE_BROADCAST, /* Needed for fanout in wput */
ipif,
@@ -2049,32 +2072,23 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
{
ill_t *ire_stq_ill = NULL;
ill_t *ire_ipif_ill = NULL;
- ill_group_t *ire_ill_group = NULL;
ASSERT(match_flags != 0 || zoneid != ALL_ZONES);
/*
- * MATCH_IRE_ILL/MATCH_IRE_ILL_GROUP : We match both on ill
- * pointed by ire_stq and ire_ipif. Only in the case of
- * IRE_CACHEs can ire_stq and ire_ipif be pointing to
- * different ills. But we want to keep this function generic
- * enough for future use. So, we always try to match on both.
- * The only caller of this function ire_walk_ill_tables, will
- * call "func" after we return from this function. We expect
- * "func" to do the right filtering of ires in this case.
- *
- * NOTE : In the case of MATCH_IRE_ILL_GROUP, groups
- * pointed by ire_stq and ire_ipif should always be the same.
- * So, we just match on only one of them.
+ * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and
+ * ire_ipif. Only in the case of IRE_CACHEs can ire_stq and
+ * ire_ipif be pointing to different ills. But we want to keep
+ * this function generic enough for future use. So, we always
+ * try to match on both. The only caller of this function
+ * ire_walk_ill_tables, will call "func" after we return from
+ * this function. We expect "func" to do the right filtering
+ * of ires in this case.
*/
- if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
+ if (match_flags & MATCH_IRE_ILL) {
if (ire->ire_stq != NULL)
- ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr;
+ ire_stq_ill = ire->ire_stq->q_ptr;
if (ire->ire_ipif != NULL)
ire_ipif_ill = ire->ire_ipif->ipif_ill;
- if (ire_stq_ill != NULL)
- ire_ill_group = ire_stq_ill->ill_group;
- if ((ire_ill_group == NULL) && (ire_ipif_ill != NULL))
- ire_ill_group = ire_ipif_ill->ill_group;
}
if (zoneid != ALL_ZONES) {
@@ -2115,7 +2129,7 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
ipif_t *src_ipif;
src_ipif =
ipif_select_source_v6(ire_stq_ill,
- &ire->ire_addr_v6, RESTRICT_TO_NONE,
+ &ire->ire_addr_v6, B_FALSE,
IPV6_PREFER_SRC_DEFAULT,
zoneid);
if (src_ipif != NULL) {
@@ -2143,9 +2157,9 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
ire_t *rire;
ire_match_flags |= MATCH_IRE_TYPE;
- if (ire->ire_ipif != NULL) {
- ire_match_flags |= MATCH_IRE_ILL_GROUP;
- }
+ if (ire->ire_ipif != NULL)
+ ire_match_flags |= MATCH_IRE_ILL;
+
if (ire->ire_ipversion == IPV4_VERSION) {
rire = ire_route_lookup(ire->ire_gateway_addr,
0, 0, IRE_INTERFACE, ire->ire_ipif, NULL,
@@ -2169,11 +2183,8 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
if (((!(match_flags & MATCH_IRE_TYPE)) ||
(ire->ire_type & ire_type)) &&
((!(match_flags & MATCH_IRE_ILL)) ||
- (ire_stq_ill == ill || ire_ipif_ill == ill)) &&
- ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
- (ire_stq_ill == ill) || (ire_ipif_ill == ill) ||
- (ire_ill_group != NULL &&
- ire_ill_group == ill->ill_group))) {
+ (ire_stq_ill == ill || ire_ipif_ill == ill ||
+ ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) {
return (B_TRUE);
}
return (B_FALSE);
@@ -2221,8 +2232,7 @@ ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
boolean_t ret;
struct rtfuncarg rtfarg;
- ASSERT((!(match_flags & (MATCH_IRE_ILL |
- MATCH_IRE_ILL_GROUP))) || (ill != NULL));
+ ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL));
ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0));
/*
* Optimize by not looking at the forwarding table if there
@@ -2399,32 +2409,26 @@ ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp,
}
/*
- * IPMP flag settings happen without taking the exclusive route
- * in ip_sioctl_flags. So we need to make an atomic check here
- * for FAILED/OFFLINE/INACTIVE flags or if it has hit the
- * FAILBACK=no case.
+ * Don't allow IRE's to be created on changing ill's. Also, since
+ * IPMP flags can be set on an ill without quiescing it, if we're not
+ * a writer on stq_ill, check that the flags still allow IRE creation.
*/
if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) {
if (stq_ill->ill_state_flags & ILL_CHANGING) {
ill = stq_ill;
error = EAGAIN;
- } else if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) ||
- (ill_is_probeonly(stq_ill) &&
- !(ire->ire_marks & IRE_MARK_HIDDEN))) {
- error = EINVAL;
+ } else if (IS_UNDER_IPMP(stq_ill)) {
+ mutex_enter(&stq_ill->ill_phyint->phyint_lock);
+ if (!ipmp_ill_is_active(stq_ill) &&
+ !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) {
+ error = EINVAL;
+ }
+ mutex_exit(&stq_ill->ill_phyint->phyint_lock);
}
- goto done;
+ if (error != 0)
+ goto done;
}
- /*
- * We don't check for OFFLINE/FAILED in this case because
- * the source address selection logic (ipif_select_source)
- * may still select a source address from such an ill. The
- * assumption is that these addresses will be moved by in.mpathd
- * soon. (i.e. this is a race). However link local addresses
- * will not move and hence ipif_select_source_v6 tries to avoid
- * FAILED ills. Please see ipif_select_source_v6 for more info
- */
if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) &&
(ipif_ill->ill_state_flags & ILL_CHANGING)) {
ill = ipif_ill;
@@ -2444,8 +2448,10 @@ done:
if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) {
ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
ire_atomic_end(irb_ptr, ire);
ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
mutex_exit(&ipsq->ipsq_lock);
error = EINPROGRESS;
} else if (error != 0) {
@@ -2502,39 +2508,7 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
ire = ire1;
}
if (ire->ire_stq != NULL)
- stq_ill = (ill_t *)ire->ire_stq->q_ptr;
-
- if (ire->ire_type == IRE_CACHE) {
- /*
- * If this interface is FAILED, or INACTIVE or has hit
- * the FAILBACK=no case, we create IRE_CACHES marked
- * HIDDEN for some special cases e.g. bind to
- * IPIF_NOFAILOVER address etc. So, if this interface
- * is FAILED/INACTIVE/hit FAILBACK=no case, and we are
- * not creating hidden ires, we should not allow that.
- * This happens because the state of the interface
- * changed while we were waiting in ARP. If this is the
- * daemon sending probes, the next probe will create
- * HIDDEN ires and we will create an ire then. This
- * cannot happen with NDP currently because IRE is
- * never queued in NDP. But it can happen in the
- * future when we have external resolvers with IPv6.
- * If the interface gets marked with OFFLINE while we
- * are waiting in ARP, don't add the ire.
- */
- if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) ||
- (ill_is_probeonly(stq_ill) &&
- !(ire->ire_marks & IRE_MARK_HIDDEN))) {
- /*
- * We don't know whether it is a valid ipif or not.
- * unless we do the check below. So, set it to NULL.
- */
- ire->ire_ipif = NULL;
- ire_delete(ire);
- *irep = NULL;
- return (EINVAL);
- }
- }
+ stq_ill = ire->ire_stq->q_ptr;
if (stq_ill != NULL && ire->ire_type == IRE_CACHE &&
stq_ill->ill_net_type == IRE_IF_RESOLVER) {
@@ -2573,12 +2547,12 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
rw_exit(&ipst->ips_ill_g_lock);
if (ipif == NULL ||
(ipif->ipif_isv6 &&
+ !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) &&
!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
&ipif->ipif_v6src_addr)) ||
(!ipif->ipif_isv6 &&
ire->ire_src_addr != ipif->ipif_src_addr) ||
ire->ire_zoneid != ipif->ipif_zoneid) {
-
if (ipif != NULL)
ipif_refrele(ipif);
ire->ire_ipif = NULL;
@@ -2587,20 +2561,7 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
return (EINVAL);
}
-
ASSERT(ill != NULL);
- /*
- * If this group was dismantled while this packets was
- * queued in ARP, don't add it here.
- */
- if (ire->ire_ipif->ipif_ill->ill_group != ill->ill_group) {
- /* We don't want ire_inactive bump stats for this */
- ipif_refrele(ipif);
- ire->ire_ipif = NULL;
- ire_delete(ire);
- *irep = NULL;
- return (EINVAL);
- }
/*
* Since we didn't attach label security attributes to the
@@ -2677,6 +2638,16 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
boolean_t need_refrele = B_FALSE;
nce_t *nce;
ip_stack_t *ipst = ire->ire_ipst;
+ uint_t marks = 0;
+
+ /*
+ * IREs with source addresses hosted on interfaces that are under IPMP
+ * should be hidden so that applications don't accidentally end up
+ * sending packets with test addresses as their source addresses, or
+ * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here.
+ */
+ if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill))
+ marks |= IRE_MARK_TESTHIDDEN;
if (ire->ire_ipif != NULL)
ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
@@ -2691,10 +2662,15 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
case IRE_HOST:
ire->ire_mask = IP_HOST_MASK;
ire->ire_masklen = IP_ABITS;
+ ire->ire_marks |= marks;
if ((ire->ire_flags & RTF_SETSRC) == 0)
ire->ire_src_addr = 0;
break;
case IRE_CACHE:
+ ire->ire_mask = IP_HOST_MASK;
+ ire->ire_masklen = IP_ABITS;
+ ire->ire_marks |= marks;
+ break;
case IRE_BROADCAST:
case IRE_LOCAL:
case IRE_LOOPBACK:
@@ -2702,15 +2678,14 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
ire->ire_masklen = IP_ABITS;
break;
case IRE_PREFIX:
- if ((ire->ire_flags & RTF_SETSRC) == 0)
- ire->ire_src_addr = 0;
- break;
case IRE_DEFAULT:
+ ire->ire_marks |= marks;
if ((ire->ire_flags & RTF_SETSRC) == 0)
ire->ire_src_addr = 0;
break;
case IRE_IF_RESOLVER:
case IRE_IF_NORESOLVER:
+ ire->ire_marks |= marks;
break;
default:
ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n",
@@ -2796,19 +2771,13 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
*/
flags |= MATCH_IRE_IPIF;
/*
- * If we are creating hidden ires, make sure we search on
- * this ill (MATCH_IRE_ILL) and a hidden ire,
- * while we are searching for duplicates below. Otherwise we
- * could potentially find an IRE on some other interface
- * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
- * shouldn't do this as this will lead to an infinite loop
- * (if we get to ip_wput again) eventually we need an hidden
- * ire for this packet to go out. MATCH_IRE_ILL is explicitly
- * done below.
+ * If we are creating a hidden IRE, make sure we search for
+ * hidden IREs when searching for duplicates below.
+ * Otherwise, we might find an IRE on some other interface
+ * that's not marked hidden.
*/
- if (ire->ire_type == IRE_CACHE &&
- (ire->ire_marks & IRE_MARK_HIDDEN))
- flags |= (MATCH_IRE_MARK_HIDDEN);
+ if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
+ flags |= MATCH_IRE_MARK_TESTHIDDEN;
}
if ((ire->ire_type & IRE_CACHETABLE) == 0) {
irb_ptr = ire_get_bucket(ire);
@@ -2927,7 +2896,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
* avoid a lookup in the caller again. If the callers
* don't want to use it, they need to do a REFRELE.
*/
- ip1dbg(("found dup ire existing %p new %p",
+ ip1dbg(("found dup ire existing %p new %p\n",
(void *)ire1, (void *)ire));
IRE_REFHOLD(ire1);
ire_atomic_end(irb_ptr, ire);
@@ -2948,6 +2917,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
return (0);
}
}
+
if (ire->ire_type & IRE_CACHE) {
ASSERT(ire->ire_stq != NULL);
nce = ndp_lookup_v4(ire_to_ill(ire),
@@ -2999,17 +2969,9 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
}
/*
* Make it easy for ip_wput_ire() to hit multiple broadcast ires by
- * grouping identical addresses together on the hash chain. We also
- * don't want to send multiple copies out if there are two ills part
- * of the same group. Thus we group the ires with same addr and same
- * ill group together so that ip_wput_ire can easily skip all the
- * ires with same addr and same group after sending the first copy.
- * We do this only for IRE_BROADCASTs as ip_wput_ire is currently
- * interested in such groupings only for broadcasts.
- *
- * NOTE : If the interfaces are brought up first and then grouped,
- * illgrp_insert will handle it. We come here when the interfaces
- * are already in group and we are bringing them UP.
+ * grouping identical addresses together on the hash chain. We do
+ * this only for IRE_BROADCASTs as ip_wput_ire is currently interested
+ * in such groupings only for broadcasts.
*
* Find the first entry that matches ire_addr. *irep will be null
* if no match.
@@ -3023,29 +2985,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
if (ire->ire_type == IRE_BROADCAST && *irep != NULL) {
/*
* We found some ire (i.e *irep) with a matching addr. We
- * want to group ires with same addr and same ill group
- * together.
- *
- * First get to the entry that matches our address and
- * ill group i.e stop as soon as we find the first ire
- * matching the ill group and address. If there is only
- * an address match, we should walk and look for some
- * group match. These are some of the possible scenarios :
- *
- * 1) There are no groups at all i.e all ire's ill_group
- * are NULL. In that case we will essentially group
- * all the ires with the same addr together. Same as
- * the "else" block of this "if".
- *
- * 2) There are some groups and this ire's ill_group is
- * NULL. In this case, we will first find the group
- * that matches the address and a NULL group. Then
- * we will insert the ire at the end of that group.
- *
- * 3) There are some groups and this ires's ill_group is
- * non-NULL. In this case we will first find the group
- * that matches the address and the ill_group. Then
- * we will insert the ire at the end of that group.
+ * want to group ires with same addr.
*/
for (;;) {
ire1 = *irep;
@@ -3053,8 +2993,8 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
(ire1->ire_next->ire_addr != ire->ire_addr) ||
(ire1->ire_type != IRE_BROADCAST) ||
(ire1->ire_flags & RTF_MULTIRT) ||
- (ire1->ire_ipif->ipif_ill->ill_group ==
- ire->ire_ipif->ipif_ill->ill_group))
+ (ire1->ire_ipif->ipif_ill->ill_grp ==
+ ire->ire_ipif->ipif_ill->ill_grp))
break;
irep = &ire1->ire_next;
}
@@ -3071,18 +3011,14 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
/*
* Either we have hit the end of the list or the address
- * did not match or the group *matched*. If we found
- * a match on the group, skip to the end of the group.
+ * did not match.
*/
while (*irep != NULL) {
ire1 = *irep;
if ((ire1->ire_addr != ire->ire_addr) ||
- (ire1->ire_type != IRE_BROADCAST) ||
- (ire1->ire_ipif->ipif_ill->ill_group !=
- ire->ire_ipif->ipif_ill->ill_group))
+ (ire1->ire_type != IRE_BROADCAST))
break;
- if (ire1->ire_ipif->ipif_ill->ill_group == NULL &&
- ire1->ire_ipif == ire->ire_ipif) {
+ if (ire1->ire_ipif == ire->ire_ipif) {
irep = &ire1->ire_next;
break;
}
@@ -3611,15 +3547,14 @@ ire_inactive(ire_t *ire)
* The ipif that is associated with an ire is ire->ire_ipif and
* hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call
* ipif_ill_refrele_tail. Usually stq_ill is null or the same as
- * ire->ire_ipif->ipif_ill. So nothing more needs to be done. Only
- * in the case of IRE_CACHES when IPMP is used, stq_ill can be
- * different. If this is different from ire->ire_ipif->ipif_ill and
- * if the ill_ire_cnt on the stq_ill also has dropped to zero, we call
+ * ire->ire_ipif->ipif_ill. So nothing more needs to be done.
+ * However, for VNI or IPMP IRE entries, stq_ill can be different.
+ * If this is different from ire->ire_ipif->ipif_ill and if the
+ * ill_ire_cnt on the stq_ill also has dropped to zero, we call
* ipif_ill_refrele_tail on the stq_ill.
*/
-
if (ire->ire_stq != NULL)
- stq_ill = (ill_t *)ire->ire_stq->q_ptr;
+ stq_ill = ire->ire_stq->q_ptr;
if (stq_ill == NULL || stq_ill == ill) {
/* Optimize the most common case */
@@ -3881,26 +3816,27 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
{
ill_t *ire_ill = NULL, *dst_ill;
ill_t *ipif_ill = NULL;
- ill_group_t *ire_ill_group = NULL;
- ill_group_t *ipif_ill_group = NULL;
ASSERT(ire->ire_ipversion == IPV4_VERSION);
ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
- ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) ||
+ ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
(ipif != NULL && !ipif->ipif_isv6));
ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL);
/*
- * HIDDEN cache entries have to be looked up specifically with
- * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set
- * when the interface is FAILED or INACTIVE. In that case,
- * any IRE_CACHES that exists should be marked with
- * IRE_MARK_HIDDEN. So, we don't really need to match below
- * for IRE_MARK_HIDDEN. But we do so for consistency.
+ * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
+ * is in fact hidden, to ensure the caller gets the right one. One
+ * exception: if the caller passed MATCH_IRE_IHANDLE, then they
+ * already know the identity of the given IRE_INTERFACE entry and
+ * there's no point trying to hide it from them.
*/
- if (!(match_flags & MATCH_IRE_MARK_HIDDEN) &&
- (ire->ire_marks & IRE_MARK_HIDDEN))
- return (B_FALSE);
+ if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+ if (match_flags & MATCH_IRE_IHANDLE)
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+
+ if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
+ return (B_FALSE);
+ }
/*
* MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option
@@ -3994,19 +3930,18 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
}
/*
- * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that
- * somebody wants to send out on a particular interface which
- * is given by ire_stq and hence use ire_stq to derive the ill
- * value. ire_ipif for IRE_CACHES is just the means of getting
- * a source address i.e ire_src_addr = ire->ire_ipif->ipif_src_addr.
- * ire_to_ill does the right thing for this.
+ * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
+ * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
+ * of getting a source address -- i.e., ire_src_addr ==
+ * ire->ire_ipif->ipif_src_addr). ire_to_ill() handles this.
+ *
+ * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
+ * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
+ * IPMP test traffic), then the ill must match exactly.
*/
- if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
+ if (match_flags & MATCH_IRE_ILL) {
ire_ill = ire_to_ill(ire);
- if (ire_ill != NULL)
- ire_ill_group = ire_ill->ill_group;
ipif_ill = ipif->ipif_ill;
- ipif_ill_group = ipif_ill->ill_group;
}
if ((ire->ire_addr == (addr & mask)) &&
@@ -4018,24 +3953,21 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
(ire->ire_src_addr == ipif->ipif_src_addr)) &&
((!(match_flags & MATCH_IRE_IPIF)) ||
(ire->ire_ipif == ipif)) &&
- ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
- (ire->ire_type != IRE_CACHE ||
- ire->ire_marks & IRE_MARK_HIDDEN)) &&
+ ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
+ (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) ||
(ire->ire_type != IRE_CACHE ||
ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) &&
- ((!(match_flags & MATCH_IRE_ILL)) ||
- (ire_ill == ipif_ill)) &&
((!(match_flags & MATCH_IRE_WQ)) ||
(ire->ire_stq == wq)) &&
+ ((!(match_flags & MATCH_IRE_ILL)) ||
+ (ire_ill == ipif_ill ||
+ (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
+ ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
((!(match_flags & MATCH_IRE_IHANDLE)) ||
(ire->ire_ihandle == ihandle)) &&
((!(match_flags & MATCH_IRE_MASK)) ||
(ire->ire_mask == mask)) &&
- ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
- (ire_ill == ipif_ill) ||
- (ire_ill_group != NULL &&
- ire_ill_group == ipif_ill_group)) &&
((!(match_flags & MATCH_IRE_SECATTR)) ||
(!is_system_labeled()) ||
(tsol_ire_match_gwattr(ire, tsl) == 0))) {
@@ -4060,8 +3992,7 @@ ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
* ire_match_args() will dereference ipif MATCH_IRE_SRC or
* MATCH_IRE_ILL is set.
*/
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
- (ipif == NULL))
+ if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
return (NULL);
/*
@@ -4142,14 +4073,15 @@ ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif,
/*
* Check whether the IRE_LOCAL and the IRE potentially used to transmit
- * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are part of
- * the same ill group.
+ * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical
+ * or part of the same illgrp. (In the IPMP case, usually the two IREs
+ * will both belong to the IPMP ill, but exceptions are possible -- e.g.
+ * if IPMP test addresses are on their own subnet.)
*/
boolean_t
-ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire)
+ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire)
{
- ill_t *recv_ill, *xmit_ill;
- ill_group_t *recv_group, *xmit_group;
+ ill_t *recv_ill, *xmit_ill;
ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK));
ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE));
@@ -4160,20 +4092,11 @@ ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire)
ASSERT(recv_ill != NULL);
ASSERT(xmit_ill != NULL);
- if (recv_ill == xmit_ill)
- return (B_TRUE);
-
- recv_group = recv_ill->ill_group;
- xmit_group = xmit_ill->ill_group;
-
- if (recv_group != NULL && recv_group == xmit_group)
- return (B_TRUE);
-
- return (B_FALSE);
+ return (IS_ON_SAME_LAN(recv_ill, xmit_ill));
}
/*
- * Check if the IRE_LOCAL uses the same ill (group) as another route would use.
+ * Check if the IRE_LOCAL uses the same ill as another route would use.
* If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
* then we don't allow this IRE_LOCAL to be used.
*/
@@ -4183,17 +4106,16 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
{
ire_t *alt_ire;
boolean_t rval;
+ int flags;
+
+ flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE;
if (ire_local->ire_ipversion == IPV4_VERSION) {
alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL,
- NULL, zoneid, 0, tsl,
- MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE, ipst);
+ NULL, zoneid, 0, tsl, flags, ipst);
} else {
- alt_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL,
- 0, NULL, NULL, zoneid, 0, tsl,
- MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE, ipst);
+ alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL,
+ NULL, zoneid, 0, tsl, flags, ipst);
}
if (alt_ire == NULL)
@@ -4203,16 +4125,14 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
ire_refrele(alt_ire);
return (B_FALSE);
}
- rval = ire_local_same_ill_group(ire_local, alt_ire);
+ rval = ire_local_same_lan(ire_local, alt_ire);
ire_refrele(alt_ire);
return (rval);
}
/*
- * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers
- * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get
- * to the hidden ones.
+ * Lookup cache
*
* In general the zoneid has to match (where ALL_ZONES match all of them).
* But for IRE_LOCAL we also need to handle the case where L2 should
@@ -4220,8 +4140,7 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
* Ethernet drivers nor Ethernet hardware loops back packets sent to their
* own MAC address. This loopback is needed when the normal
* routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill (or ill group) as the ill with which this IRE_LOCAL is
- * associated.
+ * the same ill as the ill with which this IRE_LOCAL is associated.
*
* Earlier versions of this code always matched an IRE_LOCAL independently of
* the zoneid. We preserve that earlier behavior when
@@ -4239,7 +4158,7 @@ ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl,
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
if (ire->ire_marks & (IRE_MARK_CONDEMNED |
- IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) {
+ IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) {
continue;
}
if (ire->ire_addr == addr) {
@@ -4284,7 +4203,7 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
ire_t *ire;
/*
- * Lets look for an ire in the cachetable whose
+ * Look for an ire in the cachetable whose
* ire_addr matches the destination.
* Since we are being called by forwarding fastpath
* no need to check for Trusted Solaris label.
@@ -4293,8 +4212,8 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
dst, ipst->ips_ip_cache_table_size)];
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & (IRE_MARK_CONDEMNED |
- IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) {
+ if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN |
+ IRE_MARK_PRIVATE_ADDR)) {
continue;
}
if (ire->ire_addr == dst) {
@@ -4307,7 +4226,6 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
return (NULL);
}
-
/*
* Locate the interface ire that is tied to the cache ire 'cire' via
* cire->ire_ihandle.
@@ -4333,13 +4251,8 @@ ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire)
* because the ihandle refers to an ipif which can be in only one zone.
*/
match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
- /*
- * ip_newroute calls ire_ftable_lookup with MATCH_IRE_ILL only
- * for on-link hosts. We should never be here for onlink.
- * Thus, use MATCH_IRE_ILL_GROUP.
- */
if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
/*
* We know that the mask of the interface ire equals cire->ire_cmask.
* (When ip_newroute() created 'cire' for the gateway it set its
@@ -4376,7 +4289,7 @@ ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire)
*/
match_flags = MATCH_IRE_TYPE;
if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL_GROUP;
+ match_flags |= MATCH_IRE_ILL;
ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET,
pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
if (ire == NULL)
@@ -4411,7 +4324,16 @@ ire_t *
ipif_to_ire(const ipif_t *ipif)
{
ire_t *ire;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK;
+
+ /*
+ * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
+ * so that they aren't accidentally returned. However, if the
+ * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
+ */
+ if (IS_UNDER_IPMP(ipif->ipif_ill))
+ match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
ASSERT(!ipif->ipif_isv6);
if (ipif->ipif_ire_type == IRE_LOOPBACK) {
@@ -4421,13 +4343,12 @@ ipif_to_ire(const ipif_t *ipif)
} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
/* In this case we need to lookup destination address. */
ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0,
- IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL,
- (MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK), ipst);
+ IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags,
+ ipst);
} else {
ire = ire_ftable_lookup(ipif->ipif_subnet,
ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL,
- ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
- MATCH_IRE_MASK), ipst);
+ ALL_ZONES, 0, NULL, match_flags, ipst);
}
return (ire);
}
@@ -4811,7 +4732,7 @@ ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst)
continue;
if (cire->ire_addr != dst)
continue;
- if (cire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+ if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
continue;
unres_cnt--;
}
@@ -4983,7 +4904,7 @@ ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
continue;
if (cire->ire_marks &
(IRE_MARK_CONDEMNED |
- IRE_MARK_HIDDEN))
+ IRE_MARK_TESTHIDDEN))
continue;
if (cire->ire_gw_secattr != NULL &&
@@ -5186,7 +5107,7 @@ ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
continue;
if (cire->ire_marks &
(IRE_MARK_CONDEMNED |
- IRE_MARK_HIDDEN))
+ IRE_MARK_TESTHIDDEN))
continue;
if (cire->ire_gw_secattr != NULL &&
@@ -5401,7 +5322,7 @@ ire_trace_cleanup(const ire_t *ire)
* invoked when the mblk containing fake_ire is freed.
*/
void
-ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
+ire_arpresolve(ire_t *in_ire)
{
areq_t *areq;
ipaddr_t *addrp;
@@ -5409,8 +5330,13 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
ire_t *ire, *buf;
size_t bufsize;
frtn_t *frtnp;
- ill_t *ill;
- ip_stack_t *ipst = dst_ill->ill_ipst;
+ ill_t *dst_ill;
+ ip_stack_t *ipst;
+
+ ASSERT(in_ire->ire_nce != NULL);
+
+ dst_ill = ire_to_ill(in_ire);
+ ipst = dst_ill->ill_ipst;
/*
* Construct message chain for the resolver
@@ -5431,16 +5357,16 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
*/
/*
- * We use esballoc to allocate the second part(the ire_t size mblk)
- * of the message chain depicted above. THis mblk will be freed
- * by arp when there is a timeout, and otherwise passed to IP
- * and IP will * free it after processing the ARP response.
+ * We use esballoc to allocate the second part (IRE_MBLK)
+ * of the message chain depicted above. This mblk will be freed
+ * by arp when there is a timeout, and otherwise passed to IP
+ * and IP will free it after processing the ARP response.
*/
bufsize = sizeof (ire_t) + sizeof (frtn_t);
buf = kmem_alloc(bufsize, KM_NOSLEEP);
if (buf == NULL) {
- ip1dbg(("ire_arpresolver:alloc buffer failed\n "));
+ ip1dbg(("ire_arpresolve: alloc buffer failed\n"));
return;
}
frtnp = (frtn_t *)(buf + 1);
@@ -5448,16 +5374,15 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
frtnp->free_func = ire_freemblk;
ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
-
if (ire_mp == NULL) {
ip1dbg(("ire_arpresolve: esballoc failed\n"));
kmem_free(buf, bufsize);
return;
}
- ASSERT(in_ire->ire_nce != NULL);
+
areq_mp = copyb(dst_ill->ill_resolver_mp);
if (areq_mp == NULL) {
- kmem_free(buf, bufsize);
+ freemsg(ire_mp);
return;
}
@@ -5473,9 +5398,8 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
ire->ire_ipif_seqid = in_ire->ire_ipif_seqid;
ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex;
ire->ire_ipif = in_ire->ire_ipif;
- ire->ire_stq = in_ire->ire_stq;
- ill = ire_to_ill(ire);
- ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
+ ire->ire_stq = dst_ill->ill_wq;
+ ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex;
ire->ire_zoneid = in_ire->ire_zoneid;
ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
ire->ire_ipst = ipst;
@@ -5528,7 +5452,6 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
* Note that the ARP/IP merge should replace the functioanlity by providing
* direct function calls to clean up unresolved entries in ire/nce lists.
*/
-
void
ire_freemblk(ire_t *ire_mp)
{
@@ -5738,9 +5661,8 @@ retry_nce:
* is marked as ND_REACHABLE at this point.
* This nce does not undergo any further state changes,
* and exists as long as the interface is plumbed.
- * Note: we do the ire_nce assignment here for IRE_BROADCAST
- * because some functions like ill_mark_bcast() inline the
- * ire_add functionality.
+ * Note: the assignment of ire_nce here is a historical
+ * artifact of old code that used to inline ire_add().
*/
ire->ire_nce = nce;
/*
@@ -5772,8 +5694,7 @@ ip4_ctable_lookup_impl(ire_ctable_args_t *margs)
ire_t *ire;
ip_stack_t *ipst = margs->ict_ipst;
- if ((margs->ict_flags &
- (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
+ if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
(margs->ict_ipif == NULL)) {
return (NULL);
}
@@ -5802,10 +5723,7 @@ ip4_ctable_lookup_impl(ire_ctable_args_t *margs)
/*
* This function locates IRE_CACHE entries which were added by the
* ire_forward() path. We can fully specify the IRE we are looking for by
- * providing the ipif_t AND the ire_stq. This is different to MATCH_IRE_ILL
- * which uses the ipif_ill. This is inadequate with IPMP groups where
- * illgrp_scheduler() may have been used to select an ill from the group for
- * the outgoing interface.
+ * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ).
*/
ire_t *
ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif,
diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c
index ac14adf00d..1a3df02418 100644
--- a/usr/src/uts/common/inet/ip/ip_mroute.c
+++ b/usr/src/uts/common/inet/ip/ip_mroute.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -2037,6 +2037,7 @@ static int
ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
struct mfc *rt)
{
+ ill_t *vill;
vifi_t vifi;
struct vif *vifp;
ipaddr_t dst = ipha->ipha_dst;
@@ -2102,25 +2103,21 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
}
/*
* Don't forward if it didn't arrive from the parent vif for its
- * origin. But do match on the groups as we nominate only one
- * ill in the group for receiving allmulti packets.
+ * origin.
*/
- if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill &&
- (ill->ill_group == NULL ||
- ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group !=
- ill->ill_group)) ||
+ vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill;
+ if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) ||
(ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
/* Came in the wrong interface */
ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
"numvifs %d ill %s viftable ill %s\n",
(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
- ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
+ vill->ill_name));
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"ip_mdq: arrived wrong if, vifi %d ill "
"%s viftable ill %s\n",
- (int)vifi, ill->ill_name,
- ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
+ (int)vifi, ill->ill_name, vill->ill_name);
}
ipst->ips_mrtstat->mrts_wrong_if++;
rt->mfc_wrong_if++;
@@ -3047,7 +3044,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp)
dst = ipha->ipha_dst;
ipif = vifp->v_ipif;
- mutex_enter(&ipif->ipif_ill->ill_lock);
if (ilm_lookup_ipif(ipif, dst) != NULL) {
/*
* The packet is not yet reassembled, thus we need to
@@ -3057,7 +3053,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp)
mblk_t *mp_loop;
ire_t *ire;
- mutex_exit(&ipif->ipif_ill->ill_lock);
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1,
SL_TRACE,
@@ -3082,8 +3077,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp)
}
if (ire != NULL)
ire_refrele(ire);
- } else {
- mutex_exit(&ipif->ipif_ill->ill_lock);
}
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c
index f3c95ae362..cbea9be165 100644
--- a/usr/src/uts/common/inet/ip/ip_multi.c
+++ b/usr/src/uts/common/inet/ip/ip_multi.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -68,12 +68,10 @@ static void ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode,
static ilm_t *ilm_add_v6(ipif_t *ipif, const in6_addr_t *group,
ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist,
- int orig_ifindex, zoneid_t zoneid);
+ zoneid_t zoneid);
static void ilm_delete(ilm_t *ilm);
static int ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *group);
static int ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *group);
-static ilg_t *ilg_lookup_ill_index_v6(conn_t *connp,
- const in6_addr_t *v6group, int index);
static ilg_t *ilg_lookup_ipif(conn_t *connp, ipaddr_t group,
ipif_t *ipif);
static int ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif,
@@ -91,25 +89,21 @@ static int ip_opt_delete_group_excl(conn_t *connp, ipaddr_t group,
static int ip_opt_delete_group_excl_v6(conn_t *connp,
const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode,
const in6_addr_t *v6src);
+static void ill_ilm_walker_hold(ill_t *ill);
+static void ill_ilm_walker_rele(ill_t *ill);
/*
* MT notes:
*
* Multicast joins operate on both the ilg and ilm structures. Multiple
* threads operating on an conn (socket) trying to do multicast joins
- * need to synchronize when operating on the ilg. Multiple threads
+ * need to synchronize when operating on the ilg. Multiple threads
* potentially operating on different conn (socket endpoints) trying to
* do multicast joins could eventually end up trying to manipulate the
- * ilm simulatenously and need to synchronize on the access to the ilm.
- * Both are amenable to standard Solaris MT techniques, but it would be
- * complex to handle a failover or failback which needs to manipulate
- * ilg/ilms if an applications can also simultaenously join/leave
- * multicast groups. Hence multicast join/leave also go through the ipsq_t
+ * ilm simultaneously and need to synchronize access to the ilm. Currently,
+ * this is done by synchronizing join/leave via per-phyint ipsq_t
* serialization.
*
- * Multicast joins and leaves are single-threaded per phyint/IPMP group
- * using the ipsq serialization mechanism.
- *
* An ilm is an IP data structure used to track multicast join/leave.
* An ilm is associated with a <multicast group, ipif> tuple in IPv4 and
* with just <multicast group> in IPv6. ilm_refcnt is the number of ilg's
@@ -211,12 +205,13 @@ conn_ilg_reap(conn_t *connp)
* Returns a pointer to the next available ilg in conn_ilg. Allocs more
* buffers in size of ILG_ALLOC_CHUNK ilgs when needed, and updates conn's
* ilg tracking fields appropriately (conn_ilg_inuse reflects usage of the
- * returned ilg). Returns NULL on failure (ENOMEM).
+ * returned ilg). Returns NULL on failure, in which case `*errp' will be
+ * filled in with the reason.
*
* Assumes connp->conn_lock is held.
*/
static ilg_t *
-conn_ilg_alloc(conn_t *connp)
+conn_ilg_alloc(conn_t *connp, int *errp)
{
ilg_t *new, *ret;
int curcnt;
@@ -224,10 +219,21 @@ conn_ilg_alloc(conn_t *connp)
ASSERT(MUTEX_HELD(&connp->conn_lock));
ASSERT(connp->conn_ilg_inuse <= connp->conn_ilg_allocated);
+ /*
+ * If CONN_CLOSING is set, conn_ilg cleanup has begun and we must not
+ * create any ilgs.
+ */
+ if (connp->conn_state_flags & CONN_CLOSING) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
if (connp->conn_ilg == NULL) {
connp->conn_ilg = GETSTRUCT(ilg_t, ILG_ALLOC_CHUNK);
- if (connp->conn_ilg == NULL)
+ if (connp->conn_ilg == NULL) {
+ *errp = ENOMEM;
return (NULL);
+ }
connp->conn_ilg_allocated = ILG_ALLOC_CHUNK;
connp->conn_ilg_inuse = 0;
}
@@ -241,12 +247,15 @@ conn_ilg_alloc(conn_t *connp)
* ilg_delete_all() will have to be changed when
* this logic is changed.
*/
+ *errp = EBUSY;
return (NULL);
}
curcnt = connp->conn_ilg_allocated;
new = GETSTRUCT(ilg_t, curcnt + ILG_ALLOC_CHUNK);
- if (new == NULL)
+ if (new == NULL) {
+ *errp = ENOMEM;
return (NULL);
+ }
bcopy(connp->conn_ilg, new, sizeof (ilg_t) * curcnt);
mi_free((char *)connp->conn_ilg);
connp->conn_ilg = new;
@@ -378,42 +387,6 @@ ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist)
}
}
-/*
- * If the given interface has failed, choose a new one to join on so
- * that we continue to receive packets. ilg_orig_ifindex remembers
- * what the application used to join on so that we know the ilg to
- * delete even though we change the ill here. Callers will store the
- * ilg returned from this function in ilg_ill. Thus when we receive
- * a packet on ilg_ill, conn_wantpacket_v6 will deliver the packets.
- *
- * This function must be called as writer so we can walk the group
- * list and examine flags without holding a lock.
- */
-ill_t *
-ip_choose_multi_ill(ill_t *ill, const in6_addr_t *grp)
-{
- ill_t *till;
- ill_group_t *illgrp = ill->ill_group;
-
- ASSERT(IAM_WRITER_ILL(ill));
-
- if (IN6_IS_ADDR_UNSPECIFIED(grp) || illgrp == NULL)
- return (ill);
-
- if ((ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) == 0)
- return (ill);
-
- till = illgrp->illgrp_ill;
- while (till != NULL &&
- (till->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE))) {
- till = till->ill_group_next;
- }
- if (till != NULL)
- return (till);
-
- return (ill);
-}
-
static int
ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist,
boolean_t isv6)
@@ -560,8 +533,7 @@ ilm_update_del(ilm_t *ilm, boolean_t isv6)
}
/*
- * INADDR_ANY means all multicast addresses. This is only used
- * by the multicast router.
+ * INADDR_ANY means all multicast addresses.
* INADDR_ANY is stored as IPv6 unspecified addr.
*/
int
@@ -578,40 +550,31 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
if (!CLASSD(group) && group != INADDR_ANY)
return (EINVAL);
+ if (IS_UNDER_IPMP(ill))
+ return (EINVAL);
+
/*
- * INADDR_ANY is represented as the IPv6 unspecifed addr.
+ * INADDR_ANY is represented as the IPv6 unspecified addr.
*/
if (group == INADDR_ANY)
v6group = ipv6_all_zeros;
else
IN6_IPADDR_TO_V4MAPPED(group, &v6group);
- mutex_enter(&ill->ill_lock);
ilm = ilm_lookup_ipif(ipif, group);
- mutex_exit(&ill->ill_lock);
/*
* Since we are writer, we know the ilm_flags itself cannot
* change at this point, and ilm_lookup_ipif would not have
* returned a DELETED ilm. However, the data path can free
- * ilm->next via ilm_walker_cleanup() so we can safely
+ * ilm->ilm_next via ilm_walker_cleanup() so we can safely
* access anything in ilm except ilm_next (for safe access to
- * ilm_next we'd have to take the ill_lock).
+ * ilm_next we'd have to take the ill_lock).
*/
if (ilm != NULL)
return (ilm_update_add(ilm, ilgstat, ilg_flist, B_FALSE));
- /*
- * ilms are associated with ipifs in IPv4. It moves with the
- * ipif if the ipif moves to a new ill when the interface
- * fails. Thus we really don't check whether the ipif_ill
- * has failed like in IPv6. If it has FAILED the ipif
- * will move (daemon will move it) and hence the ilm, if the
- * ipif is not IPIF_NOFAILOVER. For the IPIF_NOFAILOVER ipifs,
- * we continue to receive in the same place even if the
- * interface fails.
- */
ilm = ilm_add_v6(ipif, &v6group, ilgstat, ilg_fmode, ilg_flist,
- ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid);
+ ipif->ipif_zoneid);
if (ilm == NULL)
return (ENOMEM);
@@ -623,10 +586,7 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
*/
if (ilm_numentries_v6(ill, &v6group) > 1)
return (0);
- if (ill->ill_group == NULL)
- ret = ill_join_allmulti(ill);
- else
- ret = ill_nominate_mcast_rcv(ill->ill_group);
+ ret = ill_join_allmulti(ill);
if (ret != 0)
ilm_delete(ilm);
return (ret);
@@ -646,12 +606,8 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
/*
* The unspecified address means all multicast addresses.
- * This is only used by the multicast router.
*
- * ill identifies the interface to join on; it may not match the
- * interface requested by the application of a failover has taken
- * place. orig_ifindex always identifies the interface requested
- * by the app.
+ * ill identifies the interface to join on.
*
* ilgstat tells us if there's an ilg associated with this join,
* and if so, if it's a new ilg or a change to an existing one.
@@ -659,9 +615,8 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
* the ilg (and will be EXCLUDE {NULL} in the case of no ilg).
*/
int
-ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
- zoneid_t zoneid, ilg_stat_t ilgstat, mcast_record_t ilg_fmode,
- slist_t *ilg_flist)
+ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+ ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist)
{
ilm_t *ilm;
int ret;
@@ -673,37 +628,20 @@ ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
return (EINVAL);
}
+ if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_MC_SOLICITEDNODE(v6group))
+ return (EINVAL);
+
/*
- * An ilm is uniquely identified by the tuple of (group, ill,
- * orig_ill). group is the multicast group address, ill is
- * the interface on which it is currently joined, and orig_ill
- * is the interface on which the application requested the
- * join. orig_ill and ill are the same unless orig_ill has
- * failed over.
- *
- * Both orig_ill and ill are required, which means we may have
- * 2 ilms on an ill for the same group, but with different
- * orig_ills. These must be kept separate, so that when failback
- * occurs, the appropriate ilms are moved back to their orig_ill
- * without disrupting memberships on the ill to which they had
- * been moved.
- *
- * In order to track orig_ill, we store orig_ifindex in the
- * ilm and ilg.
+ * An ilm is uniquely identified by the tuple of (group, ill) where
+ * `group' is the multicast group address, and `ill' is the interface
+ * on which it is currently joined.
*/
- mutex_enter(&ill->ill_lock);
- ilm = ilm_lookup_ill_index_v6(ill, v6group, orig_ifindex, zoneid);
- mutex_exit(&ill->ill_lock);
+ ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid);
if (ilm != NULL)
return (ilm_update_add(ilm, ilgstat, ilg_flist, B_TRUE));
- /*
- * We need to remember where the application really wanted
- * to join. This will be used later if we want to failback
- * to the original interface.
- */
ilm = ilm_add_v6(ill->ill_ipif, v6group, ilgstat, ilg_fmode,
- ilg_flist, orig_ifindex, zoneid);
+ ilg_flist, zoneid);
if (ilm == NULL)
return (ENOMEM);
@@ -715,11 +653,7 @@ ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
*/
if (ilm_numentries_v6(ill, v6group) > 1)
return (0);
- if (ill->ill_group == NULL)
- ret = ill_join_allmulti(ill);
- else
- ret = ill_nominate_mcast_rcv(ill->ill_group);
-
+ ret = ill_join_allmulti(ill);
if (ret != 0)
ilm_delete(ilm);
return (ret);
@@ -756,6 +690,14 @@ ip_ll_send_enabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
ASSERT(IAM_WRITER_ILL(ill));
/*
+ * If we're on the IPMP ill, use the nominated multicast interface to
+ * send and receive DLPI messages, if one exists. (If none exists,
+ * there are no usable interfaces and thus nothing to do.)
+ */
+ if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+ return (0);
+
+ /*
* Create a AR_ENTRY_SQUERY message with a dl_enabmulti_req tacked
* on.
*/
@@ -842,9 +784,8 @@ ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *v6groupp)
}
/*
- * INADDR_ANY means all multicast addresses. This is only used
- * by the multicast router.
- * INADDR_ANY is stored as the IPv6 unspecifed addr.
+ * INADDR_ANY means all multicast addresses.
+ * INADDR_ANY is stored as the IPv6 unspecified addr.
*/
int
ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
@@ -859,7 +800,7 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
return (EINVAL);
/*
- * INADDR_ANY is represented as the IPv6 unspecifed addr.
+ * INADDR_ANY is represented as the IPv6 unspecified addr.
*/
if (group == INADDR_ANY)
v6group = ipv6_all_zeros;
@@ -870,9 +811,7 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
* Look for a match on the ipif.
* (IP_DROP_MEMBERSHIP specifies an ipif using an IP address).
*/
- mutex_enter(&ill->ill_lock);
ilm = ilm_lookup_ipif(ipif, group);
- mutex_exit(&ill->ill_lock);
if (ilm == NULL)
return (ENOENT);
@@ -897,11 +836,9 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
return (0);
/* If we never joined, then don't leave. */
- if (ill->ill_join_allmulti) {
+ if (ill->ill_join_allmulti)
ill_leave_allmulti(ill);
- if (ill->ill_group != NULL)
- (void) ill_nominate_mcast_rcv(ill->ill_group);
- }
+
return (0);
}
@@ -921,11 +858,10 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
/*
* The unspecified address means all multicast addresses.
- * This is only used by the multicast router.
*/
int
-ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
- zoneid_t zoneid, boolean_t no_ilg, boolean_t leaving)
+ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+ boolean_t no_ilg, boolean_t leaving)
{
ipif_t *ipif;
ilm_t *ilm;
@@ -938,25 +874,8 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
/*
* Look for a match on the ill.
- * (IPV6_LEAVE_GROUP specifies an ill using an ifindex).
- *
- * Similar to ip_addmulti_v6, we should always look using
- * the orig_ifindex.
- *
- * 1) If orig_ifindex is different from ill's ifindex
- * we should have an ilm with orig_ifindex created in
- * ip_addmulti_v6. We should delete that here.
- *
- * 2) If orig_ifindex is same as ill's ifindex, we should
- * not delete the ilm that is temporarily here because of
- * a FAILOVER. Those ilms will have a ilm_orig_ifindex
- * different from ill's ifindex.
- *
- * Thus, always lookup using orig_ifindex.
*/
- mutex_enter(&ill->ill_lock);
- ilm = ilm_lookup_ill_index_v6(ill, v6group, orig_ifindex, zoneid);
- mutex_exit(&ill->ill_lock);
+ ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid);
if (ilm == NULL)
return (ENOENT);
@@ -985,11 +904,9 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
return (0);
/* If we never joined, then don't leave. */
- if (ill->ill_join_allmulti) {
+ if (ill->ill_join_allmulti)
ill_leave_allmulti(ill);
- if (ill->ill_group != NULL)
- (void) ill_nominate_mcast_rcv(ill->ill_group);
- }
+
return (0);
}
@@ -1020,6 +937,13 @@ ip_ll_send_disabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
uint32_t addrlen, addroff;
ASSERT(IAM_WRITER_ILL(ill));
+
+ /*
+ * See comment in ip_ll_send_enabmulti_req().
+ */
+ if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+ return (0);
+
/*
* Create a AR_ENTRY_SQUERY message with a dl_disabmulti_req tacked
* on.
@@ -1099,16 +1023,16 @@ ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *v6group)
}
/*
- * Make the driver pass up all multicast packets
- *
- * With ill groups, the caller makes sure that there is only
- * one ill joining the allmulti group.
+ * Make the driver pass up all multicast packets. NOTE: to keep callers
+ * IPMP-unaware, if an IPMP ill is passed in, the ill_join_allmulti flag is
+ * set on it (rather than the cast ill).
*/
int
ill_join_allmulti(ill_t *ill)
{
mblk_t *promiscon_mp, *promiscoff_mp;
uint32_t addrlen, addroff;
+ ill_t *join_ill = ill;
ASSERT(IAM_WRITER_ILL(ill));
@@ -1120,7 +1044,13 @@ ill_join_allmulti(ill_t *ill)
return (0);
}
- ASSERT(!ill->ill_join_allmulti);
+ /*
+ * See comment in ip_ll_send_enabmulti_req().
+ */
+ if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+ return (0);
+
+ ASSERT(!join_ill->ill_join_allmulti);
/*
* Create a DL_PROMISCON_REQ message and send it directly to the DLPI
@@ -1144,20 +1074,18 @@ ill_join_allmulti(ill_t *ill)
ill_dlpi_send(ill, promiscon_mp);
}
- ill->ill_join_allmulti = B_TRUE;
+ join_ill->ill_join_allmulti = B_TRUE;
return (0);
}
/*
* Make the driver stop passing up all multicast packets
- *
- * With ill groups, we need to nominate some other ill as
- * this ipif->ipif_ill is leaving the group.
*/
void
ill_leave_allmulti(ill_t *ill)
{
- mblk_t *promiscoff_mp = ill->ill_promiscoff_mp;
+ mblk_t *promiscoff_mp;
+ ill_t *leave_ill = ill;
ASSERT(IAM_WRITER_ILL(ill));
@@ -1169,7 +1097,13 @@ ill_leave_allmulti(ill_t *ill)
return;
}
- ASSERT(ill->ill_join_allmulti);
+ /*
+ * See comment in ip_ll_send_enabmulti_req().
+ */
+ if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+ return;
+
+ ASSERT(leave_ill->ill_join_allmulti);
/*
* Create a DL_PROMISCOFF_REQ message and send it directly to
@@ -1179,12 +1113,13 @@ ill_leave_allmulti(ill_t *ill)
*/
if ((ill->ill_net_type == IRE_IF_RESOLVER) &&
!(ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST)) {
+ promiscoff_mp = ill->ill_promiscoff_mp;
ASSERT(promiscoff_mp != NULL);
ill->ill_promiscoff_mp = NULL;
ill_dlpi_send(ill, promiscoff_mp);
}
- ill->ill_join_allmulti = B_FALSE;
+ leave_ill->ill_join_allmulti = B_FALSE;
}
static ill_t *
@@ -1213,22 +1148,35 @@ int
ip_join_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
{
ill_t *ill;
- int ret;
+ int ret = 0;
if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL)
return (ENODEV);
+
+ /*
+ * The ip_addmulti*() functions won't allow IPMP underlying interfaces
+ * to join allmulti since only the nominated underlying interface in
+ * the group should receive multicast. We silently succeed to avoid
+ * having to teach IPobs (currently the only caller of this routine)
+ * to ignore failures in this case.
+ */
+ if (IS_UNDER_IPMP(ill))
+ goto out;
+
if (isv6) {
- ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ifindex,
- ill->ill_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
+ ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ill->ill_zoneid,
+ ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
} else {
ret = ip_addmulti(INADDR_ANY, ill->ill_ipif, ILGSTAT_NONE,
MODE_IS_EXCLUDE, NULL);
}
ill->ill_ipallmulti_cnt++;
+out:
ipsq_exit(ill->ill_phyint->phyint_ipsq);
return (ret);
}
+
int
ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
{
@@ -1236,14 +1184,17 @@ ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL)
return (ENODEV);
- ASSERT(ill->ill_ipallmulti_cnt != 0);
- if (isv6) {
- (void) ip_delmulti_v6(&ipv6_all_zeros, ill, ifindex,
- ill->ill_zoneid, B_TRUE, B_TRUE);
- } else {
- (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE, B_TRUE);
+
+ if (ill->ill_ipallmulti_cnt > 0) {
+ if (isv6) {
+ (void) ip_delmulti_v6(&ipv6_all_zeros, ill,
+ ill->ill_zoneid, B_TRUE, B_TRUE);
+ } else {
+ (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE,
+ B_TRUE);
+ }
+ ill->ill_ipallmulti_cnt--;
}
- ill->ill_ipallmulti_cnt--;
ipsq_exit(ill->ill_phyint->phyint_ipsq);
return (0);
}
@@ -1260,8 +1211,7 @@ ip_purge_allmulti(ill_t *ill)
for (; ill->ill_ipallmulti_cnt > 0; ill->ill_ipallmulti_cnt--) {
if (ill->ill_isv6) {
(void) ip_delmulti_v6(&ipv6_all_zeros, ill,
- ill->ill_phyint->phyint_ifindex, ill->ill_zoneid,
- B_TRUE, B_TRUE);
+ ill->ill_zoneid, B_TRUE, B_TRUE);
} else {
(void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE,
B_TRUE);
@@ -1539,13 +1489,14 @@ void
ill_recover_multicast(ill_t *ill)
{
ilm_t *ilm;
+ ipif_t *ipif = ill->ill_ipif;
char addrbuf[INET6_ADDRSTRLEN];
ASSERT(IAM_WRITER_ILL(ill));
ill->ill_need_recover_multicast = 0;
- ILM_WALKER_HOLD(ill);
+ ill_ilm_walker_hold(ill);
for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
/*
* Check how many ipif's that have members in this group -
@@ -1553,47 +1504,45 @@ ill_recover_multicast(ill_t *ill)
* in the list.
*/
if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 &&
- ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm)
+ ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE,
+ ALL_ZONES) != ilm) {
continue;
- ip1dbg(("ill_recover_multicast: %s\n",
- inet_ntop(AF_INET6, &ilm->ilm_v6addr, addrbuf,
- sizeof (addrbuf))));
+ }
+
+ ip1dbg(("ill_recover_multicast: %s\n", inet_ntop(AF_INET6,
+ &ilm->ilm_v6addr, addrbuf, sizeof (addrbuf))));
+
if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
- if (ill->ill_group == NULL) {
- (void) ill_join_allmulti(ill);
- } else {
- /*
- * We don't want to join on this ill,
- * if somebody else in the group has
- * already been nominated.
- */
- (void) ill_nominate_mcast_rcv(ill->ill_group);
- }
+ (void) ill_join_allmulti(ill);
} else {
- (void) ip_ll_addmulti_v6(ill->ill_ipif,
- &ilm->ilm_v6addr);
+ if (ill->ill_isv6)
+ mld_joingroup(ilm);
+ else
+ igmp_joingroup(ilm);
+
+ (void) ip_ll_addmulti_v6(ipif, &ilm->ilm_v6addr);
}
}
- ILM_WALKER_RELE(ill);
+ ill_ilm_walker_rele(ill);
+
}
/*
* The opposite of ill_recover_multicast() -- leaves all multicast groups
- * that were explicitly joined. Note that both these functions could be
- * disposed of if we enhanced ARP to allow us to handle DL_DISABMULTI_REQ
- * and DL_ENABMULTI_REQ messages when an interface is down.
+ * that were explicitly joined.
*/
void
ill_leave_multicast(ill_t *ill)
{
ilm_t *ilm;
+ ipif_t *ipif = ill->ill_ipif;
char addrbuf[INET6_ADDRSTRLEN];
ASSERT(IAM_WRITER_ILL(ill));
ill->ill_need_recover_multicast = 1;
- ILM_WALKER_HOLD(ill);
+ ill_ilm_walker_hold(ill);
for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
/*
* Check how many ipif's that have members in this group -
@@ -1601,25 +1550,26 @@ ill_leave_multicast(ill_t *ill)
* in the list.
*/
if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 &&
- ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm)
+ ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE,
+ ALL_ZONES) != ilm) {
continue;
- ip1dbg(("ill_leave_multicast: %s\n",
- inet_ntop(AF_INET6, &ilm->ilm_v6addr, addrbuf,
- sizeof (addrbuf))));
+ }
+
+ ip1dbg(("ill_leave_multicast: %s\n", inet_ntop(AF_INET6,
+ &ilm->ilm_v6addr, addrbuf, sizeof (addrbuf))));
+
if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
ill_leave_allmulti(ill);
- /*
- * If we were part of an IPMP group, then
- * ill_handoff_responsibility() has already
- * nominated a new member (so we don't).
- */
- ASSERT(ill->ill_group == NULL);
} else {
- (void) ip_ll_delmulti_v6(ill->ill_ipif,
- &ilm->ilm_v6addr);
+ if (ill->ill_isv6)
+ mld_leavegroup(ilm);
+ else
+ igmp_leavegroup(ilm);
+
+ (void) ip_ll_delmulti_v6(ipif, &ilm->ilm_v6addr);
}
}
- ILM_WALKER_RELE(ill);
+ ill_ilm_walker_rele(ill);
}
/* Find an ilm for matching the ill */
@@ -1628,91 +1578,79 @@ ilm_lookup_ill(ill_t *ill, ipaddr_t group, zoneid_t zoneid)
{
in6_addr_t v6group;
- ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
/*
- * INADDR_ANY is represented as the IPv6 unspecifed addr.
+ * INADDR_ANY is represented as the IPv6 unspecified addr.
*/
if (group == INADDR_ANY)
v6group = ipv6_all_zeros;
else
IN6_IPADDR_TO_V4MAPPED(group, &v6group);
- return (ilm_lookup_ill_v6(ill, &v6group, zoneid));
+ return (ilm_lookup_ill_v6(ill, &v6group, B_TRUE, zoneid));
}
/*
- * Find an ilm for matching the ill. All the ilm lookup functions
- * ignore ILM_DELETED ilms. These have been logically deleted, and
- * igmp and linklayer disable multicast have been done. Only mi_free
- * yet to be done. Still there in the list due to ilm_walkers. The
- * last walker will release it.
+ * Find an ilm for address `v6group' on `ill' and zone `zoneid' (which may be
+ * ALL_ZONES). In general, if `ill' is in an IPMP group, we will match
+ * against any ill in the group. However, if `restrict_solicited' is set,
+ * then specifically for IPv6 solicited-node multicast, the match will be
+ * restricted to the specified `ill'.
*/
ilm_t *
-ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group, zoneid_t zoneid)
+ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group,
+ boolean_t restrict_solicited, zoneid_t zoneid)
{
ilm_t *ilm;
+ ilm_walker_t ilw;
+ boolean_t restrict_ill = B_FALSE;
- ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
+ /*
+ * In general, underlying interfaces cannot have multicast memberships
+ * and thus lookups always match across the illgrp. However, we must
+ * allow IPv6 solicited-node multicast memberships on underlying
+ * interfaces, and thus an IPMP meta-interface and one of its
+ * underlying ills may have the same solicited-node multicast address.
+ * In that case, we need to restrict the lookup to the requested ill.
+ * However, we may receive packets on an underlying interface that
+ * are for the corresponding IPMP interface's solicited-node multicast
+ * address, and thus in that case we need to match across the group --
+ * hence the unfortunate `restrict_solicited' argument.
+ */
+ if (IN6_IS_ADDR_MC_SOLICITEDNODE(v6group) && restrict_solicited)
+ restrict_ill = (IS_IPMP(ill) || IS_UNDER_IPMP(ill));
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
- if (ilm->ilm_flags & ILM_DELETED)
+ ilm = ilm_walker_start(&ilw, ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group))
continue;
- if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) &&
- (zoneid == ALL_ZONES || zoneid == ilm->ilm_zoneid))
- return (ilm);
- }
- return (NULL);
-}
-
-ilm_t *
-ilm_lookup_ill_index_v6(ill_t *ill, const in6_addr_t *v6group, int index,
- zoneid_t zoneid)
-{
- ilm_t *ilm;
-
- ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
-
- for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (ilm->ilm_flags & ILM_DELETED)
+ if (zoneid != ALL_ZONES && zoneid != ilm->ilm_zoneid)
continue;
- if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) &&
- (zoneid == ALL_ZONES || zoneid == ilm->ilm_zoneid) &&
- ilm->ilm_orig_ifindex == index) {
- return (ilm);
+ if (!restrict_ill || ill == (ill->ill_isv6 ?
+ ilm->ilm_ill : ilm->ilm_ipif->ipif_ill)) {
+ break;
}
}
- return (NULL);
+ ilm_walker_finish(&ilw);
+ return (ilm);
}
-
/*
- * Found an ilm for the ipif. Only needed for IPv4 which does
+ * Find an ilm for the ipif. Only needed for IPv4 which does
* ipif specific socket options.
*/
ilm_t *
ilm_lookup_ipif(ipif_t *ipif, ipaddr_t group)
{
- ill_t *ill = ipif->ipif_ill;
- ilm_t *ilm;
- in6_addr_t v6group;
-
- ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
- /*
- * INADDR_ANY is represented as the IPv6 unspecifed addr.
- */
- if (group == INADDR_ANY)
- v6group = ipv6_all_zeros;
- else
- IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+ ilm_t *ilm;
+ ilm_walker_t ilw;
- for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
- if (ilm->ilm_flags & ILM_DELETED)
- continue;
- if (ilm->ilm_ipif == ipif &&
- IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group))
- return (ilm);
+ ilm = ilm_walker_start(&ilw, ipif->ipif_ill);
+ for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ if (ilm->ilm_ipif == ipif && ilm->ilm_addr == group)
+ break;
}
- return (NULL);
+ ilm_walker_finish(&ilw);
+ return (ilm);
}
/*
@@ -1739,8 +1677,7 @@ ilm_numentries_v6(ill_t *ill, const in6_addr_t *v6group)
/* Caller guarantees that the group is not already on the list */
static ilm_t *
ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
- mcast_record_t ilg_fmode, slist_t *ilg_flist, int orig_ifindex,
- zoneid_t zoneid)
+ mcast_record_t ilg_fmode, slist_t *ilg_flist, zoneid_t zoneid)
{
ill_t *ill = ipif->ipif_ill;
ilm_t *ilm;
@@ -1783,19 +1720,10 @@ ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
(char *), "ilm", (void *), ilm);
ipif->ipif_ilm_cnt++;
}
+
ASSERT(ill->ill_ipst);
ilm->ilm_ipst = ill->ill_ipst; /* No netstack_hold */
- /*
- * After this if ilm moves to a new ill, we don't change
- * the ilm_orig_ifindex. Thus, if ill_index != ilm_orig_ifindex,
- * it has been moved. Indexes don't match even when the application
- * wants to join on a FAILED/INACTIVE interface because we choose
- * a new interface to join in. This is considered as an implicit
- * move.
- */
- ilm->ilm_orig_ifindex = orig_ifindex;
-
ASSERT(!(ipif->ipif_state_flags & IPIF_CONDEMNED));
ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED));
@@ -1969,6 +1897,108 @@ ilm_delete(ilm_t *ilm)
}
}
+/* Increment the ILM walker count for `ill' */
+static void
+ill_ilm_walker_hold(ill_t *ill)
+{
+ mutex_enter(&ill->ill_lock);
+ ill->ill_ilm_walker_cnt++;
+ mutex_exit(&ill->ill_lock);
+}
+
+/* Decrement the ILM walker count for `ill' */
+static void
+ill_ilm_walker_rele(ill_t *ill)
+{
+ mutex_enter(&ill->ill_lock);
+ ill->ill_ilm_walker_cnt--;
+ if (ill->ill_ilm_walker_cnt == 0 && ill->ill_ilm_cleanup_reqd)
+ ilm_walker_cleanup(ill); /* drops ill_lock */
+ else
+ mutex_exit(&ill->ill_lock);
+}
+
+/*
+ * Start walking the ILMs associated with `ill'; the first ILM in the walk
+ * (if any) is returned. State associated with the walk is stored in `ilw'.
+ * Note that walks associated with interfaces under IPMP also walk the ILMs
+ * on the associated IPMP interface; this is handled transparently to callers
+ * via ilm_walker_step(). (Usually with IPMP all ILMs will be on the IPMP
+ * interface; the only exception is to support IPv6 test addresses, which
+ * require ILMs for their associated solicited-node multicast addresses.)
+ */
+ilm_t *
+ilm_walker_start(ilm_walker_t *ilw, ill_t *ill)
+{
+ ilw->ilw_ill = ill;
+ if (IS_UNDER_IPMP(ill))
+ ilw->ilw_ipmp_ill = ipmp_ill_hold_ipmp_ill(ill);
+ else
+ ilw->ilw_ipmp_ill = NULL;
+
+ ill_ilm_walker_hold(ill);
+ if (ilw->ilw_ipmp_ill != NULL)
+ ill_ilm_walker_hold(ilw->ilw_ipmp_ill);
+
+ if (ilw->ilw_ipmp_ill != NULL && ilw->ilw_ipmp_ill->ill_ilm != NULL)
+ ilw->ilw_walk_ill = ilw->ilw_ipmp_ill;
+ else
+ ilw->ilw_walk_ill = ilw->ilw_ill;
+
+ return (ilm_walker_step(ilw, NULL));
+}
+
+/*
+ * Helper function for ilm_walker_step() that returns the next ILM
+ * associated with `ilw', regardless of whether it's deleted.
+ */
+static ilm_t *
+ilm_walker_step_all(ilm_walker_t *ilw, ilm_t *ilm)
+{
+ if (ilm == NULL)
+ return (ilw->ilw_walk_ill->ill_ilm);
+
+ if (ilm->ilm_next != NULL)
+ return (ilm->ilm_next);
+
+ if (ilw->ilw_ipmp_ill != NULL && IS_IPMP(ilw->ilw_walk_ill)) {
+ ilw->ilw_walk_ill = ilw->ilw_ill;
+ /*
+ * It's possible that ilw_ill left the group during our walk,
+ * so we can't ASSERT() that it's under IPMP. Callers that
+ * care will be writer on the IPSQ anyway.
+ */
+ return (ilw->ilw_walk_ill->ill_ilm);
+ }
+ return (NULL);
+}
+
+/*
+ * Step to the next ILM associated with `ilw'.
+ */
+ilm_t *
+ilm_walker_step(ilm_walker_t *ilw, ilm_t *ilm)
+{
+ while ((ilm = ilm_walker_step_all(ilw, ilm)) != NULL) {
+ if (!(ilm->ilm_flags & ILM_DELETED))
+ break;
+ }
+ return (ilm);
+}
+
+/*
+ * Finish the ILM walk associated with `ilw'.
+ */
+void
+ilm_walker_finish(ilm_walker_t *ilw)
+{
+ ill_ilm_walker_rele(ilw->ilw_ill);
+ if (ilw->ilw_ipmp_ill != NULL) {
+ ill_ilm_walker_rele(ilw->ilw_ipmp_ill);
+ ill_refrele(ilw->ilw_ipmp_ill);
+ }
+ bzero(&ilw, sizeof (ilw));
+}
/*
* Looks up the appropriate ipif given a v4 multicast group and interface
@@ -2256,16 +2286,15 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
* didn't find an ilg, there's nothing to do.
*/
if (!leave_grp)
- ilg = conn_ilg_alloc(connp);
+ ilg = conn_ilg_alloc(connp, &err);
if (leave_grp || ilg == NULL) {
mutex_exit(&connp->conn_lock);
- return (leave_grp ? 0 : ENOMEM);
+ return (leave_grp ? 0 : err);
}
ilgstat = ILGSTAT_NEW;
IN6_IPADDR_TO_V4MAPPED(grp, &ilg->ilg_v6group);
ilg->ilg_ipif = ipif;
ilg->ilg_ill = NULL;
- ilg->ilg_orig_ifindex = 0;
} else if (leave_grp) {
ilg_delete(connp, ilg, NULL);
mutex_exit(&connp->conn_lock);
@@ -2389,7 +2418,7 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
const struct in6_addr *grp, ill_t *ill)
{
ilg_t *ilg;
- int i, orig_ifindex, orig_fmode, new_fmode, err;
+ int i, orig_fmode, new_fmode, err;
slist_t *orig_filter = NULL;
slist_t *new_filter = NULL;
struct sockaddr_storage *sl;
@@ -2409,65 +2438,31 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
ASSERT(IAM_WRITER_ILL(ill));
- /*
- * Use the ifindex to do the lookup. We can't use the ill
- * directly because ilg_ill could point to a different ill
- * if things have moved.
- */
- orig_ifindex = ill->ill_phyint->phyint_ifindex;
-
mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_index_v6(connp, grp, orig_ifindex);
+ ilg = ilg_lookup_ill_v6(connp, grp, ill);
if (ilg == NULL) {
/*
* if the request was actually to leave, and we
* didn't find an ilg, there's nothing to do.
*/
if (!leave_grp)
- ilg = conn_ilg_alloc(connp);
+ ilg = conn_ilg_alloc(connp, &err);
if (leave_grp || ilg == NULL) {
mutex_exit(&connp->conn_lock);
- return (leave_grp ? 0 : ENOMEM);
+ return (leave_grp ? 0 : err);
}
ilgstat = ILGSTAT_NEW;
ilg->ilg_v6group = *grp;
ilg->ilg_ipif = NULL;
- /*
- * Choose our target ill to join on. This might be
- * different from the ill we've been given if it's
- * currently down and part of a group.
- *
- * new ill is not refheld; we are writer.
- */
- ill = ip_choose_multi_ill(ill, grp);
- ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED));
ilg->ilg_ill = ill;
- /*
- * Remember the index that we joined on, so that we can
- * successfully delete them later on and also search for
- * duplicates if the application wants to join again.
- */
- ilg->ilg_orig_ifindex = orig_ifindex;
} else if (leave_grp) {
- /*
- * Use the ilg's current ill for the deletion,
- * we might have failed over.
- */
- ill = ilg->ilg_ill;
ilg_delete(connp, ilg, NULL);
mutex_exit(&connp->conn_lock);
- (void) ip_delmulti_v6(grp, ill, orig_ifindex,
- connp->conn_zoneid, B_FALSE, B_TRUE);
+ (void) ip_delmulti_v6(grp, ill, connp->conn_zoneid, B_FALSE,
+ B_TRUE);
return (0);
} else {
ilgstat = ILGSTAT_CHANGE;
- /*
- * The current ill might be different from the one we were
- * asked to join on (if failover has occurred); we should
- * join on the ill stored in the ilg. The original ill
- * is noted in ilg_orig_ifindex, which matched our request.
- */
- ill = ilg->ilg_ill;
/* preserve existing state in case ip_addmulti() fails */
orig_fmode = ilg->ilg_fmode;
if (ilg->ilg_filter == NULL) {
@@ -2531,8 +2526,8 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
mutex_exit(&connp->conn_lock);
- err = ip_addmulti_v6(grp, ill, orig_ifindex, connp->conn_zoneid,
- ilgstat, new_fmode, new_filter);
+ err = ip_addmulti_v6(grp, ill, connp->conn_zoneid, ilgstat, new_fmode,
+ new_filter);
if (err != 0) {
/*
* Restore the original filter state, or delete the
@@ -2541,7 +2536,7 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
* conn_lock.
*/
mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_index_v6(connp, grp, orig_ifindex);
+ ilg = ilg_lookup_ill_v6(connp, grp, ill);
ASSERT(ilg != NULL);
if (ilgstat == ILGSTAT_NEW) {
ilg_delete(connp, ilg, NULL);
@@ -3043,20 +3038,12 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group,
ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src)
{
ilg_t *ilg;
- ill_t *ilg_ill;
- uint_t ilg_orig_ifindex;
boolean_t leaving = B_TRUE;
ASSERT(IAM_WRITER_ILL(ill));
- /*
- * Use the index that we originally used to join. We can't
- * use the ill directly because ilg_ill could point to
- * a new ill if things have moved.
- */
mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_index_v6(connp, v6group,
- ill->ill_phyint->phyint_ifindex);
+ ilg = ilg_lookup_ill_v6(connp, v6group, ill);
if ((ilg == NULL) || (ilg->ilg_flags & ILG_DELETED)) {
mutex_exit(&connp->conn_lock);
return (EADDRNOTAVAIL);
@@ -3087,12 +3074,10 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group,
leaving = B_FALSE;
}
- ilg_ill = ilg->ilg_ill;
- ilg_orig_ifindex = ilg->ilg_orig_ifindex;
ilg_delete(connp, ilg, v6src);
mutex_exit(&connp->conn_lock);
- (void) ip_delmulti_v6(v6group, ilg_ill, ilg_orig_ifindex,
- connp->conn_zoneid, B_FALSE, leaving);
+ (void) ip_delmulti_v6(v6group, ill, connp->conn_zoneid, B_FALSE,
+ leaving);
return (0);
}
@@ -3345,10 +3330,10 @@ ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode,
if (ilg == NULL) {
ilgstat = ILGSTAT_NEW;
- if ((ilg = conn_ilg_alloc(connp)) == NULL) {
+ if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) {
mutex_exit(&connp->conn_lock);
l_free(new_filter);
- return (ENOMEM);
+ return (error);
}
if (src != INADDR_ANY) {
ilg->ilg_filter = l_alloc();
@@ -3369,7 +3354,6 @@ ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode,
}
ilg->ilg_ipif = ipif;
ilg->ilg_ill = NULL;
- ilg->ilg_orig_ifindex = 0;
ilg->ilg_fmode = fmode;
} else {
int index;
@@ -3437,7 +3421,6 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
mcast_record_t fmode, const in6_addr_t *v6src)
{
int error = 0;
- int orig_ifindex;
ilg_t *ilg;
ilg_stat_t ilgstat;
slist_t *new_filter = NULL;
@@ -3456,13 +3439,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
*/
mutex_enter(&connp->conn_lock);
- /*
- * Use the ifindex to do the lookup. We can't use the ill
- * directly because ilg_ill could point to a different ill if
- * things have moved.
- */
- orig_ifindex = ill->ill_phyint->phyint_ifindex;
- ilg = ilg_lookup_ill_index_v6(connp, v6group, orig_ifindex);
+ ilg = ilg_lookup_ill_v6(connp, v6group, ill);
/*
* Depending on the option we're handling, may or may not be okay
@@ -3501,10 +3478,10 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
}
if (ilg == NULL) {
- if ((ilg = conn_ilg_alloc(connp)) == NULL) {
+ if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) {
mutex_exit(&connp->conn_lock);
l_free(new_filter);
- return (ENOMEM);
+ return (error);
}
if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) {
ilg->ilg_filter = l_alloc();
@@ -3521,22 +3498,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
ilg->ilg_v6group = *v6group;
ilg->ilg_fmode = fmode;
ilg->ilg_ipif = NULL;
- /*
- * Choose our target ill to join on. This might be different
- * from the ill we've been given if it's currently down and
- * part of a group.
- *
- * new ill is not refheld; we are writer.
- */
- ill = ip_choose_multi_ill(ill, v6group);
- ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED));
ilg->ilg_ill = ill;
- /*
- * Remember the orig_ifindex that we joined on, so that we
- * can successfully delete them later on and also search
- * for duplicates if the application wants to join again.
- */
- ilg->ilg_orig_ifindex = orig_ifindex;
} else {
int index;
if (ilg->ilg_fmode != fmode || IN6_IS_ADDR_UNSPECIFIED(v6src)) {
@@ -3560,13 +3522,6 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
ilgstat = ILGSTAT_CHANGE;
index = ilg->ilg_filter->sl_numsrc++;
ilg->ilg_filter->sl_addr[index] = *v6src;
- /*
- * The current ill might be different from the one we were
- * asked to join on (if failover has occurred); we should
- * join on the ill stored in the ilg. The original ill
- * is noted in ilg_orig_ifindex, which matched our request.
- */
- ill = ilg->ilg_ill;
}
/*
@@ -3584,8 +3539,8 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
* info for the ill, which involves looking at the status of
* all the ilgs associated with this group/interface pair.
*/
- error = ip_addmulti_v6(v6group, ill, orig_ifindex, connp->conn_zoneid,
- ilgstat, new_fmode, new_filter);
+ error = ip_addmulti_v6(v6group, ill, connp->conn_zoneid, ilgstat,
+ new_fmode, new_filter);
if (error != 0) {
/*
* But because we waited, we have to undo the ilg update
@@ -3595,7 +3550,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
in6_addr_t delsrc =
(ilgstat == ILGSTAT_NEW) ? ipv6_all_zeros : *v6src;
mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_index_v6(connp, v6group, orig_ifindex);
+ ilg = ilg_lookup_ill_v6(connp, v6group, ill);
ASSERT(ilg != NULL);
ilg_delete(connp, ilg, &delsrc);
mutex_exit(&connp->conn_lock);
@@ -3639,7 +3594,7 @@ ilg_lookup_ill_withsrc(conn_t *connp, ipaddr_t group, ipaddr_t src, ill_t *ill)
ASSERT(ilg->ilg_ill == NULL);
ilg_ill = ipif->ipif_ill;
ASSERT(!ilg_ill->ill_isv6);
- if (ilg_ill == ill &&
+ if (IS_ON_SAME_LAN(ilg_ill, ill) &&
IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group)) {
if (SLIST_IS_EMPTY(ilg->ilg_filter)) {
/* no source filter, so this is a match */
@@ -3692,7 +3647,7 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
continue;
ASSERT(ilg->ilg_ipif == NULL);
ASSERT(ilg_ill->ill_isv6);
- if (ilg_ill == ill &&
+ if (IS_ON_SAME_LAN(ilg_ill, ill) &&
IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) {
if (SLIST_IS_EMPTY(ilg->ilg_filter)) {
/* no source filter, so this is a match */
@@ -3724,35 +3679,6 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
}
/*
- * Get the ilg whose ilg_orig_ifindex is associated with ifindex.
- * This is useful when the interface fails and we have moved
- * to a new ill, but still would like to locate using the index
- * that we originally used to join. Used only for IPv6 currently.
- */
-static ilg_t *
-ilg_lookup_ill_index_v6(conn_t *connp, const in6_addr_t *v6group, int ifindex)
-{
- ilg_t *ilg;
- int i;
-
- ASSERT(MUTEX_HELD(&connp->conn_lock));
- for (i = 0; i < connp->conn_ilg_inuse; i++) {
- ilg = &connp->conn_ilg[i];
- if (ilg->ilg_ill == NULL ||
- (ilg->ilg_flags & ILG_DELETED) != 0)
- continue;
- /* ilg_ipif is NULL for V6 */
- ASSERT(ilg->ilg_ipif == NULL);
- ASSERT(ilg->ilg_orig_ifindex != 0);
- if (IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group) &&
- ilg->ilg_orig_ifindex == ifindex) {
- return (ilg);
- }
- }
- return (NULL);
-}
-
-/*
* Find an IPv6 ilg matching group and ill
*/
ilg_t *
@@ -3863,32 +3789,28 @@ ilg_delete_all(conn_t *connp)
in6_addr_t v6group;
boolean_t success;
ipsq_t *ipsq;
- int orig_ifindex;
mutex_enter(&connp->conn_lock);
retry:
ILG_WALKER_HOLD(connp);
- for (i = connp->conn_ilg_inuse - 1; i >= 0; ) {
+ for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
ilg = &connp->conn_ilg[i];
/*
* Since this walk is not atomic (we drop the
* conn_lock and wait in ipsq_enter) we need
* to check for the ILG_DELETED flag.
*/
- if (ilg->ilg_flags & ILG_DELETED) {
- /* Go to the next ilg */
- i--;
+ if (ilg->ilg_flags & ILG_DELETED)
continue;
- }
- v6group = ilg->ilg_v6group;
- if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
+ if (IN6_IS_ADDR_V4MAPPED(&ilg->ilg_v6group)) {
ipif = ilg->ilg_ipif;
ill = ipif->ipif_ill;
} else {
ipif = NULL;
ill = ilg->ilg_ill;
}
+
/*
* We may not be able to refhold the ill if the ill/ipif
* is changing. But we need to make sure that the ill will
@@ -3897,11 +3819,9 @@ retry:
* in which case the unplumb thread will handle the cleanup,
* and we move on to the next ilg.
*/
- if (!ill_waiter_inc(ill)) {
- /* Go to the next ilg */
- i--;
+ if (!ill_waiter_inc(ill))
continue;
- }
+
mutex_exit(&connp->conn_lock);
/*
* To prevent deadlock between ill close which waits inside
@@ -3916,51 +3836,31 @@ retry:
ipsq = ill->ill_phyint->phyint_ipsq;
ill_waiter_dcr(ill);
mutex_enter(&connp->conn_lock);
- if (!success) {
- /* Go to the next ilg */
- i--;
+ if (!success)
continue;
- }
/*
- * Make sure that nothing has changed under. For eg.
- * a failover/failback can change ilg_ill while we were
- * waiting to become exclusive above
+ * Move on if the ilg was deleted while conn_lock was dropped.
*/
- if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
- ipif = ilg->ilg_ipif;
- ill = ipif->ipif_ill;
- } else {
- ipif = NULL;
- ill = ilg->ilg_ill;
- }
- if (!IAM_WRITER_ILL(ill) || (ilg->ilg_flags & ILG_DELETED)) {
- /*
- * The ilg has changed under us probably due
- * to a failover or unplumb. Retry on the same ilg.
- */
+ if (ilg->ilg_flags & ILG_DELETED) {
mutex_exit(&connp->conn_lock);
ipsq_exit(ipsq);
mutex_enter(&connp->conn_lock);
continue;
}
v6group = ilg->ilg_v6group;
- orig_ifindex = ilg->ilg_orig_ifindex;
ilg_delete(connp, ilg, NULL);
mutex_exit(&connp->conn_lock);
- if (ipif != NULL)
+ if (ipif != NULL) {
(void) ip_delmulti(V4_PART_OF_V6(v6group), ipif,
B_FALSE, B_TRUE);
-
- else
- (void) ip_delmulti_v6(&v6group, ill, orig_ifindex,
+ } else {
+ (void) ip_delmulti_v6(&v6group, ill,
connp->conn_zoneid, B_FALSE, B_TRUE);
-
+ }
ipsq_exit(ipsq);
mutex_enter(&connp->conn_lock);
- /* Go to the next ilg */
- i--;
}
ILG_WALKER_RELE(connp);
@@ -4063,7 +3963,6 @@ conn_delete_ill(conn_t *connp, caddr_t arg)
int i;
char group_buf[INET6_ADDRSTRLEN];
in6_addr_t v6group;
- int orig_ifindex;
ilg_t *ilg;
/*
@@ -4097,11 +3996,10 @@ conn_delete_ill(conn_t *connp, caddr_t arg)
ill->ill_name));
v6group = ilg->ilg_v6group;
- orig_ifindex = ilg->ilg_orig_ifindex;
ilg_delete(connp, ilg, NULL);
mutex_exit(&connp->conn_lock);
- (void) ip_delmulti_v6(&v6group, ill, orig_ifindex,
+ (void) ip_delmulti_v6(&v6group, ill,
connp->conn_zoneid, B_FALSE, B_TRUE);
mutex_enter(&connp->conn_lock);
}
@@ -4115,7 +4013,6 @@ conn_delete_ill(conn_t *connp, caddr_t arg)
if (connp->conn_multicast_ill == ill) {
/* Revert to late binding */
connp->conn_multicast_ill = NULL;
- connp->conn_orig_multicast_ifindex = 0;
}
mutex_exit(&connp->conn_lock);
}
diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c
index b53897cefe..895cc74bd2 100644
--- a/usr/src/uts/common/inet/ip/ip_ndp.c
+++ b/usr/src/uts/common/inet/ip/ip_ndp.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -83,8 +83,9 @@ static boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
static void nce_ire_delete(nce_t *nce);
static void nce_ire_delete1(ire_t *ire, char *nce_arg);
static void nce_set_ll(nce_t *nce, uchar_t *ll_addr);
-static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
-static nce_t *nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
+static nce_t *nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *,
+ nce_t *);
+static nce_t *nce_lookup_mapping(ill_t *, const in6_addr_t *);
static void nce_make_mapping(nce_t *nce, uchar_t *addrpos,
uchar_t *addr);
static int nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
@@ -93,11 +94,16 @@ static mblk_t *nce_udreq_alloc(ill_t *ill);
static void nce_update(nce_t *nce, uint16_t new_state,
uchar_t *new_ll_addr);
static uint32_t nce_solicit(nce_t *nce, mblk_t *mp);
-static boolean_t nce_xmit(ill_t *ill, uint32_t operation,
- ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
+static boolean_t nce_xmit(ill_t *ill, uint8_t type,
+ boolean_t use_lla_addr, const in6_addr_t *sender,
const in6_addr_t *target, int flag);
+static boolean_t nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla,
+ const in6_addr_t *target, uint_t flags);
+static boolean_t nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla,
+ const in6_addr_t *src, uint_t flags);
static int ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
nce_t **, nce_t *);
+static ipif_t *ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill);
#ifdef DEBUG
static void nce_trace_cleanup(const nce_t *);
@@ -110,22 +116,6 @@ static void nce_trace_cleanup(const nce_t *);
(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
NCE_TABLE_SIZE)]))
-/*
- * Compute default flags to use for an advertisement of this nce's address.
- */
-static int
-nce_advert_flags(const nce_t *nce)
-{
- int flag = 0;
-
- if (nce->nce_flags & NCE_F_ISROUTER)
- flag |= NDP_ISROUTER;
- if (!(nce->nce_flags & NCE_F_ANYCAST))
- flag |= NDP_ORIDE;
-
- return (flag);
-}
-
/* Non-tunable probe interval, based on link capabilities */
#define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
@@ -262,8 +252,7 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
mutex_exit(&nce->nce_lock);
- dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
- &ipv6_all_zeros, addr, NDP_PROBE);
+ dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
if (dropped) {
mutex_enter(&nce->nce_lock);
nce->nce_pcnt++;
@@ -282,23 +271,20 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
mutex_exit(&nce->nce_lock);
- dropped = nce_xmit(ill,
- ND_NEIGHBOR_ADVERT,
- ill, /* ill to be used for extracting ill_nd_lla */
- B_TRUE, /* use ill_nd_lla */
- addr, /* Source and target of the advertisement pkt */
- &ipv6_all_hosts_mcast, /* Destination of the packet */
- nce_advert_flags(nce));
+ dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast,
+ 0);
mutex_enter(&nce->nce_lock);
if (dropped)
nce->nce_unsolicit_count++;
if (nce->nce_unsolicit_count != 0) {
+ ASSERT(nce->nce_timeout_id == 0);
nce->nce_timeout_id = timeout(ndp_timer, nce,
MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
}
mutex_exit(&nce->nce_lock);
mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
}
+
/*
* If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
* we call nce_fastpath as soon as the nce is resolved in ndp_process.
@@ -311,10 +297,10 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
}
int
-ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
- const in6_addr_t *mask, const in6_addr_t *extract_mask,
- uint32_t hw_extract_start, uint16_t flags, uint16_t state,
- nce_t **newnce)
+ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
+ const in6_addr_t *addr, const in6_addr_t *mask,
+ const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags,
+ uint16_t state, nce_t **newnce)
{
int err = 0;
nce_t *nce;
@@ -325,7 +311,7 @@ ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
/* Get head of v6 hash table */
nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
- nce = nce_lookup_addr(ill, addr, nce);
+ nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
if (nce == NULL) {
err = ndp_add_v6(ill,
hw_addr,
@@ -562,13 +548,11 @@ nce_ire_delete_list(nce_t *nce)
if (nce->nce_ipversion == IPV4_VERSION) {
ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, nce_ire_delete1,
- (char *)nce, nce->nce_ill);
+ IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
} else {
ASSERT(nce->nce_ipversion == IPV6_VERSION);
ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, nce_ire_delete1,
- (char *)nce, nce->nce_ill);
+ IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
}
NCE_REFRELE_NOTR(nce);
nce = nce_next;
@@ -628,8 +612,7 @@ ndp_restart_dad(nce_t *nce)
nce->nce_state = ND_PROBE;
nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
mutex_exit(&nce->nce_lock);
- dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
- B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
+ dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
if (dropped) {
mutex_enter(&nce->nce_lock);
nce->nce_pcnt++;
@@ -649,22 +632,19 @@ ndp_restart_dad(nce_t *nce)
* If one is found, the refcnt on the nce will be incremented.
*/
nce_t *
-ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
+ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
+ boolean_t caller_holds_lock)
{
nce_t *nce;
- ip_stack_t *ipst;
-
- ASSERT(ill != NULL);
- ipst = ill->ill_ipst;
+ ip_stack_t *ipst = ill->ill_ipst;
- ASSERT(ill != NULL && ill->ill_isv6);
- if (!caller_holds_lock) {
+ ASSERT(ill->ill_isv6);
+ if (!caller_holds_lock)
mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
- }
/* Get head of v6 hash table */
nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
- nce = nce_lookup_addr(ill, addr, nce);
+ nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
if (nce == NULL)
nce = nce_lookup_mapping(ill, addr);
if (!caller_holds_lock)
@@ -685,14 +665,17 @@ ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
in6_addr_t addr6;
ip_stack_t *ipst = ill->ill_ipst;
- if (!caller_holds_lock) {
+ if (!caller_holds_lock)
mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
- }
/* Get head of v4 hash table */
nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
- nce = nce_lookup_addr(ill, &addr6, nce);
+ /*
+ * NOTE: IPv4 never matches across the illgrp since the NCE's we're
+ * looking up have fastpath headers that are inherently per-ill.
+ */
+ nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
if (!caller_holds_lock)
mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
return (nce);
@@ -706,7 +689,8 @@ ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
* lock (ndp_g_lock).
*/
static nce_t *
-nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
+nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
+ nce_t *nce)
{
ndp_g_t *ndp;
ip_stack_t *ipst = ill->ill_ipst;
@@ -716,12 +700,12 @@ nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
else
ndp = ipst->ips_ndp4;
- ASSERT(ill != NULL);
ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
if (IN6_IS_ADDR_UNSPECIFIED(addr))
return (NULL);
for (; nce != NULL; nce = nce->nce_next) {
- if (nce->nce_ill == ill) {
+ if (nce->nce_ill == ill ||
+ match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) {
if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
&ipv6_all_ones)) {
@@ -771,8 +755,8 @@ nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
* Process passed in parameters either from an incoming packet or via
* user ioctl.
*/
-void
-ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
+static void
+nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
{
ill_t *ill = nce->nce_ill;
uint32_t hw_addr_len = ill->ill_nd_lla_len;
@@ -852,7 +836,7 @@ ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
} else {
/*
* Send locally originated packets back
- * into * ip_wput_v6.
+ * into ip_wput_v6.
*/
put(ill->ill_wq, mp);
}
@@ -918,6 +902,65 @@ ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
}
/*
+ * Walker state structure used by ndp_process() / ndp_process_entry().
+ */
+typedef struct ndp_process_data {
+ ill_t *np_ill; /* ill/illgrp to match against */
+ const in6_addr_t *np_addr; /* IPv6 address to match */
+ uchar_t *np_hw_addr; /* passed to nce_process() */
+ uint32_t np_flag; /* passed to nce_process() */
+ boolean_t np_is_adv; /* passed to nce_process() */
+} ndp_process_data_t;
+
+/*
+ * Walker callback used by ndp_process() for IPMP groups: calls nce_process()
+ * for each NCE with a matching address that's in the same IPMP group.
+ */
+static void
+ndp_process_entry(nce_t *nce, void *arg)
+{
+ ndp_process_data_t *npp = arg;
+
+ if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) &&
+ IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) &&
+ IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
+ nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv);
+ }
+}
+
+/*
+ * Wrapper around nce_process() that handles IPMP. In particular, for IPMP,
+ * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have
+ * more than one NCE for a given IPv6 address to tend to. In that case, we
+ * need to walk all NCEs and callback nce_process() for each one. Since this
+ * is expensive, in the non-IPMP case we just directly call nce_process().
+ * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP
+ * interfaces in an IPMP group share the same NCEs -- at which point this
+ * function can be removed entirely.
+ */
+void
+ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
+{
+ ill_t *ill = nce->nce_ill;
+ struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6;
+ ndp_process_data_t np;
+
+ if (ill->ill_grp == NULL) {
+ nce_process(nce, hw_addr, flag, is_adv);
+ return;
+ }
+
+ /* IPMP case: walk all NCEs */
+ np.np_ill = ill;
+ np.np_addr = &nce->nce_addr;
+ np.np_flag = flag;
+ np.np_is_adv = is_adv;
+ np.np_hw_addr = hw_addr;
+
+ ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES);
+}
+
+/*
* Pass arg1 to the pfi supplied, along with each nce in existence.
* ndp_walk() places a REFHOLD on the nce and drops the lock when
* walking the hash list.
@@ -926,7 +969,6 @@ void
ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
boolean_t trace)
{
-
nce_t *nce;
nce_t *nce1;
nce_t **ncep;
@@ -1021,27 +1063,58 @@ ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
int
ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
{
- nce_t *nce;
- int err = 0;
+ nce_t *nce, *hw_nce = NULL;
+ int err;
+ ill_t *ipmp_ill;
+ uint16_t nce_flags;
uint32_t ms;
mblk_t *mp_nce = NULL;
ip_stack_t *ipst = ill->ill_ipst;
+ uchar_t *hwaddr = NULL;
ASSERT(ill->ill_isv6);
- if (IN6_IS_ADDR_MULTICAST(dst)) {
- err = nce_set_multicast(ill, dst);
- return (err);
+
+ if (IN6_IS_ADDR_MULTICAST(dst))
+ return (nce_set_multicast(ill, dst));
+
+ nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
+
+ /*
+ * If `ill' is under IPMP, then first check to see if there's an NCE
+ * for `dst' on the IPMP meta-interface (e.g., because an application
+ * explicitly did an SIOCLIFSETND to tie a hardware address to `dst').
+ * If so, we use that hardware address when creating the NCE below.
+ * Note that we don't yet have a mechanism to remove these NCEs if the
+ * NCE for `dst' on the IPMP meta-interface is subsequently removed --
+ * but rather than build such a beast, we should fix NCEs so that they
+ * can be properly shared across an IPMP group.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
+ hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE);
+ if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) {
+ hwaddr = hw_nce->nce_res_mp->b_rptr +
+ NCE_LL_ADDR_OFFSET(ipmp_ill);
+ nce_flags |= hw_nce->nce_flags;
+ }
+ ill_refrele(ipmp_ill);
+ }
}
+
err = ndp_lookup_then_add_v6(ill,
- NULL, /* No hardware address */
+ B_FALSE, /* NCE fastpath is per ill; don't match across group */
+ hwaddr,
dst,
&ipv6_all_ones,
&ipv6_all_zeros,
0,
- (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
- ND_INCOMPLETE,
+ nce_flags,
+ hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE,
&nce);
+ if (hw_nce != NULL)
+ NCE_REFRELE(hw_nce);
+
switch (err) {
case 0:
/*
@@ -1057,11 +1130,10 @@ ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
NCE_REFRELE(nce);
return (0);
}
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+
mutex_enter(&nce->nce_lock);
if (nce->nce_state != ND_INCOMPLETE) {
mutex_exit(&nce->nce_lock);
- rw_exit(&ipst->ips_ill_g_lock);
NCE_REFRELE(nce);
return (0);
}
@@ -1069,14 +1141,11 @@ ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
if (mp_nce == NULL) {
/* The caller will free mp */
mutex_exit(&nce->nce_lock);
- rw_exit(&ipst->ips_ill_g_lock);
ndp_delete(nce);
NCE_REFRELE(nce);
return (ENOMEM);
}
- ms = nce_solicit(nce, mp_nce);
- rw_exit(&ipst->ips_ill_g_lock);
- if (ms == 0) {
+ if ((ms = nce_solicit(nce, mp_nce)) == 0) {
/* The caller will free mp */
if (mp_nce != mp)
freeb(mp_nce);
@@ -1143,6 +1212,7 @@ ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
}
err = ndp_lookup_then_add_v6(ill,
+ B_FALSE, /* NCE fastpath is per ill; don't match across group */
NULL, /* hardware address */
dst,
&ipv6_all_ones,
@@ -1191,7 +1261,7 @@ nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
- nce = nce_lookup_addr(ill, dst, nce);
+ nce = nce_lookup_addr(ill, B_FALSE, dst, nce);
if (nce != NULL) {
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
NCE_REFRELE(nce);
@@ -1259,7 +1329,13 @@ ndp_query(ill_t *ill, struct lif_nd_req *lnr)
sin6 = (sin6_t *)&lnr->lnr_addr;
addr = &sin6->sin6_addr;
- nce = ndp_lookup_v6(ill, addr, B_FALSE);
+ /*
+ * NOTE: if the ill is an IPMP interface, then match against the whole
+ * illgrp. This e.g. allows in.ndpd to retrieve the link layer
+ * addresses for the data addresses on an IPMP interface even though
+ * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill.
+ */
+ nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE);
if (nce == NULL)
return (ESRCH);
/* If in INCOMPLETE state, no link layer address is available yet */
@@ -1347,24 +1423,14 @@ ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
uint32_t
nce_solicit(nce_t *nce, mblk_t *mp)
{
- ill_t *ill;
- ill_t *src_ill;
ip6_t *ip6h;
- in6_addr_t src;
- in6_addr_t dst;
- ipif_t *ipif;
- ip6i_t *ip6i;
- boolean_t dropped = B_FALSE;
- ip_stack_t *ipst = nce->nce_ill->ill_ipst;
+ in6_addr_t sender;
+ boolean_t dropped;
- ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
ASSERT(MUTEX_HELD(&nce->nce_lock));
- ill = nce->nce_ill;
- ASSERT(ill != NULL);
- if (nce->nce_rcnt == 0) {
+ if (nce->nce_rcnt == 0)
return (0);
- }
if (mp == NULL) {
ASSERT(nce->nce_qd_mp != NULL);
@@ -1385,60 +1451,22 @@ nce_solicit(nce_t *nce, mblk_t *mp)
* could be from the nce_qd_mp which could have b_next/b_prev
* non-NULL.
*/
- ip6i = (ip6i_t *)ip6h;
- ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
- sizeof (ip6i_t) + IPV6_HDR_LEN);
+ ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
}
- src = ip6h->ip6_src;
- /*
- * If the src of outgoing packet is one of the assigned interface
- * addresses use it, otherwise we will pick the source address below.
- */
- src_ill = ill;
- if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
- if (ill->ill_group != NULL)
- src_ill = ill->ill_group->illgrp_ill;
- for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
- for (ipif = src_ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (IN6_ARE_ADDR_EQUAL(&src,
- &ipif->ipif_v6lcl_addr)) {
- break;
- }
- }
- if (ipif != NULL)
- break;
- }
- /*
- * If no relevant ipif can be found, then it's not one of our
- * addresses. Reset to :: and let nce_xmit. If an ipif can be
- * found, but it's not yet done with DAD verification, then
- * just postpone this transmission until later.
- */
- if (src_ill == NULL)
- src = ipv6_all_zeros;
- else if (!ipif->ipif_addr_ready)
- return (ill->ill_reachable_retrans_time);
- }
- dst = nce->nce_addr;
+
/*
- * If source address is unspecified, nce_xmit will choose
- * one for us and initialize the hardware address also
- * appropriately.
+ * Need to copy the sender address into a local since `mp' can
+ * go away once we drop nce_lock.
*/
- if (IN6_IS_ADDR_UNSPECIFIED(&src))
- src_ill = NULL;
+ sender = ip6h->ip6_src;
nce->nce_rcnt--;
mutex_exit(&nce->nce_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
- &dst, 0);
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0);
mutex_enter(&nce->nce_lock);
if (dropped)
nce->nce_rcnt++;
- return (ill->ill_reachable_retrans_time);
+ return (nce->nce_ill->ill_reachable_retrans_time);
}
/*
@@ -1475,7 +1503,7 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
*/
mutex_enter(&ill->ill_lock);
if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
- (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
+ (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
mutex_exit(&ill->ill_lock);
continue;
}
@@ -1485,8 +1513,8 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
mutex_exit(&ill->ill_lock);
ipif->ipif_was_dup = B_TRUE;
- if (ipif_ndp_up(ipif) != EINPROGRESS)
- (void) ipif_up_done_v6(ipif);
+ VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
+ (void) ipif_up_done_v6(ipif);
}
freeb(mp);
}
@@ -1515,7 +1543,7 @@ ipif6_dup_recovery(void *arg)
/*
* No lock, because this is just an optimization.
*/
- if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
+ if (ipif->ipif_state_flags & IPIF_CONDEMNED)
return;
/* If the link is down, we'll retry this later */
@@ -1542,13 +1570,20 @@ ndp_do_recovery(ipif_t *ipif)
if (mp == NULL) {
mutex_enter(&ill->ill_lock);
if (ipif->ipif_recovery_id == 0 &&
- !(ipif->ipif_state_flags & (IPIF_MOVING |
- IPIF_CONDEMNED))) {
+ !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
}
mutex_exit(&ill->ill_lock);
} else {
+ /*
+ * A recovery timer may still be running if we got here from
+ * ill_restart_dad(); cancel that timer.
+ */
+ if (ipif->ipif_recovery_id != 0)
+ (void) untimeout(ipif->ipif_recovery_id);
+ ipif->ipif_recovery_id = 0;
+
bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
sizeof (ipif->ipif_v6lcl_addr));
ill_refhold(ill);
@@ -1558,41 +1593,51 @@ ndp_do_recovery(ipif_t *ipif)
}
/*
- * Find the solicitation in the given message, and extract printable details
- * (MAC and IP addresses) from it.
+ * Find the MAC and IP addresses in an NA/NS message.
*/
-static nd_neighbor_solicit_t *
-ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
- size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
+static void
+ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp,
+ uchar_t **haddr, uint_t *haddrlenp)
{
- nd_neighbor_solicit_t *ns;
- ip6_t *ip6h;
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
+ nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
+ nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
uchar_t *addr;
- int alen;
+ int alen = 0;
- alen = 0;
- ip6h = (ip6_t *)mp->b_rptr;
if (dl_mp == NULL) {
nd_opt_hdr_t *opt;
- int nslen;
+ int len;
/*
* If it's from the fast-path, then it can't be a probe
- * message, and thus must include the source linkaddr option.
+ * message, and thus must include a linkaddr option.
* Extract that here.
*/
- ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
- nslen = mp->b_wptr - (uchar_t *)ns;
- if ((nslen -= sizeof (*ns)) > 0) {
- opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
- ND_OPT_SOURCE_LINKADDR);
- if (opt != NULL &&
- opt->nd_opt_len * 8 - sizeof (*opt) >=
- ill->ill_nd_lla_len) {
- addr = (uchar_t *)(opt + 1);
- alen = ill->ill_nd_lla_len;
+ switch (icmp6->icmp6_type) {
+ case ND_NEIGHBOR_SOLICIT:
+ len = mp->b_wptr - (uchar_t *)ns;
+ if ((len -= sizeof (*ns)) > 0) {
+ opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1),
+ len, ND_OPT_SOURCE_LINKADDR);
}
+ break;
+ case ND_NEIGHBOR_ADVERT:
+ len = mp->b_wptr - (uchar_t *)na;
+ if ((len -= sizeof (*na)) > 0) {
+ opt = ndp_get_option((nd_opt_hdr_t *)(na + 1),
+ len, ND_OPT_TARGET_LINKADDR);
+ }
+ break;
+ }
+
+ if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >=
+ ill->ill_nd_lla_len) {
+ addr = (uchar_t *)(opt + 1);
+ alen = ill->ill_nd_lla_len;
}
+
/*
* We cheat a bit here for the sake of printing usable log
* messages in the rare case where the reply we got was unicast
@@ -1624,16 +1669,17 @@ ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
}
}
}
+
if (alen > 0) {
*haddr = addr;
- (void) mac_colon_addr(addr, alen, hbuf, hlen);
+ *haddrlenp = alen;
} else {
*haddr = NULL;
- (void) strcpy(hbuf, "?");
+ *haddrlenp = 0;
}
- ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
- (void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
- return (ns);
+
+ /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
+ *targp = ns->nd_ns_target;
}
/*
@@ -1646,68 +1692,80 @@ ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
{
ill_t *ill = rq->q_ptr;
ipif_t *ipif;
- char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */
- char hbuf[MAC_STR_LEN];
- char sbuf[INET6_ADDRSTRLEN];
- nd_neighbor_solicit_t *ns;
- mblk_t *dl_mp = NULL;
- uchar_t *haddr;
+ mblk_t *dl_mp = NULL;
+ uchar_t *haddr;
+ uint_t haddrlen;
ip_stack_t *ipst = ill->ill_ipst;
+ in6_addr_t targ;
if (DB_TYPE(mp) != M_DATA) {
dl_mp = mp;
mp = mp->b_cont;
}
- ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
- sizeof (sbuf), &haddr);
- if (haddr != NULL &&
- bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
+
+ ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
+ if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
/*
- * Ignore conflicts generated by misbehaving switches that just
- * reflect our own messages back to us.
+ * Ignore conflicts generated by misbehaving switches that
+ * just reflect our own messages back to us. For IPMP, we may
+ * see reflections across any ill in the illgrp.
*/
- goto ignore_conflict;
+ if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
+ IS_UNDER_IPMP(ill) &&
+ ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL)
+ goto ignore_conflict;
}
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ /*
+ * Look up the appropriate ipif.
+ */
+ ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL,
+ NULL, ipst);
+ if (ipif == NULL)
+ goto ignore_conflict;
- if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
- !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
- &ns->nd_ns_target)) {
- continue;
- }
+ /* Reload the ill to match the ipif */
+ ill = ipif->ipif_ill;
- /* If it's already marked, then don't do anything. */
- if (ipif->ipif_flags & IPIF_DUPLICATE)
- continue;
+ /* If it's already duplicate or ineligible, then don't do anything. */
+ if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
+ ipif_refrele(ipif);
+ goto ignore_conflict;
+ }
- /*
- * If this is a failure during duplicate recovery, then don't
- * complain. It may take a long time to recover.
- */
- if (!ipif->ipif_was_dup) {
- ipif_get_name(ipif, ibuf, sizeof (ibuf));
- cmn_err(CE_WARN, "%s has duplicate address %s (in "
- "use by %s); disabled", ibuf, sbuf, hbuf);
- }
- mutex_enter(&ill->ill_lock);
- ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
- ipif->ipif_flags |= IPIF_DUPLICATE;
- ill->ill_ipif_dup_count++;
- mutex_exit(&ill->ill_lock);
- (void) ipif_down(ipif, NULL, NULL);
- ipif_down_tail(ipif);
- mutex_enter(&ill->ill_lock);
- if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
- ill->ill_net_type == IRE_IF_RESOLVER &&
- !(ipif->ipif_state_flags & (IPIF_MOVING |
- IPIF_CONDEMNED)) &&
- ipst->ips_ip_dup_recovery > 0) {
- ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
- ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
- }
- mutex_exit(&ill->ill_lock);
+ /*
+ * If this is a failure during duplicate recovery, then don't
+ * complain. It may take a long time to recover.
+ */
+ if (!ipif->ipif_was_dup) {
+ char ibuf[LIFNAMSIZ];
+ char hbuf[MAC_STR_LEN];
+ char sbuf[INET6_ADDRSTRLEN];
+
+ ipif_get_name(ipif, ibuf, sizeof (ibuf));
+ cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
+ " disabled", ibuf,
+ inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
+ mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
}
+ mutex_enter(&ill->ill_lock);
+ ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
+ ipif->ipif_flags |= IPIF_DUPLICATE;
+ ill->ill_ipif_dup_count++;
+ mutex_exit(&ill->ill_lock);
+ (void) ipif_down(ipif, NULL, NULL);
+ ipif_down_tail(ipif);
+ mutex_enter(&ill->ill_lock);
+ if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
+ ill->ill_net_type == IRE_IF_RESOLVER &&
+ !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
+ ipst->ips_ip_dup_recovery > 0) {
+ ASSERT(ipif->ipif_recovery_id == 0);
+ ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
+ ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
+ }
+ mutex_exit(&ill->ill_lock);
+ ipif_refrele(ipif);
ignore_conflict:
if (dl_mp != NULL)
freeb(dl_mp);
@@ -1721,7 +1779,7 @@ ignore_conflict:
* we start a timer on the ipif.
*/
static void
-ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
+ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
{
if ((mp = copymsg(mp)) != NULL) {
if (dl_mp == NULL)
@@ -1736,7 +1794,6 @@ ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
B_FALSE);
}
}
- ndp_delete(nce);
}
/*
@@ -1757,6 +1814,7 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
NULL, NULL, ipst);
if (ipif == NULL)
return;
+
/*
* First, figure out if this address is disposable.
*/
@@ -1786,19 +1844,21 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
* sending out an unsolicited Neighbor Advertisement.
*/
if (defs >= maxdefense) {
- ip_ndp_failure(ill, mp, dl_mp, nce);
+ ip_ndp_failure(ill, mp, dl_mp);
} else {
char hbuf[MAC_STR_LEN];
char sbuf[INET6_ADDRSTRLEN];
uchar_t *haddr;
+ uint_t haddrlen;
+ in6_addr_t targ;
- (void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
- sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
+ ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
- hbuf, sbuf, ill->ill_name);
- (void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
- &nce->nce_addr, &ipv6_all_hosts_mcast,
- nce_advert_flags(nce));
+ mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)),
+ inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
+ ill->ill_name);
+
+ (void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0);
}
}
@@ -1843,6 +1903,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
bad_solicit = B_TRUE;
goto done;
}
+
}
if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
/* Check to see if this is a valid DAD solicitation */
@@ -1859,7 +1920,13 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
}
}
- our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
+ /*
+ * NOTE: with IPMP, it's possible the nominated multicast ill (which
+ * received this packet if it's multicast) is not the ill tied to
+ * e.g. the IPMP ill's data link-local. So we match across the illgrp
+ * to ensure we find the associated NCE.
+ */
+ our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE);
/*
* If this is a valid Solicitation, a permanent
* entry should exist in the cache
@@ -1883,7 +1950,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
haddr = (uchar_t *)&opt[1];
if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
hlen == 0) {
- ip1dbg(("ndp_input_advert: bad SLLA\n"));
+ ip1dbg(("ndp_input_solicit: bad SLLA\n"));
bad_solicit = B_TRUE;
goto done;
}
@@ -1934,6 +2001,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
goto no_source;
err = ndp_lookup_then_add_v6(ill,
+ B_FALSE,
haddr,
&src, /* Soliciting nodes address */
&ipv6_all_ones,
@@ -1949,8 +2017,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
break;
case EEXIST:
/*
- * B_FALSE indicates this is not an
- * an advertisement.
+ * B_FALSE indicates this is not an an advertisement.
*/
ndp_process(nnce, haddr, 0, B_FALSE);
NCE_REFRELE(nnce);
@@ -1985,7 +2052,7 @@ no_source:
* If someone else is probing our address, then
* we've crossed wires. Declare failure.
*/
- ip_ndp_failure(ill, mp, dl_mp, our_nce);
+ ip_ndp_failure(ill, mp, dl_mp);
}
goto done;
}
@@ -1995,15 +2062,8 @@ no_source:
*/
src = ipv6_all_hosts_mcast;
}
- flag |= nce_advert_flags(our_nce);
/* Response to a solicitation */
- (void) nce_xmit(ill,
- ND_NEIGHBOR_ADVERT,
- ill, /* ill to be used for extracting ill_nd_lla */
- B_TRUE, /* use ill_nd_lla */
- &target, /* Source and target of the advertisement pkt */
- &src, /* IP Destination (source of original pkt) */
- flag);
+ (void) nce_xmit_advert(our_nce, B_TRUE, &src, flag);
done:
if (bad_solicit)
BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
@@ -2023,8 +2083,8 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
in6_addr_t target;
nd_opt_hdr_t *opt = NULL;
int len;
- mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
ip_stack_t *ipst = ill->ill_ipst;
+ mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
ip6h = (ip6_t *)mp->b_rptr;
icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
@@ -2067,66 +2127,62 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
}
/*
- * If this interface is part of the group look at all the
+ * NOTE: we match across the illgrp since we need to do DAD for all of
+ * our local addresses, and those are spread across all the active
* ills in the group.
*/
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- if (ill->ill_group != NULL)
- ill = ill->ill_group->illgrp_ill;
+ if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL)
+ return;
- for (; ill != NULL; ill = ill->ill_group_next) {
- mutex_enter(&ill->ill_lock);
- if (!ILL_CAN_LOOKUP(ill)) {
- mutex_exit(&ill->ill_lock);
- continue;
- }
- ill_refhold_locked(ill);
- mutex_exit(&ill->ill_lock);
- dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
- /* We have to drop the lock since ndp_process calls put* */
- rw_exit(&ipst->ips_ill_g_lock);
- if (dst_nce != NULL) {
- if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
- dst_nce->nce_state == ND_PROBE) {
- /*
- * Someone else sent an advertisement for an
- * address that we're trying to configure.
- * Tear it down. Note that dl_mp might be NULL
- * if we're getting a unicast reply. This
- * isn't typically done (multicast is the norm
- * in response to a probe), but ip_ndp_failure
- * will handle the dl_mp == NULL case as well.
- */
- ip_ndp_failure(ill, mp, dl_mp, dst_nce);
- } else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
- /*
- * Someone just announced one of our local
- * addresses. If it wasn't us, then this is a
- * conflict. Defend the address or shut it
- * down.
- */
- if (dl_mp != NULL &&
- (haddr == NULL ||
- nce_cmp_ll_addr(dst_nce, haddr,
- ill->ill_nd_lla_len))) {
- ip_ndp_conflict(ill, mp, dl_mp,
- dst_nce);
- }
- } else {
- if (na->nd_na_flags_reserved &
- ND_NA_FLAG_ROUTER) {
- dst_nce->nce_flags |= NCE_F_ISROUTER;
+ if (dst_nce->nce_flags & NCE_F_PERMANENT) {
+ /*
+ * Someone just advertised one of our local addresses. First,
+ * check it it was us -- if so, we can safely ignore it.
+ */
+ if (haddr != NULL) {
+ if (!nce_cmp_ll_addr(dst_nce, haddr, hlen))
+ goto out; /* from us -- no conflict */
+
+ /*
+ * If we're in an IPMP group, check if this is an echo
+ * from another ill in the group. Use the double-
+ * checked locking pattern to avoid grabbing
+ * ill_g_lock in the non-IPMP case.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
+ ill->ill_grp, haddr, hlen) != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
+ goto out;
}
- /* B_TRUE indicates this an advertisement */
- ndp_process(dst_nce, haddr,
- na->nd_na_flags_reserved, B_TRUE);
+ rw_exit(&ipst->ips_ill_g_lock);
}
- NCE_REFRELE(dst_nce);
}
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- ill_refrele(ill);
+
+ /*
+ * This appears to be a real conflict. If we're trying to
+ * configure this NCE (ND_PROBE), then shut it down.
+ * Otherwise, handle the discovered conflict.
+ *
+ * Note that dl_mp might be NULL if we're getting a unicast
+ * reply. This isn't typically done (multicast is the norm in
+ * response to a probe), but we can handle the dl_mp == NULL
+ * case as well.
+ */
+ if (dst_nce->nce_state == ND_PROBE)
+ ip_ndp_failure(ill, mp, dl_mp);
+ else
+ ip_ndp_conflict(ill, mp, dl_mp, dst_nce);
+ } else {
+ if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
+ dst_nce->nce_flags |= NCE_F_ISROUTER;
+
+ /* B_TRUE indicates this an advertisement */
+ ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE);
}
- rw_exit(&ipst->ips_ill_g_lock);
+out:
+ NCE_REFRELE(dst_nce);
}
/*
@@ -2194,6 +2250,40 @@ done:
}
/*
+ * Utility routine to send an advertisement. Assumes that the NCE cannot
+ * go away (e.g., because it's refheld).
+ */
+static boolean_t
+nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target,
+ uint_t flags)
+{
+ ASSERT((flags & NDP_PROBE) == 0);
+
+ if (nce->nce_flags & NCE_F_ISROUTER)
+ flags |= NDP_ISROUTER;
+ if (!(nce->nce_flags & NCE_F_ANYCAST))
+ flags |= NDP_ORIDE;
+
+ return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla,
+ &nce->nce_addr, target, flags));
+}
+
+/*
+ * Utility routine to send a solicitation. Assumes that the NCE cannot
+ * go away (e.g., because it's refheld).
+ */
+static boolean_t
+nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
+ uint_t flags)
+{
+ if (flags & NDP_PROBE)
+ sender = &ipv6_all_zeros;
+
+ return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla,
+ sender, &nce->nce_addr, flags));
+}
+
+/*
* nce_xmit is called to form and transmit a ND solicitation or
* advertisement ICMP packet.
*
@@ -2207,88 +2297,79 @@ done:
* corresponding ill's ill_wq otherwise returns B_TRUE.
*/
static boolean_t
-nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
- boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
- int flag)
+nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
+ const in6_addr_t *sender, const in6_addr_t *target, int flag)
{
+ ill_t *hwaddr_ill;
uint32_t len;
icmp6_t *icmp6;
mblk_t *mp;
ip6_t *ip6h;
nd_opt_hdr_t *opt;
- uint_t plen;
+ uint_t plen, maxplen;
ip6i_t *ip6i;
ipif_t *src_ipif = NULL;
uint8_t *hw_addr;
zoneid_t zoneid = GLOBAL_ZONEID;
+ char buf[INET6_ADDRSTRLEN];
+
+ ASSERT(!IS_IPMP(ill));
/*
- * If we have a unspecified source(sender) address, select a
- * proper source address for the solicitation here itself so
- * that we can initialize the h/w address correctly. This is
- * needed for interface groups as source address can come from
- * the whole group and the h/w address initialized from ill will
- * be wrong if the source address comes from a different ill.
- *
- * If the sender is specified then we use this address in order
- * to lookup the zoneid before calling ip_output_v6(). This is to
- * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
- * by IP (we cannot guarantee that the global zone has an interface
- * route to the destination).
- *
- * Note that the NA never comes here with the unspecified source
- * address. The following asserts that whenever the source
- * address is specified, the haddr also should be specified.
+ * Check that the sender is actually a usable address on `ill', and if
+ * so, track that as the src_ipif. If not, for solicitations, set the
+ * sender to :: so that a new one will be picked below; for adverts,
+ * drop the packet since we expect nce_xmit_advert() to always provide
+ * a valid sender.
*/
- ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
+ if (!IN6_IS_ADDR_UNSPECIFIED(sender)) {
+ if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL ||
+ !src_ipif->ipif_addr_ready) {
+ if (src_ipif != NULL) {
+ ipif_refrele(src_ipif);
+ src_ipif = NULL;
+ }
+ if (type == ND_NEIGHBOR_ADVERT) {
+ ip1dbg(("nce_xmit: No source ipif for src %s\n",
+ inet_ntop(AF_INET6, sender, buf,
+ sizeof (buf))));
+ return (B_TRUE);
+ }
+ sender = &ipv6_all_zeros;
+ }
+ }
+ /*
+ * If we still have an unspecified source (sender) address and this
+ * isn't a probe, select a source address from `ill'.
+ */
if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
- ASSERT(operation != ND_NEIGHBOR_ADVERT);
+ ASSERT(type != ND_NEIGHBOR_ADVERT);
/*
- * Pick a source address for this solicitation, but
- * restrict the selection to addresses assigned to the
- * output interface (or interface group). We do this
- * because the destination will create a neighbor cache
- * entry for the source address of this packet, so the
- * source address had better be a valid neighbor.
+ * Pick a source address for this solicitation, but restrict
+ * the selection to addresses assigned to the output
+ * interface. We do this because the destination will create
+ * a neighbor cache entry for the source address of this
+ * packet, so the source address needs to be a valid neighbor.
*/
- src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
+ src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
if (src_ipif == NULL) {
- char buf[INET6_ADDRSTRLEN];
-
ip1dbg(("nce_xmit: No source ipif for dst %s\n",
- inet_ntop(AF_INET6, (char *)target, buf,
- sizeof (buf))));
+ inet_ntop(AF_INET6, target, buf, sizeof (buf))));
return (B_TRUE);
}
sender = &src_ipif->ipif_v6src_addr;
- hwaddr_ill = src_ipif->ipif_ill;
- } else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
- zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst);
- /*
- * It's possible for ipif_lookup_addr_zoneid_v6() to return
- * ALL_ZONES if it cannot find a matching ipif for the address
- * we are trying to use. In this case we err on the side of
- * trying to send the packet by defaulting to the GLOBAL_ZONEID.
- */
- if (zoneid == ALL_ZONES)
- zoneid = GLOBAL_ZONEID;
}
/*
- * Always make sure that the NS/NA packets don't get load
- * spread. This is needed so that the probe packets sent
- * by the in.mpathd daemon can really go out on the desired
- * interface. Probe packets are made to go out on a desired
- * interface by including a ip6i with ATTACH_IF flag. As these
- * packets indirectly end up sending/receiving NS/NA packets
- * (neighbor doing NUD), we have to make sure that NA
- * also go out on the same interface.
+ * We're either sending a probe or we have a source address.
*/
- plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
+ ASSERT((flag & NDP_PROBE) || src_ipif != NULL);
+
+ maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8);
len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
- plen * 8;
+ maxplen;
mp = allocb(len, BPRI_LO);
if (mp == NULL) {
if (src_ipif != NULL)
@@ -2301,28 +2382,27 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
ip6i = (ip6i_t *)mp->b_rptr;
ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
ip6i->ip6i_nxt = IPPROTO_RAW;
- ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
+ ip6i->ip6i_flags = IP6I_HOPLIMIT;
if (flag & NDP_PROBE)
ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
- ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
ip6h->ip6_nxt = IPPROTO_ICMPV6;
ip6h->ip6_hops = IPV6_MAX_HOPS;
+ ip6h->ip6_src = *sender;
ip6h->ip6_dst = *target;
icmp6 = (icmp6_t *)&ip6h[1];
opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
sizeof (nd_neighbor_advert_t));
- if (operation == ND_NEIGHBOR_SOLICIT) {
+ if (type == ND_NEIGHBOR_SOLICIT) {
nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
if (!(flag & NDP_PROBE))
opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
- ip6h->ip6_src = *sender;
ns->nd_ns_target = *target;
if (!(flag & NDP_UNICAST)) {
/* Form multicast address of the target */
@@ -2335,7 +2415,6 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
ASSERT(!(flag & NDP_PROBE));
opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
- ip6h->ip6_src = *sender;
na->nd_na_target = *sender;
if (flag & NDP_ISROUTER)
na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
@@ -2347,22 +2426,48 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
hw_addr = NULL;
if (!(flag & NDP_PROBE)) {
+ /*
+ * Use our source address to find the hardware address to put
+ * in the packet, so that the hardware address and IP address
+ * will match up -- even if that hardware address doesn't
+ * match the ill we actually transmit the packet through.
+ */
+ if (IS_IPMP(src_ipif->ipif_ill)) {
+ hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif);
+ if (hwaddr_ill == NULL) {
+ ip1dbg(("nce_xmit: no bound ill!\n"));
+ ipif_refrele(src_ipif);
+ freemsg(mp);
+ return (B_TRUE);
+ }
+ } else {
+ hwaddr_ill = src_ipif->ipif_ill;
+ ill_refhold(hwaddr_ill); /* for symmetry */
+ }
+
+ plen = roundup(sizeof (nd_opt_hdr_t) +
+ hwaddr_ill->ill_nd_lla_len, 8);
+
hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
hwaddr_ill->ill_phys_addr;
if (hw_addr != NULL) {
/* Fill in link layer address and option len */
- opt->nd_opt_len = (uint8_t)plen;
+ opt->nd_opt_len = (uint8_t)(plen / 8);
bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
}
+
+ ill_refrele(hwaddr_ill);
}
- if (hw_addr == NULL) {
- /* If there's no link layer address option, then strip it. */
- len -= plen * 8;
- mp->b_wptr = mp->b_rptr + len;
- ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
- }
- icmp6->icmp6_type = (uint8_t)operation;
+ if (hw_addr == NULL)
+ plen = 0;
+
+ /* Fix up the length of the packet now that plen is known */
+ len -= (maxplen - plen);
+ mp->b_wptr = mp->b_rptr + len;
+ ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
+
+ icmp6->icmp6_type = type;
icmp6->icmp6_code = 0;
/*
* Prepare for checksum by putting icmp length in the icmp
@@ -2370,8 +2475,17 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
*/
icmp6->icmp6_cksum = ip6h->ip6_plen;
- if (src_ipif != NULL)
+ /*
+ * Before we toss the src_ipif, look up the zoneid to pass to
+ * ip_output_v6(). This is to ensure unicast ND_NEIGHBOR_ADVERT
+ * packets to be routed correctly by IP (we cannot guarantee that the
+ * global zone has an interface route to the destination).
+ */
+ if (src_ipif != NULL) {
+ if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES)
+ zoneid = GLOBAL_ZONEID;
ipif_refrele(src_ipif);
+ }
ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
return (B_FALSE);
@@ -2448,7 +2562,6 @@ ndp_timer(void *arg)
ill_t *ill = nce->nce_ill;
uint32_t ms;
char addrbuf[INET6_ADDRSTRLEN];
- mblk_t *mp;
boolean_t dropped = B_FALSE;
ip_stack_t *ipst = ill->ill_ipst;
@@ -2460,11 +2573,6 @@ ndp_timer(void *arg)
*/
ASSERT(nce != NULL);
- /*
- * Grab the ill_g_lock now itself to avoid lock order problems.
- * nce_solicit needs ill_g_lock to be able to traverse ills
- */
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
mutex_enter(&nce->nce_lock);
NCE_REFHOLD_LOCKED(nce);
nce->nce_timeout_id = 0;
@@ -2474,11 +2582,10 @@ ndp_timer(void *arg)
*/
switch (nce->nce_state) {
case ND_DELAY:
- rw_exit(&ipst->ips_ill_g_lock);
nce->nce_state = ND_PROBE;
mutex_exit(&nce->nce_lock);
- (void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
- &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
+ (void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros,
+ NDP_UNICAST);
if (ip_debug > 3) {
/* ip2dbg */
pr_addr_dbg("ndp_timer: state for %s changed "
@@ -2489,7 +2596,6 @@ ndp_timer(void *arg)
return;
case ND_PROBE:
/* must be retransmit timer */
- rw_exit(&ipst->ips_ill_g_lock);
nce->nce_pcnt--;
ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
nce->nce_pcnt >= -1);
@@ -2504,8 +2610,8 @@ ndp_timer(void *arg)
nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
addrbuf, sizeof (addrbuf))));
mutex_exit(&nce->nce_lock);
- dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
- B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
+ dropped = nce_xmit_solicit(nce, B_FALSE,
+ &ipv6_all_zeros,
(nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
NDP_UNICAST);
if (dropped) {
@@ -2542,8 +2648,8 @@ ndp_timer(void *arg)
*/
nce->nce_state = ND_REACHABLE;
mutex_exit(&nce->nce_lock);
- ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
- ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
+ ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr,
+ nce->nce_ill);
if (ipif != NULL) {
if (ipif->ipif_was_dup) {
char ibuf[LIFNAMSIZ + 10];
@@ -2566,9 +2672,8 @@ ndp_timer(void *arg)
}
/* Begin defending our new address */
nce->nce_unsolicit_count = 0;
- dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
- B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
- nce_advert_flags(nce));
+ dropped = nce_xmit_advert(nce, B_FALSE,
+ &ipv6_all_hosts_mcast, 0);
if (dropped) {
nce->nce_unsolicit_count = 1;
NDP_RESTART_TIMER(nce,
@@ -2589,51 +2694,40 @@ ndp_timer(void *arg)
}
NCE_REFRELE(nce);
return;
- case ND_INCOMPLETE:
+ case ND_INCOMPLETE: {
+ ip6_t *ip6h;
+ ip6i_t *ip6i;
+ mblk_t *mp, *datamp, *nextmp, **prevmpp;
+
/*
- * Must be resolvers retransmit timer.
+ * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp
+ * for any IPMP probe packets, and toss 'em. IPMP probe
+ * packets will always be at the head of nce_qd_mp and always
+ * have an ip6i_t header, so we can stop at the first queued
+ * ND packet without an ip6i_t.
*/
- for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
- ip6i_t *ip6i;
- ip6_t *ip6h;
- mblk_t *data_mp;
-
- /*
- * Walk the list of packets queued, and see if there
- * are any multipathing probe packets. Such packets
- * are always queued at the head. Since this is a
- * retransmit timer firing, mark such packets as
- * delayed in ND resolution. This info will be used
- * in ip_wput_v6(). Multipathing probe packets will
- * always have an ip6i_t. Once we hit a packet without
- * it, we can break out of this loop.
- */
- if (mp->b_datap->db_type == M_CTL)
- data_mp = mp->b_cont;
- else
- data_mp = mp;
-
- ip6h = (ip6_t *)data_mp->b_rptr;
+ prevmpp = &nce->nce_qd_mp;
+ for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) {
+ nextmp = mp->b_next;
+ datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp;
+ ip6h = (ip6_t *)datamp->b_rptr;
if (ip6h->ip6_nxt != IPPROTO_RAW)
break;
- /*
- * This message should have been pulled up already in
- * ip_wput_v6. We can't do pullups here because the
- * b_next/b_prev is non-NULL.
- */
ip6i = (ip6i_t *)ip6h;
- ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
- sizeof (ip6i_t) + IPV6_HDR_LEN);
-
- /* Mark this packet as delayed due to ND resolution */
- if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
- ip6i->ip6i_flags |= IP6I_ND_DELAYED;
+ if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) {
+ inet_freemsg(mp);
+ *prevmpp = nextmp;
+ } else {
+ prevmpp = &mp->b_next;
+ }
}
+
+ /*
+ * Must be resolver's retransmit timer.
+ */
if (nce->nce_qd_mp != NULL) {
- ms = nce_solicit(nce, NULL);
- rw_exit(&ipst->ips_ill_g_lock);
- if (ms == 0) {
+ if ((ms = nce_solicit(nce, NULL)) == 0) {
if (nce->nce_state != ND_REACHABLE) {
mutex_exit(&nce->nce_lock);
nce_resolv_failed(nce);
@@ -2649,11 +2743,10 @@ ndp_timer(void *arg)
return;
}
mutex_exit(&nce->nce_lock);
- rw_exit(&ipst->ips_ill_g_lock);
NCE_REFRELE(nce);
break;
- case ND_REACHABLE :
- rw_exit(&ipst->ips_ill_g_lock);
+ }
+ case ND_REACHABLE:
if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
nce->nce_unsolicit_count != 0) ||
((nce->nce_flags & NCE_F_PERMANENT) &&
@@ -2661,13 +2754,8 @@ ndp_timer(void *arg)
if (nce->nce_unsolicit_count > 0)
nce->nce_unsolicit_count--;
mutex_exit(&nce->nce_lock);
- dropped = nce_xmit(ill,
- ND_NEIGHBOR_ADVERT,
- ill, /* ill to be used for hw addr */
- B_FALSE, /* use ill_phys_addr */
- &nce->nce_addr,
- &ipv6_all_hosts_mcast,
- nce_advert_flags(nce));
+ dropped = nce_xmit_advert(nce, B_FALSE,
+ &ipv6_all_hosts_mcast, 0);
if (dropped) {
mutex_enter(&nce->nce_lock);
nce->nce_unsolicit_count++;
@@ -2686,7 +2774,6 @@ ndp_timer(void *arg)
NCE_REFRELE(nce);
break;
default:
- rw_exit(&ipst->ips_ill_g_lock);
mutex_exit(&nce->nce_lock);
NCE_REFRELE(nce);
break;
@@ -2819,23 +2906,20 @@ void
nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
{
uint_t count = 0;
- mblk_t **mpp;
+ mblk_t **mpp, *tmp;
ASSERT(MUTEX_HELD(&nce->nce_lock));
- for (mpp = &nce->nce_qd_mp; *mpp != NULL;
- mpp = &(*mpp)->b_next) {
- if (++count >
- nce->nce_ill->ill_max_buf) {
- mblk_t *tmp = nce->nce_qd_mp->b_next;
-
+ for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
+ if (++count > nce->nce_ill->ill_max_buf) {
+ tmp = nce->nce_qd_mp->b_next;
nce->nce_qd_mp->b_next = NULL;
nce->nce_qd_mp->b_prev = NULL;
freemsg(nce->nce_qd_mp);
nce->nce_qd_mp = tmp;
}
}
- /* put this on the list */
+
if (head_insert) {
mp->b_next = nce->nce_qd_mp;
nce->nce_qd_mp = mp;
@@ -2849,8 +2933,8 @@ nce_queue_mp(nce_t *nce, mblk_t *mp)
{
boolean_t head_insert = B_FALSE;
ip6_t *ip6h;
- ip6i_t *ip6i;
- mblk_t *data_mp;
+ ip6i_t *ip6i;
+ mblk_t *data_mp;
ASSERT(MUTEX_HELD(&nce->nce_lock));
@@ -2867,43 +2951,28 @@ nce_queue_mp(nce_t *nce, mblk_t *mp)
* non-NULL.
*/
ip6i = (ip6i_t *)ip6h;
- ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
- sizeof (ip6i_t) + IPV6_HDR_LEN);
+ ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
+
/*
- * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
- * This has 2 aspects mentioned below.
- * 1. Perform head insertion in the nce_qd_mp for these packets.
- * This ensures that next retransmit of ND solicitation
- * will use the interface specified by the probe packet,
- * for both NS and NA. This corresponds to the src address
- * in the IPv6 packet. If we insert at tail, we will be
- * depending on the packet at the head for successful
- * ND resolution. This is not reliable, because the interface
- * on which the NA arrives could be different from the interface
- * on which the NS was sent, and if the receiving interface is
- * failed, it will appear that the sending interface is also
- * failed, causing in.mpathd to misdiagnose this as link
- * failure.
- * 2. Drop the original packet, if the ND resolution did not
- * succeed in the first attempt. However we will create the
- * nce and the ire, as soon as the ND resolution succeeds.
- * We don't gain anything by queueing multiple probe packets
- * and sending them back-to-back once resolution succeeds.
- * It is sufficient to send just 1 packet after ND resolution
- * succeeds. Since mpathd is sending down probe packets at a
- * constant rate, we don't need to send the queued packet. We
- * need to queue it only for NDP resolution. The benefit of
- * dropping the probe packets that were delayed in ND
- * resolution, is that in.mpathd will not see inflated
- * RTT. If the ND resolution does not succeed within
- * in.mpathd's failure detection time, mpathd may detect
- * a failure, and it does not matter whether the packet
- * was queued or dropped.
+ * If this packet is marked IP6I_IPMP_PROBE, then we need to:
+ *
+ * 1. Insert it at the head of the nce_qd_mp list. Consider
+ * the normal (non-probe) load-speading case where the
+ * source address of the ND packet is not tied to nce_ill.
+ * If the ill bound to the source address cannot receive,
+ * the response to the ND packet will not be received.
+ * However, if ND packets for nce_ill's probes are queued
+ * behind that ND packet, those probes will also fail to
+ * be sent, and thus in.mpathd will erroneously conclude
+ * that nce_ill has also failed.
+ *
+ * 2. Drop the probe packet in ndp_timer() if the ND did
+ * not succeed on the first attempt. This ensures that
+ * ND problems do not manifest as probe RTT spikes.
*/
- if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
+ if (ip6i->ip6i_flags & IP6I_IPMP_PROBE)
head_insert = B_TRUE;
}
-
nce_queue_mp_common(nce, mp, head_insert);
}
@@ -2988,13 +3057,17 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
(lnr->lnr_state_create != ND_STALE))
return (EINVAL);
+ if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN)
+ return (EINVAL);
+
sin6 = (sin6_t *)&lnr->lnr_addr;
addr = &sin6->sin6_addr;
mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
/* We know it can not be mapping so just look in the hash table */
nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
- nce = nce_lookup_addr(ill, addr, nce);
+ /* See comment in ndp_query() regarding IS_IPMP(ill) usage */
+ nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce);
if (nce != NULL)
new_flags = nce->nce_flags;
@@ -3065,7 +3138,7 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
* the link layer address passed in to determine the state
* much like incoming packets.
*/
- ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
+ nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
NCE_REFRELE(nce);
return (0);
}
@@ -3463,7 +3536,11 @@ ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
- nce = nce_lookup_addr(ill, &addr6, nce);
+ /*
+ * NOTE: IPv4 never matches across the illgrp since the NCE's we're
+ * looking up have fastpath headers that are inherently per-ill.
+ */
+ nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
if (nce == NULL) {
err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
} else {
@@ -3718,3 +3795,26 @@ ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
return (nce != NULL);
}
+
+/*
+ * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly
+ * with IPMP. Specifically, since neighbor discovery is always done on
+ * underlying interfaces (even for addresses owned by an IPMP interface), we
+ * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface
+ * associated with `ill' (if it exists).
+ */
+static ipif_t *
+ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill)
+{
+ ipif_t *ipif;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
+ if (ipif == NULL && IS_UNDER_IPMP(ill)) {
+ if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
+ ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
+ ill_refrele(ill);
+ }
+ }
+ return (ipif);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_netinfo.c b/usr/src/uts/common/inet/ip/ip_netinfo.c
index 53665593be..e81c7a0e1f 100644
--- a/usr/src/uts/common/inet/ip/ip_netinfo.c
+++ b/usr/src/uts/common/inet/ip/ip_netinfo.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -568,33 +568,17 @@ ip_getifname_impl(phy_if_t phy_ifdata,
char *buffer, const size_t buflen, boolean_t isv6, ip_stack_t *ipst)
{
ill_t *ill;
- char *name;
ASSERT(buffer != NULL);
ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, NULL, NULL,
NULL, NULL, ipst);
- if (ill != NULL) {
- name = ill->ill_name;
- } else {
- /* Fallback to group names only if hook_emulation is set */
- if (ipst->ips_ipmp_hook_emulation) {
- ill = ill_group_lookup_on_ifindex((uint_t)phy_ifdata,
- isv6, ipst);
- }
- if (ill == NULL)
- return (1);
- name = ill->ill_phyint->phyint_groupname;
- }
- if (name != NULL) {
- (void) strlcpy(buffer, name, buflen);
- ill_refrele(ill);
- return (0);
- } else {
- ill_refrele(ill);
+ if (ill == NULL)
return (1);
- }
+ (void) strlcpy(buffer, ill->ill_name, buflen);
+ ill_refrele(ill);
+ return (0);
}
/*
@@ -625,9 +609,6 @@ ipv6_getmtu(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata)
/*
* Shared implementation to determine the MTU of a network interface
- *
- * Note: this does not handle a non-zero ifdata when ipmp_hook_emulation is set.
- * But IP Filter only uses a zero ifdata.
*/
/* ARGSUSED */
static int
@@ -653,16 +634,7 @@ ip_getmtu_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
if ((ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6,
NULL, NULL, NULL, NULL, ipst)) == NULL) {
- /*
- * Fallback to group names only if hook_emulation
- * is set
- */
- if (ipst->ips_ipmp_hook_emulation) {
- ill = ill_group_lookup_on_ifindex(
- (uint_t)phy_ifdata, isv6, ipst);
- }
- if (ill == NULL)
- return (0);
+ return (0);
}
mtu = ill->ill_max_frag;
ill_refrele(ill);
@@ -686,9 +658,6 @@ ip_getpmtuenabled(net_handle_t neti)
/*
* Get next interface from the current list of IPv4 physical network interfaces
- *
- * Note: this does not handle the case when ipmp_hook_emulation is set.
- * But IP Filter does not use this function.
*/
static phy_if_t
ip_phygetnext(net_handle_t neti, phy_if_t phy_ifdata)
@@ -752,15 +721,10 @@ ip_phylookup_impl(const char *name, boolean_t isv6, ip_stack_t *ipst)
ill = ill_lookup_on_name((char *)name, B_FALSE, isv6, NULL, NULL,
NULL, NULL, NULL, ipst);
-
- /* Fallback to group names only if hook_emulation is set */
- if (ill == NULL && ipst->ips_ipmp_hook_emulation) {
- ill = ill_group_lookup_on_name((char *)name, isv6, ipst);
- }
if (ill == NULL)
return (0);
- phy = ill->ill_phyint->phyint_hook_ifindex;
+ phy = ill->ill_phyint->phyint_ifindex;
ill_refrele(ill);
@@ -798,9 +762,6 @@ ipv6_lifgetnext(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata)
/*
* Shared implementation to get next interface from the current list of
* logical network interfaces
- *
- * Note: this does not handle the case when ipmp_hook_emulation is set.
- * But IP Filter does not use this function.
*/
static lif_if_t
ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
@@ -834,7 +795,7 @@ ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
/*
* It's safe to iterate the ill_ipif list when holding an ill_lock.
* And it's also safe to access ipif_id without ipif refhold.
- * See ipif_get_id().
+ * See the field access rules in ip.h.
*/
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
if (!IPIF_CAN_LOOKUP(ipif))
@@ -1013,8 +974,8 @@ ip_inject_impl(inject_t style, net_inject_t *packet, boolean_t isv6,
if (ire->ire_nce == NULL ||
ire->ire_nce->nce_fp_mp == NULL &&
ire->ire_nce->nce_res_mp == NULL) {
- ip_newroute_v6(ire->ire_stq, mp,
- &sin6->sin6_addr, NULL, NULL, ALL_ZONES, ipst);
+ ip_newroute_v6(ire->ire_stq, mp, &sin6->sin6_addr,
+ &ip6h->ip6_src, NULL, ALL_ZONES, ipst);
ire_refrele(ire);
return (0);
@@ -1170,7 +1131,7 @@ ip_routeto_impl(struct sockaddr *address, struct sockaddr *nexthop,
}
ASSERT(ill != NULL);
- phy_if = (phy_if_t)ill->ill_phyint->phyint_hook_ifindex;
+ phy_if = (phy_if_t)ill->ill_phyint->phyint_ifindex;
if (sire != NULL)
ire_refrele(sire);
ire_refrele(ire);
@@ -1305,9 +1266,6 @@ ipv6_getlifaddr(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata,
/*
* Shared implementation to determine the network addresses for an interface
- *
- * Note: this does not handle a non-zero ifdata when ipmp_hook_emulation is set.
- * But IP Filter only uses a zero ifdata.
*/
/* ARGSUSED */
static int
@@ -1531,12 +1489,6 @@ ip_ni_queue_func_impl(injection_t *inject, boolean_t out)
ill = ill_lookup_on_ifindex((uint_t)packet->ni_physical,
B_FALSE, NULL, NULL, NULL, NULL, ipst);
-
- /* Fallback to group names only if hook_emulation is set */
- if (ill == NULL && ipst->ips_ipmp_hook_emulation) {
- ill = ill_group_lookup_on_ifindex((uint_t)packet->ni_physical,
- B_FALSE, ipst);
- }
if (ill == NULL) {
kmem_free(inject, sizeof (*inject));
return;
@@ -1613,65 +1565,3 @@ done:
kmem_free(info->hnei_event.hne_data, info->hnei_event.hne_datalen);
kmem_free(arg, sizeof (hook_nic_event_int_t));
}
-
-/*
- * Temporary function to support IPMP emulation for IP Filter.
- * Lookup an ill based on the ifindex assigned to the group.
- * Skips unusable ones i.e. where any of these flags are set:
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)
- */
-ill_t *
-ill_group_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
-{
- ill_t *ill;
- phyint_t *phyi;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- phyi = phyint_lookup_group_ifindex(index, ipst);
- if (phyi != NULL) {
- ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
- if (ill != NULL) {
- mutex_enter(&ill->ill_lock);
- if (ILL_CAN_LOOKUP(ill)) {
- ill_refhold_locked(ill);
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (ill);
- }
- mutex_exit(&ill->ill_lock);
- }
- }
- rw_exit(&ipst->ips_ill_g_lock);
- return (NULL);
-}
-
-/*
- * Temporary function to support IPMP emulation for IP Filter.
- * Lookup an ill based on the group name.
- * Skips unusable ones i.e. where any of these flags are set:
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)
- */
-ill_t *
-ill_group_lookup_on_name(char *name, boolean_t isv6, ip_stack_t *ipst)
-{
- ill_t *ill;
- phyint_t *phyi;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- phyi = phyint_lookup_group(name, B_TRUE, ipst);
- if (phyi != NULL) {
- ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
- if (ill != NULL) {
- mutex_enter(&ill->ill_lock);
- if (ILL_CAN_LOOKUP(ill)) {
- ill_refhold_locked(ill);
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (ill);
- }
- mutex_exit(&ill->ill_lock);
- }
- }
- rw_exit(&ipst->ips_ill_g_lock);
- return (NULL);
-}
diff --git a/usr/src/uts/common/inet/ip/ip_opt_data.c b/usr/src/uts/common/inet/ip/ip_opt_data.c
index bb6e98a99e..1c91ea667f 100644
--- a/usr/src/uts/common/inet/ip/ip_opt_data.c
+++ b/usr/src/uts/common/inet/ip/ip_opt_data.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -119,9 +119,6 @@ opdes_t ip_opt_arr[] = {
{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 /* no ifindex */ },
-{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
- sizeof (struct in_addr), 0 /* not initialized */ },
-
{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
sizeof (int), 0 },
@@ -199,12 +196,6 @@ opdes_t ip_opt_arr[] = {
{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 /* no ifindex */ },
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 /* no ifindex */ },
-
-{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 /* no ifindex */ },
-
{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
sizeof (int), 0 },
diff --git a/usr/src/uts/common/inet/ip/ip_rts.c b/usr/src/uts/common/inet/ip/ip_rts.c
index 3324d1d833..77ab2cc220 100644
--- a/usr/src/uts/common/inet/ip/ip_rts.c
+++ b/usr/src/uts/common/inet/ip/ip_rts.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -93,34 +93,52 @@ static void rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics);
static void ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *);
/*
- * Send the ack to all the routing queues. In case of the originating queue,
- * send it only if the loopback is set.
- *
- * Messages are sent upstream only on routing sockets that did not specify an
- * address family when they were created or when the address family matches the
- * one specified by the caller.
+ * Send `mp' to all eligible routing queues. A queue is ineligible if:
*
+ * 1. SO_USELOOPBACK is off and it is not the originating queue.
+ * 2. RTAW_UNDER_IPMP is on and RTSQ_UNDER_IPMP is clear in `flags'.
+ * 3. RTAW_UNDER_IPMP is off and RTSQ_NORMAL is clear in `flags'.
+ * 4. It is not the same address family as `af', and `af' isn't AF_UNSPEC.
*/
void
-rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst)
+rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
+ ip_stack_t *ipst)
{
mblk_t *mp1;
conn_t *connp, *next_connp;
+ /*
+ * Since we don't have an ill_t here, RTSQ_DEFAULT must already be
+ * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP by now.
+ */
+ ASSERT(!(flags & RTSQ_DEFAULT));
+
mutex_enter(&ipst->ips_rts_clients->connf_lock);
connp = ipst->ips_rts_clients->connf_head;
- while (connp != NULL) {
+ for (; connp != NULL; connp = next_connp) {
+ next_connp = connp->conn_next;
+
/*
* If there was a family specified when this routing socket was
* created and it doesn't match the family of the message to
* copy, then continue.
*/
if ((connp->conn_proto != AF_UNSPEC) &&
- (connp->conn_proto != af)) {
- connp = connp->conn_next;
+ (connp->conn_proto != af))
continue;
+
+ /*
+ * Queue the message only if the conn_t and flags match.
+ */
+ if (connp->conn_rtaware & RTAW_UNDER_IPMP) {
+ if (!(flags & RTSQ_UNDER_IPMP))
+ continue;
+ } else {
+ if (!(flags & RTSQ_NORMAL))
+ continue;
}
+
/*
* For the originating queue, we only copy the message upstream
* if loopback is set. For others reading on the routing
@@ -128,8 +146,8 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst)
* message.
*/
if ((o_connp == connp) && connp->conn_loopback == 0) {
- connp = connp->conn_next;
- continue;
+ connp = connp->conn_next;
+ continue;
}
CONN_INC_REF(connp);
mutex_exit(&ipst->ips_rts_clients->connf_lock);
@@ -145,10 +163,9 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst)
}
mutex_enter(&ipst->ips_rts_clients->connf_lock);
- /* Follow the next pointer before releasing the conn. */
+ /* reload next_connp since conn_next may have changed */
next_connp = connp->conn_next;
CONN_DEC_REF(connp);
- connp = next_connp;
}
mutex_exit(&ipst->ips_rts_clients->connf_lock);
freemsg(mp);
@@ -209,7 +226,7 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
rtm->rtm_errno = error;
else
rtm->rtm_flags |= RTF_DONE;
- rts_queue_input(mp, NULL, af, ipst);
+ rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst);
}
/* ARGSUSED */
@@ -430,7 +447,7 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
if (index != 0) {
ill_t *ill;
-
+lookup:
/*
* IPC must be refheld somewhere in ip_wput_nondata or
* ip_wput_ioctl etc... and cleaned up if ioctl is killed.
@@ -445,16 +462,33 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
goto done;
}
- ipif = ipif_get_next_ipif(NULL, ill);
- ill_refrele(ill);
/*
- * If this is replacement ipif, prevent a route from
- * being added.
+ * Since all interfaces in an IPMP group must be equivalent,
+ * we prevent changes to a specific underlying interface's
+ * routing configuration. However, for backward compatibility,
+ * we intepret a request to add a route on an underlying
+ * interface as a request to add a route on its IPMP interface.
*/
- if (ipif != NULL && ipif->ipif_replace_zero) {
- error = ENETDOWN;
- goto done;
+ if (IS_UNDER_IPMP(ill)) {
+ switch (rtm->rtm_type) {
+ case RTM_CHANGE:
+ case RTM_DELETE:
+ ill_refrele(ill);
+ error = EINVAL;
+ goto done;
+ case RTM_ADD:
+ index = ipmp_ill_get_ipmp_ifindex(ill);
+ ill_refrele(ill);
+ if (index == 0) {
+ error = EINVAL;
+ goto done;
+ }
+ goto lookup;
+ }
}
+
+ ipif = ipif_get_next_ipif(NULL, ill);
+ ill_refrele(ill);
match_flags |= MATCH_IRE_ILL;
}
@@ -1037,7 +1071,7 @@ done:
/* OK ACK already set up by caller except this */
ip2dbg(("ip_rts_request: OK ACK\n"));
}
- rts_queue_input(mp, connp, af, ipst);
+ rts_queue_input(mp, connp, af, RTSQ_ALL, ipst);
}
iocp->ioc_error = error;
@@ -1724,7 +1758,7 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
rtm->rtm_errno = error;
rtm->rtm_flags |= RTF_DONE;
rtm->rtm_addrs = rtm_addrs;
- rts_queue_input(mp, NULL, AF_INET, ipst);
+ rts_queue_input(mp, NULL, AF_INET, RTSQ_ALL, ipst);
}
/*
@@ -1733,7 +1767,13 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
* Message type generated RTM_IFINFO.
*/
void
-ip_rts_ifmsg(const ipif_t *ipif)
+ip_rts_ifmsg(const ipif_t *ipif, uint_t flags)
+{
+ ip_rts_xifmsg(ipif, 0, 0, flags);
+}
+
+void
+ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags)
{
if_msghdr_t *ifm;
mblk_t *mp;
@@ -1741,12 +1781,12 @@ ip_rts_ifmsg(const ipif_t *ipif)
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
/*
- * This message should be generated only
- * when the physical device is changing
- * state.
+ * This message should be generated only when the physical interface
+ * is changing state.
*/
if (ipif->ipif_id != 0)
return;
+
if (ipif->ipif_isv6) {
af = AF_INET6;
mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
@@ -1765,11 +1805,22 @@ ip_rts_ifmsg(const ipif_t *ipif)
}
ifm = (if_msghdr_t *)mp->b_rptr;
ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
- ifm->ifm_flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags |
- ipif->ipif_ill->ill_phyint->phyint_flags;
+ ifm->ifm_flags = (ipif->ipif_flags | ipif->ipif_ill->ill_flags |
+ ipif->ipif_ill->ill_phyint->phyint_flags | set) & ~clear;
rts_getifdata(&ifm->ifm_data, ipif);
ifm->ifm_addrs = RTA_IFP;
- rts_queue_input(mp, NULL, af, ipst);
+
+ if (flags & RTSQ_DEFAULT) {
+ flags = RTSQ_ALL;
+ /*
+ * If this message is for an underlying interface, prevent
+ * "normal" (IPMP-unaware) routing sockets from seeing it.
+ */
+ if (IS_UNDER_IPMP(ipif->ipif_ill))
+ flags &= ~RTSQ_NORMAL;
+ }
+
+ rts_queue_input(mp, NULL, af, flags, ipst);
}
/*
@@ -1778,7 +1829,7 @@ ip_rts_ifmsg(const ipif_t *ipif)
* The structure of the code is based on the 4.4BSD-Lite2 <net/rtsock.c>.
*/
void
-ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
+ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
{
int pass;
int ncmd;
@@ -1793,6 +1844,17 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
af = AF_INET6;
else
af = AF_INET;
+
+ if (flags & RTSQ_DEFAULT) {
+ flags = RTSQ_ALL;
+ /*
+ * If this message is for an underlying interface, prevent
+ * "normal" (IPMP-unaware) routing sockets from seeing it.
+ */
+ if (IS_UNDER_IPMP(ipif->ipif_ill))
+ flags &= ~RTSQ_NORMAL;
+ }
+
/*
* If the request is DELETE, send RTM_DELETE and RTM_DELADDR.
* if the request is ADD, send RTM_NEWADDR and RTM_ADD.
@@ -1827,7 +1889,7 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
ifam->ifam_metric = ipif->ipif_metric;
ifam->ifam_flags = ((cmd == RTM_ADD) ? RTF_UP : 0);
ifam->ifam_addrs = rtm_addrs;
- rts_queue_input(mp, NULL, af, ipst);
+ rts_queue_input(mp, NULL, af, flags, ipst);
}
if ((cmd == RTM_ADD && pass == 2) ||
(cmd == RTM_DELETE && pass == 1)) {
@@ -1857,7 +1919,7 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
if (error == 0)
rtm->rtm_flags |= RTF_DONE;
rtm->rtm_addrs = rtm_addrs;
- rts_queue_input(mp, NULL, af, ipst);
+ rts_queue_input(mp, NULL, af, flags, ipst);
}
}
}
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index 59ddb7461f..5afa70160d 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -2322,11 +2322,8 @@ ipcl_conn_cleanup(conn_t *connp)
* We should replace these pointers with ifindex/ipaddr_t to
* make the code less complex.
*/
- ASSERT(connp->conn_xmit_if_ill == NULL);
- ASSERT(connp->conn_nofailover_ill == NULL);
ASSERT(connp->conn_outgoing_ill == NULL);
ASSERT(connp->conn_incoming_ill == NULL);
- ASSERT(connp->conn_outgoing_pill == NULL);
ASSERT(connp->conn_multicast_ipif == NULL);
ASSERT(connp->conn_multicast_ill == NULL);
#endif
diff --git a/usr/src/uts/common/inet/ip/ipmp.c b/usr/src/uts/common/inet/ip/ipmp.c
new file mode 100644
index 0000000000..b8f3768834
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ipmp.c
@@ -0,0 +1,2201 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <inet/arp.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_rts.h>
+#include <inet/mi.h>
+#include <net/if_types.h>
+#include <sys/dlpi.h>
+#include <sys/kmem.h>
+#include <sys/modhash.h>
+#include <sys/sdt.h>
+#include <sys/strsun.h>
+#include <sys/sunddi.h>
+#include <sys/types.h>
+
+/*
+ * Convenience macros for getting the ip_stack_t associated with an
+ * ipmp_illgrp_t or ipmp_grp_t.
+ */
+#define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint)
+#define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst)
+
+/*
+ * Assorted constants that aren't important enough to be tunable.
+ */
+#define IPMP_GRP_HASH_SIZE 64
+#define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */
+
+/*
+ * Templates for IPMP ARP messages.
+ */
+static const arie_t ipmp_aract_template = {
+ AR_IPMP_ACTIVATE,
+ sizeof (arie_t), /* Name offset */
+ sizeof (arie_t) /* Name length (set by ill_arp_alloc) */
+};
+
+static const arie_t ipmp_ardeact_template = {
+ AR_IPMP_DEACTIVATE,
+ sizeof (arie_t), /* Name offset */
+ sizeof (arie_t) /* Name length (set by ill_arp_alloc) */
+};
+
+/*
+ * IPMP meta-interface kstats (based on those in PSARC/1997/198).
+ */
+static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = {
+ { "obytes", KSTAT_DATA_UINT32 },
+ { "obytes64", KSTAT_DATA_UINT64 },
+ { "rbytes", KSTAT_DATA_UINT32 },
+ { "rbytes64", KSTAT_DATA_UINT64 },
+ { "opackets", KSTAT_DATA_UINT32 },
+ { "opackets64", KSTAT_DATA_UINT64 },
+ { "oerrors", KSTAT_DATA_UINT32 },
+ { "ipackets", KSTAT_DATA_UINT32 },
+ { "ipackets64", KSTAT_DATA_UINT64 },
+ { "ierrors", KSTAT_DATA_UINT32 },
+ { "multircv", KSTAT_DATA_UINT32 },
+ { "multixmt", KSTAT_DATA_UINT32 },
+ { "brdcstrcv", KSTAT_DATA_UINT32 },
+ { "brdcstxmt", KSTAT_DATA_UINT32 },
+ { "link_up", KSTAT_DATA_UINT32 }
+};
+
+static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t);
+static int ipmp_grp_create_kstats(ipmp_grp_t *);
+static int ipmp_grp_update_kstats(kstat_t *, int);
+static void ipmp_grp_destroy_kstats(ipmp_grp_t *);
+static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *);
+static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *);
+static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
+static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t);
+static boolean_t ipmp_ill_activate(ill_t *);
+static void ipmp_ill_deactivate(ill_t *);
+static void ipmp_ill_ire_mark_testhidden(ire_t *, char *);
+static void ipmp_ill_ire_clear_testhidden(ire_t *, char *);
+static void ipmp_ill_refresh_active_timer_start(ill_t *);
+static void ipmp_ill_rtsaddrmsg(ill_t *, int);
+static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action);
+static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t);
+static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *);
+static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *);
+
+/*
+ * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init().
+ */
+void
+ipmp_init(ip_stack_t *ipst)
+{
+ ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash",
+ IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
+ mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
+ rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0);
+}
+
+/*
+ * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini().
+ */
+void
+ipmp_destroy(ip_stack_t *ipst)
+{
+ mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash);
+ rw_destroy(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Create an IPMP group named `grname', associate it with IPMP phyint `phyi',
+ * and add it to the hash. On success, return a pointer to the created group.
+ * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP
+ * meta-interface associated with the group also has the same name (but they
+ * may differ later via ipmp_grp_rename()).
+ */
+ipmp_grp_t *
+ipmp_grp_create(const char *grname, phyint_t *phyi)
+{
+ ipmp_grp_t *grp;
+ ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+ mod_hash_hndl_t mh;
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL)
+ return (NULL);
+
+ (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
+ (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname));
+
+ /*
+ * Cache the group's phyint. This is safe since a phyint_t will
+ * outlive its ipmp_grp_t.
+ */
+ grp->gr_phyint = phyi;
+
+ /*
+ * Create IPMP group kstats.
+ */
+ if (ipmp_grp_create_kstats(grp) != 0) {
+ kmem_free(grp, sizeof (ipmp_grp_t));
+ return (NULL);
+ }
+
+ /*
+ * Insert the group into the hash.
+ */
+ if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) {
+ ipmp_grp_destroy_kstats(grp);
+ kmem_free(grp, sizeof (ipmp_grp_t));
+ return (NULL);
+ }
+ ipmp_grp_insert(grp, mh);
+
+ return (grp);
+}
+
+/*
+ * Create IPMP kstat structures for `grp'. Return an errno upon failure.
+ */
+static int
+ipmp_grp_create_kstats(ipmp_grp_t *grp)
+{
+ kstat_t *ksp;
+ netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
+
+ ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net",
+ KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id);
+ if (ksp == NULL)
+ return (ENOMEM);
+
+ ksp->ks_update = ipmp_grp_update_kstats;
+ ksp->ks_private = grp;
+ bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats));
+
+ kstat_install(ksp);
+ grp->gr_ksp = ksp;
+ return (0);
+}
+
+/*
+ * Update the IPMP kstats tracked by `ksp'; called by the kstats framework.
+ */
+static int
+ipmp_grp_update_kstats(kstat_t *ksp, int rw)
+{
+ uint_t i;
+ kstat_named_t *kn = KSTAT_NAMED_PTR(ksp);
+ ipmp_grp_t *grp = ksp->ks_private;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+ ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq;
+ phyint_t *phyi;
+ uint64_t phyi_kstats[IPMP_KSTAT_MAX];
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Start with the group's baseline values.
+ */
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ if (kn[i].data_type == KSTAT_DATA_UINT32) {
+ kn[i].value.ui32 = grp->gr_kstats0[i];
+ } else {
+ ASSERT(kn[i].data_type == KSTAT_DATA_UINT64);
+ kn[i].value.ui64 = grp->gr_kstats0[i];
+ }
+ }
+
+ /*
+ * Add in the stats of each phyint currently in the group. Since we
+ * don't directly track the phyints in a group, we cheat by walking
+ * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while
+ * ill_g_lock is held.)
+ */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ ipsq = grp_ipsq->ipsq_next;
+ for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) {
+ phyi = ipsq->ipsq_phyint;
+
+ /*
+ * If a phyint in a group is being unplumbed, it's possible
+ * that ill_glist_delete() -> phyint_free() already freed the
+ * phyint (and set ipsq_phyint to NULL), but the unplumb
+ * operation has yet to complete (and thus ipsq_dq() has yet
+ * to remove the phyint's IPSQ from the group IPSQ's phyint
+ * list). We skip those phyints here (note that their kstats
+ * have already been added to gr_kstats0[]).
+ */
+ if (phyi == NULL)
+ continue;
+
+ ipmp_phyint_get_kstats(phyi, phyi_kstats);
+
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ phyi_kstats[i] -= phyi->phyint_kstats0[i];
+ if (kn[i].data_type == KSTAT_DATA_UINT32)
+ kn[i].value.ui32 += phyi_kstats[i];
+ else
+ kn[i].value.ui64 += phyi_kstats[i];
+ }
+ }
+
+ kn[IPMP_KSTAT_LINK_UP].value.ui32 =
+ (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0;
+
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (0);
+}
+
+/*
+ * Destroy IPMP kstat structures for `grp'.
+ */
+static void
+ipmp_grp_destroy_kstats(ipmp_grp_t *grp)
+{
+ netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
+
+ kstat_delete_netstack(grp->gr_ksp, id);
+ bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0));
+ grp->gr_ksp = NULL;
+}
+
+/*
+ * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it
+ * does not exist.
+ */
+ipmp_grp_t *
+ipmp_grp_lookup(const char *grname, ip_stack_t *ipst)
+{
+ ipmp_grp_t *grp;
+
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
+ (mod_hash_val_t *)&grp) == 0)
+ return (grp);
+
+ return (NULL);
+}
+
+/*
+ * Place information about group `grp' into `lifgr'.
+ */
+void
+ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr)
+{
+ ill_t *ill;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ lifgr->gi_v4 = (grp->gr_v4 != NULL);
+ lifgr->gi_v6 = (grp->gr_v6 != NULL);
+ lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4;
+ lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6;
+ lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP;
+ (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ);
+ lifgr->gi_m4ifname[0] = '\0';
+ lifgr->gi_m6ifname[0] = '\0';
+ lifgr->gi_bcifname[0] = '\0';
+
+ if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) {
+ (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ);
+ (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ);
+ }
+
+ if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL)
+ (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ);
+}
+
+/*
+ * Insert `grp' into the hash using the reserved hash entry `mh'.
+ * Caller must ensure `grp' is not yet in the hash.
+ */
+static void
+ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh)
+{
+ int err;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * Since grp->gr_name will exist at least as long as `grp' is in the
+ * hash, we use it directly as the key.
+ */
+ err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash,
+ (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh);
+ if (err != 0) {
+ /*
+ * This should never happen since `mh' was preallocated.
+ */
+ panic("cannot insert IPMP group \"%s\" (err %d)",
+ grp->gr_name, err);
+ }
+}
+
+/*
+ * Remove `grp' from the hash. Caller must ensure `grp' is in it.
+ */
+static void
+ipmp_grp_remove(ipmp_grp_t *grp)
+{
+ int err;
+ mod_hash_val_t val;
+ mod_hash_key_t key = (mod_hash_key_t)grp->gr_name;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val);
+ if (err != 0 || val != grp) {
+ panic("cannot remove IPMP group \"%s\" (err %d)",
+ grp->gr_name, err);
+ }
+}
+
+/*
+ * Attempt to rename `grp' to new name `grname'. Return an errno if the new
+ * group name already exists or is invalid, or if there isn't enough memory.
+ */
+int
+ipmp_grp_rename(ipmp_grp_t *grp, const char *grname)
+{
+ mod_hash_hndl_t mh;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if (grname[0] == '\0')
+ return (EINVAL);
+
+ if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
+ (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND)
+ return (EEXIST);
+
+ /*
+ * Before we remove the group from the hash, ensure we'll be able to
+ * re-insert it by reserving space.
+ */
+ if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0)
+ return (ENOMEM);
+
+ ipmp_grp_remove(grp);
+ (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
+ ipmp_grp_insert(grp, mh);
+
+ return (0);
+}
+
+/*
+ * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in
+ * the hash, and that there are no interfaces on it.
+ */
+void
+ipmp_grp_destroy(ipmp_grp_t *grp)
+{
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * If there are still interfaces using this group, panic before things
+ * go really off the rails.
+ */
+ if (grp->gr_nif != 0)
+ panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name);
+
+ ipmp_grp_remove(grp);
+ ipmp_grp_destroy_kstats(grp);
+
+ ASSERT(grp->gr_v4 == NULL);
+ ASSERT(grp->gr_v6 == NULL);
+ ASSERT(grp->gr_nv4 == 0);
+ ASSERT(grp->gr_nv6 == 0);
+ ASSERT(grp->gr_nactif == 0);
+ ASSERT(grp->gr_linkdownmp == NULL);
+ grp->gr_phyint = NULL;
+
+ kmem_free(grp, sizeof (ipmp_grp_t));
+}
+
+/*
+ * Check whether `ill' is suitable for inclusion into `grp', and return an
+ * errno describing the problem (if any). NOTE: many of these errno values
+ * are interpreted by ifconfig, which will take corrective action and retry
+ * the SIOCSLIFGROUPNAME, so please exercise care when changing them.
+ */
+static int
+ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
+{
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * To sidestep complicated address migration logic in the kernel and
+ * to force the kernel's all-hosts multicast memberships to be blown
+ * away, all addresses that had been brought up must be brought back
+ * down prior to adding an interface to a group. (This includes
+ * addresses currently down due to DAD.) Once the interface has been
+ * added to the group, its addresses can then be brought back up, at
+ * which point they will be moved to the IPMP meta-interface.
+ * NOTE: we do this before ill_appaddr_cnt() since bringing down the
+ * link-local causes in.ndpd to remove its ADDRCONF'd addresses.
+ */
+ if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
+ return (EADDRINUSE);
+
+ /*
+ * To avoid confusing applications by changing addresses that are
+ * under their control, all such control must be removed prior to
+ * adding an interface into a group.
+ */
+ if (ill_appaddr_cnt(ill) != 0)
+ return (EADDRNOTAVAIL);
+
+ /*
+ * Since PTP addresses do not share the same broadcast domain, they
+ * are not allowed to be in an IPMP group.
+ */
+ if (ill_ptpaddr_cnt(ill) != 0)
+ return (EINVAL);
+
+ /*
+ * An ill must support multicast to be allowed into a group.
+ */
+ if (!(ill->ill_flags & ILLF_MULTICAST))
+ return (ENOTSUP);
+
+ /*
+ * An ill must strictly be using ARP and/or ND for address
+ * resolution for it to be allowed into a group.
+ */
+ if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV))
+ return (ENOTSUP);
+
+ /*
+ * An ill cannot also be using usesrc groups. (Although usesrc uses
+ * ill_g_usesrc_lock, we don't need to grab it since usesrc also does
+ * all its modifications as writer.)
+ */
+ if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill))
+ return (ENOTSUP);
+
+ /*
+ * All ills in a group must be the same mactype.
+ */
+ if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Check whether `phyi' is suitable for inclusion into `grp', and return an
+ * errno describing the problem (if any). See comment above ipmp_grp_vet_ill()
+ * regarding errno values.
+ */
+int
+ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi)
+{
+ int err = 0;
+ ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+ ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq));
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+ /*
+ * An interface cannot have address families plumbed that are not
+ * configured in the group.
+ */
+ if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL ||
+ phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL)
+ return (EAFNOSUPPORT);
+
+ if (phyi->phyint_illv4 != NULL)
+ err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4);
+ if (err == 0 && phyi->phyint_illv6 != NULL)
+ err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6);
+
+ return (err);
+}
+
+/*
+ * Create a new illgrp on IPMP meta-interface `ill'.
+ */
+ipmp_illgrp_t *
+ipmp_illgrp_create(ill_t *ill)
+{
+ uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
+ ipmp_illgrp_t *illg;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_IPMP(ill));
+ ASSERT(ill->ill_grp == NULL);
+
+ if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL)
+ return (NULL);
+
+ list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode));
+ list_create(&illg->ig_actif, sizeof (ill_t),
+ offsetof(ill_t, ill_actnode));
+ list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t),
+ offsetof(ipmp_arpent_t, ia_node));
+
+ illg->ig_ipmp_ill = ill;
+ ill->ill_grp = illg;
+ ipmp_illgrp_set_mtu(illg, mtu);
+
+ return (illg);
+}
+
+/*
+ * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface.
+ */
+void
+ipmp_illgrp_destroy(ipmp_illgrp_t *illg)
+{
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+ ASSERT(IS_IPMP(illg->ig_ipmp_ill));
+
+ /*
+ * Verify `illg' is empty.
+ */
+ ASSERT(illg->ig_next_ill == NULL);
+ ASSERT(illg->ig_cast_ill == NULL);
+ ASSERT(list_is_empty(&illg->ig_arpent));
+ ASSERT(list_is_empty(&illg->ig_if));
+ ASSERT(list_is_empty(&illg->ig_actif));
+ ASSERT(illg->ig_nactif == 0);
+
+ /*
+ * Destroy `illg'.
+ */
+ illg->ig_ipmp_ill->ill_grp = NULL;
+ illg->ig_ipmp_ill = NULL;
+ list_destroy(&illg->ig_if);
+ list_destroy(&illg->ig_actif);
+ list_destroy(&illg->ig_arpent);
+ kmem_free(illg, sizeof (ipmp_illgrp_t));
+}
+
+/*
+ * Add `ipif' to the pool of usable data addresses on `illg' and attempt to
+ * bind it to an underlying ill, while keeping an even address distribution.
+ * If the bind is successful, return a pointer to the bound ill.
+ */
+ill_t *
+ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
+{
+ ill_t *minill;
+ ipmp_arpent_t *entp;
+
+ ASSERT(IAM_WRITER_IPIF(ipif));
+ ASSERT(ipmp_ipif_is_dataaddr(ipif));
+
+ /*
+ * IPMP data address mappings are internally managed by IP itself, so
+ * delete any existing ARP entries associated with the address.
+ */
+ if (!ipif->ipif_isv6) {
+ entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr);
+ if (entp != NULL)
+ ipmp_illgrp_destroy_arpent(illg, entp);
+ }
+
+ if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
+ ipmp_ill_bind_ipif(minill, ipif, Res_act_none);
+
+ return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL);
+}
+
+/*
+ * Delete `ipif' from the pool of usable data addresses on `illg'. If it's
+ * bound, unbind it from the underlying ill while keeping an even address
+ * distribution.
+ */
+void
+ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
+{
+ ill_t *maxill, *boundill = ipif->ipif_bound_ill;
+
+ ASSERT(IAM_WRITER_IPIF(ipif));
+
+ if (boundill != NULL) {
+ (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE);
+
+ maxill = ipmp_illgrp_max_ill(illg);
+ if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) {
+ ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
+ ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind);
+ }
+ }
+}
+
+/*
+ * Return the active ill with the greatest number of data addresses in `illg'.
+ */
+static ill_t *
+ipmp_illgrp_max_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill, *bestill = NULL;
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ ill = list_head(&illg->ig_actif);
+ for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
+ if (bestill == NULL ||
+ ill->ill_bound_cnt > bestill->ill_bound_cnt) {
+ bestill = ill;
+ }
+ }
+ return (bestill);
+}
+
+/*
+ * Return the active ill with the fewest number of data addresses in `illg'.
+ */
+static ill_t *
+ipmp_illgrp_min_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill, *bestill = NULL;
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ ill = list_head(&illg->ig_actif);
+ for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
+ if (bestill == NULL ||
+ ill->ill_bound_cnt < bestill->ill_bound_cnt) {
+ if (ill->ill_bound_cnt == 0)
+ return (ill); /* can't get better */
+ bestill = ill;
+ }
+ }
+ return (bestill);
+}
+
+/*
+ * Return a pointer to IPMP meta-interface for `illg' (which must exist).
+ * Since ig_ipmp_ill never changes for a given illg, no locks are needed.
+ */
+ill_t *
+ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg)
+{
+ return (illg->ig_ipmp_ill);
+}
+
+/*
+ * Return a pointer to the next available underlying ill in `illg', or NULL if
+ * one doesn't exist. Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_next_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ if ((ill = illg->ig_next_ill) != NULL) {
+ illg->ig_next_ill = list_next(&illg->ig_actif, ill);
+ if (illg->ig_next_ill == NULL)
+ illg->ig_next_ill = list_head(&illg->ig_actif);
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ return (ill);
+}
+
+/*
+ * Return a held pointer to the next available underlying ill in `illg', or
+ * NULL if one doesn't exist. Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *ill;
+ uint_t i;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ for (i = 0; i < illg->ig_nactif; i++) {
+ ill = illg->ig_next_ill;
+ illg->ig_next_ill = list_next(&illg->ig_actif, ill);
+ if (illg->ig_next_ill == NULL)
+ illg->ig_next_ill = list_head(&illg->ig_actif);
+
+ if (ILL_CAN_LOOKUP(ill)) {
+ ill_refhold(ill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (ill);
+ }
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ return (NULL);
+}
+
+/*
+ * Return a pointer to the nominated multicast ill in `illg', or NULL if one
+ * doesn't exist. Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg)
+{
+ /*
+ * Since an IPMP ill's ill_grp gets cleared during I_PUNLINK but
+ * this function can get called after that point, handle NULL.
+ */
+ if (illg == NULL)
+ return (NULL);
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+ return (illg->ig_cast_ill);
+}
+
+/*
+ * Return a held pointer to the nominated multicast ill in `illg', or NULL if
+ * one doesn't exist. Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
+{
+ ill_t *castill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ castill = illg->ig_cast_ill;
+ if (castill != NULL && ILL_CAN_LOOKUP(castill)) {
+ ill_refhold(castill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (castill);
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (NULL);
+}
+
+/*
+ * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL,
+ * any existing nomination is removed. Caller must be inside the IPSQ.
+ */
+static void
+ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
+{
+ ill_t *ocastill = illg->ig_cast_ill;
+ ill_t *ipmp_ill = illg->ig_ipmp_ill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill));
+
+ /*
+ * Disable old nominated ill (if any).
+ */
+ if (ocastill != NULL) {
+ DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *,
+ illg, ill_t *, ocastill);
+ ASSERT(ocastill->ill_nom_cast);
+ ocastill->ill_nom_cast = B_FALSE;
+ /*
+ * If the IPMP meta-interface is down, we never did the join,
+ * so we must not try to leave.
+ */
+ if (ipmp_ill->ill_dl_up)
+ ill_leave_multicast(ipmp_ill);
+ }
+
+ /*
+ * Set new nomination.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ illg->ig_cast_ill = castill;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ if (ocastill != NULL) {
+ /*
+ * Delete any IREs tied to the old nomination. We must do
+ * this after the new castill is set and has reached global
+ * visibility since the datapath has not been quiesced.
+ */
+ ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
+ ill_stq_cache_delete, ocastill, ocastill);
+ }
+
+ /*
+ * Enable new nominated ill (if any).
+ */
+ if (castill != NULL) {
+ DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *,
+ illg, ill_t *, castill);
+ ASSERT(!castill->ill_nom_cast);
+ castill->ill_nom_cast = B_TRUE;
+ /*
+ * If the IPMP meta-interface is down, the attempt to recover
+ * will silently fail but ill_need_recover_multicast will be
+ * erroneously cleared -- so check first.
+ */
+ if (ipmp_ill->ill_dl_up)
+ ill_recover_multicast(ipmp_ill);
+ }
+
+ /*
+ * For IPv4, refresh our broadcast IREs. This needs to be done even
+ * if there's no new nomination since ill_refresh_bcast() still must
+ * update the IPMP meta-interface's broadcast IREs to point back at
+ * the IPMP meta-interface itself.
+ */
+ if (!ipmp_ill->ill_isv6)
+ ill_refresh_bcast(ipmp_ill);
+}
+
+/*
+ * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an
+ * entry for the same IP address already exists, destroy it first. Return the
+ * created IPMP ARP entry, or NULL on failure.
+ */
+ipmp_arpent_t *
+ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp)
+{
+ uchar_t *addrp;
+ area_t *area = (area_t *)mp->b_rptr;
+ ipmp_arpent_t *entp, *oentp;
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+ ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t));
+
+ if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL)
+ return (NULL);
+
+ if ((mp = copyb(mp)) == NULL) {
+ kmem_free(entp, sizeof (ipmp_arpent_t));
+ return (NULL);
+ }
+
+ DB_TYPE(mp) = M_PROTO;
+ entp->ia_area_mp = mp;
+ entp->ia_proxyarp = proxyarp;
+ addrp = mi_offset_paramc(mp, area->area_proto_addr_offset,
+ sizeof (ipaddr_t));
+ bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t));
+
+ if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
+ ipmp_illgrp_destroy_arpent(illg, oentp);
+
+ list_insert_head(&illg->ig_arpent, entp);
+ return (entp);
+}
+
+/*
+ * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it.
+ */
+void
+ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
+{
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ list_remove(&illg->ig_arpent, entp);
+ freeb(entp->ia_area_mp);
+ kmem_free(entp, sizeof (ipmp_arpent_t));
+}
+
+/*
+ * Mark that ARP has been notified about the IP address on `entp'; `illg' is
+ * taken as a debugging aid for DTrace FBT probes.
+ */
+/* ARGSUSED */
+void
+ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
+{
+ entp->ia_notified = B_TRUE;
+}
+
+/*
+ * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is
+ * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist.
+ */
+ipmp_arpent_t *
+ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp)
+{
+ ipmp_arpent_t *entp = list_head(&illg->ig_arpent);
+
+ ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+ if (addrp == NULL)
+ return (entp);
+
+ for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp))
+ if (entp->ia_ipaddr == *addrp)
+ break;
+ return (entp);
+}
+
+/*
+ * Refresh ARP entries on `illg' to be distributed across its active
+ * interfaces. Entries that cannot be refreshed (e.g., because there are no
+ * active interfaces) are marked so that subsequent calls can try again.
+ */
+void
+ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
+{
+ ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
+ uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
+ area_t *area;
+ mblk_t *area_mp;
+ uchar_t *physaddr;
+ ipmp_arpent_t *entp;
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill));
+ ASSERT(!ipmp_ill->ill_isv6);
+
+ ill = list_head(&illg->ig_actif);
+ entp = list_head(&illg->ig_arpent);
+ for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) {
+ if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) {
+ entp->ia_notified = B_FALSE;
+ continue;
+ }
+
+ area = (area_t *)entp->ia_area_mp->b_rptr;
+ ASSERT(paddrlen == ill->ill_phys_addr_length);
+ ASSERT(paddrlen == area->area_hw_addr_length);
+ physaddr = mi_offset_paramc(entp->ia_area_mp,
+ area->area_hw_addr_offset, paddrlen);
+
+ /*
+ * If this is a proxy ARP entry, we can skip notifying ARP if
+ * the entry is already up-to-date. If it has changed, we
+ * update the entry's hardware address before notifying ARP.
+ */
+ if (entp->ia_proxyarp) {
+ if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 &&
+ entp->ia_notified)
+ continue;
+ bcopy(ill->ill_phys_addr, physaddr, paddrlen);
+ }
+
+ if ((area_mp = copyb(entp->ia_area_mp)) == NULL) {
+ entp->ia_notified = B_FALSE;
+ continue;
+ }
+
+ putnext(ipmp_ill->ill_rq, area_mp);
+ ipmp_illgrp_mark_arpent(illg, entp);
+
+ if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
+ ill = list_head(&illg->ig_actif);
+ }
+}
+
+/*
+ * Return an interface in `illg' with the specified `physaddr', or NULL if one
+ * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen)
+{
+ ill_t *ill;
+ ill_t *ipmp_ill = illg->ig_ipmp_ill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock));
+
+ ill = list_head(&illg->ig_if);
+ for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+ if (ill->ill_phys_addr_length == paddrlen &&
+ bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0)
+ return (ill);
+ }
+ return (NULL);
+}
+
+/*
+ * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND.
+ * Caller must be inside the IPSQ unless this is initialization.
+ */
+static void
+ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu)
+{
+ ill_t *ill = illg->ig_ipmp_ill;
+ mblk_t *mp;
+
+ ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill));
+
+ /*
+ * If allocation fails, we have bigger problems than MTU.
+ */
+ if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) {
+ illg->ig_mtu = mtu;
+ put(ill->ill_rq, mp);
+ }
+}
+
+/*
+ * Recalculate the IPMP group MTU for `illg', and update its associated IPMP
+ * ill MTU if necessary.
+ */
+void
+ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
+{
+ ill_t *ill;
+ ill_t *ipmp_ill = illg->ig_ipmp_ill;
+ uint_t mtu = 0;
+
+ ASSERT(IAM_WRITER_ILL(ipmp_ill));
+
+ /*
+ * Since ill_max_mtu can only change under ill_lock, we hold ill_lock
+ * for each ill as we iterate through the list. Any changes to the
+ * ill_max_mtu will also trigger an update, so even if we missed it
+ * this time around, the update will catch it.
+ */
+ ill = list_head(&illg->ig_if);
+ for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+ mutex_enter(&ill->ill_lock);
+ if (mtu == 0 || ill->ill_max_mtu < mtu)
+ mtu = ill->ill_max_mtu;
+ mutex_exit(&ill->ill_lock);
+ }
+
+ /*
+ * MTU must be at least the minimum MTU.
+ */
+ mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
+
+ if (illg->ig_mtu != mtu)
+ ipmp_illgrp_set_mtu(illg, mtu);
+}
+
+/*
+ * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently
+ * allow the same link to be established more than once.
+ */
+void
+ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp)
+{
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if (illg->ig_ipmp_ill->ill_isv6) {
+ ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg);
+ grp->gr_v6 = illg;
+ } else {
+ ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg);
+ grp->gr_v4 = illg;
+ }
+}
+
+/*
+ * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp
+ * cannot be unlinked (e.g., because there are still interfaces using it).
+ */
+int
+ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg)
+{
+ ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+ if (illg->ig_ipmp_ill->ill_isv6) {
+ if (grp->gr_nv6 + grp->gr_pendv6 != 0)
+ return (EBUSY);
+ grp->gr_v6 = NULL;
+ } else {
+ if (grp->gr_nv4 + grp->gr_pendv4 != 0)
+ return (EBUSY);
+ grp->gr_v4 = NULL;
+ }
+ return (0);
+}
+
+/*
+ * Place `ill' into `illg', and rebalance the data addresses on `illg'
+ * to be spread evenly across the ills now in it. Also, adjust the IPMP
+ * ill as necessary to account for `ill' (e.g., MTU).
+ */
+void
+ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
+{
+ ill_t *ipmp_ill;
+ ipif_t *ipif;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */
+ ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL);
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(ill->ill_grp == NULL);
+
+ ipmp_ill = illg->ig_ipmp_ill;
+
+ /*
+ * Account for `ill' joining the illgrp.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ if (ill->ill_isv6)
+ ill->ill_phyint->phyint_grp->gr_nv6++;
+ else
+ ill->ill_phyint->phyint_grp->gr_nv4++;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * Ensure the ILLF_ROUTER flag remains consistent across the group.
+ */
+ mutex_enter(&ill->ill_lock);
+ if (ipmp_ill->ill_flags & ILLF_ROUTER)
+ ill->ill_flags |= ILLF_ROUTER;
+ else
+ ill->ill_flags &= ~ILLF_ROUTER;
+ mutex_exit(&ill->ill_lock);
+
+ /*
+ * Blow away all multicast memberships that currently exist on `ill'.
+ * This may seem odd, but it's consistent with the application view
+ * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
+ */
+ if (ill->ill_isv6) {
+ reset_conn_ill(ill);
+ reset_mrt_ill(ill);
+ } else {
+ ipif = ill->ill_ipif;
+ for (; ipif != NULL; ipif = ipif->ipif_next) {
+ reset_conn_ipif(ipif);
+ reset_mrt_vif_ipif(ipif);
+ }
+ }
+ ip_purge_allmulti(ill);
+
+ /*
+ * Borrow the first ill's ill_phys_addr_length value for the illgrp's
+ * physical address length. All other ills must have the same value,
+ * since they are required to all be the same mactype. Also update
+ * the IPMP ill's MTU and CoS marking, if necessary.
+ */
+ if (list_is_empty(&illg->ig_if)) {
+ ASSERT(ipmp_ill->ill_phys_addr_length == 0);
+ /*
+ * NOTE: we leave ill_phys_addr NULL since the IPMP group
+ * doesn't have a physical address. This means that code must
+ * not assume that ill_phys_addr is non-NULL just because
+ * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla.
+ */
+ ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length;
+ ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length;
+ ipmp_ill->ill_type = ill->ill_type;
+
+ if (ill->ill_flags & ILLF_COS_ENABLED) {
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ }
+ ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+ } else {
+ ASSERT(ipmp_ill->ill_phys_addr_length ==
+ ill->ill_phys_addr_length);
+ ASSERT(ipmp_ill->ill_type == ill->ill_type);
+
+ if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ }
+ if (illg->ig_mtu > ill->ill_max_mtu)
+ ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+ }
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ list_insert_tail(&illg->ig_if, ill);
+ ill->ill_grp = illg;
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ /*
+ * Hide the IREs on `ill' so that we don't accidentally find them when
+ * sending data traffic.
+ */
+ ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
+
+ /*
+ * Merge any broadcast IREs, if need be.
+ */
+ if (!ill->ill_isv6)
+ ill_refresh_bcast(ill);
+
+ ipmp_ill_refresh_active(ill);
+}
+
+/*
+ * Remove `ill' from its illgrp, and rebalance the data addresses in that
+ * illgrp to be spread evenly across the remaining ills. Also, adjust the
+ * IPMP ill as necessary now that `ill' is removed (e.g., MTU).
+ */
+void
+ipmp_ill_leave_illgrp(ill_t *ill)
+{
+ ill_t *ipmp_ill;
+ ipif_t *ipif;
+ ipmp_arpent_t *entp;
+ ipmp_illgrp_t *illg = ill->ill_grp;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IS_UNDER_IPMP(ill));
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(illg != NULL);
+
+ ipmp_ill = illg->ig_ipmp_ill;
+
+ /*
+ * Cancel IPMP-specific ill timeouts.
+ */
+ (void) untimeout(ill->ill_refresh_tid);
+
+ /*
+ * Expose any previously-hidden IREs on `ill'.
+ */
+ ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill);
+
+ /*
+ * Ensure the multicast state for each ipif on `ill' is down so that
+ * our ipif_multicast_up() (once `ill' leaves the group) will rejoin
+ * all eligible groups.
+ */
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipif->ipif_flags & IPIF_UP)
+ ipif_multicast_down(ipif);
+
+ /*
+ * Account for `ill' leaving the illgrp.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ if (ill->ill_isv6)
+ ill->ill_phyint->phyint_grp->gr_nv6--;
+ else
+ ill->ill_phyint->phyint_grp->gr_nv4--;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * Pull `ill' out of the interface lists.
+ */
+ if (list_link_active(&ill->ill_actnode))
+ ipmp_ill_deactivate(ill);
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ list_remove(&illg->ig_if, ill);
+ ill->ill_grp = NULL;
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ /*
+ * Recreate any broadcast IREs that had been shared, if need be.
+ */
+ if (!ill->ill_isv6)
+ ill_refresh_bcast(ill);
+
+ /*
+ * Re-establish multicast memberships that were previously being
+ * handled by the IPMP meta-interface.
+ */
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipif->ipif_flags & IPIF_UP)
+ ipif_multicast_up(ipif);
+
+ /*
+ * Refresh the group MTU based on the new interface list.
+ */
+ ipmp_illgrp_refresh_mtu(illg);
+
+ if (list_is_empty(&illg->ig_if)) {
+ /*
+ * No ills left in the illgrp; we no longer have a physical
+ * address length, nor can we support ARP, CoS, or anything
+ * else that depends on knowing the link layer type.
+ */
+ while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL)
+ ipmp_illgrp_destroy_arpent(illg, entp);
+
+ ipmp_ill->ill_phys_addr_length = 0;
+ ipmp_ill->ill_nd_lla_len = 0;
+ ipmp_ill->ill_type = IFT_OTHER;
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ } else {
+ /*
+ * If `ill' didn't support CoS, see if it can now be enabled.
+ */
+ if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
+ ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED));
+
+ ill = list_head(&illg->ig_if);
+ do {
+ if (!(ill->ill_flags & ILLF_COS_ENABLED))
+ break;
+ } while ((ill = list_next(&illg->ig_if, ill)) != NULL);
+
+ if (ill == NULL) {
+ mutex_enter(&ipmp_ill->ill_lock);
+ ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
+ mutex_exit(&ipmp_ill->ill_lock);
+ }
+ }
+ }
+}
+
+/*
+ * Check if `ill' should be active, and activate or deactivate if need be.
+ * Return B_FALSE if a refresh was necessary but could not be performed.
+ */
+static boolean_t
+ipmp_ill_try_refresh_active(ill_t *ill)
+{
+ boolean_t refreshed = B_TRUE;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ if (ipmp_ill_is_active(ill)) {
+ if (!list_link_active(&ill->ill_actnode))
+ refreshed = ipmp_ill_activate(ill);
+ } else {
+ if (list_link_active(&ill->ill_actnode))
+ ipmp_ill_deactivate(ill);
+ }
+
+ return (refreshed);
+}
+
+/*
+ * Check if `ill' should be active, and activate or deactivate if need be.
+ * If the refresh fails, schedule a timer to try again later.
+ */
+void
+ipmp_ill_refresh_active(ill_t *ill)
+{
+ if (!ipmp_ill_try_refresh_active(ill))
+ ipmp_ill_refresh_active_timer_start(ill);
+}
+
+/*
+ * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'.
+ */
+static void
+ipmp_ill_refresh_active_timer(void *ill_arg)
+{
+ ill_t *ill = ill_arg;
+ boolean_t refreshed = B_FALSE;
+
+ /*
+ * Clear ill_refresh_tid to indicate that no timeout is pending
+ * (another thread could schedule a new timeout while we're still
+ * running, but that's harmless). If the ill is going away, bail.
+ */
+ mutex_enter(&ill->ill_lock);
+ ill->ill_refresh_tid = 0;
+ if (ill->ill_state_flags & ILL_CONDEMNED) {
+ mutex_exit(&ill->ill_lock);
+ return;
+ }
+ mutex_exit(&ill->ill_lock);
+
+ if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) {
+ refreshed = ipmp_ill_try_refresh_active(ill);
+ ipsq_exit(ill->ill_phyint->phyint_ipsq);
+ }
+
+ /*
+ * If the refresh failed, schedule another attempt.
+ */
+ if (!refreshed)
+ ipmp_ill_refresh_active_timer_start(ill);
+}
+
+/*
+ * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'.
+ */
+static void
+ipmp_ill_refresh_active_timer_start(ill_t *ill)
+{
+ mutex_enter(&ill->ill_lock);
+
+ /*
+ * If the ill is going away or a refresh is already scheduled, bail.
+ */
+ if (ill->ill_refresh_tid != 0 ||
+ (ill->ill_state_flags & ILL_CONDEMNED)) {
+ mutex_exit(&ill->ill_lock);
+ return;
+ }
+
+ ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill,
+ SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT));
+
+ mutex_exit(&ill->ill_lock);
+}
+
+/*
+ * Activate `ill' so it will be used to send and receive data traffic. Return
+ * B_FALSE if `ill' cannot be activated. Note that we allocate any messages
+ * needed to deactivate `ill' here as well so that deactivation cannot fail.
+ */
+static boolean_t
+ipmp_ill_activate(ill_t *ill)
+{
+ ipif_t *ipif;
+ mblk_t *actmp = NULL, *deactmp = NULL;
+ mblk_t *linkupmp = NULL, *linkdownmp = NULL;
+ ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
+ const char *grifname = grp->gr_ifname;
+ ipmp_illgrp_t *illg = ill->ill_grp;
+ ill_t *maxill;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ /*
+ * If this will be the first active interface in the group, allocate
+ * the link-up and link-down messages.
+ */
+ if (grp->gr_nactif == 0) {
+ linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0);
+ linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0);
+ if (linkupmp == NULL || linkdownmp == NULL)
+ goto fail;
+ }
+
+ /*
+ * For IPv4, allocate the activate/deactivate messages, and tell ARP.
+ */
+ if (!ill->ill_isv6) {
+ actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template);
+ deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template);
+ if (actmp == NULL || deactmp == NULL)
+ goto fail;
+
+ ASSERT(ill->ill_ardeact_mp == NULL);
+ ill->ill_ardeact_mp = deactmp;
+ putnext(illg->ig_ipmp_ill->ill_rq, actmp);
+ }
+
+ if (list_is_empty(&illg->ig_actif)) {
+ /*
+ * Now that we have an active ill, nominate it for multicast
+ * and broadcast duties. Do this before ipmp_ill_bind_ipif()
+ * since that may need to send multicast packets (e.g., IPv6
+ * neighbor discovery probes).
+ */
+ ipmp_illgrp_set_cast(illg, ill);
+
+ /*
+ * This is the first active ill in the illgrp -- add 'em all.
+ * We can access/walk ig_ipmp_ill's ipif list since we're
+ * writer on its IPSQ as well.
+ */
+ ipif = illg->ig_ipmp_ill->ill_ipif;
+ for (; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipmp_ipif_is_up_dataaddr(ipif))
+ ipmp_ill_bind_ipif(ill, ipif, Res_act_initial);
+ } else {
+ /*
+ * Redistribute the addresses by moving them from the ill with
+ * the most addresses until the ill being activated is at the
+ * same level as the rest of the ills.
+ */
+ for (;;) {
+ maxill = ipmp_illgrp_max_ill(illg);
+ ASSERT(maxill != NULL);
+ if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt)
+ break;
+ ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
+ ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
+ }
+
+ /*
+ * TODO: explore whether it's advantageous to flush IRE_CACHE
+ * bindings to force existing connections to be redistributed
+ * to the new ill.
+ */
+ }
+
+ /*
+ * Put the interface in the active list.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ list_insert_tail(&illg->ig_actif, ill);
+ illg->ig_nactif++;
+ illg->ig_next_ill = ill;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * Refresh ARP entries to use `ill', if need be.
+ */
+ if (!ill->ill_isv6)
+ ipmp_illgrp_refresh_arpent(illg);
+
+ /*
+ * Finally, mark the group link up, if necessary.
+ */
+ if (grp->gr_nactif++ == 0) {
+ ASSERT(grp->gr_linkdownmp == NULL);
+ grp->gr_linkdownmp = linkdownmp;
+ put(illg->ig_ipmp_ill->ill_rq, linkupmp);
+ }
+ return (B_TRUE);
+fail:
+ freemsg(actmp);
+ freemsg(deactmp);
+ freemsg(linkupmp);
+ freemsg(linkdownmp);
+ return (B_FALSE);
+}
+
+/*
+ * Deactivate `ill' so it will not be used to send or receive data traffic.
+ */
+static void
+ipmp_ill_deactivate(ill_t *ill)
+{
+ ill_t *minill;
+ ipif_t *ipif, *ubnextipif, *ubheadipif = NULL;
+ mblk_t *mp;
+ ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
+ ipmp_illgrp_t *illg = ill->ill_grp;
+ ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ /*
+ * Delete IRE_CACHE entries tied to this ill before they become stale.
+ */
+ ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
+ ill_stq_cache_delete, ill, ill);
+
+ /*
+ * Pull the interface out of the active list.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ list_remove(&illg->ig_actif, ill);
+ illg->ig_nactif--;
+ illg->ig_next_ill = list_head(&illg->ig_actif);
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * If the ill that's being deactivated had been nominated for
+ * multicast/broadcast, nominate a new one.
+ */
+ if (ill == illg->ig_cast_ill)
+ ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
+
+ /*
+ * Unbind all of the ipifs bound to this ill, and save 'em in a list;
+ * we'll rebind them after we tell the resolver the ill is no longer
+ * active. We must do things in this order or the resolver could
+ * accidentally rebind to the ill we're trying to remove if multiple
+ * ills in the group have the same hardware address (which is
+ * unsupported, but shouldn't lead to a wedged machine).
+ */
+ while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) {
+ ipif->ipif_bound_next = ubheadipif;
+ ubheadipif = ipif;
+ }
+
+ if (!ill->ill_isv6) {
+ /*
+ * Tell ARP `ill' is no longer active in the group.
+ */
+ mp = ill->ill_ardeact_mp;
+ ill->ill_ardeact_mp = NULL;
+ ASSERT(mp != NULL);
+ putnext(illg->ig_ipmp_ill->ill_rq, mp);
+
+ /*
+ * Refresh any ARP entries that had been using `ill'.
+ */
+ ipmp_illgrp_refresh_arpent(illg);
+ }
+
+ /*
+ * Rebind each ipif from the deactivated ill to the active ill with
+ * the fewest ipifs. If there are no active ills, the ipifs will
+ * remain unbound.
+ */
+ for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) {
+ ubnextipif = ipif->ipif_bound_next;
+ ipif->ipif_bound_next = NULL;
+
+ if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
+ ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
+ }
+
+ /*
+ * Finally, mark the group link down, if necessary.
+ */
+ if (--grp->gr_nactif == 0) {
+ mp = grp->gr_linkdownmp;
+ grp->gr_linkdownmp = NULL;
+ ASSERT(mp != NULL);
+ put(illg->ig_ipmp_ill->ill_rq, mp);
+ }
+}
+
+/*
+ * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD)
+ * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners.
+ */
+static void
+ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd)
+{
+ ipif_t *ipif;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE);
+
+ /*
+ * If `ill' is truly down, there are no messages to generate since:
+ *
+ * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface
+ * and its addresses by bringing them down. But that's already
+ * true, so there's nothing to hide.
+ *
+ * 2. If cmd == RTM_ADD, then we're supposed to generate messages
+ * indicating that any previously-hidden up addresses are again
+ * back up (along with the interface). But they aren't, so
+ * there's nothing to expose.
+ */
+ if (ill->ill_ipif_up_count == 0)
+ return;
+
+ if (cmd == RTM_ADD)
+ ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL);
+
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+ if (ipif->ipif_flags & IPIF_UP)
+ ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL);
+
+ if (cmd == RTM_DELETE)
+ ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL);
+}
+
+/*
+ * Bind the address named by `ipif' to the underlying ill named by `ill'.
+ * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act'
+ * will indicate to the resolver whether this is an initial bringup of
+ * `ipif', or just a rebind to another ill.
+ */
+static void
+ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
+{
+ int err = 0;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif));
+ ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill));
+ ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif));
+ ASSERT(ipif->ipif_bound_ill == NULL);
+ ASSERT(ipif->ipif_bound_next == NULL);
+
+ ipif->ipif_bound_next = ill->ill_bound_ipif;
+ ill->ill_bound_ipif = ipif;
+ ill->ill_bound_cnt++;
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ ipif->ipif_bound_ill = ill;
+ rw_exit(&ipst->ips_ipmp_lock);
+
+ /*
+ * If necessary, tell ARP/NDP about the new mapping. Note that
+ * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills.
+ */
+ if (act != Res_act_none) {
+ if (ill->ill_isv6) {
+ VERIFY(ipif_resolver_up(ipif, act) == 0);
+ err = ipif_ndp_up(ipif, act == Res_act_initial);
+ } else {
+ err = ipif_resolver_up(ipif, act);
+ }
+
+ /*
+ * Since ipif_ndp_up() never returns EINPROGRESS and
+ * ipif_resolver_up() only returns EINPROGRESS when the
+ * associated ill is not up, we should never be here with
+ * EINPROGRESS. We rely on this to simplify the design.
+ */
+ ASSERT(err != EINPROGRESS);
+ }
+ /* TODO: retry binding on failure? when? */
+ ipif->ipif_bound = (err == 0);
+}
+
+/*
+ * Unbind the address named by `ipif' from the underlying ill named by `ill'.
+ * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned.
+ * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is
+ * B_TRUE, notify the resolver about the change.
+ */
+static ipif_t *
+ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
+{
+ ill_t *ipmp_ill;
+ ipif_t *previpif;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(IS_UNDER_IPMP(ill));
+
+ ipmp_ill = ill->ill_grp->ig_ipmp_ill;
+
+ /*
+ * If necessary, find an ipif to unbind.
+ */
+ if (ipif == NULL) {
+ if ((ipif = ill->ill_bound_ipif) == NULL) {
+ ASSERT(ill->ill_bound_cnt == 0);
+ return (NULL);
+ }
+ }
+
+ ASSERT(IAM_WRITER_IPIF(ipif));
+ ASSERT(IS_IPMP(ipif->ipif_ill));
+ ASSERT(ipif->ipif_bound_ill == ill);
+ ASSERT(ill->ill_bound_cnt > 0);
+
+ /*
+ * Unbind it.
+ */
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+ ipif->ipif_bound_ill = NULL;
+ rw_exit(&ipst->ips_ipmp_lock);
+ ill->ill_bound_cnt--;
+
+ if (ill->ill_bound_ipif == ipif) {
+ ill->ill_bound_ipif = ipif->ipif_bound_next;
+ } else {
+ previpif = ill->ill_bound_ipif;
+ while (previpif->ipif_bound_next != ipif)
+ previpif = previpif->ipif_bound_next;
+
+ previpif->ipif_bound_next = ipif->ipif_bound_next;
+ }
+ ipif->ipif_bound_next = NULL;
+
+ /*
+ * If requested, notify the resolvers (provided we're bound).
+ */
+ if (notifyres && ipif->ipif_bound) {
+ if (ill->ill_isv6) {
+ ipif_ndp_down(ipif);
+ } else {
+ ASSERT(ipif->ipif_arp_del_mp != NULL);
+ putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp);
+ ipif->ipif_arp_del_mp = NULL;
+ }
+ }
+ ipif->ipif_bound = B_FALSE;
+
+ return (ipif);
+}
+
+/*
+ * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if
+ * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this
+ * to determine whether an ill should be considered active, other consumers
+ * may race and learn about an ill that should be deactivated/activated before
+ * IPMP has performed the activation/deactivation. This should be safe though
+ * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that
+ * would've been cleaned up by ipmp_ill_deactivate().
+ */
+boolean_t
+ipmp_ill_is_active(ill_t *ill)
+{
+ phyint_t *phyi = ill->ill_phyint;
+
+ ASSERT(IS_UNDER_IPMP(ill));
+ ASSERT(IAM_WRITER_ILL(ill) ||
+ (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock)));
+
+ /*
+ * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to
+ * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the
+ * link flapping logic to be just in in.mpathd and allows us to ignore
+ * changes to PHYI_RUNNING.
+ */
+ return (!(ill->ill_ipif_up_count == 0 ||
+ (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED))));
+}
+
+/*
+ * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet
+ * IREs with a source address on `ill_arg'.
+ */
+static void
+ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
+{
+ ill_t *ill = (ill_t *)ill_arg;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(!IS_IPMP(ill));
+
+ if (ire->ire_ipif->ipif_ill != ill)
+ return;
+
+ switch (ire->ire_type) {
+ case IRE_HOST:
+ case IRE_PREFIX:
+ case IRE_DEFAULT:
+ case IRE_CACHE:
+ case IRE_IF_RESOLVER:
+ case IRE_IF_NORESOLVER:
+ DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
+ ire->ire_marks |= IRE_MARK_TESTHIDDEN;
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source
+ * address on `ill_arg'.
+ */
+static void
+ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
+{
+ ill_t *ill = (ill_t *)ill_arg;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(!IS_IPMP(ill));
+
+ if (ire->ire_ipif->ipif_ill == ill) {
+ DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
+ ire->ire_marks &= ~IRE_MARK_TESTHIDDEN;
+ }
+}
+
+/*
+ * Return a held pointer to the IPMP ill for underlying interface `ill', or
+ * NULL if one doesn't exist. (Unfortunately, this function needs to take an
+ * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
+ * ill_grp pointer may become stale when not under an IPSQ and not holding
+ * ipmp_lock.) Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_ill_hold_ipmp_ill(ill_t *ill)
+{
+ ip_stack_t *ipst = ill->ill_ipst;
+ ipmp_illgrp_t *illg;
+
+ ASSERT(!IS_IPMP(ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ illg = ill->ill_grp;
+ if (illg != NULL && ILL_CAN_LOOKUP(illg->ig_ipmp_ill)) {
+ ill_refhold(illg->ig_ipmp_ill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (illg->ig_ipmp_ill);
+ }
+ /*
+ * Assume `ill' was removed from the illgrp in the meantime.
+ */
+ rw_exit(&ill->ill_ipst->ips_ipmp_lock);
+ return (NULL);
+}
+
+/*
+ * Return the interface index for the IPMP ill tied to underlying interface
+ * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ.
+ */
+uint_t
+ipmp_ill_get_ipmp_ifindex(const ill_t *ill)
+{
+ uint_t ifindex = 0;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ipmp_grp_t *grp;
+
+ ASSERT(!IS_IPMP(ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ if ((grp = ill->ill_phyint->phyint_grp) != NULL)
+ ifindex = grp->gr_phyint->phyint_ifindex;
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (ifindex);
+}
+
+/*
+ * Place phyint `phyi' into IPMP group `grp'.
+ */
+void
+ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp)
+{
+ ill_t *ill;
+ ipsq_t *ipsq = phyi->phyint_ipsq;
+ ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq;
+ ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+
+ ASSERT(IAM_WRITER_IPSQ(ipsq));
+ ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL);
+
+ /*
+ * Send routing socket messages indicating that the phyint's ills
+ * and ipifs vanished.
+ */
+ if (phyi->phyint_illv4 != NULL) {
+ ill = phyi->phyint_illv4;
+ ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
+ }
+
+ if (phyi->phyint_illv6 != NULL) {
+ ill = phyi->phyint_illv6;
+ ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
+ }
+
+ /*
+ * Snapshot the phyint's initial kstats as a baseline.
+ */
+ ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0);
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+
+ phyi->phyint_grp = grp;
+ if (++grp->gr_nif == 1)
+ grp->gr_mactype = ill->ill_mactype;
+ else
+ ASSERT(grp->gr_mactype == ill->ill_mactype);
+
+ /*
+ * Now that we're in the group, request a switch to the group's xop
+ * when we ipsq_exit(). All future operations will be exclusive on
+ * the group xop until ipmp_phyint_leave_grp() is called.
+ */
+ ASSERT(ipsq->ipsq_swxop == NULL);
+ ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop);
+ ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop;
+
+ rw_exit(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Remove phyint `phyi' from its current IPMP group.
+ */
+void
+ipmp_phyint_leave_grp(phyint_t *phyi)
+{
+ uint_t i;
+ ipsq_t *ipsq = phyi->phyint_ipsq;
+ ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+ uint64_t phyi_kstats[IPMP_KSTAT_MAX];
+
+ ASSERT(IAM_WRITER_IPSQ(ipsq));
+
+ /*
+ * If any of the phyint's ills are still in an illgrp, kick 'em out.
+ */
+ if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4))
+ ipmp_ill_leave_illgrp(phyi->phyint_illv4);
+ if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6))
+ ipmp_ill_leave_illgrp(phyi->phyint_illv6);
+
+ /*
+ * Send routing socket messages indicating that the phyint's ills
+ * and ipifs have reappeared.
+ */
+ if (phyi->phyint_illv4 != NULL)
+ ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD);
+ if (phyi->phyint_illv6 != NULL)
+ ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD);
+
+ /*
+ * Calculate the phyint's cumulative kstats while it was in the group,
+ * and add that to the group's baseline.
+ */
+ ipmp_phyint_get_kstats(phyi, phyi_kstats);
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ phyi_kstats[i] -= phyi->phyint_kstats0[i];
+ atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]);
+ }
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+
+ phyi->phyint_grp->gr_nif--;
+ phyi->phyint_grp = NULL;
+
+ /*
+ * As our final act in leaving the group, request a switch back to our
+ * IPSQ's own xop when we ipsq_exit().
+ */
+ ASSERT(ipsq->ipsq_swxop == NULL);
+ ipsq->ipsq_swxop = &ipsq->ipsq_ownxop;
+
+ rw_exit(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Store the IPMP-related kstats for `phyi' into the array named by `kstats'.
+ * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements.
+ */
+static void
+ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[])
+{
+ uint_t i, j;
+ const char *name;
+ kstat_t *ksp;
+ kstat_named_t *kn;
+
+ bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX);
+
+ /*
+ * NOTE: ALL_ZONES here assumes that there's at most one link
+ * with a given name on a given system (safe for now).
+ */
+ ksp = kstat_hold_byname("link", 0, phyi->phyint_name, ALL_ZONES);
+ if (ksp == NULL)
+ return;
+
+ KSTAT_ENTER(ksp);
+
+ if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
+ /*
+ * Bring kstats up-to-date before recording.
+ */
+ (void) KSTAT_UPDATE(ksp, KSTAT_READ);
+
+ kn = KSTAT_NAMED_PTR(ksp);
+ for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+ name = ipmp_kstats[i].name;
+ kstats[i] = 0;
+ for (j = 0; j < ksp->ks_ndata; j++) {
+ if (strcmp(kn[j].name, name) != 0)
+ continue;
+
+ switch (kn[j].data_type) {
+ case KSTAT_DATA_INT32:
+ case KSTAT_DATA_UINT32:
+ kstats[i] = kn[j].value.ui32;
+ break;
+#ifdef _LP64
+ case KSTAT_DATA_LONG:
+ case KSTAT_DATA_ULONG:
+ kstats[i] = kn[j].value.ul;
+ break;
+#endif
+ case KSTAT_DATA_INT64:
+ case KSTAT_DATA_UINT64:
+ kstats[i] = kn[j].value.ui64;
+ break;
+ }
+ break;
+ }
+ }
+ }
+
+ KSTAT_EXIT(ksp);
+ kstat_rele(ksp);
+}
+
+/*
+ * Refresh the active state of all ills on `phyi'.
+ */
+void
+ipmp_phyint_refresh_active(phyint_t *phyi)
+{
+ if (phyi->phyint_illv4 != NULL)
+ ipmp_ill_refresh_active(phyi->phyint_illv4);
+ if (phyi->phyint_illv6 != NULL)
+ ipmp_ill_refresh_active(phyi->phyint_illv6);
+}
+
+/*
+ * Return a held pointer to the underlying ill bound to `ipif', or NULL if one
+ * doesn't exist. Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
+{
+ ill_t *boundill;
+ ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+
+ ASSERT(IS_IPMP(ipif->ipif_ill));
+
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ boundill = ipif->ipif_bound_ill;
+ if (boundill != NULL && ILL_CAN_LOOKUP(boundill)) {
+ ill_refhold(boundill);
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (boundill);
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+ return (NULL);
+}
+
+/*
+ * Return a pointer to the underlying ill bound to `ipif', or NULL if one
+ * doesn't exist. Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_ipif_bound_ill(const ipif_t *ipif)
+{
+ ASSERT(IAM_WRITER_ILL(ipif->ipif_ill));
+ ASSERT(IS_IPMP(ipif->ipif_ill));
+
+ return (ipif->ipif_bound_ill);
+}
+
+/*
+ * Check if `ipif' is a "stub" (placeholder address not being used).
+ */
+boolean_t
+ipmp_ipif_is_stubaddr(const ipif_t *ipif)
+{
+ if (ipif->ipif_flags & IPIF_UP)
+ return (B_FALSE);
+ if (ipif->ipif_ill->ill_isv6)
+ return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
+ else
+ return (ipif->ipif_lcl_addr == INADDR_ANY);
+}
+
+/*
+ * Check if `ipif' is an IPMP data address.
+ */
+boolean_t
+ipmp_ipif_is_dataaddr(const ipif_t *ipif)
+{
+ if (ipif->ipif_flags & IPIF_NOFAILOVER)
+ return (B_FALSE);
+ if (ipif->ipif_ill->ill_isv6)
+ return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
+ else
+ return (ipif->ipif_lcl_addr != INADDR_ANY);
+}
+
+/*
+ * Check if `ipif' is an IPIF_UP IPMP data address.
+ */
+static boolean_t
+ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
+{
+ return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
+}
diff --git a/usr/src/uts/common/inet/ip/rts.c b/usr/src/uts/common/inet/ip/rts.c
index 4999f28d1e..2751b19993 100644
--- a/usr/src/uts/common/inet/ip/rts.c
+++ b/usr/src/uts/common/inet/ip/rts.c
@@ -561,7 +561,6 @@ rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
case SO_TYPE:
*i1 = SOCK_RAW;
break;
-
/*
* The following three items are available here,
* but are only meaningful to IP.
@@ -597,6 +596,15 @@ rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
return (-1);
}
break;
+ case SOL_ROUTE:
+ switch (name) {
+ case RT_AWARE:
+ mutex_enter(&connp->conn_lock);
+ *i1 = connp->conn_rtaware;
+ mutex_exit(&connp->conn_lock);
+ break;
+ }
+ break;
default:
return (-1);
}
@@ -701,6 +709,20 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
return (EINVAL);
}
break;
+ case SOL_ROUTE:
+ switch (name) {
+ case RT_AWARE:
+ if (!checkonly) {
+ mutex_enter(&connp->conn_lock);
+ connp->conn_rtaware = *i1;
+ mutex_exit(&connp->conn_lock);
+ }
+ break; /* goto sizeof (int) option return */
+ default:
+ *outlenp = 0;
+ return (EINVAL);
+ }
+ break;
default:
*outlenp = 0;
return (EINVAL);
diff --git a/usr/src/uts/common/inet/ip/rts_opt_data.c b/usr/src/uts/common/inet/ip/rts_opt_data.c
index bac0eabdc4..7397b53b9e 100644
--- a/usr/src/uts/common/inet/ip/rts_opt_data.c
+++ b/usr/src/uts/common/inet/ip/rts_opt_data.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -60,6 +60,7 @@ opdes_t rts_opt_arr[] = {
{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ RT_AWARE, SOL_ROUTE, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
};
/*
diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c
index f785d8a3f6..8a3aa86d60 100644
--- a/usr/src/uts/common/inet/ip/spd.c
+++ b/usr/src/uts/common/inet/ip/spd.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -3989,7 +3989,7 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h)
ipsec_out_t *io;
boolean_t v4;
mblk_t *mp;
- boolean_t secure, attach_if;
+ boolean_t secure;
uint_t ifindex;
ipsec_selector_t sel;
ipsec_action_t *reflect_action = NULL;
@@ -4012,7 +4012,6 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h)
} else if (!ii->ipsec_in_loopback)
reflect_action = ipsec_in_to_out_action(ii);
secure = ii->ipsec_in_secure;
- attach_if = ii->ipsec_in_attach_if;
ifindex = ii->ipsec_in_ill_index;
zoneid = ii->ipsec_in_zoneid;
ASSERT(zoneid != ALL_ZONES);
@@ -4057,7 +4056,6 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h)
io->ipsec_out_proc_begin = B_FALSE;
io->ipsec_out_secure = secure;
io->ipsec_out_v4 = v4;
- io->ipsec_out_attach_if = attach_if;
io->ipsec_out_ill_index = ifindex;
io->ipsec_out_zoneid = zoneid;
io->ipsec_out_ns = ns; /* No netstack_hold */
@@ -4549,7 +4547,6 @@ ipsec_out_to_in(mblk_t *ipsec_mp)
ii->ipsec_in_secure = B_TRUE;
ii->ipsec_in_v4 = v4;
ii->ipsec_in_icmp_loopback = icmp_loopback;
- ii->ipsec_in_attach_if = B_FALSE;
}
/*
diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h
index d463c3f6ee..ad331d5706 100644
--- a/usr/src/uts/common/inet/ip6.h
+++ b/usr/src/uts/common/inet/ip6.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -133,10 +133,8 @@ typedef struct ip6_info ip6i_t;
#define IP6I_RAW_CHECKSUM 0x10
/* Compute checksum and stuff in ip6i_checksum_off */
#define IP6I_VERIFY_SRC 0x20 /* Verify ip6_src. Used when IPV6_PKTINFO */
-#define IP6I_ATTACH_IF 0x40 /* Bind to no failover address or BOUND_PIF. */
-#define IP6I_DROP_IFDELAYED 0x80
- /* Drop the packet if delayed in ndp resolver */
-#define IP6I_ND_DELAYED 0x100 /* Packet was delayed in ndp resolver */
+#define IP6I_IPMP_PROBE 0x40 /* IPMP (in.mpathd) probe packet */
+ /* 0x80 - 0x100 available */
#define IP6I_DONTFRAG 0x200 /* Don't fragment this packet */
#define IP6I_HOPLIMIT 0x400 /* hoplimit has been set by the sender */
@@ -340,7 +338,7 @@ extern void icmp_time_exceeded_v6(queue_t *, mblk_t *, uint8_t,
extern void icmp_unreachable_v6(queue_t *, mblk_t *, uint8_t,
boolean_t, boolean_t, zoneid_t, ip_stack_t *);
extern void icmp_inbound_error_fanout_v6(queue_t *, mblk_t *, ip6_t *,
- icmp6_t *, ill_t *, boolean_t, zoneid_t);
+ icmp6_t *, ill_t *, ill_t *, boolean_t, zoneid_t);
extern boolean_t conn_wantpacket_v6(conn_t *, ill_t *, ip6_t *, int, zoneid_t);
extern mblk_t *ip_add_info_v6(mblk_t *, ill_t *, const in6_addr_t *);
extern in6addr_scope_t ip_addr_scope_v6(const in6_addr_t *);
@@ -382,7 +380,7 @@ extern int ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t,
ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t,
const in6_addr_t *, mblk_t *);
extern void ip_newroute_ipif_v6(queue_t *, mblk_t *, ipif_t *,
- in6_addr_t, int, zoneid_t);
+ const in6_addr_t *, const in6_addr_t *, int, zoneid_t);
extern void ip_newroute_v6(queue_t *, mblk_t *, const in6_addr_t *,
const in6_addr_t *, ill_t *, zoneid_t, ip_stack_t *);
extern void *ip6_kstat_init(netstackid_t, ip6_stat_t *);
diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h
index c5982de059..094800197e 100644
--- a/usr/src/uts/common/inet/ip_if.h
+++ b/usr/src/uts/common/inet/ip_if.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -80,7 +80,7 @@ extern "C" {
*/
#define IFF_PHYINT_FLAGS (IFF_LOOPBACK|IFF_RUNNING|IFF_PROMISC| \
IFF_ALLMULTI|IFF_INTELLIGENT|IFF_MULTI_BCAST|IFF_FAILED|IFF_STANDBY| \
- IFF_INACTIVE|IFF_OFFLINE|IFF_VIRTUAL)
+ IFF_INACTIVE|IFF_OFFLINE|IFF_VIRTUAL|IFF_IPMP)
#define IFF_PHYINTINST_FLAGS (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP| \
IFF_MULTICAST|IFF_ROUTER|IFF_NONUD|IFF_NORTEXCH|IFF_IPV4|IFF_IPV6| \
@@ -91,11 +91,6 @@ extern "C" {
IFF_DEPRECATED|IFF_ADDRCONF|IFF_ANYCAST|IFF_NOFAILOVER| \
IFF_PREFERRED|IFF_TEMPORARY|IFF_FIXEDMTU|IFF_DUPLICATE)
-#define IPIF_REPL_CHECK(to_ipif, failback_cmd) \
- (((to_ipif)->ipif_replace_zero) || ((failback_cmd) && \
- !(to_ipif)->ipif_isv6 && !((to_ipif)->ipif_flags & IPIF_UP) && \
- (to_ipif)->ipif_lcl_addr == INADDR_ANY))
-
#define PHYI_LOOPBACK IFF_LOOPBACK /* is a loopback net */
#define PHYI_RUNNING IFF_RUNNING /* resources allocated */
#define PHYI_PROMISC IFF_PROMISC /* receive all packets */
@@ -107,6 +102,7 @@ extern "C" {
#define PHYI_INACTIVE IFF_INACTIVE /* Standby active or not ? */
#define PHYI_OFFLINE IFF_OFFLINE /* NIC has been offlined */
#define PHYI_VIRTUAL IFF_VIRTUAL /* Will not send or recv pkts */
+#define PHYI_IPMP IFF_IPMP /* IPMP meta-interface */
#define ILLF_DEBUG IFF_DEBUG /* turn on debugging */
#define ILLF_NOTRAILERS IFF_NOTRAILERS /* avoid use of trailers */
@@ -137,11 +133,6 @@ extern "C" {
#define IPIF_FIXEDMTU IFF_FIXEDMTU /* set with SIOCSLIFMTU */
#define IPIF_DUPLICATE IFF_DUPLICATE /* address is in use */
-/* Source selection values for ipif_select_source_v6 */
-#define RESTRICT_TO_NONE 0x0 /* No restriction in source selection */
-#define RESTRICT_TO_GROUP 0x1 /* Restrict to IPMP group */
-#define RESTRICT_TO_ILL 0x2 /* Restrict to ILL */
-
#ifdef DEBUG
#define ILL_MAC_PERIM_HELD(ill) ill_mac_perim_held(ill)
#else
@@ -151,24 +142,23 @@ extern "C" {
/* for ipif_resolver_up */
enum ip_resolver_action {
Res_act_initial, /* initial address establishment */
- Res_act_move, /* address move (IPMP, new DL addr) */
- Res_act_defend /* address defense */
+ Res_act_rebind, /* IPMP address rebind (new hwaddr) */
+ Res_act_defend, /* address defense */
+ Res_act_none /* do nothing */
};
-extern ill_t *illgrp_scheduler(ill_t *);
-extern mblk_t *ill_arp_alloc(ill_t *, uchar_t *, caddr_t);
-extern mblk_t *ipif_area_alloc(ipif_t *);
+extern mblk_t *ill_arp_alloc(ill_t *, const uchar_t *, caddr_t);
+extern mblk_t *ipif_area_alloc(ipif_t *, uint_t);
extern mblk_t *ipif_ared_alloc(ipif_t *);
extern mblk_t *ill_ared_alloc(ill_t *, ipaddr_t);
-extern void ill_dlpi_done(ill_t *, t_uscalar_t);
+extern mblk_t *ill_arie_alloc(ill_t *, const char *, const void *);
extern boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t);
+extern void ill_dlpi_done(ill_t *, t_uscalar_t);
extern void ill_dlpi_send(ill_t *, mblk_t *);
extern void ill_dlpi_send_deferred(ill_t *);
extern void ill_capability_done(ill_t *);
extern mblk_t *ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t);
-extern ill_t *ill_group_lookup_on_ifindex(uint_t, boolean_t, ip_stack_t *);
-extern ill_t *ill_group_lookup_on_name(char *, boolean_t, ip_stack_t *);
/* NOTE: Keep unmodified ill_lookup_on_ifindex for ipp for now */
extern ill_t *ill_lookup_on_ifindex_global_instance(uint_t, boolean_t,
queue_t *, mblk_t *, ipsq_func_t, int *);
@@ -180,6 +170,7 @@ extern ill_t *ill_lookup_on_name(char *, boolean_t,
extern uint_t ill_get_next_ifindex(uint_t, boolean_t, ip_stack_t *);
extern uint_t ill_get_ifindex_by_name(char *, ip_stack_t *);
extern void ill_ipif_cache_delete(ire_t *, char *);
+extern void ill_stq_cache_delete(ire_t *, char *);
extern void ill_delete(ill_t *);
extern void ill_delete_tail(ill_t *);
extern int ill_dl_phys(ill_t *, ipif_t *, mblk_t *, queue_t *);
@@ -193,9 +184,9 @@ extern void ill_frag_prune(ill_t *, uint_t);
extern void ill_frag_free_pkts(ill_t *, ipfb_t *, ipf_t *, int);
extern time_t ill_frag_timeout(ill_t *, time_t);
extern int ill_init(queue_t *, ill_t *);
-extern int ill_nominate_mcast_rcv(ill_group_t *);
-extern boolean_t ill_setdefaulttoken(ill_t *);
+extern void ill_refresh_bcast(ill_t *);
extern void ill_restart_dad(ill_t *, boolean_t);
+extern boolean_t ill_setdefaulttoken(ill_t *);
extern int ill_set_phys_addr(ill_t *, mblk_t *);
extern void ill_set_ndmp(ill_t *, mblk_t *, uint_t, uint_t);
@@ -222,11 +213,9 @@ extern void ill_capability_reset(ill_t *, boolean_t);
extern void ill_taskq_dispatch(ip_stack_t *);
extern void ill_mtu_change(ire_t *, char *);
-extern void ill_group_cleanup(ill_t *);
-extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *);
-extern boolean_t ill_is_probeonly(ill_t *);
-extern boolean_t ill_hook_event_create(ill_t *, lif_if_t, nic_event_t,
- nic_event_data_t, size_t);
+extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *);
+extern uint_t ill_appaddr_cnt(const ill_t *);
+extern uint_t ill_ptpaddr_cnt(const ill_t *);
extern void ip_loopback_cleanup(ip_stack_t *);
extern void ipif_get_name(const ipif_t *, char *, int);
@@ -239,6 +228,8 @@ extern ipif_t *ipif_lookup_addr_v6(const in6_addr_t *, ill_t *, zoneid_t,
queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *);
extern boolean_t ip_addr_exists_v6(const in6_addr_t *, zoneid_t,
ip_stack_t *);
+extern ipif_t *ipif_lookup_addr_exact_v6(const in6_addr_t *, ill_t *,
+ ip_stack_t *);
extern zoneid_t ipif_lookup_addr_zoneid(ipaddr_t, ill_t *, ip_stack_t *);
extern zoneid_t ipif_lookup_addr_zoneid_v6(const in6_addr_t *, ill_t *,
ip_stack_t *);
@@ -251,31 +242,30 @@ extern ipif_t *ipif_lookup_multicast(ip_stack_t *, zoneid_t, boolean_t);
extern ipif_t *ipif_lookup_remote(ill_t *, ipaddr_t, zoneid_t);
extern ipif_t *ipif_lookup_onlink_addr(ipaddr_t, zoneid_t, ip_stack_t *);
extern ipif_t *ipif_lookup_seqid(ill_t *, uint_t);
-extern boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int,
- ipif_t **);
-extern boolean_t ipif_lookup_zoneid_group(ill_t *, zoneid_t, int,
- ipif_t **);
+extern boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int, ipif_t **);
extern ipif_t *ipif_select_source(ill_t *, ipaddr_t, zoneid_t);
extern boolean_t ipif_usesrc_avail(ill_t *, zoneid_t);
extern void ipif_refhold(ipif_t *);
extern void ipif_refhold_locked(ipif_t *);
-extern void ipif_refrele(ipif_t *);
+extern void ipif_refrele(ipif_t *);
extern void ipif_all_down_tail(ipsq_t *, queue_t *, mblk_t *, void *);
+extern void ipif_resolver_down(ipif_t *);
extern int ipif_resolver_up(ipif_t *, enum ip_resolver_action);
extern int ipif_arp_setup_multicast(ipif_t *, mblk_t **);
extern int ipif_down(ipif_t *, queue_t *, mblk_t *);
extern void ipif_down_tail(ipif_t *);
+extern void ipif_multicast_down(ipif_t *);
extern void ipif_multicast_up(ipif_t *);
extern void ipif_ndp_down(ipif_t *);
-extern int ipif_ndp_up(ipif_t *);
+extern int ipif_ndp_up(ipif_t *, boolean_t);
extern int ipif_ndp_setup_multicast(ipif_t *, struct nce_s **);
extern int ipif_up_done(ipif_t *);
extern int ipif_up_done_v6(ipif_t *);
extern void ipif_up_notify(ipif_t *);
-extern void ipif_update_other_ipifs_v6(ipif_t *, ill_group_t *);
+extern void ipif_update_other_ipifs_v6(ipif_t *);
extern void ipif_recreate_interface_routes_v6(ipif_t *, ipif_t *);
extern void ill_update_source_selection(ill_t *);
-extern ipif_t *ipif_select_source_v6(ill_t *, const in6_addr_t *, uint_t,
+extern ipif_t *ipif_select_source_v6(ill_t *, const in6_addr_t *, boolean_t,
uint32_t, zoneid_t);
extern boolean_t ipif_cant_setlinklocal(ipif_t *);
extern int ipif_setlinklocal(ipif_t *);
@@ -284,11 +274,8 @@ extern ipif_t *ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t, queue_t *,
mblk_t *, ipsq_func_t, int *, ip_stack_t *);
extern ipif_t *ipif_get_next_ipif(ipif_t *curr, ill_t *ill);
extern void ipif_ill_refrele_tail(ill_t *ill);
-extern void ipif_arp_down(ipif_t *ipif);
extern void ipif_mask_reply(ipif_t *);
-
-extern int illgrp_insert(ill_group_t **, ill_t *, char *, ill_group_t *,
- boolean_t);
+extern int ipif_up(ipif_t *, queue_t *, mblk_t *);
extern void ipsq_current_start(ipsq_t *, ipif_t *, int);
extern void ipsq_current_finish(ipsq_t *);
@@ -451,13 +438,13 @@ extern int ip_sioctl_tmyaddr(ipif_t *, sin_t *, queue_t *, mblk_t *,
extern int ip_sioctl_tunparam(ipif_t *, sin_t *, queue_t *, mblk_t *,
ip_ioctl_cmd_t *, void *);
+extern int ip_sioctl_get_binding(ipif_t *, sin_t *, queue_t *,
+ mblk_t *, ip_ioctl_cmd_t *, void *);
extern int ip_sioctl_groupname(ipif_t *, sin_t *, queue_t *,
mblk_t *, ip_ioctl_cmd_t *, void *);
extern int ip_sioctl_get_groupname(ipif_t *, sin_t *, queue_t *,
mblk_t *, ip_ioctl_cmd_t *, void *);
-extern int ip_sioctl_slifoindex(ipif_t *, sin_t *, queue_t *,
- mblk_t *, ip_ioctl_cmd_t *, void *);
-extern int ip_sioctl_get_oindex(ipif_t *, sin_t *, queue_t *,
+extern int ip_sioctl_groupinfo(ipif_t *, sin_t *, queue_t *,
mblk_t *, ip_ioctl_cmd_t *, void *);
extern int ip_sioctl_get_lifzone(ipif_t *, sin_t *, queue_t *,
@@ -473,15 +460,11 @@ extern int ip_sioctl_slifusesrc(ipif_t *, sin_t *, queue_t *,
mblk_t *, ip_ioctl_cmd_t *, void *);
extern int ip_sioctl_get_lifsrcof(ipif_t *, sin_t *, queue_t *,
mblk_t *, ip_ioctl_cmd_t *, void *);
-extern int ip_sioctl_set_ipmpfailback(ipif_t *, sin_t *, queue_t *,
- mblk_t *, ip_ioctl_cmd_t *, void *);
extern void ip_sioctl_copyin_resume(ipsq_t *, queue_t *, mblk_t *, void *);
extern void ip_sioctl_copyin_setup(queue_t *, mblk_t *);
-extern void ip_sioctl_iocack(queue_t *, mblk_t *);
+extern void ip_sioctl_iocack(ipsq_t *, queue_t *, mblk_t *, void *);
extern ip_ioctl_cmd_t *ip_sioctl_lookup(int);
-extern int ip_sioctl_move(ipif_t *, sin_t *, queue_t *, mblk_t *,
- ip_ioctl_cmd_t *, void *);
extern void conn_delete_ire(conn_t *, caddr_t);
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index dae62ab499..369ba60005 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -394,11 +394,9 @@ typedef struct ip_lso_info_s {
#define CONN_IS_LSO_MD_FASTPATH(connp) \
((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \
!((connp)->conn_nexthop_set) && /* IP_NEXTHOP */ \
- (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \
- (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \
(connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */
-/* Definitons for fragmenting IP packets using MDT. */
+/* Definitions for fragmenting IP packets using MDT. */
/*
* Smaller and private version of pdescinfo_t used specifically for IP,
diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h
index 7accbbcfa3..0a9f8add85 100644
--- a/usr/src/uts/common/inet/ip_ire.h
+++ b/usr/src/uts/common/inet/ip_ire.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -86,31 +86,17 @@ extern "C" {
/* return the ire. No recursive */
/* lookup should be done. */
#define MATCH_IRE_IHANDLE 0x0200 /* Match IRE on ihandle */
-#define MATCH_IRE_MARK_HIDDEN 0x0400 /* Match IRE ire_marks with */
- /* IRE_MARK_HIDDEN. */
+#define MATCH_IRE_MARK_TESTHIDDEN 0x0400 /* Match IRE_MARK_TESTHIDDEN IREs */
+
/*
- * MATCH_IRE_ILL is used whenever we want to specifically match an IRE
- * whose ire_ipif->ipif_ill or (ill_t *)ire_stq->q_ptr matches a given
- * ill. When MATCH_IRE_ILL is used to locate an IRE_CACHE, it implies
- * that the packet will not be load balanced. This is normally used
- * by in.mpathd to send out failure detection probes.
- *
- * MATCH_IRE_ILL_GROUP is used whenever we are not specific about which
- * interface (ill) the packet should be sent out. This implies that the
- * packets will be subjected to load balancing and it might go out on
- * any interface in the group. When there is only interface in the group,
- * MATCH_IRE_ILL_GROUP becomes MATCH_IRE_ILL. Most of the code uses
- * MATCH_IRE_ILL_GROUP and MATCH_IRE_ILL is used in very few cases where
- * we want to disable load balancing.
- *
* MATCH_IRE_PARENT is used whenever we unconditionally want to get the
* parent IRE (sire) while recursively searching IREs for an offsubnet
* destination. With this flag, even if no IRE_CACHETABLE or IRE_INTERFACE
* is found to help resolving IRE_OFFSUBNET in lookup routines, the
* IRE_OFFSUBNET sire, if any, is returned to the caller.
*/
-#define MATCH_IRE_ILL_GROUP 0x0800 /* Match IRE on ill or the ill_group. */
-#define MATCH_IRE_ILL 0x1000 /* Match IRE on the ill only */
+/* UNUSED 0x0800 */
+#define MATCH_IRE_ILL 0x1000 /* Match IRE on the ill */
#define MATCH_IRE_PARENT 0x2000 /* Match parent ire, if any, */
/* even if ire is not matched. */
@@ -305,7 +291,7 @@ extern ire_t *ire_ihandle_lookup_onlink(ire_t *);
extern ire_t *ire_ihandle_lookup_offlink(ire_t *, ire_t *);
extern ire_t *ire_ihandle_lookup_offlink_v6(ire_t *, ire_t *);
-extern boolean_t ire_local_same_ill_group(ire_t *, ire_t *);
+extern boolean_t ire_local_same_lan(ire_t *, ire_t *);
extern boolean_t ire_local_ok_across_zones(ire_t *, zoneid_t, void *,
const struct ts_label_s *, ip_stack_t *);
@@ -354,7 +340,7 @@ extern ire_t *ipif_lookup_multi_ire_v6(ipif_t *, const in6_addr_t *);
extern ire_t *ire_get_next_bcast_ire(ire_t *, ire_t *);
extern ire_t *ire_get_next_default_ire(ire_t *, ire_t *);
-extern void ire_arpresolve(ire_t *, ill_t *);
+extern void ire_arpresolve(ire_t *);
extern void ire_freemblk(ire_t *);
extern boolean_t ire_match_args(ire_t *, ipaddr_t, ipaddr_t, ipaddr_t,
int, const ipif_t *, zoneid_t, uint32_t, const struct ts_label_s *, int,
diff --git a/usr/src/uts/common/inet/ip_multi.h b/usr/src/uts/common/inet/ip_multi.h
index a3f4282cc7..7dee133967 100644
--- a/usr/src/uts/common/inet/ip_multi.h
+++ b/usr/src/uts/common/inet/ip_multi.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -49,6 +49,15 @@ typedef enum {
} ilg_stat_t;
/*
+ * Flags shared via ips_mrt_flags, used by mcast_restart_timers_thread().
+ */
+typedef enum {
+ IP_MRT_STOP = 0x1, /* request to stop thread */
+ IP_MRT_DONE = 0x2, /* indication that thread is stopped */
+ IP_MRT_RUN = 0x4 /* request to restart timers */
+} ip_mrt_flags_t;
+
+/*
* Extern functions
*/
extern mblk_t *igmp_input(queue_t *, mblk_t *, ill_t *);
@@ -78,9 +87,7 @@ extern int ip_get_dlpi_mbcast(ill_t *, mblk_t *);
extern void ilm_free(ipif_t *);
extern ilm_t *ilm_lookup_ill(ill_t *, ipaddr_t, zoneid_t);
extern ilm_t *ilm_lookup_ill_v6(ill_t *, const in6_addr_t *,
- zoneid_t);
-extern ilm_t *ilm_lookup_ill_index_v6(ill_t *, const in6_addr_t *,
- int, zoneid_t);
+ boolean_t, zoneid_t);
extern ilm_t *ilm_lookup_ipif(ipif_t *, ipaddr_t);
extern int ilm_numentries_v6(ill_t *, const in6_addr_t *);
@@ -92,10 +99,10 @@ extern int ip_ll_send_enabmulti_req(ill_t *, const in6_addr_t *);
extern int ip_addmulti(ipaddr_t, ipif_t *, ilg_stat_t,
mcast_record_t, slist_t *);
-extern int ip_addmulti_v6(const in6_addr_t *, ill_t *, int,
+extern int ip_addmulti_v6(const in6_addr_t *, ill_t *,
zoneid_t, ilg_stat_t, mcast_record_t, slist_t *);
extern int ip_delmulti(ipaddr_t, ipif_t *, boolean_t, boolean_t);
-extern int ip_delmulti_v6(const in6_addr_t *, ill_t *, int,
+extern int ip_delmulti_v6(const in6_addr_t *, ill_t *,
zoneid_t, boolean_t, boolean_t);
extern int ill_join_allmulti(ill_t *);
extern void ill_leave_allmulti(ill_t *);
@@ -140,9 +147,11 @@ extern void reset_conn_ipif(ipif_t *);
extern void reset_conn_ill(ill_t *);
extern void reset_mrt_ill(ill_t *);
extern void reset_mrt_vif_ipif(ipif_t *);
-extern void igmp_start_timers(unsigned, ip_stack_t *);
-extern void mld_start_timers(unsigned, ip_stack_t *);
+extern void mcast_restart_timers_thread(ip_stack_t *);
extern void ilm_inactive(ilm_t *);
+extern ilm_t *ilm_walker_start(ilm_walker_t *, ill_t *);
+extern ilm_t *ilm_walker_step(ilm_walker_t *, ilm_t *);
+extern void ilm_walker_finish(ilm_walker_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip_ndp.h b/usr/src/uts/common/inet/ip_ndp.h
index 4dbb56a884..5eda155c0e 100644
--- a/usr/src/uts/common/inet/ip_ndp.h
+++ b/usr/src/uts/common/inet/ip_ndp.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_IP_NDP_H
#define _INET_IP_NDP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/mutex.h>
#include <sys/stream.h>
#include <netinet/in.h>
@@ -318,7 +316,8 @@ extern nd_opt_hdr_t *ndp_get_option(nd_opt_hdr_t *, int, int);
extern void ndp_inactive(nce_t *);
extern void ndp_input(ill_t *, mblk_t *, mblk_t *);
extern boolean_t ndp_lookup_ipaddr(in_addr_t, netstack_t *);
-extern nce_t *ndp_lookup_v6(ill_t *, const in6_addr_t *, boolean_t);
+extern nce_t *ndp_lookup_v6(ill_t *, boolean_t, const in6_addr_t *,
+ boolean_t);
extern nce_t *ndp_lookup_v4(ill_t *, const in_addr_t *, boolean_t);
extern int ndp_mcastreq(ill_t *, const in6_addr_t *, uint32_t, uint32_t,
mblk_t *);
@@ -346,7 +345,7 @@ extern void nce_fastpath(nce_t *);
extern int ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *,
const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t,
nce_t **);
-extern int ndp_lookup_then_add_v6(ill_t *, uchar_t *,
+extern int ndp_lookup_then_add_v6(ill_t *, boolean_t, uchar_t *,
const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, uint32_t,
uint16_t, uint16_t, nce_t **);
extern int ndp_lookup_then_add_v4(ill_t *,
diff --git a/usr/src/uts/common/inet/ip_rts.h b/usr/src/uts/common/inet/ip_rts.h
index 70b33e0278..61bc451995 100644
--- a/usr/src/uts/common/inet/ip_rts.h
+++ b/usr/src/uts/common/inet/ip_rts.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -37,19 +37,28 @@ extern "C" {
*/
#define TSOL_RTSA_REQUEST_MAX 1 /* one per route destination */
+/*
+ * Flags for RTS queuing operations.
+ */
+#define RTSQ_UNDER_IPMP 0x01 /* send only on RTAW_UNDER_IPMP queues */
+#define RTSQ_NORMAL 0x02 /* send only on normal queues */
+#define RTSQ_ALL (RTSQ_UNDER_IPMP|RTSQ_NORMAL) /* send on all queues */
+#define RTSQ_DEFAULT 0x04 /* use standard filtering */
+
#ifdef _KERNEL
extern void ip_rts_change(int, ipaddr_t, ipaddr_t,
- ipaddr_t, ipaddr_t, ipaddr_t, int, int,
- int, ip_stack_t *);
+ ipaddr_t, ipaddr_t, ipaddr_t, int, int, int, ip_stack_t *);
extern void ip_rts_change_v6(int, const in6_addr_t *, const in6_addr_t *,
const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, int, int, int,
ip_stack_t *);
-extern void ip_rts_ifmsg(const ipif_t *);
+extern void ip_rts_ifmsg(const ipif_t *, uint_t);
-extern void ip_rts_newaddrmsg(int, int, const ipif_t *);
+extern void ip_rts_xifmsg(const ipif_t *, uint64_t, uint64_t, uint_t);
+
+extern void ip_rts_newaddrmsg(int, int, const ipif_t *, uint_t);
extern int ip_rts_request(queue_t *, mblk_t *, cred_t *);
@@ -70,9 +79,11 @@ extern void rts_fill_msg_v6(int, int, const in6_addr_t *,
extern size_t rts_header_msg_size(int);
-extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, ip_stack_t *);
+extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, uint_t,
+ ip_stack_t *);
extern int ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *, cred_t *);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index 3c53e1a3d3..750378f587 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,6 +33,7 @@ extern "C" {
#include <sys/netstack.h>
#include <netinet/igmp_var.h>
+#include <sys/modhash.h>
#ifdef _KERNEL
#include <sys/list.h>
@@ -172,9 +173,6 @@ struct ip_stack {
krwlock_t ips_ill_g_usesrc_lock;
- struct ill_group *ips_illgrp_head_v4; /* Head of IPv4 ill groups */
- struct ill_group *ips_illgrp_head_v6; /* Head of IPv6 ill groups */
-
/* Taskq dispatcher for capability operations */
kmutex_t ips_capab_taskq_lock;
kcondvar_t ips_capab_taskq_cv;
@@ -204,7 +202,6 @@ struct ip_stack {
int ips_igmp_timer_scheduled_last;
int ips_igmp_deferred_next;
timeout_id_t ips_igmp_timeout_id;
- kthread_t *ips_igmp_timer_thread;
boolean_t ips_igmp_timer_setter_active;
/* Following protected by mld_timer_lock */
@@ -212,7 +209,6 @@ struct ip_stack {
int ips_mld_timer_scheduled_last;
int ips_mld_deferred_next;
timeout_id_t ips_mld_timeout_id;
- kthread_t *ips_mld_timer_thread;
boolean_t ips_mld_timer_setter_active;
/* Protected by igmp_slowtimeout_lock */
@@ -269,8 +265,6 @@ struct ip_stack {
int ips_ip_g_forward;
int ips_ipv6_forward;
- int ips_ipmp_hook_emulation; /* ndd variable */
-
time_t ips_ip_g_frag_timeout;
clock_t ips_ip_g_frag_timo_ms;
@@ -280,8 +274,6 @@ struct ip_stack {
clock_t ips_icmp_pkt_err_last;
/* Number of packets sent in burst */
uint_t ips_icmp_pkt_err_sent;
- /* Used by icmp_send_redirect_v6 for picking random src. */
- uint_t ips_icmp_redirect_v6_src_index;
/* Protected by ip_mi_lock */
void *ips_ip_g_head; /* Instance Data List Head */
@@ -356,8 +348,6 @@ struct ip_stack {
kstat_t *ips_loopback_ksp;
- uint_t ips_ipif_src_random;
-
struct idl_s *ips_conn_drain_list; /* Array of conn drain lists */
uint_t ips_conn_drain_list_cnt; /* Count of conn_drain_list */
int ips_conn_drain_list_index; /* Next drain_list */
@@ -375,15 +365,6 @@ struct ip_stack {
uint64_t ips_ipif_g_seqid;
union phyint_list_u *ips_phyint_g_list; /* start of phyint list */
- /*
- * Reflects value of FAILBACK variable in IPMP config file
- * /etc/default/mpathd. Default value is B_TRUE.
- * Set to B_FALSE if user disabled failback by configuring
- * "FAILBACK=no" in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this
- * information to kernel.
- */
- boolean_t ips_ipmp_enable_failback;
-
/* ip_neti.c */
hook_family_t ips_ipv4root;
hook_family_t ips_ipv6root;
@@ -427,12 +408,25 @@ struct ip_stack {
kcondvar_t ips_ipobs_cb_cv;
struct __ldi_ident *ips_ldi_ident;
+
+/* ipmp.c */
+ krwlock_t ips_ipmp_lock;
+ mod_hash_t *ips_ipmp_grp_hash;
+
+/* igmp.c */
+ /* multicast restart timers thread logic */
+ kmutex_t ips_mrt_lock;
+ uint_t ips_mrt_flags;
+ kcondvar_t ips_mrt_cv;
+ kcondvar_t ips_mrt_done_cv;
+ kthread_t *ips_mrt_thread;
};
typedef struct ip_stack ip_stack_t;
/* Finding an ip_stack_t */
#define CONNQ_TO_IPST(_q) (Q_TO_CONN(_q)->conn_netstack->netstack_ip)
#define ILLQ_TO_IPST(_q) (((ill_t *)(_q)->q_ptr)->ill_ipst)
+#define PHYINT_TO_IPST(phyi) ((phyi)->phyint_ipsq->ipsq_ipst)
#else /* _KERNEL */
typedef int ip_stack_t;
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index 5fb86a5262..d80123a977 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -249,7 +249,6 @@ struct conn_s {
squeue_t *conn_initial_sqp; /* Squeue at open time */
squeue_t *conn_final_sqp; /* Squeue after connect */
- ill_t *conn_nofailover_ill; /* Failover ill */
ill_t *conn_dhcpinit_ill; /* IP_DHCPINIT_IF */
ipsec_latch_t *conn_latch; /* latched state */
ill_t *conn_outgoing_ill; /* IP{,V6}_BOUND_IF */
@@ -295,7 +294,6 @@ struct conn_s {
uint_t conn_proto; /* SO_PROTOTYPE state */
ill_t *conn_incoming_ill; /* IP{,V6}_BOUND_IF */
- ill_t *conn_outgoing_pill; /* IP{,V6}_BOUND_PIF */
ill_t *conn_oper_pending_ill; /* pending shared ioctl */
ilg_t *conn_ilg; /* Group memberships */
@@ -307,9 +305,6 @@ struct conn_s {
struct ipif_s *conn_multicast_ipif; /* IP_MULTICAST_IF */
ill_t *conn_multicast_ill; /* IPV6_MULTICAST_IF */
- int conn_orig_bound_ifindex; /* BOUND_IF before MOVE */
- int conn_orig_multicast_ifindex;
- /* IPv6 MC IF before MOVE */
struct conn_s *conn_drain_next; /* Next conn in drain list */
struct conn_s *conn_drain_prev; /* Prev conn in drain list */
idl_t *conn_idl; /* Ptr to the drain list head */
@@ -322,7 +317,7 @@ struct conn_s {
uchar_t conn_broadcast_ttl; /* IP_BROADCAST_TTL */
#define conn_nexthop_v4 V4_PART_OF_V6(conn_nexthop_v6)
cred_t *conn_peercred; /* Peer credentials, if any */
-
+ int conn_rtaware; /* RT_AWARE sockopt value */
kcondvar_t conn_sq_cv; /* For non-STREAMS socket IO */
kthread_t *conn_sq_caller; /* Caller of squeue sync ops */
sock_upcalls_t *conn_upcalls; /* Upcalls to sockfs */
diff --git a/usr/src/uts/common/inet/ipnet/ipnet.c b/usr/src/uts/common/inet/ipnet/ipnet.c
index 577205f25a..e94af50424 100644
--- a/usr/src/uts/common/inet/ipnet/ipnet.c
+++ b/usr/src/uts/common/inet/ipnet/ipnet.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -229,16 +229,19 @@ ipnet_if_init(void)
int
_init(void)
{
- int ret;
+ int ret;
+ boolean_t netstack_registered = B_FALSE;
if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
return (ENODEV);
ipnet_minor_space = id_space_create("ipnet_minor_space",
IPNET_MINOR_MIN, MAXMIN32);
- netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
+
/*
* We call ddi_taskq_create() with nthread == 1 to ensure in-order
- * delivery of packets to clients.
+ * delivery of packets to clients. Note that we need to create the
+ * taskqs before calling netstack_register() since ipnet_stack_init()
+ * registers callbacks that use 'em.
*/
ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
@@ -247,6 +250,10 @@ _init(void)
ret = ENOMEM;
goto done;
}
+
+ netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
+ netstack_registered = B_TRUE;
+
if ((ret = ipnet_if_init()) == 0)
ret = mod_install(&modlinkage);
done:
@@ -255,7 +262,8 @@ done:
ddi_taskq_destroy(ipnet_taskq);
if (ipnet_nicevent_taskq != NULL)
ddi_taskq_destroy(ipnet_nicevent_taskq);
- netstack_unregister(NS_IPNET);
+ if (netstack_registered)
+ netstack_unregister(NS_IPNET);
id_space_destroy(ipnet_minor_space);
}
return (ret);
@@ -268,9 +276,10 @@ _fini(void)
if ((err = mod_remove(&modlinkage)) != 0)
return (err);
+
+ netstack_unregister(NS_IPNET);
ddi_taskq_destroy(ipnet_nicevent_taskq);
ddi_taskq_destroy(ipnet_taskq);
- netstack_unregister(NS_IPNET);
id_space_destroy(ipnet_minor_space);
return (0);
}
@@ -987,6 +996,7 @@ static boolean_t
ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
ipnet_addrp_t *dst)
{
+ boolean_t obsif;
uint64_t ifindex = ipnet->ipnet_if->if_index;
ipnet_addrtype_t srctype, dsttype;
@@ -994,6 +1004,13 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
dsttype = ipnet_get_addrtype(ipnet, dst);
/*
+ * If the packet's ifindex matches ours, or the packet's group ifindex
+ * matches ours, it's on the interface we're observing. (Thus,
+ * observing on the group ifindex matches all ifindexes in the group.)
+ */
+ obsif = (ihd->ihd_ifindex == ifindex || ihd->ihd_grifindex == ifindex);
+
+ /*
* Do not allow an ipnet stream to see packets that are not from or to
* its zone. The exception is when zones are using the shared stack
* model. In this case, streams in the global zone have visibility
@@ -1025,7 +1042,7 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
* have our source address (this allows us to see packets we send).
*/
if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
- if (ihd->ihd_ifindex == ifindex || srctype == IPNETADDR_MYADDR)
+ if (srctype == IPNETADDR_MYADDR || obsif)
return (B_TRUE);
}
@@ -1033,7 +1050,7 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
* We accept multicast and broadcast packets transmitted or received
* on the interface we're observing.
*/
- if (dsttype == IPNETADDR_MBCAST && ihd->ihd_ifindex == ifindex)
+ if (dsttype == IPNETADDR_MBCAST && obsif)
return (B_TRUE);
return (B_FALSE);
diff --git a/usr/src/uts/common/inet/ipsec_info.h b/usr/src/uts/common/inet/ipsec_info.h
index b014bdade0..0348e10b91 100644
--- a/usr/src/uts/common/inet/ipsec_info.h
+++ b/usr/src/uts/common/inet/ipsec_info.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_IPSEC_INFO_H
#define _INET_IPSEC_INFO_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -114,12 +112,11 @@ typedef struct ipsec_in_s {
ipsec_in_decaps : 1, /* Was this packet decapsulated from */
/* a matching inner packet? */
- ipsec_in_attach_if : 1, /* Don't load spread this packet */
ipsec_in_accelerated : 1, /* hardware accelerated packet */
ipsec_in_icmp_loopback : 1, /* Looped-back ICMP packet, */
/* all should trust this. */
- ipsec_in_pad_bits : 24;
+ ipsec_in_pad_bits : 25;
int ipsec_in_ill_index; /* interface on which ipha_dst was */
/* configured when pkt was recv'd */
@@ -197,12 +194,11 @@ typedef struct ipsec_out_s {
ipsec_out_reserved : 1,
ipsec_out_v4 : 1,
- ipsec_out_attach_if : 1,
ipsec_out_unspec_src : 1, /* IPv6 ip6i_t info */
ipsec_out_reachable : 1, /* NDP reachability info */
ipsec_out_failed: 1,
-
ipsec_out_se_done: 1,
+
ipsec_out_esp_done: 1,
ipsec_out_ah_done: 1,
ipsec_out_need_policy: 1,
@@ -225,7 +221,7 @@ typedef struct ipsec_out_s {
*/
ipsec_out_icmp_loopback: 1,
ipsec_out_ip_nexthop : 1, /* IP_NEXTHOP option is set */
- ipsec_out_pad_bits : 12;
+ ipsec_out_pad_bits : 13;
cred_t *ipsec_out_cred;
uint32_t ipsec_out_capab_ill_index;
diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h
index 5abfc06581..a467abaee9 100644
--- a/usr/src/uts/common/inet/mib2.h
+++ b/usr/src/uts/common/inet/mib2.h
@@ -17,9 +17,8 @@
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -27,8 +26,6 @@
#ifndef _INET_MIB2_H
#define _INET_MIB2_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <netinet/in.h> /* For in6_addr_t */
#include <sys/tsol/label.h> /* For brange_t */
#include <sys/tsol/label_macro.h> /* For brange_t */
@@ -65,9 +62,14 @@ extern "C" {
* #define OPTLEN(x) ((((x) + sizeof(long) - 1) / sizeof(long)) * sizeof(long))
* #define OPTVAL(opt) ((char *)(opt + 1))
*
- * For get requests (T_NEGOTIATE), any MIB2_xxx value can be used (only
+ * For get requests (T_CURRENT), any MIB2_xxx value can be used (only
* "get all" is supported, so all modules get a copy of the request to
- * return everything it knows. Recommend: Use MIB2_IP
+ * return everything it knows. In general, we use MIB2_IP. There is
+ * one exception: in general, IP will not report information related to
+ * IRE_MARK_TESTHIDDEN routes (e.g., in the MIB2_IP_ROUTE table).
+ * However, using the special value EXPER_IP_AND_TESTHIDDEN will cause
+ * all information to be reported. This special value should only be
+ * used by IPMP-aware low-level utilities (e.g. in.mpathd).
*
* IMPORTANT: some fields are grouped in a different structure than
* suggested by MIB-II, e.g., checksum error counts. The original MIB-2
@@ -79,7 +81,6 @@ extern "C" {
#define IPPROTO_MAX 256
#endif
-
#define MIB2_SYSTEM (IPPROTO_MAX+1)
#define MIB2_INTERFACES (IPPROTO_MAX+2)
#define MIB2_AT (IPPROTO_MAX+3)
@@ -108,12 +109,13 @@ extern "C" {
#define EXPER_IGMP (EXPER+1)
#define EXPER_DVMRP (EXPER+2)
#define EXPER_RAWIP (EXPER+3)
+#define EXPER_IP_AND_TESTHIDDEN (EXPER+4)
/*
* Define range of levels for experimental use
*/
#define EXPER_RANGE_START (EXPER+1)
-#define EXPER_RANGE_END (EXPER+3)
+#define EXPER_RANGE_END (EXPER+4)
#define BUMP_MIB(s, x) { \
extern void __dtrace_probe___mib_##x(int, void *); \
diff --git a/usr/src/uts/common/inet/sctp/sctp_addr.c b/usr/src/uts/common/inet/sctp/sctp_addr.c
index 1761396031..94cc8e8883 100644
--- a/usr/src/uts/common/inet/sctp/sctp_addr.c
+++ b/usr/src/uts/common/inet/sctp/sctp_addr.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/stream.h>
diff --git a/usr/src/uts/common/inet/sctp_ip.h b/usr/src/uts/common/inet/sctp_ip.h
index 16ab99abab..7b20d3fd2b 100644
--- a/usr/src/uts/common/inet/sctp_ip.h
+++ b/usr/src/uts/common/inet/sctp_ip.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_SCTP_IP_H
#define _INET_SCTP_IP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 488f8ee021..68e0883222 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -31,7 +31,6 @@
#include <sys/strsubr.h>
#include <sys/stropts.h>
#include <sys/strlog.h>
-#include <sys/strsun.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/timod.h>
@@ -4683,18 +4682,10 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
/* ifindex must be already set */
ASSERT(ifindex != 0);
- if (ltcp->tcp_bound_if != 0) {
- /*
- * Set newtcp's bound_if equal to
- * listener's value. If ifindex is
- * not the same as ltcp->tcp_bound_if,
- * it must be a packet for the ipmp group
- * of interfaces
- */
+ if (ltcp->tcp_bound_if != 0)
tcp->tcp_bound_if = ltcp->tcp_bound_if;
- } else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
+ else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
tcp->tcp_bound_if = ifindex;
- }
tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary;
tcp->tcp_recvifindex = 0;
@@ -10716,9 +10707,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
ipp->ipp_fields |= IPPF_USE_MIN_MTU;
ipp->ipp_use_min_mtu = *i1;
break;
- case IPV6_BOUND_PIF:
- /* Handled at the IP level */
- return (-EINVAL);
case IPV6_SEC_OPT:
/*
* We should not allow policy setting after
@@ -18895,7 +18883,6 @@ tcp_zcopy_check(tcp_t *tcp)
connp->conn_dontroute == 0 &&
!connp->conn_nexthop_set &&
connp->conn_outgoing_ill == NULL &&
- connp->conn_nofailover_ill == NULL &&
do_tcpzcopy == 1) {
/*
* the checks above closely resemble the fast path checks
@@ -19139,7 +19126,6 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp)
ipaddr_t dst;
ire_t *ire;
ill_t *ill;
- conn_t *connp = tcp->tcp_connp;
mblk_t *ire_fp_mp;
tcp_stack_t *tcps = tcp->tcp_tcps;
@@ -19164,14 +19150,6 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp)
}
ill = ire_to_ill(ire);
- if (connp->conn_outgoing_ill != NULL) {
- ill_t *conn_outgoing_ill = NULL;
- /*
- * Choose a good ill in the group to send the packets on.
- */
- ire = conn_set_outgoing_ill(connp, ire, &conn_outgoing_ill);
- ill = ire_to_ill(ire);
- }
ASSERT(ill != NULL);
if (!tcp->tcp_ire_ill_check_done) {
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
index 15b5d04d61..8c8eee3b58 100644
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +35,7 @@
#include <inet/common.h>
#include <inet/optcom.h>
#include <inet/ip.h>
+#include <inet/ip_if.h>
#include <inet/ip_impl.h>
#include <inet/tcp.h>
#include <inet/tcp_impl.h>
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index d977c27e53..e2314f8104 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -151,9 +151,6 @@ opdes_t tcp_opt_arr[] = {
{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
sizeof (in_addr_t), -1 /* not initialized */ },
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (int), 0 /* no ifindex */ },
-
{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
sizeof (int), 0 },
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 7c9433caa0..1178315cb5 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -80,6 +80,7 @@
#include <inet/ipp_common.h>
#include <sys/squeue_impl.h>
#include <inet/ipnet.h>
+#include <sys/ethernet.h>
/*
* The ipsec_info.h header file is here since it has the definition for the
@@ -2141,7 +2142,6 @@ udp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
case MCAST_UNBLOCK_SOURCE:
case MCAST_JOIN_SOURCE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
- case IP_DONTFAILOVER_IF:
/* cannot "get" the value for these */
return (-1);
case IP_BOUND_IF:
@@ -3152,9 +3152,7 @@ udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
ipp->ipp_use_min_mtu = *i1;
break;
- case IPV6_BOUND_PIF:
case IPV6_SEC_OPT:
- case IPV6_DONTFAILOVER_IF:
case IPV6_SRC_PREFERENCES:
case IPV6_V6ONLY:
/* Handled at the IP level */
@@ -5351,7 +5349,6 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port,
if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
CONN_OUTBOUND_POLICY_PRESENT(connp, ipss) ||
connp->conn_dontroute ||
- connp->conn_nofailover_ill != NULL ||
connp->conn_outgoing_ill != NULL || optinfo.ip_opt_flags != 0 ||
optinfo.ip_opt_ill_index != 0 ||
ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
@@ -5419,8 +5416,7 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr;
ASSERT(ipif != NULL);
- if (stq_ill != ipif->ipif_ill && (stq_ill->ill_group == NULL ||
- stq_ill->ill_group != ipif->ipif_ill->ill_group))
+ if (!IS_ON_SAME_LAN(stq_ill, ipif->ipif_ill))
retry_caching = B_TRUE;
}
@@ -5444,7 +5440,7 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
ASSERT(ipif != NULL);
ire = ire_ctable_lookup(dst, 0, 0, ipif,
connp->conn_zoneid, MBLK_GETLABEL(mp),
- MATCH_IRE_ILL_GROUP, ipst);
+ MATCH_IRE_ILL, ipst);
} else {
ASSERT(ipif == NULL);
ire = ire_cache_lookup(dst, connp->conn_zoneid,
@@ -5622,12 +5618,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
}
if (CLASSD(dst)) {
- boolean_t ilm_exists;
-
- ILM_WALKER_HOLD(ill);
- ilm_exists = (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL);
- ILM_WALKER_RELE(ill);
- if (ilm_exists) {
+ if (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL) {
ip_multicast_loopback(q, ill, mp,
connp->conn_multicast_loop ? 0 :
IP_FF_NO_MCAST_LOOP, zoneid);
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index 0ec5a2c45e..65729b82f1 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -132,9 +132,6 @@ opdes_t udp_opt_arr[] = {
{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
sizeof (int), 0 /* no ifindex */ },
-{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (struct in_addr), 0 /* not initialized */ },
-
{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
sizeof (int), 0 },
@@ -191,12 +188,6 @@ opdes_t udp_opt_arr[] = {
{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
sizeof (int), 0 /* no ifindex */ },
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (int), 0 /* no ifindex */ },
-
-{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
- sizeof (int), 0 /* no ifindex */ },
-
{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
sizeof (int), 0 },
diff --git a/usr/src/uts/common/inet/vni/vni.c b/usr/src/uts/common/inet/vni/vni.c
deleted file mode 100644
index a370a7b4be..0000000000
--- a/usr/src/uts/common/inet/vni/vni.c
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-
-#include "vni_impl.h"
-#include <sys/conf.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/dlpi.h>
-#include <sys/stat.h>
-#include <sys/ethernet.h>
-#include <sys/strsun.h>
-#include <sys/stropts.h>
-
-static int vniopen(queue_t *, dev_t *, int, int, cred_t *);
-static int vniclose(queue_t *, int, cred_t *);
-static int vniwput(queue_t *, mblk_t *);
-static int vniattach(dev_info_t *, ddi_attach_cmd_t);
-static int vnidetach(dev_info_t *, ddi_detach_cmd_t);
-
-static struct module_info minfo = {
- VNIIDNUM, /* mi_idnum */
- VNINAME, /* mi_idname */
- VNIMINPSZ, /* mi_minpsz */
- VNIMAXPSZ, /* mi_maxpsz */
- VNIHIWAT, /* mi_hiwat */
- VNILOWAT /* mi_lowat */
-};
-
-static struct qinit vnirinit = {
- NULL, /* qi_putp */
- NULL, /* qi_srvp */
- vniopen, /* qi_qopen */
- vniclose, /* qi_qclose */
- NULL, /* qi_qadmin */
- &minfo, /* qi_minfo */
- NULL /* qi_mstat */
-};
-
-static struct qinit vniwinit = {
- vniwput, /* qi_putp */
- NULL, /* qi_srvp */
- NULL, /* qi_qopen */
- NULL, /* qi_qclose */
- NULL, /* qi_qadmin */
- &minfo, /* qi_minfo */
- NULL /* qi_mstat */
-};
-
-static struct streamtab vni_info = {
- &vnirinit, /* st_rdinit */
- &vniwinit, /* st_wrinit */
- NULL, /* st_muxrinit */
- NULL /* st_muxwrinit */
-};
-
-DDI_DEFINE_STREAM_OPS(vni_ops, nulldev, nulldev, vniattach, \
- vnidetach, nodev, nodev, VNIFLAGS, &vni_info, ddi_quiesce_not_supported);
-
-static struct modldrv modldrv = {
- &mod_driverops,
- "Virtual network interface",
- &vni_ops,
-};
-
-static struct modlinkage modlinkage = {
- MODREV_1, &modldrv, NULL
-};
-
-static vni_str_t *vni_strlist_head;
-
-/*
- * DL_INFO_ACK template for VNI pseudo interface.
- */
-static dl_info_ack_t dlvni_infoack = {
- DL_INFO_ACK, /* dl_primitive */
- 0, /* dl_max_sdu */
- 0, /* dl_min_sdu */
- 0, /* dl_addr_length */
- SUNW_DL_VNI, /* dl_mac_type */
- 0, /* dl_reserved */
- 0, /* dl_current_state */
- 0, /* dl_sap_length */
- DL_CLDLS, /* dl_service_mode */
- 0, /* dl_qos_length */
- 0, /* dl_qos_offset */
- 0, /* dl_range_length */
- 0, /* dl_range_offset */
- DL_STYLE2, /* dl_provider_style */
- 0, /* dl_addr_offset */
- DL_VERSION_2, /* dl_version */
- 0, /* dl_brdcst_addr_length */
- 0, /* dl_brdcst_addr_offset */
- 0 /* dl_growth */
-};
-
-int
-_init(void)
-{
- return (mod_install(&modlinkage));
-}
-
-int
-_fini(void)
-{
- return (mod_remove(&modlinkage));
-}
-
-int
-_info(struct modinfo *modinfop)
-{
- return (mod_info(&modlinkage, modinfop));
-}
-
-static int
-vniattach(dev_info_t *devi, ddi_attach_cmd_t cmd)
-{
- if (cmd != DDI_ATTACH) {
- cmn_err(CE_NOTE, "vniattach failure: cmd != DDI_ATTACH\n");
- return (DDI_FAILURE);
- }
-
- if (ddi_create_minor_node(devi, VNINAME, S_IFCHR,
- ddi_get_instance(devi), DDI_PSEUDO, CLONE_DEV) ==
- DDI_FAILURE) {
- ddi_remove_minor_node(devi, NULL);
- cmn_err(CE_NOTE, "vniattach failure: ddi_create_minor_node\n");
- return (DDI_FAILURE);
- }
-
- return (DDI_SUCCESS);
-}
-
-static int
-vnidetach(dev_info_t *devi, ddi_detach_cmd_t cmd)
-{
- if (cmd != DDI_DETACH)
- return (DDI_FAILURE);
-
- ddi_remove_minor_node(devi, NULL);
- return (DDI_SUCCESS);
-}
-
-/* ARGSUSED */
-static int
-vniopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
-{
- vni_str_t *stp, *prevstp;
- minor_t minordev = 0;
-
- if (sflag != CLONEOPEN)
- return (EINVAL);
-
- prevstp = NULL;
-
- for (stp = vni_strlist_head; stp != NULL; stp = stp->st_next) {
- if (minordev < stp->st_minor)
- break;
- minordev++;
- prevstp = stp;
- }
-
- stp = kmem_zalloc(sizeof (vni_str_t), KM_SLEEP);
-
- *devp = makedevice(getmajor(*devp), minordev);
-
- stp->st_minor = minordev;
- stp->st_state = DL_UNATTACHED;
- stp->st_next = NULL;
-
- q->q_ptr = stp;
- WR(q)->q_ptr = stp;
-
- if (prevstp != NULL) {
- stp->st_next = prevstp->st_next;
- prevstp->st_next = stp;
- } else {
- stp->st_next = vni_strlist_head;
- vni_strlist_head = stp;
- }
-
- qprocson(q);
- return (0);
-}
-
-/* ARGSUSED */
-static int
-vniclose(queue_t *q, int flag, cred_t *credp)
-{
- vni_str_t *stp, **prevstpp;
-
- qprocsoff(q);
- stp = (vni_str_t *)q->q_ptr;
- stp->st_state = DL_UNATTACHED;
-
- /* Unlink the per-stream entry from the list and free it */
- stp = vni_strlist_head;
- prevstpp = &vni_strlist_head;
-
- for (; stp != NULL; stp = stp->st_next) {
- if (stp == (vni_str_t *)q->q_ptr)
- break;
- prevstpp = &stp->st_next;
- }
-
- ASSERT(stp != NULL);
-
- *prevstpp = stp->st_next;
-
- kmem_free(stp, sizeof (vni_str_t));
-
- q->q_ptr = WR(q)->q_ptr = NULL;
- return (0);
-}
-
-static int
-vniwput(queue_t *q, mblk_t *mp)
-{
- union DL_primitives *dlp;
- vni_str_t *stp;
- dl_info_ack_t *dlip;
- t_scalar_t prim;
-
- stp = q->q_ptr;
-
- switch ((mp)->b_datap->db_type) {
- case M_PROTO:
- case M_PCPROTO:
- if (MBLKL(mp) < sizeof (t_scalar_t)) {
- dlerrorack(q, mp, DL_PRIM_INVAL, DL_UNSUPPORTED, 0);
- return (0);
- }
- dlp = (void *)mp->b_rptr;
- prim = dlp->dl_primitive;
- switch (prim) {
- case DL_ATTACH_REQ:
- if (MBLKL(mp) < DL_ATTACH_REQ_SIZE) {
- dlerrorack(q, mp, DL_ATTACH_REQ, DL_BADPRIM, 0);
- return (0);
- }
- if (stp->st_state != DL_UNATTACHED) {
- dlerrorack(q, mp, DL_ATTACH_REQ, DL_OUTSTATE,
- 0);
- return (0);
- }
- stp->st_ppa = dlp->attach_req.dl_ppa;
- stp->st_state = DL_UNBOUND;
- dlokack(q, mp, DL_ATTACH_REQ);
- break;
- case DL_BIND_REQ:
- if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
- dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
- return (0);
- }
- if (stp->st_state != DL_UNBOUND) {
- dlerrorack(q, mp, DL_BIND_REQ, DL_OUTSTATE, 0);
- return (0);
- }
- stp->st_state = DL_IDLE;
- dlbindack(q, mp, dlp->bind_req.dl_sap, NULL, 0, 0, 0);
- break;
- case DL_INFO_REQ:
- if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
- dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
- return (0);
- }
- if ((mp = mexchange(q, mp, sizeof (dl_info_ack_t),
- M_PCPROTO, DL_INFO_ACK)) == NULL) {
- return (0);
- }
- dlip = (void *)mp->b_rptr;
- *dlip = dlvni_infoack;
- dlip->dl_current_state = stp->st_state;
- qreply(q, mp);
- break;
- case DL_PHYS_ADDR_REQ:
- if (MBLKL(mp) < DL_PHYS_ADDR_REQ_SIZE) {
- dlerrorack(q, mp, DL_PHYS_ADDR_REQ, DL_BADPRIM,
- 0);
- return (0);
- }
- dlphysaddrack(q, mp, NULL, 0);
- break;
- case DL_UNBIND_REQ:
- if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
- dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
- return (0);
- }
- if (stp->st_state != DL_IDLE) {
- dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE,
- 0);
- return (0);
- }
- /* Nothing to flush. But DLPI spec says to; so do it */
- flushq(q, FLUSHALL);
- flushq(RD(q), FLUSHALL);
- stp->st_state = DL_UNBOUND;
- dlokack(q, mp, DL_UNBIND_REQ);
- break;
- case DL_DETACH_REQ:
- if (MBLKL(mp) < DL_DETACH_REQ_SIZE) {
- dlerrorack(q, mp, DL_DETACH_REQ, DL_BADPRIM, 0);
- return (0);
- }
- if (stp->st_state != DL_UNBOUND) {
- dlerrorack(q, mp, DL_DETACH_REQ, DL_OUTSTATE,
- 0);
- return (0);
- }
- stp->st_state = DL_UNATTACHED;
- dlokack(q, mp, DL_DETACH_REQ);
- break;
- default:
- dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
- }
- break;
- case M_IOCTL:
- /*
- * No ioctl's currently supported. Need to have the NAK since
- * ifconfig calls SIOCGTUNPARAM during the end of plumb
- */
- miocnak(q, mp, 0, EINVAL);
- break;
- case M_FLUSH:
- /* Really nothing to flush since no msgs enqueued */
- if (*mp->b_rptr & FLUSHR) {
- qreply(q, mp);
- } else {
- freemsg(mp);
- }
- break;
- default:
- freemsg(mp);
- break;
- }
- return (0);
-}
diff --git a/usr/src/uts/common/inet/vni/vni_impl.h b/usr/src/uts/common/inet/vni/vni_impl.h
deleted file mode 100644
index ffba1b08bf..0000000000
--- a/usr/src/uts/common/inet/vni/vni_impl.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _INET_VNI_IMPL_H
-#define _INET_VNI_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/modctl.h>
-#include <sys/stream.h>
-
-typedef struct vni_str {
- struct vni_str *st_next; /* next in list */
- t_uscalar_t st_state; /* DLPI state */
- minor_t st_minor; /* corresponding minor */
- uint32_t st_ppa; /* physical point of attachment */
-} vni_str_t;
-
-#define DL_MAXPRIM DL_GET_STATISTICS_ACK
-#define VNIIDNUM 0x2a84
-#define VNINAME "vni"
-#define VNIFLAGS (D_MP|D_MTPERMOD)
-#define VNIHIWAT 1024
-#define VNILOWAT 512
-#define VNIMINPSZ 0
-#define VNIMAXPSZ INFPSZ
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _INET_VNI_IMPL_H */
diff --git a/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c b/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c
index 69feb36606..03d82fbcab 100644
--- a/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c
+++ b/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c
@@ -19,14 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#define AF_INET_OFFLOAD 30
-
#include <sys/sockio.h>
#include <sys/stream.h>
#include <sys/errno.h>
@@ -34,27 +30,24 @@
#include <sys/strsun.h>
#include <inet/common.h>
#include <net/if.h>
+#include <net/if_types.h>
#include <inet/mi.h>
#include <sys/t_kuser.h>
#include <sys/stropts.h>
#include <sys/pathname.h>
#include <sys/kstr.h>
#include <sys/timod.h>
+#include <sys/sunddi.h>
#include <sys/ib/clients/rds/rds.h>
#include <sys/ib/clients/rds/rds_transport.h>
static sin_t sin_null; /* Zero address for quick clears */
-#define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
-
-#define isalpha(ch) (((ch) >= 'a' && (ch) <= 'z') || \
- ((ch) >= 'A' && (ch) <= 'Z'))
-
/*
* Just pass the ioctl to IP and the result to the caller.
*/
int
-rds_do_ip_ioctl(int cmd, int len, caddr_t arg)
+rds_do_ip_ioctl(int cmd, int len, void *arg)
{
vnode_t *kvp, *vp;
TIUSER *tiptr;
@@ -62,8 +55,7 @@ rds_do_ip_ioctl(int cmd, int len, caddr_t arg)
k_sigset_t smask;
int err = 0;
- if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
- &kvp) == 0) {
+ if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) {
if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
&tiptr, CRED()) == 0) {
vp = tiptr->fp->f_vnode;
@@ -72,13 +64,13 @@ rds_do_ip_ioctl(int cmd, int len, caddr_t arg)
return (EPROTO);
}
} else {
- return (EPROTO);
+ return (EPROTO);
}
iocb.ic_cmd = cmd;
iocb.ic_timout = 0;
iocb.ic_len = len;
- iocb.ic_dp = arg;
+ iocb.ic_dp = (caddr_t)arg;
sigintr(&smask, 0);
err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
sigunintr(&smask);
@@ -88,197 +80,166 @@ rds_do_ip_ioctl(int cmd, int len, caddr_t arg)
}
/*
- * Return 0 if the interface is IB.
- * Return error (>0) if any error is encountered during processing.
- * Return -1 if the interface is not IB and no error.
+ * Check if the IP interface named by `lifrp' is RDS-capable.
*/
-static int
-rds_is_ib_interface(char *name)
+static boolean_t
+rds_capable_interface(struct lifreq *lifrp)
{
+ char ifname[LIFNAMSIZ];
+ char drv[MAXLINKNAMELEN];
+ uint_t ppa;
+ char *cp;
- char dev_path[MAXPATHLEN];
- char devname[MAXNAMELEN];
- ldi_handle_t lh;
- dl_info_ack_t info;
- int ret = 0;
- int i;
- k_sigset_t smask;
+ if (lifrp->lifr_type == IFT_IB)
+ return (B_TRUE);
/*
- * ibd devices are only style 2 devices
- * so we will open only style 2 devices
- * by ignoring the ppa
+ * Strip off the logical interface portion before getting
+ * intimate with the name.
*/
- i = strlen(name) - 1;
- while ((i >= 0) && (!isalpha(name[i]))) i--;
- if (i < 0) {
- /* Invalid interface name, no alphabet */
- return (-1);
- }
- (void) strncpy(devname, name, i + 1);
- devname[i + 1] = '\0';
+ (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
+ if ((cp = strchr(ifname, ':')) != NULL)
+ *cp = '\0';
- if (strcmp("lo", devname) == 0) {
+ if (strcmp("lo0", ifname) == 0) {
/*
- * loopback interface is considered RDS capable
+ * loopback is considered RDS-capable
*/
- return (0);
+ return (B_TRUE);
}
- (void) strncpy(dev_path, "/dev/", MAXPATHLEN);
- if (strlcat(dev_path, devname, MAXPATHLEN) >= MAXPATHLEN) {
- /* string overflow */
- return (-1);
- }
+ return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS &&
+ rds_transport_ops->rds_transport_if_lookup_by_name(drv));
+}
- ret = ldi_open_by_name(dev_path, FREAD|FWRITE, kcred, &lh, rds_li);
- if (ret != 0) {
- return (ret);
- }
+/*
+ * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
+ * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
+ */
+static int
+rds_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
+{
+ int err;
+ int nifs;
- sigintr(&smask, 0);
- ret = dl_info(lh, &info, NULL, NULL, NULL);
- sigunintr(&smask);
- (void) ldi_close(lh, FREAD|FWRITE, kcred);
- if (ret != 0) {
- return (ret);
- }
+ if ((err = rds_do_ip_ioctl(SIOCGIFNUM, sizeof (int), &nifs)) != 0)
+ return (err);
- if (info.dl_mac_type != DL_IB &&
- !rds_transport_ops->rds_transport_if_lookup_by_name(devname)) {
- return (-1);
+ /*
+ * Pad the interface count to account for additional interfaces that
+ * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
+ */
+ nifs += 4;
+
+ bzero(lifcp, sizeof (struct lifconf));
+ lifcp->lifc_family = AF_INET;
+ lifcp->lifc_len = *bufsizep = (nifs * sizeof (struct lifreq));
+ lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_NOSLEEP);
+ if (lifcp->lifc_buf == NULL)
+ return (ENOMEM);
+
+ err = rds_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
+ if (err != 0) {
+ kmem_free(lifcp->lifc_buf, *bufsizep);
+ return (err);
}
-
return (0);
}
void
rds_ioctl_copyin_done(queue_t *q, mblk_t *mp)
{
- char *addr;
+ void *addr;
mblk_t *mp1;
int err = 0;
- struct iocblk *iocp = (struct iocblk *)(uintptr_t)mp->b_rptr;
+ struct iocblk *iocp = (void *)mp->b_rptr;
if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) {
err = EPROTO;
goto done;
}
- addr = (char *)mp1->b_rptr;
+ addr = mp1->b_rptr;
switch (iocp->ioc_cmd) {
-
case SIOCGIFNUM: {
- /* Get number of interfaces. */
- struct ifconf kifc;
- struct ifreq *ifr;
- int num_ifs;
- int n;
-
- err = rds_do_ip_ioctl(iocp->ioc_cmd, sizeof (int),
- (char *)&num_ifs);
- if (err != 0) {
- break;
- }
+ uint_t bufsize;
+ struct lifconf lifc;
+ struct lifreq *lifrp;
+ int i, nifs, retval = 0;
- kifc.ifc_len = num_ifs * sizeof (struct ifreq);
- kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP);
- err = rds_do_ip_ioctl(SIOCGIFCONF,
- sizeof (struct ifconf), (caddr_t)&kifc);
- if (err != 0) {
- kmem_free(kifc.ifc_buf, kifc.ifc_len);
+ if ((err = rds_do_lifconf(&lifc, &bufsize)) != 0)
break;
- }
- ifr = kifc.ifc_req;
- n = num_ifs;
- for (num_ifs = 0; n > 0; ifr++) {
- err = rds_is_ib_interface(ifr->ifr_name);
- if (err == 0) {
- num_ifs++;
- } else if (err > 0) {
- num_ifs = 0;
- break;
- } else {
- err = 0;
+
+ nifs = lifc.lifc_len / sizeof (struct lifreq);
+ for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
+ if (strlen(lifrp->lifr_name) <= IFNAMSIZ &&
+ rds_capable_interface(lifrp)) {
+ retval++;
}
- n--;
}
- *((int *)(uintptr_t)addr) = num_ifs;
- kmem_free(kifc.ifc_buf, kifc.ifc_len);
- }
+ *((int *)addr) = retval;
+ kmem_free(lifc.lifc_buf, bufsize);
break;
+ }
case O_SIOCGIFCONF:
case SIOCGIFCONF: {
STRUCT_HANDLE(ifconf, ifc);
caddr_t ubuf_addr;
int ubuf_size;
- struct ifconf kifc;
- struct ifreq *ifr, *ptr;
- int num_ifs;
-
- STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
- (struct ifconf *)(uintptr_t)addr);
+ uint_t bufsize;
+ int i, nifs;
+ struct lifconf lifc;
+ struct lifreq *lifrp;
+ struct ifreq *ifrp;
+ STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, (struct ifconf *)addr);
ubuf_size = STRUCT_FGET(ifc, ifc_len);
ubuf_addr = STRUCT_FGETP(ifc, ifc_buf);
- err = rds_do_ip_ioctl(SIOCGIFNUM, sizeof (int),
- (char *)&num_ifs);
- if (err != 0) {
+ if ((err = rds_do_lifconf(&lifc, &bufsize)) != 0)
break;
- }
- kifc.ifc_len = num_ifs * sizeof (struct ifreq);
- kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP);
- err = rds_do_ip_ioctl(iocp->ioc_cmd,
- sizeof (struct ifconf), (caddr_t)&kifc);
- if (err != 0) {
- kmem_free(kifc.ifc_buf, kifc.ifc_len);
- break;
- }
mp1 = mi_copyout_alloc(q, mp, ubuf_addr, ubuf_size, B_FALSE);
if (mp1 == NULL) {
err = ENOMEM;
- kmem_free(kifc.ifc_buf, ubuf_size);
+ kmem_free(lifc.lifc_buf, bufsize);
break;
}
- ifr = kifc.ifc_req;
- ptr = (struct ifreq *)(uintptr_t)mp1->b_rptr;
- for (; num_ifs > 0 &&
- (int)((uintptr_t)mp1->b_wptr - (uintptr_t)mp1->b_rptr) <
- ubuf_size; num_ifs--, ifr++) {
- err = rds_is_ib_interface(ifr->ifr_name);
- if (err == 0) {
- ifr->ifr_addr.sa_family = AF_INET_OFFLOAD;
- bcopy((caddr_t)ifr, ptr, sizeof (struct ifreq));
- ptr++;
- mp1->b_wptr = (uchar_t *)ptr;
- } else if (err > 0) {
- break;
- } else {
- err = 0;
+ ifrp = (void *)mp1->b_rptr;
+ nifs = lifc.lifc_len / sizeof (struct lifreq);
+ for (lifrp = lifc.lifc_req, i = 0; i < nifs &&
+ MBLKTAIL(mp1) >= sizeof (struct ifreq); i++, lifrp++) {
+ /*
+ * Skip entries that are impossible to return with
+ * SIOCGIFCONF, or not RDS-capable.
+ */
+ if (strlen(lifrp->lifr_name) > IFNAMSIZ ||
+ !rds_capable_interface(lifrp)) {
+ continue;
}
+
+ ifrp->ifr_addr = *(struct sockaddr *)&lifrp->lifr_addr;
+ ifrp->ifr_addr.sa_family = AF_INET_OFFLOAD;
+ (void) strlcpy(ifrp->ifr_name, lifrp->lifr_name,
+ IFNAMSIZ);
+ ifrp++;
+ mp1->b_wptr += sizeof (struct ifreq);
}
- STRUCT_FSET(ifc, ifc_len, (int)((uintptr_t)mp1->b_wptr -
- (uintptr_t)mp1->b_rptr));
- kmem_free(kifc.ifc_buf, kifc.ifc_len);
- }
+ STRUCT_FSET(ifc, ifc_len, MBLKL(mp1));
+ kmem_free(lifc.lifc_buf, bufsize);
break;
+ }
case SIOCGIFMTU:
- err = rds_do_ip_ioctl(iocp->ioc_cmd,
- sizeof (struct ifreq), addr);
- break;
-
case SIOCGIFFLAGS:
- err = rds_do_ip_ioctl(iocp->ioc_cmd,
- sizeof (struct ifreq), addr);
+ err = rds_do_ip_ioctl(iocp->ioc_cmd, sizeof (struct ifreq),
+ addr);
break;
- case TI_GETMYNAME: {
+ case TI_GETMYNAME: {
rds_t *rds;
STRUCT_HANDLE(strbuf, sb);
ipaddr_t v4addr;
@@ -287,8 +248,7 @@ rds_ioctl_copyin_done(queue_t *q, mblk_t *mp)
sin_t *sin;
STRUCT_SET_HANDLE(sb,
- ((struct iocblk *)(uintptr_t)mp->b_rptr)->ioc_flag,
- (void *)(uintptr_t)addr);
+ ((struct iocblk *)(uintptr_t)mp->b_rptr)->ioc_flag, addr);
rds = (rds_t *)q->q_ptr;
ASSERT(rds->rds_family == AF_INET_OFFLOAD);
addrlen = sizeof (sin_t);
@@ -320,7 +280,6 @@ done:
mi_copy_done(q, mp, err);
}
-
void
rds_ioctl_copyin_setup(queue_t *q, mblk_t *mp)
{
@@ -383,38 +342,26 @@ rds_ioctl(queue_t *q, mblk_t *mp)
boolean_t
rds_verify_bind_address(ipaddr_t addr)
{
- int numifs;
- struct ifconf kifc;
- struct ifreq *ifr;
- boolean_t ret = B_FALSE;
-
-
- if (rds_do_ip_ioctl(SIOCGIFNUM, sizeof (int), (caddr_t)&numifs)) {
- return (ret);
- }
-
- kifc.ifc_len = numifs * sizeof (struct ifreq);
- kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP);
-
- if (rds_do_ip_ioctl(SIOCGIFCONF, sizeof (struct ifconf),
- (caddr_t)&kifc)) {
- goto done;
- }
-
- ifr = kifc.ifc_req;
- for (numifs = kifc.ifc_len / sizeof (struct ifreq);
- numifs > 0; numifs--, ifr++) {
- struct sockaddr_in *sin;
-
- sin = (struct sockaddr_in *)(uintptr_t)&ifr->ifr_addr;
- if ((sin->sin_addr.s_addr == addr) &&
- (rds_is_ib_interface(ifr->ifr_name) == 0)) {
- ret = B_TRUE;
- break;
+ int i, nifs;
+ uint_t bufsize;
+ struct lifconf lifc;
+ struct lifreq *lifrp;
+ struct sockaddr_in *sinp;
+ boolean_t retval = B_FALSE;
+
+ if (rds_do_lifconf(&lifc, &bufsize) != 0)
+ return (B_FALSE);
+
+ nifs = lifc.lifc_len / sizeof (struct lifreq);
+ for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
+ sinp = (struct sockaddr_in *)&lifrp->lifr_addr;
+ if (rds_capable_interface(lifrp) &&
+ sinp->sin_addr.s_addr == addr) {
+ retval = B_TRUE;
+ break;
}
}
-done:
- kmem_free(kifc.ifc_buf, kifc.ifc_len);
- return (ret);
+ kmem_free(lifc.lifc_buf, bufsize);
+ return (retval);
}
diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c
index bcb3c235be..dd7c9554a5 100644
--- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c
+++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,6 +33,7 @@
#include <sys/socket.h>
#include <sys/stat.h>
#include <net/if_arp.h>
+#include <net/if_types.h>
#include <sys/file.h>
#include <sys/sockio.h>
#include <sys/pathname.h>
@@ -528,62 +529,112 @@ ibcm_arp_get_ibd_insts(ibcm_arp_ibd_insts_t *ibds)
}
/*
- * Return ibd interfaces and ibd instances.
+ * Issue an ioctl down to IP. There are several similar versions of this
+ * function (e.g., rpcib_do_ip_ioctl()); clearly a utility routine is needed.
*/
static int
-ibcm_arp_get_ibd_ipaddr(ibcm_arp_ibd_insts_t *ibds)
+ibcm_do_ip_ioctl(int cmd, int len, void *arg)
{
- TIUSER *tiptr;
- vnode_t *kvp;
- vnode_t *vp = NULL;
- struct strioctl iocb;
- struct lifreq lif_req;
- int k, ip_cnt;
- ibcm_arp_ip_t *ipp;
+ vnode_t *kvp;
+ TIUSER *tiptr;
+ struct strioctl iocb;
+ int err = 0;
- if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) {
- if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
- &tiptr, CRED()) == 0) {
- vp = tiptr->fp->f_vnode;
- } else {
- VN_RELE(kvp);
- }
- }
+ if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) != 0)
+ return (EPROTO);
- if (vp == NULL)
- return (-1);
+ if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, &tiptr, CRED()) != 0) {
+ VN_RELE(kvp);
+ return (EPROTO);
+ }
- /* Get ibd ip's */
- ip_cnt = 0;
- for (k = 0, ipp = ibds->ibcm_arp_ip; k < ibds->ibcm_arp_ibd_cnt;
- k++, ipp++) {
+ iocb.ic_cmd = cmd;
+ iocb.ic_timout = 0;
+ iocb.ic_len = len;
+ iocb.ic_dp = (caddr_t)arg;
+ err = kstr_ioctl(tiptr->fp->f_vnode, I_STR, (intptr_t)&iocb);
+ (void) t_kclose(tiptr, 0);
+ VN_RELE(kvp);
+ return (err);
+}
- (void) bzero((void *)&lif_req, sizeof (struct lifreq));
- (void) snprintf(lif_req.lifr_name, sizeof (lif_req.lifr_name),
- "%s%d", IBCM_ARP_IBD_NAME, ipp->ip_inst);
+/*
+ * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
+ * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
+ */
+static int
+ibcm_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
+{
+ int err;
+ struct lifnum lifn;
+
+ bzero(&lifn, sizeof (struct lifnum));
+ lifn.lifn_family = AF_UNSPEC;
+
+ err = ibcm_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
+ if (err != 0)
+ return (err);
+
+ /*
+ * Pad the interface count to account for additional interfaces that
+ * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
+ */
+ lifn.lifn_count += 4;
+
+ bzero(lifcp, sizeof (struct lifconf));
+ lifcp->lifc_family = AF_UNSPEC;
+ lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
+ lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
+
+ err = ibcm_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
+ if (err != 0) {
+ kmem_free(lifcp->lifc_buf, *bufsizep);
+ return (err);
+ }
+ return (0);
+}
- (void) bzero((void *)&iocb, sizeof (struct strioctl));
- iocb.ic_cmd = SIOCGLIFADDR;
- iocb.ic_timout = 0;
- iocb.ic_len = sizeof (struct lifreq);
- iocb.ic_dp = (caddr_t)&lif_req;
+/*
+ * Fill in `ibds' with IP addresses tied to IFT_IB IP interfaces. Returns
+ * B_TRUE if at least one address was filled in.
+ */
+static boolean_t
+ibcm_arp_get_ibd_ipaddr(ibcm_arp_ibd_insts_t *ibds)
+{
+ int i, nifs, naddr = 0;
+ uint_t bufsize;
+ struct lifconf lifc;
+ struct lifreq *lifrp;
+ ibcm_arp_ip_t *ipp;
+
+ if (ibcm_do_lifconf(&lifc, &bufsize) != 0)
+ return (B_FALSE);
+
+ nifs = lifc.lifc_len / sizeof (struct lifreq);
+ for (lifrp = lifc.lifc_req, i = 0;
+ i < nifs && naddr < ibds->ibcm_arp_ibd_cnt; i++, lifrp++) {
+ if (lifrp->lifr_type != IFT_IB)
+ continue;
- if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) {
+ ipp = &ibds->ibcm_arp_ip[naddr];
+ switch (lifrp->lifr_addr.ss_family) {
+ case AF_INET:
ipp->ip_inet_family = AF_INET;
- bcopy(&lif_req.lifr_addr, &ipp->ip_cm_sin,
+ bcopy(&lifrp->lifr_addr, &ipp->ip_cm_sin,
sizeof (struct sockaddr_in));
- ip_cnt++;
- continue;
+ naddr++;
+ break;
+ case AF_INET6:
+ ipp->ip_inet_family = AF_INET6;
+ bcopy(&lifrp->lifr_addr, &ipp->ip_cm_sin6,
+ sizeof (struct sockaddr_in6));
+ naddr++;
+ break;
}
}
- (void) t_kclose(tiptr, 0);
- VN_RELE(kvp);
-
- if (ip_cnt == 0)
- return (-1);
- else
- return (0);
+ kmem_free(lifc.lifc_buf, bufsize);
+ return (naddr > 0);
}
ibt_status_t
@@ -600,7 +651,7 @@ ibcm_arp_get_ibds(ibcm_arp_ibd_insts_t *ibdp)
return (IBT_SRC_IP_NOT_FOUND);
/* Get the IP addresses of active ports. */
- if (ibcm_arp_get_ibd_ipaddr(ibdp) != 0) {
+ if (!ibcm_arp_get_ibd_ipaddr(ibdp)) {
IBTF_DPRINTF_L2(cmlog, "ibcm_arp_get_ibds: failed to get "
"ibd instance: IBT_SRC_IP_NOT_FOUND");
return (IBT_SRC_IP_NOT_FOUND);
diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c
index af622d5c8f..29b5116446 100644
--- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c
+++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/dlpi.h>
@@ -35,24 +33,13 @@
#include <sys/ddi.h>
#include <sys/cmn_err.h>
#include <sys/socket.h>
-#include <sys/tihdr.h>
#include <net/if.h>
-#include <net/if_arp.h>
#include <net/if_types.h>
-#include <net/if_dl.h>
-#include <net/route.h>
-#include <sys/sockio.h>
#include <netinet/in.h>
-#include <netinet/ip6.h>
-#include <netinet/icmp6.h>
#include <sys/ethernet.h>
-#include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
-#include <inet/mi.h>
#include <inet/arp.h>
#include <inet/ip.h>
-#include <inet/ip_multi.h>
#include <inet/ip_ire.h>
-#include <inet/ip_rts.h>
#include <inet/ip_if.h>
#include <sys/ib/mgt/ibcm/ibcm_arp.h>
#include <inet/ip_ftable.h>
@@ -389,21 +376,16 @@ ibcm_arp_pr_callback(ibcm_arp_prwqn_t *wqnp, int status)
wqnp->func((void *)wqnp, status);
}
+/*
+ * Check if the interface is loopback or IB.
+ */
static int
-ibcm_arp_check_interface(ibcm_arp_prwqn_t *wqnp, int length)
+ibcm_arp_check_interface(ill_t *ill)
{
- /*
- * if the i/f is not ib or lo device, fail the request
- */
- if (bcmp(wqnp->ifname, "ibd", 3) == 0) {
- if (length != IPOIB_ADDRL) {
- return (EINVAL);
- }
- } else if (bcmp(wqnp->ifname, "lo", 2)) {
- return (ETIMEDOUT);
- }
+ if (IS_LOOPBACK(ill) || ill->ill_type == IFT_IB)
+ return (0);
- return (0);
+ return (ETIMEDOUT);
}
#define IBTL_IPV4_ADDR(a) (a->un.ip4addr)
@@ -414,11 +396,10 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
ibcm_arp_pr_comp_func_t func)
{
ibcm_arp_prwqn_t *wqnp;
- ire_t *ire;
- ire_t *src_ire;
+ ire_t *ire = NULL;
+ ire_t *src_ire = NULL;
ipif_t *ipif;
- ill_t *ill;
- int length;
+ ill_t *ill, *hwaddr_ill = NULL;
ip_stack_t *ipst;
IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup(src %p dest %p)",
@@ -449,13 +430,10 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
if (src_ire == NULL) {
IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
"ire_ctable_lookup failed");
- netstack_rele(ipst->ips_netstack);
- ibcm_arp_prwqn_delete(wqnp);
ib_s->status = EFAULT;
- return (1);
+ goto fail;
}
-
/*
* get an ire for the destination adress with the matching source
* address
@@ -463,16 +441,11 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
ire = ire_ftable_lookup(IBTL_IPV4_ADDR(dst_addr), 0, 0, 0,
src_ire->ire_ipif, 0, src_ire->ire_zoneid, 0, NULL, MATCH_IRE_SRC,
ipst);
-
- netstack_rele(ipst->ips_netstack);
-
if (ire == NULL) {
IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
"ire_ftable_lookup failed");
- IRE_REFRELE(src_ire);
- ibcm_arp_prwqn_delete(wqnp);
ib_s->status = EFAULT;
- return (1);
+ goto fail;
}
wqnp->src_addr.un.ip4addr = ire->ire_src_addr;
@@ -480,35 +453,56 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
ipif = src_ire->ire_ipif;
ill = ipif->ipif_ill;
- length = ill->ill_name_length;
- bcopy(ill->ill_name, &wqnp->ifname, ill->ill_name_length);
- wqnp->ifname[length] = '\0';
- bcopy(ill->ill_phys_addr, &wqnp->src_mac,
- ill->ill_phys_addr_length);
+ (void) strlcpy(wqnp->ifname, ill->ill_name, sizeof (wqnp->ifname));
- IRE_REFRELE(ire);
- IRE_REFRELE(src_ire);
+ /*
+ * For IPMP data addresses, we need to use the hardware address of the
+ * interface bound to the given address.
+ */
+ if (IS_IPMP(ill)) {
+ if ((hwaddr_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: no bound "
+ "ill for IPMP interface %s", ill->ill_name);
+ ib_s->status = EFAULT;
+ goto fail;
+ }
+ } else {
+ hwaddr_ill = ill;
+ ill_refhold(hwaddr_ill); /* for symmetry */
+ }
- ib_s->status =
- ibcm_arp_check_interface(wqnp, ill->ill_phys_addr_length);
- if (ib_s->status) {
+ bcopy(hwaddr_ill->ill_phys_addr, &wqnp->src_mac,
+ hwaddr_ill->ill_phys_addr_length);
+
+ if ((ib_s->status = ibcm_arp_check_interface(hwaddr_ill)) != 0) {
IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
"ibcm_arp_check_interface failed");
- ibcm_arp_prwqn_delete(wqnp);
- return (1);
+ goto fail;
}
- ib_s->status = ibcm_arp_squery_arp(wqnp);
- if (ib_s->status) {
+ if ((ib_s->status = ibcm_arp_squery_arp(wqnp)) != 0) {
IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
"ibcm_arp_squery_arp failed");
- ibcm_arp_prwqn_delete(wqnp);
- return (1);
+ goto fail;
}
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: Return: 0x%p", wqnp);
+ ill_refrele(hwaddr_ill);
+ IRE_REFRELE(ire);
+ IRE_REFRELE(src_ire);
+ netstack_rele(ipst->ips_netstack);
+ IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: Return: 0x%p", wqnp);
return (0);
+fail:
+ if (hwaddr_ill != NULL)
+ ill_refrele(hwaddr_ill);
+ if (ire != NULL)
+ IRE_REFRELE(ire);
+ if (src_ire != NULL)
+ IRE_REFRELE(src_ire);
+ ibcm_arp_prwqn_delete(wqnp);
+ netstack_rele(ipst->ips_netstack);
+ return (1);
}
#define IBCM_H2N_GID(gid) \
diff --git a/usr/src/uts/common/ipp/ipgpc/classifier-objects.h b/usr/src/uts/common/ipp/ipgpc/classifier-objects.h
index f1cb20b88d..4002a39573 100644
--- a/usr/src/uts/common/ipp/ipgpc/classifier-objects.h
+++ b/usr/src/uts/common/ipp/ipgpc/classifier-objects.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _IPP_IPGPC_CLASSIFIER_OBJECTS_H
#define _IPP_IPGPC_CLASSIFIER_OBJECTS_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/time.h>
#include <ipp/ipp.h>
#include <ipp/ipgpc/ipgpc.h>
@@ -64,14 +61,12 @@ extern "C" {
#define IPGPC_TABLE_UID 8
#define IPGPC_TABLE_PROJID 9
#define IPGPC_TABLE_IF 10
-#define IPGPC_TABLE_IF_GRPNM 11
-#define IPGPC_TABLE_DIR 12
+#define IPGPC_TABLE_DIR 11
#define TABLE_ID_OFFSET IPGPC_TABLE_PROTOID
#define PROTOID_IDX (IPGPC_TABLE_PROTOID - TABLE_ID_OFFSET)
#define UID_IDX (IPGPC_TABLE_UID - TABLE_ID_OFFSET)
#define PROJID_IDX (IPGPC_TABLE_PROJID - TABLE_ID_OFFSET)
#define IF_IDX (IPGPC_TABLE_IF - TABLE_ID_OFFSET)
-#define IF_GRPNM_IDX (IPGPC_TABLE_IF_GRPNM - TABLE_ID_OFFSET)
#define DIR_IDX (IPGPC_TABLE_DIR - TABLE_ID_OFFSET)
/* Match types for selector searching */
@@ -91,11 +86,10 @@ extern "C" {
#define UID_MASK 0x40
#define PROJID_MASK 0x80
#define IF_MASK 0x100
-#define IF_GRPNM_MASK 0x200
-#define DIR_MASK 0x400
+#define DIR_MASK 0x200
#define ALL_MATCH_MASK (DS_MASK | PROTO_MASK | SADDR_MASK | DADDR_MASK | \
SPORT_MASK | DPORT_MASK | UID_MASK | PROJID_MASK | \
- IF_MASK | IF_GRPNM_MASK | DIR_MASK)
+ IF_MASK | DIR_MASK)
#define HASH_SIZE 11 /* default hash table size */
@@ -108,7 +102,6 @@ typedef struct ipgpc_filter_s {
char filter_name[MAXNAMELEN]; /* null terminated name of filter */
/* exact match selectors */
- char if_groupname[LIFNAMSIZ]; /* null terminated iface groupname */
uid_t uid; /* uid key, value = exact or IPGPC_WILDCARD */
projid_t projid; /* project id, " " */
uint_t if_index; /* interface index, " " or 0 for wildcard */
diff --git a/usr/src/uts/common/ipp/ipgpc/classifier.c b/usr/src/uts/common/ipp/ipgpc/classifier.c
index bb09a3ca89..9137fcba9a 100644
--- a/usr/src/uts/common/ipp/ipgpc/classifier.c
+++ b/usr/src/uts/common/ipp/ipgpc/classifier.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/socket.h>
@@ -78,7 +76,6 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table,
uint16_t *slctrs_srchd)
{
int match_status;
- int if_grpnm_hv;
/* Find on packet direction */
match_status =
@@ -96,19 +93,6 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table,
return (match_status);
}
- /* Find on IF_GRPNM of packet */
- if (packet->if_groupname_len > 0) {
- if_grpnm_hv = name_hash(packet->if_groupname, TABLE_SIZE);
- } else {
- if_grpnm_hv = IPGPC_WILDCARD;
- }
- match_status =
- ipgpc_findfilters(IPGPC_TABLE_IF_GRPNM, if_grpnm_hv, fid_table);
- if (CHECK_MATCH_STATUS(match_status, slctrs_srchd,
- ipgpc_table_list[IF_GRPNM_IDX].info.mask) != NORMAL_MATCH) {
- return (match_status);
- }
-
/* Find on DS field */
match_status =
ipgpc_findfilters(IPGPC_BA_DSID, packet->dsfield, fid_table);
@@ -149,9 +133,8 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table,
/* Find on IP Source Port field */
if (packet->sport > 0) {
- match_status =
- ipgpc_findfilters(IPGPC_TRIE_SPORTID, packet->sport,
- fid_table);
+ match_status = ipgpc_findfilters(IPGPC_TRIE_SPORTID,
+ packet->sport, fid_table);
if (CHECK_MATCH_STATUS(match_status, slctrs_srchd,
ipgpc_trie_list[IPGPC_TRIE_SPORTID].info.mask)
!= NORMAL_MATCH) {
@@ -164,9 +147,8 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table,
/* Find on IP Destination Port field */
if (packet->dport > 0) {
- match_status =
- ipgpc_findfilters(IPGPC_TRIE_DPORTID, packet->dport,
- fid_table);
+ match_status = ipgpc_findfilters(IPGPC_TRIE_DPORTID,
+ packet->dport, fid_table);
if (CHECK_MATCH_STATUS(match_status, slctrs_srchd,
ipgpc_trie_list[IPGPC_TRIE_DPORTID].info.mask)
!= NORMAL_MATCH) {
@@ -261,12 +243,11 @@ ipgpc_classify(int af, ipgpc_packet_t *packet)
match_status = 0;
slctrs_srchd = ALL_MATCH_MASK;
-
bzero(fid_table, sizeof (ht_match_t) * HASH_SIZE);
/* first search all address family independent selectors */
- if ((rc = common_classify(packet, fid_table, &slctrs_srchd)) !=
- NORMAL_MATCH) {
+ rc = common_classify(packet, fid_table, &slctrs_srchd);
+ if (rc != NORMAL_MATCH) {
/* free all dynamic allocated memory */
FREE_FID_TABLE(fid_table, p, q, i);
if (rc == NO_MATCHES) {
@@ -453,7 +434,7 @@ bestmatch(ht_match_t *fid_table, uint16_t bestmask)
*/
real_prio =
((uint64_t)ipgpc_fid_list[key].filter.priority
- << 32) |
+ << 32) |
(uint64_t)~ipgpc_fid_list[key].filter.precedence;
/* check to see if this is the new bestmatch */
@@ -689,35 +670,32 @@ parse_packet6(ipgpc_packet_t *packet, mblk_t *mp)
void
print_packet(int af, ipgpc_packet_t *pkt)
{
+ char saddrbuf[INET6_ADDRSTRLEN];
+ char daddrbuf[INET6_ADDRSTRLEN];
+
if (af == AF_INET) {
- char saddrbuf[INET_ADDRSTRLEN];
- char daddrbuf[INET_ADDRSTRLEN];
+ (void) inet_ntop(af, &V4_PART_OF_V6(pkt->saddr), saddrbuf,
+ sizeof (saddrbuf));
+ (void) inet_ntop(af, &V4_PART_OF_V6(pkt->daddr), daddrbuf,
+ sizeof (daddrbuf));
+
ipgpc4dbg(("print_packet: saddr = %s, daddr = %s, sport = %u" \
", dport = %u, proto = %u, dsfield = %x, uid = %d," \
- " if_index = %d, if_groupname = %s, projid = %d, " \
- "direction = %d",
- inet_ntop(af, &V4_PART_OF_V6(pkt->saddr), saddrbuf,
- sizeof (saddrbuf)),
- inet_ntop(af, &V4_PART_OF_V6(pkt->daddr), daddrbuf,
- sizeof (daddrbuf)),
- ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto,
+ " if_index = %d, projid = %d, direction = %d", saddrbuf,
+ daddrbuf, ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto,
pkt->dsfield, pkt->uid, pkt->if_index,
- (pkt->if_groupname != NULL) ? pkt->if_groupname : "NULL",
pkt->projid, pkt->direction));
} else if (af == AF_INET6) {
- char saddrbuf[INET6_ADDRSTRLEN];
- char daddrbuf[INET6_ADDRSTRLEN];
+ (void) inet_ntop(af, pkt->saddr.s6_addr32, saddrbuf,
+ sizeof (saddrbuf));
+ (void) inet_ntop(af, pkt->daddr.s6_addr32, daddrbuf,
+ sizeof (daddrbuf));
+
ipgpc4dbg(("print_packet: saddr = %s, daddr = %s, sport = %u" \
", dport = %u, proto = %u, dsfield = %x, uid = %d," \
- " if_index = %d, if_groupname = %s, projid = %d, " \
- "direction = %d",
- inet_ntop(af, pkt->saddr.s6_addr32, saddrbuf,
- sizeof (saddrbuf)),
- inet_ntop(af, pkt->daddr.s6_addr32, daddrbuf,
- sizeof (daddrbuf)),
- ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto,
+ " if_index = %d, projid = %d, direction = %d", saddrbuf,
+ daddrbuf, ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto,
pkt->dsfield, pkt->uid, pkt->if_index,
- (pkt->if_groupname != NULL) ? pkt->if_groupname : "NULL",
pkt->projid, pkt->direction));
}
}
diff --git a/usr/src/uts/common/ipp/ipgpc/classifier.h b/usr/src/uts/common/ipp/ipgpc/classifier.h
index 4ee36ae32b..629aeab2f5 100644
--- a/usr/src/uts/common/ipp/ipgpc/classifier.h
+++ b/usr/src/uts/common/ipp/ipgpc/classifier.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _IPP_IPGPC_CLASSIFIER_H
#define _IPP_IPGPC_CLASSIFIER_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/cmn_err.h>
#include <ipp/ipgpc/filters.h>
@@ -74,8 +71,6 @@ typedef struct ipgpc_packet_s {
projid_t projid; /* project id for packet */
uint_t if_index; /* interface index */
uint32_t direction; /* packet direction */
- char *if_groupname; /* interface group name */
- uint_t if_groupname_len; /* interface group name length */
uint_t len; /* length of packet */
} ipgpc_packet_t;
diff --git a/usr/src/uts/common/ipp/ipgpc/classifierddi.c b/usr/src/uts/common/ipp/ipgpc/classifierddi.c
index d9955d84a6..4d31da6396 100644
--- a/usr/src/uts/common/ipp/ipgpc/classifierddi.c
+++ b/usr/src/uts/common/ipp/ipgpc/classifierddi.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/systm.h>
#include <sys/socket.h>
#include <netinet/in.h>
@@ -433,12 +431,6 @@ ipgpc_invoke_action(ipp_action_id_t aid, ipp_packet_t *packet)
}
}
- /* The ill_index could be 0 when called from forwarding (read) path */
- if (ill_idx > 0) {
- ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE,
- NULL, NULL, NULL, NULL);
- }
-
/* parse the packet from the message block */
ipha = (ipha_t *)mp->b_rptr;
/* Determine IP Header Version */
@@ -452,23 +444,27 @@ ipgpc_invoke_action(ipp_action_id_t aid, ipp_packet_t *packet)
pkt.direction = callout_pos; /* set packet direction */
+ /* The ill_index could be 0 when called from forwarding (read) path */
+ if (ill_idx > 0) {
+ ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE,
+ NULL, NULL, NULL, NULL);
+ }
if (ill != NULL) {
- pkt.if_index = ill->ill_phyint->phyint_ifindex;
- pkt.if_groupname_len =
- ill->ill_phyint->phyint_groupname_len;
- if (pkt.if_groupname_len > 0) {
- pkt.if_groupname =
- ill->ill_phyint->phyint_groupname;
- } else {
- pkt.if_groupname = NULL;
- }
- /* Got the fields from the ILL, go ahead and refrele */
+ /*
+ * Since all IPP actions in an IPMP group are performed
+ * relative to the IPMP group interface, if this is an
+ * underlying interface in an IPMP group, use the IPMP
+ * group interface's index.
+ */
+ if (IS_UNDER_IPMP(ill))
+ pkt.if_index = ipmp_ill_get_ipmp_ifindex(ill);
+ else
+ pkt.if_index = ill->ill_phyint->phyint_ifindex;
+ /* Got the field from the ILL, go ahead and refrele */
ill_refrele(ill);
} else {
- /* unknown if_index and if_group */
+ /* unknown if_index */
pkt.if_index = IPGPC_UNSPECIFIED;
- pkt.if_groupname = NULL;
- pkt.if_groupname_len = 0;
}
if (ipgpc_debug > 5) {
diff --git a/usr/src/uts/common/ipp/ipgpc/filters.c b/usr/src/uts/common/ipp/ipgpc/filters.c
index 7dd4dce48e..3a2f954d0a 100644
--- a/usr/src/uts/common/ipp/ipgpc/filters.c
+++ b/usr/src/uts/common/ipp/ipgpc/filters.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/atomic.h>
#include <sys/types.h>
#include <sys/systm.h>
@@ -83,7 +81,6 @@ static ht_node_t proto_table[TABLE_SIZE]; /* protocol table */
static ht_node_t uid_table[TABLE_SIZE]; /* IPGPC_UID table */
static ht_node_t projid_table[TABLE_SIZE]; /* IPGPC_PROJID table */
static ht_node_t if_table[TABLE_SIZE]; /* Interface ID table */
-static ht_node_t if_grpnm_table[TABLE_SIZE]; /* Interface Group Name table */
static ht_node_t dir_table[TABLE_SIZE]; /* packet direction table */
static ipp_action_id_t ipgpc_aid; /* the action id for ipgpc */
@@ -262,9 +259,6 @@ initialize_tables(void)
/* IF_INDEX selector structure */
insert_ipgpc_table_list_info(IF_IDX, if_table, IPGPC_UNSPECIFIED,
IF_MASK);
- /* IF_GRPNM_INDEX selector structure */
- insert_ipgpc_table_list_info(IF_GRPNM_IDX, if_grpnm_table,
- IPGPC_WILDCARD, IF_GRPNM_MASK);
/* DIR selector structure */
insert_ipgpc_table_list_info(DIR_IDX, dir_table, IPGPC_UNSPECIFIED,
DIR_MASK);
@@ -617,19 +611,6 @@ ipgpc_parse_filter(ipgpc_filter_t *filter, nvlist_t *nvlp)
bcopy(s, filter->filter_name, (strlen(s) + 1));
- /* parse interface group name */
- if (nvlist_lookup_string(nvlp, IPGPC_IF_GROUPNAME, &s) != 0) {
- filter->if_groupname[0] = '\0';
- } else {
- /* check max interface group name lenght */
- if ((strlen(s) + 1) > LIFNAMSIZ) {
- ipgpc0dbg(("ipgpc_parse_filter: interface group name" \
- " > LIFNAMSIZ"));
- return (EINVAL);
- }
- bcopy(s, filter->if_groupname, (strlen(s) + 1));
- }
-
/* parse uid */
if (nvlist_lookup_uint32(nvlp, IPGPC_UID, &filter->uid) != 0) {
filter->uid = (uid_t)IPGPC_WILDCARD;
@@ -976,8 +957,6 @@ insertfid(int filter_id, ipgpc_filter_t *filter, uint_t class_id)
static void
common_addfilter(fid_t *fid, int filter_id)
{
- int if_grpnm_hv;
-
/* start trie inserts */
/* add source port selector */
if (t_insert(&ipgpc_trie_list[IPGPC_TRIE_SPORTID], filter_id,
@@ -1025,17 +1004,6 @@ common_addfilter(fid_t *fid, int filter_id)
fid->insert_map |= IF_MASK;
}
- /* add interface groupname selector */
- if (fid->filter.if_groupname[0] == '\0') {
- if_grpnm_hv = IPGPC_WILDCARD;
- } else {
- if_grpnm_hv = name_hash(fid->filter.if_groupname, TABLE_SIZE);
- }
- if (ht_insert(&ipgpc_table_list[IF_GRPNM_IDX], filter_id, if_grpnm_hv)
- == NORMAL_VALUE) {
- fid->insert_map |= IF_GRPNM_MASK;
- }
-
/* add direction selector */
if (ht_insert(&ipgpc_table_list[DIR_IDX], filter_id,
fid->filter.direction) == NORMAL_VALUE) {
@@ -1102,8 +1070,8 @@ ipgpc_addfilter(ipgpc_filter_t *filter, char *class_name, ipp_flags_t flags)
fid_t *fid;
unsigned class_id;
- if ((err = class_name2id(&class_id, class_name, ipgpc_num_cls)) !=
- EEXIST) {
+ err = class_name2id(&class_id, class_name, ipgpc_num_cls);
+ if (err != EEXIST) {
ipgpc0dbg(("ipgpc_addfilter: class lookup error %d", err));
return (err);
}
@@ -1376,9 +1344,8 @@ insertcid(ipgpc_class_t *in_class, int *out_class_id)
/* init kstat entry */
if ((rc = class_statinit(in_class, class_id)) != 0) {
ipgpc_cid_list[class_id].info = -1;
- ipgpc0dbg(("insertcid: " \
- "class_statinit failed with " \
- "error %d", rc));
+ ipgpc0dbg(("insertcid: "
+ "class_statinit failed with error %d", rc));
mutex_exit(&ipgpc_cid_list_lock);
return (rc);
}
@@ -1409,8 +1376,6 @@ insertcid(ipgpc_class_t *in_class, int *out_class_id)
static void
common_removefilter(int in_filter_id, fid_t *fid)
{
- int if_grpnm_hv;
-
/* start trie removes */
t_remove(&ipgpc_trie_list[IPGPC_TRIE_SPORTID], in_filter_id,
fid->filter.sport, fid->filter.sport_mask);
@@ -1438,14 +1403,6 @@ common_removefilter(int in_filter_id, fid_t *fid)
/* remove id from interface id table */
ht_remove(&ipgpc_table_list[IF_IDX], in_filter_id,
fid->filter.if_index);
-
- /* remove id from interface group name table */
- if (fid->filter.if_groupname[0] == '\0') {
- if_grpnm_hv = IPGPC_WILDCARD;
- } else {
- if_grpnm_hv = name_hash(fid->filter.if_groupname, TABLE_SIZE);
- }
- ht_remove(&ipgpc_table_list[IF_GRPNM_IDX], in_filter_id, if_grpnm_hv);
/* remove id from direction table */
ht_remove(&ipgpc_table_list[DIR_IDX], in_filter_id,
fid->filter.direction);
@@ -1782,7 +1739,6 @@ int
ipgpc_modifyclass(nvlist_t **nvlpp, ipp_flags_t flags)
{
unsigned class_id;
- ipp_stat_t *cl_stats;
ipgpc_class_t in_class;
char *name;
int rc;
@@ -1837,15 +1793,14 @@ ipgpc_modifyclass(nvlist_t **nvlpp, ipp_flags_t flags)
/* check to see if gather_stats booleans differ */
if ((ipgpc_cid_list[class_id].aclass.gather_stats !=
in_class.gather_stats)) {
- if (ipgpc_cid_list[class_id].aclass.gather_stats ==
- B_TRUE) {
- /* delete kstat entry */
- if (ipgpc_cid_list[class_id].cl_stats != NULL) {
- cl_stats =
- ipgpc_cid_list[class_id].cl_stats;
- ipp_stat_destroy(cl_stats);
- ipgpc_cid_list[class_id].cl_stats = NULL;
- }
+ if (ipgpc_cid_list[class_id].aclass.gather_stats) {
+ /* delete kstat entry */
+ if (ipgpc_cid_list[class_id].cl_stats != NULL) {
+ ipp_stat_destroy(
+ ipgpc_cid_list[class_id].cl_stats);
+ ipgpc_cid_list[class_id].cl_stats =
+ NULL;
+ }
} else { /* gather_stats == B_FALSE */
if ((rc = class_statinit(&in_class, class_id))
!= 0) {
@@ -2326,14 +2281,6 @@ build_filter_nvlist(nvlist_t **nvlpp, ipgpc_filter_t *in_filter,
return (rc);
}
- /* add interface groupname */
- if (in_filter->if_groupname[0] != '\0') {
- if ((rc = nvlist_add_string(nvlp, IPGPC_IF_GROUPNAME,
- in_filter->if_groupname)) != 0) {
- return (rc);
- }
- }
-
/* add uid */
if (in_filter->uid != IPGPC_WILDCARD) {
if ((rc = nvlist_add_uint32(nvlp, IPGPC_UID, in_filter->uid))
diff --git a/usr/src/uts/common/ipp/ipgpc/ipgpc.h b/usr/src/uts/common/ipp/ipgpc/ipgpc.h
index f2e1354132..51edc313f8 100644
--- a/usr/src/uts/common/ipp/ipgpc/ipgpc.h
+++ b/usr/src/uts/common/ipp/ipgpc/ipgpc.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _IPP_IPGPC_IPGPC_H
#define _IPP_IPGPC_IPGPC_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/socket.h>
@@ -48,7 +45,6 @@ extern "C" {
#define IPGPC_NAME "ipgpc"
/* config names of name-value pairs and type */
-#define IPGPC_IF_GROUPNAME "ipgpc.if_groupname" /* string */
#define IPGPC_UID "ipgpc.user" /* int32_t */
#define IPGPC_PROJID "ipgpc.projid" /* int32_t */
#define IPGPC_IF_INDEX "ipgpc.if_index" /* uint32_t */
diff --git a/usr/src/uts/common/net/if.h b/usr/src/uts/common/net/if.h
index 904fe078cb..05f013e4dc 100644
--- a/usr/src/uts/common/net/if.h
+++ b/usr/src/uts/common/net/if.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -12,7 +12,6 @@
#ifndef _NET_IF_H
#define _NET_IF_H
-#pragma ident "%Z%%M% %I% %E% SMI"
/* if.h 1.26 90/05/29 SMI; from UCB 7.1 6/4/86 */
#include <sys/feature_tests.h>
@@ -105,7 +104,7 @@ struct ifnet {
* If you define a flag here, you need to define one in ip_if.h before
* using the new flag in IP. Don't use these flags directly in IP.
*/
-#define IFF_UP 0x0000000001 /* interface is up */
+#define IFF_UP 0x0000000001 /* address is up */
#define IFF_BROADCAST 0x0000000002 /* broadcast address valid */
#define IFF_DEBUG 0x0000000004 /* turn on debugging */
#define IFF_LOOPBACK 0x0000000008 /* is a loopback net */
@@ -138,7 +137,7 @@ struct ifnet {
*/
#define IFF_NOXMIT 0x0000010000 /* Do not transmit packets */
#define IFF_NOLOCAL 0x0000020000 /* No address - just on-link subnet */
-#define IFF_DEPRECATED 0x0000040000 /* interface address deprecated */
+#define IFF_DEPRECATED 0x0000040000 /* Address is deprecated */
#define IFF_ADDRCONF 0x0000080000 /* address from stateless addrconf */
#define IFF_ROUTER 0x0000100000 /* router on this interface */
@@ -149,14 +148,12 @@ struct ifnet {
#define IFF_IPV4 0x0001000000 /* IPv4 interface */
#define IFF_IPV6 0x0002000000 /* IPv6 interface */
/* 0x0004000000 was IFF_MIPRUNNING */
-#define IFF_NOFAILOVER 0x0008000000 /* Don't failover on NIC failure */
+#define IFF_NOFAILOVER 0x0008000000 /* in.mpathd(1M) test address */
-#define IFF_FAILED 0x0010000000 /* NIC has failed */
-#define IFF_STANDBY 0x0020000000 /* Standby NIC to be used on failures */
-#define IFF_INACTIVE 0x0040000000 /* NIC active or not ? */
- /* Used for Standby NIC or */
- /* when FAILBACK is disabled by user */
-#define IFF_OFFLINE 0x0080000000 /* NIC has been offlined */
+#define IFF_FAILED 0x0010000000 /* Interface has failed */
+#define IFF_STANDBY 0x0020000000 /* Interface is a hot-spare */
+#define IFF_INACTIVE 0x0040000000 /* Functioning but not used for data */
+#define IFF_OFFLINE 0x0080000000 /* Interface is offline */
/*
* The IFF_XRESOLV flag is an evolving interface and is subject
@@ -170,14 +167,22 @@ struct ifnet {
#define IFF_FIXEDMTU 0x1000000000ll /* MTU manually set with SIOCSLIFMTU */
#define IFF_VIRTUAL 0x2000000000ll /* Does not send or receive packets */
#define IFF_DUPLICATE 0x4000000000ll /* Local address already in use */
+#define IFF_IPMP 0x8000000000ll /* IPMP IP interface */
-/* flags set internally only: */
+/* flags that cannot be changed by userland on any interface */
#define IFF_CANTCHANGE \
(IFF_BROADCAST | IFF_POINTOPOINT | IFF_RUNNING | IFF_PROMISC | \
IFF_MULTICAST | IFF_MULTI_BCAST | IFF_UNNUMBERED | IFF_IPV4 | \
- IFF_IPV6 | IFF_INACTIVE | IFF_FIXEDMTU | IFF_VIRTUAL | \
+ IFF_IPV6 | IFF_IPMP | IFF_FIXEDMTU | IFF_VIRTUAL | \
IFF_LOOPBACK | IFF_ALLMULTI | IFF_DUPLICATE | IFF_COS_ENABLED)
+/* flags that cannot be changed by userland on an IPMP interface */
+#define IFF_IPMP_CANTCHANGE IFF_FAILED
+
+/* flags that can never be set on an IPMP interface */
+#define IFF_IPMP_INVALID (IFF_STANDBY | IFF_INACTIVE | IFF_OFFLINE | \
+ IFF_NOFAILOVER | IFF_NOARP | IFF_NONUD | IFF_XRESOLV)
+
/*
* Output queues (ifp->if_snd) and internetwork datagram level (pup level 1)
* input routines have queues of messages stored on ifqueue structures
@@ -354,7 +359,7 @@ struct lifreq {
} lifr_lifru1;
#define lifr_addrlen lifr_lifru1.lifru_addrlen
#define lifr_ppa lifr_lifru1.lifru_ppa /* Driver's ppa */
- uint_t lifr_movetoindex; /* FAILOVER/FAILBACK ifindex */
+ uint_t lifr_type; /* IFT_ETHER, ... */
union {
struct sockaddr_storage lifru_addr;
struct sockaddr_storage lifru_dstaddr;
@@ -371,6 +376,7 @@ struct lifreq {
struct lif_nd_req lifru_nd_req;
struct lif_ifinfo_req lifru_ifinfo_req;
char lifru_groupname[LIFGRNAMSIZ]; /* SIOC[GS]LIFGROUPNAME */
+ char lifru_binding[LIFNAMSIZ]; /* SIOCGLIFBINDING */
uint_t lifru_delay; /* SIOC[GS]LIFNOTIFYDELAY */
zoneid_t lifru_zoneid; /* SIOC[GS]LIFZONE */
} lifr_lifru;
@@ -392,6 +398,7 @@ struct lifreq {
#define lifr_nd lifr_lifru.lifru_nd_req /* SIOCLIF*ND */
#define lifr_ifinfo lifr_lifru.lifru_ifinfo_req /* SIOC[GS]LIFLNKINFO */
#define lifr_groupname lifr_lifru.lifru_groupname
+#define lifr_binding lifr_lifru.lifru_binding
#define lifr_delay lifr_lifru.lifru_delay
#define lifr_zoneid lifr_lifru.lifru_zoneid
};
@@ -556,6 +563,7 @@ struct lifsrcof {
#define LIFC_TEMPORARY 0x04 /* Include IFF_TEMPORARY interfaces */
#define LIFC_ALLZONES 0x08 /* Include all zones */
/* (must be issued from global zone) */
+#define LIFC_UNDER_IPMP 0x10 /* Include underlying IPMP interfaces */
#if defined(_SYSCALL32)
@@ -582,6 +590,22 @@ struct lifsrcof32 {
#endif /* _SYSCALL32 */
/*
+ * IPMP group information, for use with SIOCGLIFGROUPINFO.
+ */
+typedef struct lifgroupinfo {
+ char gi_grname[LIFGRNAMSIZ]; /* group name (set by caller) */
+ char gi_grifname[LIFNAMSIZ]; /* IPMP meta-interface name */
+ char gi_m4ifname[LIFNAMSIZ]; /* v4 mcast interface name */
+ char gi_m6ifname[LIFNAMSIZ]; /* v6 mcast interface name */
+ char gi_bcifname[LIFNAMSIZ]; /* v4 bcast interface name */
+ boolean_t gi_v4; /* group is plumbed for v4 */
+ boolean_t gi_v6; /* group is plumbed for v6 */
+ uint_t gi_nv4; /* # of underlying v4 if's */
+ uint_t gi_nv6; /* # of underlying v6 if's */
+ uint_t gi_mactype; /* DLPI mac type of group */
+} lifgroupinfo_t;
+
+/*
* OBSOLETE: Structure used in SIOCGIFCONF request.
* Used to retrieve interface configuration
* for machine (useful for programs which
diff --git a/usr/src/uts/common/net/route.h b/usr/src/uts/common/net/route.h
index 078971918d..3e4307f25e 100644
--- a/usr/src/uts/common/net/route.h
+++ b/usr/src/uts/common/net/route.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -45,7 +45,6 @@
#ifndef _NET_ROUTE_H
#define _NET_ROUTE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
/* from UCB 8.5 (Berkeley) 2/8/95 */
#include <sys/tsol/label.h>
@@ -254,6 +253,18 @@ typedef struct tsol_rtsecattr_s {
#define RTSA_CIPSO 0x100 /* CIPSO protocol */
#define RTSA_SLRANGE (RTSA_MINSL|RTSA_MAXSL)
+/*
+ * Routing socket options.
+ */
+#define RT_AWARE 0x0001 /* set awareness of hidden interfaces */
+
+/*
+ * Supported RT_AWARE values. As a convenience, the bit-values here mirror
+ * the LIFC_* values.
+ */
+#define RTAW_DEFAULT 0x0000 /* unaware application */
+#define RTAW_UNDER_IPMP 0x0010 /* aware of underlying IPMP interfaces */
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h
index 782e2dc340..fc2c750ba7 100644
--- a/usr/src/uts/common/netinet/in.h
+++ b/usr/src/uts/common/netinet/in.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -932,15 +932,7 @@ typedef struct ipsec_req {
#define IP_BOUND_IF 0x41 /* bind socket to an ifindex */
#define IP_UNSPEC_SRC 0x42 /* use unspecified source address */
#define IP_BROADCAST_TTL 0x43 /* use specific TTL for broadcast */
-
-/*
- * IP_DONTFAILOVER_IF option is used to indicate that outbound unicast and
- * multicast packets go through the specified interface, no load spreading,
- * no failover.
- * This is a Sun private interface.
- */
-#define IP_DONTFAILOVER_IF 0x44
-
+/* can be reused 0x44 */
#define IP_DHCPINIT_IF 0x45 /* accept all unicast DHCP traffic */
/*
@@ -1258,15 +1250,6 @@ typedef struct {
#define IPV6_BOUND_IF 0x41 /* bind to an ifindex */
#define IPV6_UNSPEC_SRC 0x42 /* source of packets set to */
/* unspecified (all zeros) */
-#define IPV6_BOUND_PIF 0x43 /* Bind to Physical interface */
- /* No load balancing or failover */
-/*
- * IPV6_DONTFAILOVER_IF option is used to indicate that outbound unicast and
- * multicast packets go through the specified interface, no load spreading,
- * no failover.
- * This is a Sun private interface.
- */
-#define IPV6_DONTFAILOVER_IF 0x44
/*
* Miscellaneous IPv6 constants.
diff --git a/usr/src/uts/common/rpc/rpcib.c b/usr/src/uts/common/rpc/rpcib.c
index d0edb2e8f0..aba7803131 100644
--- a/usr/src/uts/common/rpc/rpcib.c
+++ b/usr/src/uts/common/rpc/rpcib.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -56,7 +56,6 @@
#include <sys/errno.h>
#include <sys/kmem.h>
#include <sys/debug.h>
-#include <sys/systm.h>
#include <sys/pathname.h>
#include <sys/kstat.h>
#include <sys/t_lock.h>
@@ -67,47 +66,43 @@
#include <sys/callb.h>
#include <sys/sunddi.h>
#include <sys/sunndi.h>
-#include <sys/sunldi.h>
#include <sys/sdt.h>
-#include <sys/dlpi.h>
#include <sys/ib/ibtl/ibti.h>
#include <rpc/rpc.h>
#include <rpc/ib.h>
-
#include <sys/modctl.h>
-
-#include <sys/pathname.h>
#include <sys/kstr.h>
#include <sys/sockio.h>
#include <sys/vnode.h>
#include <sys/tiuser.h>
#include <net/if.h>
+#include <net/if_types.h>
#include <sys/cred.h>
#include <rpc/rpc_rdma.h>
-
#include <nfs/nfs.h>
-#include <sys/kstat.h>
#include <sys/atomic.h>
#define NFS_RDMA_PORT 2050
-extern char *inet_ntop(int, const void *, char *, int);
-
+/*
+ * Convenience structure used by rpcib_get_ib_addresses()
+ */
+typedef struct rpcib_ipaddrs {
+ void *ri_list; /* pointer to list of addresses */
+ uint_t ri_count; /* number of addresses in list */
+ uint_t ri_size; /* size of ri_list in bytes */
+} rpcib_ipaddrs_t;
/*
* Prototype declarations for driver ops
*/
-
static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
void *, void **);
static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
-static int rpcib_is_ib_interface(char *);
-static int rpcib_dl_info(ldi_handle_t, dl_info_ack_t *);
-static int rpcib_do_ip_ioctl(int, int, caddr_t);
-static boolean_t rpcib_get_ib_addresses(struct sockaddr_in *,
- struct sockaddr_in6 *, uint_t *, uint_t *);
-static uint_t rpcib_get_number_interfaces(void);
+static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
+static int rpcib_do_ip_ioctl(int, int, void *);
+static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
static int rpcib_cache_kstat_update(kstat_t *, int);
static void rib_force_cleanup(void *);
@@ -147,9 +142,6 @@ static struct cb_ops rpcib_cbops = {
nodev /* int (*cb_awrite)() */
};
-
-
-
/*
* Device options
*/
@@ -205,8 +197,7 @@ typedef struct cache_struct {
avl_node_t avl_link;
} cache_avl_struct_t;
-
-static uint64_t rib_total_buffers = 0;
+static uint64_t rib_total_buffers = 0;
uint64_t cache_limit = 100 * 1024 * 1024;
static volatile uint64_t cache_allocation = 0;
static uint64_t cache_watermark = 80 * 1024 * 1024;
@@ -409,12 +400,10 @@ rpcib_t rpcib;
*/
int rib_debug = 0;
-
int
_init(void)
{
- int error;
- int ret;
+ int error;
error = mod_install((struct modlinkage *)&rib_modlinkage);
if (error != 0) {
@@ -423,11 +412,7 @@ _init(void)
*/
return (error);
}
- ret = ldi_ident_from_mod(&rib_modlinkage, &rpcib_li);
- if (ret != 0)
- rpcib_li = NULL;
mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
-
return (0);
}
@@ -448,7 +433,6 @@ _fini()
return (status);
}
mutex_destroy(&plugin_state_lock);
- ldi_ident_release(rpcib_li);
return (0);
}
@@ -458,7 +442,6 @@ _info(struct modinfo *modinfop)
return (mod_info(&rib_modlinkage, modinfop));
}
-
/*
* rpcib_getinfo()
* Given the device number, return the devinfo pointer or the
@@ -1822,124 +1805,100 @@ refresh:
rdma_stat
rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
{
- struct sockaddr_in *sin4, *sin4arr;
- struct sockaddr_in6 *sin6, *sin6arr;
- uint_t nif, nif4, nif6, i;
+ uint_t i;
ibt_path_info_t path;
ibt_status_t ibt_status;
uint8_t num_paths_p;
ibt_ip_path_attr_t ipattr;
ibt_ip_addr_t dstip;
ibt_path_ip_src_t srcip;
-
+ rpcib_ipaddrs_t addrs4;
+ rpcib_ipaddrs_t addrs6;
+ struct sockaddr_in *sinp;
+ struct sockaddr_in6 *sin6p;
+ rdma_stat retval = RDMA_SUCCESS;
*hca = NULL;
-
ASSERT(raddr->buf != NULL);
bzero(&path, sizeof (ibt_path_info_t));
bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
bzero(&srcip, sizeof (ibt_path_ip_src_t));
- /* Obtain the source IP addresses for the system */
- nif = rpcib_get_number_interfaces();
- sin4arr = (struct sockaddr_in *)
- kmem_zalloc(sizeof (struct sockaddr_in) * nif, KM_SLEEP);
- sin6arr = (struct sockaddr_in6 *)
- kmem_zalloc(sizeof (struct sockaddr_in6) * nif, KM_SLEEP);
-
- (void) rpcib_get_ib_addresses(sin4arr, sin6arr, &nif4, &nif6);
-
- /* Are there really any IB interfaces available */
- if (nif4 == 0 && nif6 == 0) {
- kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
- kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
- return (RDMA_FAILED);
+ if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
+ (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
+ retval = RDMA_FAILED;
+ goto done;
}
/* Prep the destination address */
switch (addr_type) {
case AF_INET:
- sin4 = (struct sockaddr_in *)raddr->buf;
+ sinp = (struct sockaddr_in *)raddr->buf;
dstip.family = AF_INET;
- dstip.un.ip4addr = sin4->sin_addr.s_addr;
+ dstip.un.ip4addr = sinp->sin_addr.s_addr;
+ sinp = addrs4.ri_list;
- for (i = 0; i < nif4; i++) {
+ for (i = 0; i < addrs4.ri_count; i++) {
num_paths_p = 0;
ipattr.ipa_dst_ip = &dstip;
ipattr.ipa_hca_guid = rib_stat->hca->hca_guid;
ipattr.ipa_ndst = 1;
ipattr.ipa_max_paths = 1;
ipattr.ipa_src_ip.family = dstip.family;
- ipattr.ipa_src_ip.un.ip4addr =
- sin4arr[i].sin_addr.s_addr;
+ ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr;
ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
- IBT_PATH_NO_FLAGS,
- &ipattr,
- &path,
- &num_paths_p,
+ IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p,
&srcip);
if (ibt_status == IBT_SUCCESS &&
num_paths_p != 0 &&
path.pi_hca_guid == rib_stat->hca->hca_guid) {
*hca = rib_stat->hca;
-
- kmem_free(sin4arr,
- sizeof (struct sockaddr_in) * nif);
- kmem_free(sin6arr,
- sizeof (struct sockaddr_in6) * nif);
-
- return (RDMA_SUCCESS);
+ goto done;
}
}
+ retval = RDMA_FAILED;
break;
case AF_INET6:
- sin6 = (struct sockaddr_in6 *)raddr->buf;
+ sin6p = (struct sockaddr_in6 *)raddr->buf;
dstip.family = AF_INET6;
- dstip.un.ip6addr = sin6->sin6_addr;
+ dstip.un.ip6addr = sin6p->sin6_addr;
+ sin6p = addrs6.ri_list;
- for (i = 0; i < nif6; i++) {
+ for (i = 0; i < addrs6.ri_count; i++) {
num_paths_p = 0;
ipattr.ipa_dst_ip = &dstip;
ipattr.ipa_hca_guid = rib_stat->hca->hca_guid;
ipattr.ipa_ndst = 1;
ipattr.ipa_max_paths = 1;
ipattr.ipa_src_ip.family = dstip.family;
- ipattr.ipa_src_ip.un.ip6addr = sin6arr[i].sin6_addr;
+ ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr;
ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
- IBT_PATH_NO_FLAGS,
- &ipattr,
- &path,
- &num_paths_p,
+ IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p,
&srcip);
if (ibt_status == IBT_SUCCESS &&
num_paths_p != 0 &&
path.pi_hca_guid == rib_stat->hca->hca_guid) {
*hca = rib_stat->hca;
-
- kmem_free(sin4arr,
- sizeof (struct sockaddr_in) * nif);
- kmem_free(sin6arr,
- sizeof (struct sockaddr_in6) * nif);
-
- return (RDMA_SUCCESS);
+ goto done;
}
}
-
+ retval = RDMA_FAILED;
break;
default:
- kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
- kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
- return (RDMA_INVAL);
+ retval = RDMA_INVAL;
+ break;
}
-
- kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
- kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
- return (RDMA_FAILED);
+done:
+ if (addrs4.ri_size > 0)
+ kmem_free(addrs4.ri_list, addrs4.ri_size);
+ if (addrs6.ri_size > 0)
+ kmem_free(addrs6.ri_list, addrs6.ri_size);
+ return (retval);
}
/*
@@ -4668,123 +4627,31 @@ rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
return (RDMA_SUCCESS);
}
-
/*
- * Return 0 if the interface is IB.
- * Return error (>0) if any error is encountered during processing.
- * Return -1 if the interface is not IB and no error.
+ * Check if the IP interface named by `lifrp' is RDMA-capable.
*/
-#define isalpha(ch) (((ch) >= 'a' && (ch) <= 'z') || \
- ((ch) >= 'A' && (ch) <= 'Z'))
-static int
-rpcib_is_ib_interface(char *name)
+static boolean_t
+rpcib_rdma_capable_interface(struct lifreq *lifrp)
{
+ char ifname[LIFNAMSIZ];
+ char *cp;
- char dev_path[MAXPATHLEN];
- char devname[MAXNAMELEN];
- ldi_handle_t lh;
- dl_info_ack_t info;
- int ret = 0;
- int i;
+ if (lifrp->lifr_type == IFT_IB)
+ return (B_TRUE);
/*
- * ibd devices are only style 2 devices
- * so we will open only style 2 devices
- * by ignoring the ppa
+ * Strip off the logical interface portion before getting
+ * intimate with the name.
*/
+ (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
+ if ((cp = strchr(ifname, ':')) != NULL)
+ *cp = '\0';
- i = strlen(name) - 1;
- while ((i >= 0) && (!isalpha(name[i]))) i--;
-
- if (i < 0) {
- /* Invalid interface name, no alphabet */
- return (-1);
- }
-
- (void) strncpy(devname, name, i + 1);
- devname[i + 1] = '\0';
-
- if (strcmp("lo", devname) == 0) {
- /*
- * loopback interface not rpc/rdma capable
- */
- return (-1);
- }
-
- (void) strncpy(dev_path, "/dev/", MAXPATHLEN);
- if (strlcat(dev_path, devname, MAXPATHLEN) >= MAXPATHLEN) {
- /* string overflow */
- return (-1);
- }
-
- ret = ldi_open_by_name(dev_path, FREAD|FWRITE, kcred, &lh, rpcib_li);
- if (ret != 0) {
- return (ret);
- }
- ret = rpcib_dl_info(lh, &info);
- (void) ldi_close(lh, FREAD|FWRITE, kcred);
- if (ret != 0) {
- return (ret);
- }
-
- if (info.dl_mac_type != DL_IB) {
- return (-1);
- }
-
- return (0);
+ return (strcmp("lo0", ifname) == 0);
}
static int
-rpcib_dl_info(ldi_handle_t lh, dl_info_ack_t *info)
-{
- dl_info_req_t *info_req;
- union DL_primitives *dl_prim;
- mblk_t *mp;
- k_sigset_t smask;
- int error;
-
- if ((mp = allocb(sizeof (dl_info_req_t), BPRI_MED)) == NULL) {
- return (ENOMEM);
- }
-
- mp->b_datap->db_type = M_PROTO;
-
- info_req = (dl_info_req_t *)(uintptr_t)mp->b_wptr;
- mp->b_wptr += sizeof (dl_info_req_t);
- info_req->dl_primitive = DL_INFO_REQ;
-
- sigintr(&smask, 0);
- if ((error = ldi_putmsg(lh, mp)) != 0) {
- sigunintr(&smask);
- return (error);
- }
- if ((error = ldi_getmsg(lh, &mp, (timestruc_t *)NULL)) != 0) {
- sigunintr(&smask);
- return (error);
- }
- sigunintr(&smask);
-
- dl_prim = (union DL_primitives *)(uintptr_t)mp->b_rptr;
- switch (dl_prim->dl_primitive) {
- case DL_INFO_ACK:
- if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) <
- sizeof (dl_info_ack_t)) {
- error = -1;
- } else {
- *info = *(dl_info_ack_t *)(uintptr_t)mp->b_rptr;
- error = 0;
- }
- break;
- default:
- error = -1;
- break;
- }
-
- freemsg(mp);
- return (error);
-}
-static int
-rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg)
+rpcib_do_ip_ioctl(int cmd, int len, void *arg)
{
vnode_t *kvp, *vp;
TIUSER *tiptr;
@@ -4792,23 +4659,22 @@ rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg)
k_sigset_t smask;
int err = 0;
- if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
- &kvp) == 0) {
- if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
+ if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) {
+ if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE,
&tiptr, CRED()) == 0) {
- vp = tiptr->fp->f_vnode;
- } else {
- VN_RELE(kvp);
- return (EPROTO);
+ vp = tiptr->fp->f_vnode;
+ } else {
+ VN_RELE(kvp);
+ return (EPROTO);
}
} else {
- return (EPROTO);
+ return (EPROTO);
}
iocb.ic_cmd = cmd;
iocb.ic_timout = 0;
iocb.ic_len = len;
- iocb.ic_dp = arg;
+ iocb.ic_dp = (caddr_t)arg;
sigintr(&smask, 0);
err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
sigunintr(&smask);
@@ -4817,65 +4683,89 @@ rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg)
return (err);
}
-static uint_t rpcib_get_number_interfaces(void) {
-uint_t numifs;
- if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (uint_t), (caddr_t)&numifs)) {
- return (0);
+/*
+ * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
+ * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
+ */
+static int
+rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
+{
+ int err;
+ struct lifnum lifn;
+
+ bzero(&lifn, sizeof (struct lifnum));
+ lifn.lifn_family = AF_UNSPEC;
+
+ err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
+ if (err != 0)
+ return (err);
+
+ /*
+ * Pad the interface count to account for additional interfaces that
+ * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
+ */
+ lifn.lifn_count += 4;
+
+ bzero(lifcp, sizeof (struct lifconf));
+ lifcp->lifc_family = AF_UNSPEC;
+ lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
+ lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
+
+ err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
+ if (err != 0) {
+ kmem_free(lifcp->lifc_buf, *bufsizep);
+ return (err);
}
- return (numifs);
+ return (0);
}
static boolean_t
-rpcib_get_ib_addresses(
- struct sockaddr_in *saddr4,
- struct sockaddr_in6 *saddr6,
- uint_t *number4,
- uint_t *number6)
+rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
{
- int numifs;
- struct ifconf kifc;
- struct ifreq *ifr;
- boolean_t ret = B_FALSE;
+ uint_t i, nifs;
+ uint_t bufsize;
+ struct lifconf lifc;
+ struct lifreq *lifrp;
+ struct sockaddr_in *sinp;
+ struct sockaddr_in6 *sin6p;
- *number4 = 0;
- *number6 = 0;
+ bzero(addrs4, sizeof (rpcib_ipaddrs_t));
+ bzero(addrs6, sizeof (rpcib_ipaddrs_t));
- if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (int), (caddr_t)&numifs)) {
- return (ret);
+ if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
+ return (B_FALSE);
+
+ if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
+ kmem_free(lifc.lifc_buf, bufsize);
+ return (B_FALSE);
}
- kifc.ifc_len = numifs * sizeof (struct ifreq);
- kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP);
+ /*
+ * Worst case is that all of the addresses are IB-capable and have
+ * the same address family, so size our buffers accordingly.
+ */
+ addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
+ addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
+ addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
+ addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
- if (rpcib_do_ip_ioctl(SIOCGIFCONF, sizeof (struct ifconf),
- (caddr_t)&kifc)) {
- goto done;
- }
+ for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
+ if (!rpcib_rdma_capable_interface(lifrp))
+ continue;
- ifr = kifc.ifc_req;
- for (numifs = kifc.ifc_len / sizeof (struct ifreq);
- numifs > 0; numifs--, ifr++) {
- struct sockaddr_in *sin4;
- struct sockaddr_in6 *sin6;
-
- if ((rpcib_is_ib_interface(ifr->ifr_name) == 0)) {
- sin4 = (struct sockaddr_in *)(uintptr_t)&ifr->ifr_addr;
- sin6 = (struct sockaddr_in6 *)(uintptr_t)&ifr->ifr_addr;
- if (sin4->sin_family == AF_INET) {
- saddr4[*number4] = *(struct sockaddr_in *)
- (uintptr_t)&ifr->ifr_addr;
- *number4 = *number4 + 1;
- } else if (sin6->sin6_family == AF_INET6) {
- saddr6[*number6] = *(struct sockaddr_in6 *)
- (uintptr_t)&ifr->ifr_addr;
- *number6 = *number6 + 1;
- }
+ if (lifrp->lifr_addr.ss_family == AF_INET) {
+ sinp = addrs4->ri_list;
+ bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
+ sizeof (struct sockaddr_in));
+ } else if (lifrp->lifr_addr.ss_family == AF_INET6) {
+ sin6p = addrs6->ri_list;
+ bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
+ sizeof (struct sockaddr_in6));
}
}
- ret = B_TRUE;
-done:
- kmem_free(kifc.ifc_buf, kifc.ifc_len);
- return (ret);
+
+ kmem_free(lifc.lifc_buf, bufsize);
+ return (B_TRUE);
}
/* ARGSUSED */
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index aa01ddeed6..9f9c95c78d 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -266,13 +266,16 @@ typedef struct dl_ipnetinfo {
#define DL_OTHER 0x09 /* Any other medium not listed above */
/*
* Private media types. These must be above the value 0x80000000 as
- * stated in the DLPI specification.
+ * stated in the DLPI specification. NOTE: The SUNW_ prefix is used
+ * to denote synthetic DLPI types that are internal to the stack.
*/
#define DL_IPV4 0x80000001ul /* IPv4 Tunnel Link */
#define DL_IPV6 0x80000002ul /* IPv6 Tunnel Link */
#define SUNW_DL_VNI 0x80000003ul /* Virtual network interface */
#define DL_WIFI 0x80000004ul /* IEEE 802.11 */
#define DL_IPNET 0x80000005ul /* ipnet(7D) link */
+#define SUNW_DL_IPMP 0x80000006ul /* IPMP stub interface */
+
/*
* DLPI provider service supported.
* These must be allowed to be bitwise-OR for dl_service_mode in
diff --git a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h
index e421c0b9c0..7bb54ad12e 100644
--- a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h
+++ b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h
@@ -19,34 +19,23 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_IB_MGT_IBCM_IBCM_ARP_H
#define _SYS_IB_MGT_IBCM_IBCM_ARP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
-
#include <sys/ib/mgt/ibcm/ibcm_impl.h>
#include <sys/modhash.h>
#include <sys/ib/clients/ibd/ibd.h>
#include <sys/strsun.h>
-#include <sys/strsubr.h>
#include <sys/socket.h>
#include <sys/stat.h> /* for S_IFCHR */
-#include <inet/common.h>
-#include <inet/ip.h>
-#include <inet/ip_if.h>
-#include <inet/ip_ire.h>
-#include <inet/ip_rts.h>
-#include <sys/dlpi.h>
-#include <net/route.h>
/*
* IPoIB addr lookup completion function
@@ -103,7 +92,6 @@ typedef struct ibcm_arp_streams_s {
/* GID to IP-Addr and Ip-Addr to GID look-up functions. */
-#define IBCM_ARP_IBD_NAME "ibd"
#define IBCM_ARP_IBD_INSTANCES 4
typedef struct ibcm_arp_ip_s {
diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h
index 593505a426..4e3b2b5778 100644
--- a/usr/src/uts/common/sys/socket.h
+++ b/usr/src/uts/common/sys/socket.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -156,12 +156,10 @@ struct so_snd_bufinfo {
/* "Socket"-level control message types: */
#define SCM_RIGHTS 0x1010 /* access rights (array of int) */
-
#define SO_SECATTR 0x1011 /* socket's security attributes */
#define SCM_UCRED 0x1012 /* sender's ucred */
#define SO_TIMESTAMP 0x1013 /* socket-level timestamp option */
#define SCM_TIMESTAMP SO_TIMESTAMP /* socket control message timestamp */
-
#define SO_ALLZONES 0x1014 /* bind in all zones */
#define SO_EXCLBIND 0x1015 /* exclusive binding */
@@ -203,9 +201,12 @@ struct linger {
};
/*
- * Level number for (get/set)sockopt() to apply to socket itself.
+ * Levels for (get/set)sockopt() that don't apply to a specific protocol.
*/
#define SOL_SOCKET 0xffff /* options for socket level */
+#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
+#define SOL_ROUTE 0xfffe /* options for routing socket level */
+#endif
/*
* Address families.
diff --git a/usr/src/uts/common/sys/sockio.h b/usr/src/uts/common/sys/sockio.h
index 9e107ff3ef..0ef5394fea 100644
--- a/usr/src/uts/common/sys/sockio.h
+++ b/usr/src/uts/common/sys/sockio.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -175,7 +175,7 @@ extern "C" {
#define SIOCSLIFNETMASK _IOW('i', 126, struct lifreq) /* set subnetmask */
#define SIOCGLIFMETRIC _IOWR('i', 127, struct lifreq) /* get if metric */
#define SIOCSLIFMETRIC _IOW('i', 128, struct lifreq) /* set if metric */
-#define SIOCSLIFNAME _IOWR('i', 129, struct lifreq) /* set interface name */
+#define SIOCSLIFNAME _IOWR('i', 129, struct lifreq) /* set interface name */
#define SIOCGLIFNUM _IOWR('i', 130, struct lifnum) /* get number of ifs */
#define SIOCGLIFMUXID _IOWR('i', 131, struct lifreq) /* get if muxid */
#define SIOCSLIFMUXID _IOW('i', 132, struct lifreq) /* set if muxid */
@@ -223,22 +223,21 @@ extern "C" {
#define SIOCLIPSECONFIG _IOW('i', 152, 0) /* List Policy */
/*
- * IOCTLS for implementing load balancing and failover within IP.
+ * 153 can be reused (was consolidation-private SIOCLIFFAILOVER).
*/
-#define SIOCLIFFAILOVER _IOW('i', 153, struct lifreq) /* Failover */
-#define SIOCLIFFAILBACK _IOW('i', 154, struct lifreq) /* Failback */
-#define SIOCSLIFGROUPNAME _IOW('i', 155, struct lifreq) /* Group interfaces */
-#define SIOCGLIFGROUPNAME _IOWR('i', 156, struct lifreq) /* Get group name */
-#define SIOCGLIFOINDEX _IOWR('i', 157, struct lifreq) /* get orig if index */
/*
- * Leave 158 - 160 unused; used to be SIOC*IFARP ioctls.
+ * IP Multipathing ioctls.
*/
+#define SIOCGLIFBINDING _IOWR('i', 154, struct lifreq)
+#define SIOCSLIFGROUPNAME _IOW('i', 155, struct lifreq)
+#define SIOCGLIFGROUPNAME _IOWR('i', 156, struct lifreq)
+#define SIOCGLIFGROUPINFO _IOWR('i', 157, struct lifgroupinfo)
/*
- * IOCTL for implementing load balancing and failover within IP.
+ * Leave 158 - 160 unused; used to be SIOC*IFARP ioctls.
+ * However, 161 can be reused (was consolidation-private SIOCSLIFOINDEX).
*/
-#define SIOCSLIFOINDEX _IOWR('i', 161, struct lifreq) /* set orig if index */
/*
* IOCTLS which provide an interface to the IPv6 address selection policy.
@@ -309,10 +308,8 @@ extern "C" {
#define SIOCSIPMSFILTER _IOW('i', 181, 0)
/*
- * IOCTL for implementing "disable FAILBACK" IPMP configuration.
+ * 182 can be reused (was consolidation-private SIOCSIPMPFAILBACK).
*/
-#define SIOCSIPMPFAILBACK _IOW('i', 182, int) /* enable/disable */
- /* FAILBACK */
#define SIOCSENABLESDP _IOWR('i', 183, int) /* Enable SDP */
diff --git a/usr/src/uts/common/sys/sysevent/eventdefs.h b/usr/src/uts/common/sys/sysevent/eventdefs.h
index ac21686e84..dcf36f748c 100644
--- a/usr/src/uts/common/sys/sysevent/eventdefs.h
+++ b/usr/src/uts/common/sys/sysevent/eventdefs.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -179,6 +179,8 @@ extern "C" {
/* Interface within an IPMP group has changed state or type */
#define ESC_IPMP_IF_CHANGE "ESC_ipmp_if_change"
+/* IPMP probe has changed state */
+#define ESC_IPMP_PROBE_STATE "ESC_ipmp_probe_state"
/*
* EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes
diff --git a/usr/src/uts/common/sys/sysevent/ipmp.h b/usr/src/uts/common/sys/sysevent/ipmp.h
index 137fa918cd..ba39a5bb2b 100644
--- a/usr/src/uts/common/sys/sysevent/ipmp.h
+++ b/usr/src/uts/common/sys/sysevent/ipmp.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,16 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_SYSEVENT_IPMP_H
#define _SYS_SYSEVENT_IPMP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-
/*
* IPMP sysevent definitions. Note that all of these definitions are
* Sun-private and are subject to change at any time.
@@ -39,13 +35,18 @@
extern "C" {
#endif
+/*
+ * Event channel associated with these events
+ */
+#define IPMP_EVENT_CHAN "com.sun:ipmp:events"
/*
* Event type EC_IPMP/ESC_IPMP_GROUP_STATE event schema
*
* Event Class - EC_IPMP
* Event Sub-Class - ESC_IPMP_GROUP_STATE
- * Event Publisher - SUNW:usr:in.mpathd
+ * Event Vendor - com.sun
+ * Event Publisher - in.mpathd
*
* Attribute Name - IPMP_EVENT_VERSION
* Attribute Type - SE_DATA_TYPE_UINT32
@@ -70,18 +71,20 @@ extern "C" {
#define IPMP_GROUP_STATE "ipmp_group_state"
typedef enum {
- IPMP_GROUP_OK, /* at least one interface in group is ok */
- IPMP_GROUP_FAILED /* all interfaces in the group have failed */
+ IPMP_GROUP_OK, /* all interfaces in the group are ok */
+ IPMP_GROUP_FAILED, /* all interfaces in the group are unusable */
+ IPMP_GROUP_DEGRADED /* some interfaces in the group are unusable */
} ipmp_group_state_t;
-#define IPMP_EVENT_CUR_VERSION 1
+#define IPMP_EVENT_CUR_VERSION 2
/*
* Event type EC_IPMP/ESC_IPMP_GROUP_CHANGE event schema
*
* Event Class - EC_IPMP
* Event Sub-Class - ESC_IPMP_GROUP_CHANGE
- * Event Publisher - SUNW:usr:in.mpathd
+ * Event Vendor - com.sun
+ * Event Publisher - in.mpathd
*
* Attribute Name - IPMP_GROUP_NAME
* Attribute Type - SE_DATA_TYPE_STRING
@@ -113,7 +116,8 @@ typedef enum {
*
* Event Class - EC_IPMP
* Event Sub-Class - ESC_IPMP_GROUP_MEMBER_CHANGE
- * Event Publisher - SUNW:usr:in.mpathd
+ * Event Vendor - com.sun
+ * Event Publisher - in.mpathd
*
* Attribute Name - IPMP_GROUP_NAME
* Attribute Type - SE_DATA_TYPE_STRING
@@ -171,7 +175,8 @@ typedef enum {
*
* Event Class - EC_IPMP
* Event Sub-Class - ESC_IPMP_IF_CHANGE
- * Event Publisher - SUNW:usr:in.mpathd
+ * Event Vendor - com.sun
+ * Event Publisher - in.mpathd
*
* Attribute Name - IPMP_GROUP_NAME
* Attribute Type - SE_DATA_TYPE_STRING
@@ -198,6 +203,75 @@ typedef enum {
* Attribute Value - <if-type>
*/
+#define IPMP_PROBE_ID "ipmp_probe_id"
+#define IPMP_PROBE_STATE "ipmp_probe_state"
+#define IPMP_PROBE_START_TIME "ipmp_probe_start_time"
+#define IPMP_PROBE_SENT_TIME "ipmp_probe_sent_time"
+#define IPMP_PROBE_ACKRECV_TIME "ipmp_probe_ackrecv_time"
+#define IPMP_PROBE_ACKPROC_TIME "ipmp_probe_ackproc_time"
+#define IPMP_PROBE_TARGET "ipmp_probe_target"
+#define IPMP_PROBE_TARGET_RTTAVG "ipmp_probe_target_rttavg"
+#define IPMP_PROBE_TARGET_RTTDEV "ipmp_probe_target_rttdev"
+
+typedef enum {
+ IPMP_PROBE_SENT, /* the probe has been sent */
+ IPMP_PROBE_ACKED, /* the probe has been acked */
+ IPMP_PROBE_LOST /* the probe has been lost */
+} ipmp_probe_state_t;
+
+/*
+ * Event type EC_IPMP/ESC_IPMP_PROBE_STATE event schema
+ *
+ * Event Class - EC_IPMP
+ * Event Sub-Class - ESC_IPMP_PROBE_STATE
+ * Event Vendor - com.sun
+ * Event Publisher - in.mpathd
+ *
+ * Attribute Name - IPMP_PROBE_ID
+ * Attribute Type - SE_DATA_TYPE_UINT32
+ * Attribute Value - <probe-id>
+ *
+ * Attribute Name - IPMP_EVENT_VERSION
+ * Attribute Type - SE_DATA_TYPE_UINT32
+ * Attribute Value - <version>
+ *
+ * Attribute Name - IPMP_IF_NAME
+ * Attribute Type - SE_DATA_TYPE_STRING
+ * Attribute Value - <if-name>
+ *
+ * Attribute Name - IPMP_PROBE_STATE
+ * Attribute Type - SE_DATA_TYPE_UINT32
+ * Attribute Value - <probe-state>
+ *
+ * Attribute Name - IPMP_PROBE_START_TIME
+ * Attribute Type - SE_DATA_TYPE_TIME
+ * Attribute Value - <probe-start-time>
+ *
+ * Attribute Name - IPMP_PROBE_SENT_TIME
+ * Attribute Type - SE_DATA_TYPE_TIME
+ * Attribute Value - <probe-sent-time>
+ *
+ * Attribute Name - IPMP_PROBE_ACKRECV_TIME
+ * Attribute Type - SE_DATA_TYPE_TIME
+ * Attribute Value - <probe-ackrecv-time>
+ *
+ * Attribute Name - IPMP_PROBE_ACKPROC_TIME
+ * Attribute Type - SE_DATA_TYPE_TIME
+ * Attribute Value - <probe-ackproc-time>
+ *
+ * Attribute Name - IPMP_PROBE_TARGET
+ * Attribute Type - SE_DATA_TYPE_BYTES
+ * Attribute Value - <probe-target-ip>
+ *
+ * Attribute Name - IPMP_PROBE_TARGET_RTTAVG
+ * Attribute Type - SE_DATA_TYPE_UINT32
+ * Attribute Value - <probe-target-rttavg>
+ *
+ * Attribute Name - IPMP_PROBE_TARGET_RTTDEV
+ * Attribute Type - SE_DATA_TYPE_UINT32
+ * Attribute Value - <probe-target-rttdev>
+ */
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared
index 9585034efb..b25c2fb0cc 100644
--- a/usr/src/uts/intel/Makefile.intel.shared
+++ b/usr/src/uts/intel/Makefile.intel.shared
@@ -216,6 +216,7 @@ DRV_KMODS += cryptoadm
DRV_KMODS += dda
DRV_KMODS += devinfo
DRV_KMODS += dld
+DRV_KMODS += dlpistub
DRV_KMODS += dmd
DRV_KMODS_32 += dnet
DRV_KMODS += dump
@@ -321,7 +322,6 @@ DRV_KMODS += udp6
DRV_KMODS += ucode
DRV_KMODS += ural
DRV_KMODS += vgatext
-DRV_KMODS += vni
DRV_KMODS += vnic
DRV_KMODS += vscan
DRV_KMODS += wc
diff --git a/usr/src/uts/intel/vni/Makefile b/usr/src/uts/intel/dlpistub/Makefile
index aa32704615..53cf2092a7 100644
--- a/usr/src/uts/intel/vni/Makefile
+++ b/usr/src/uts/intel/dlpistub/Makefile
@@ -18,18 +18,11 @@
#
# CDDL HEADER END
#
-#
-# uts/intel/vni/Makefile
-#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
-# This makefile drives the production of the vni streams kernel
-# module.
-#
-# intel architecture dependent
+# This makefile drives the production of the dlpistub STREAMS module.
+# intel architecture dependent
#
#
@@ -40,11 +33,11 @@ UTSBASE = ../..
#
# Define the module and object file sets.
#
-MODULE = vni
-OBJECTS = $(VNI_OBJS:%=$(OBJS_DIR)/%)
-LINTS = $(VNI_OBJS:%.o=$(LINTS_DIR)/%.ln)
+MODULE = dlpistub
+OBJECTS = $(DLPISTUB_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(DLPISTUB_OBJS:%.o=$(LINTS_DIR)/%.ln)
ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
-CONF_SRCDIR = $(UTSBASE)/common/inet/vni
+CONF_SRCDIR = $(UTSBASE)/common/inet/dlpistub
#
# Include common rules.
diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64
index 3972f1b4ec..d89224677b 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.debug64
+++ b/usr/src/uts/intel/ip/ip.global-objs.debug64
@@ -19,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -161,6 +161,9 @@ ipinfov4
ipinfov6
iplrinit
iplwinit
+ipmp_aract_template
+ipmp_ardeact_template
+ipmp_kstats
iprinitv4
iprinitv6
ipsec_action_cache
diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64
index f6a97be29b..0e58fdc219 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.obj64
+++ b/usr/src/uts/intel/ip/ip.global-objs.obj64
@@ -19,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -160,6 +160,9 @@ ipinfov4
ipinfov6
iplrinit
iplwinit
+ipmp_aract_template
+ipmp_ardeact_template
+ipmp_kstats
iprinitv4
iprinitv6
ipsec_action_cache
diff --git a/usr/src/uts/intel/os/name_to_major b/usr/src/uts/intel/os/name_to_major
index 3d58c314b7..eb70695abd 100644
--- a/usr/src/uts/intel/os/name_to_major
+++ b/usr/src/uts/intel/os/name_to_major
@@ -102,7 +102,7 @@ kmdb 171
sctp 172
sctp6 173
scsi_vhci 174
-vni 175
+dlpistub 175
cpuid 176
bmc 177
dld 178
diff --git a/usr/src/uts/sparc/Makefile.sparc.shared b/usr/src/uts/sparc/Makefile.sparc.shared
index 3723be6f32..39fba551aa 100644
--- a/usr/src/uts/sparc/Makefile.sparc.shared
+++ b/usr/src/uts/sparc/Makefile.sparc.shared
@@ -20,7 +20,7 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# This makefile contains the common definitions for all sparc
@@ -216,7 +216,8 @@ DRV_KMODS += ippctl sctp sctp6
DRV_KMODS += dld
DRV_KMODS += ipf
DRV_KMODS += rpcib
-DRV_KMODS += vni vnic
+DRV_KMODS += dlpistub
+DRV_KMODS += vnic
DRV_KMODS += xge
DRV_KMODS += rds
DRV_KMODS += chxge
diff --git a/usr/src/uts/sparc/vni/Makefile b/usr/src/uts/sparc/dlpistub/Makefile
index 6a96edc17e..548361738a 100644
--- a/usr/src/uts/sparc/vni/Makefile
+++ b/usr/src/uts/sparc/dlpistub/Makefile
@@ -18,18 +18,11 @@
#
# CDDL HEADER END
#
-#
-# uts/sparc/vni/Makefile
-#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
-# This makefile drives the production of the vni streams kernel
-# module.
-#
-# sparc architecture dependent
+# This makefile drives the production of the dlpistub STREAMS module.
+# sparc architecture dependent
#
#
@@ -40,11 +33,11 @@ UTSBASE = ../..
#
# Define the module and object file sets.
#
-MODULE = vni
-OBJECTS = $(VNI_OBJS:%=$(OBJS_DIR)/%)
-LINTS = $(VNI_OBJS:%.o=$(LINTS_DIR)/%.ln)
+MODULE = dlpistub
+OBJECTS = $(DLPISTUB_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(DLPISTUB_OBJS:%.o=$(LINTS_DIR)/%.ln)
ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
-CONF_SRCDIR = $(UTSBASE)/common/inet/vni
+CONF_SRCDIR = $(UTSBASE)/common/inet/dlpistub
#
# Include common rules.
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64
index 279bd92d0b..6606b472bf 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.debug64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64
@@ -19,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -161,6 +161,9 @@ ipinfov4
ipinfov6
iplrinit
iplwinit
+ipmp_aract_template
+ipmp_ardeact_template
+ipmp_kstats
iprinitv4
iprinitv6
ipsec_action_cache
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64
index 4f4bc3e376..89d40afbbb 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.obj64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64
@@ -19,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -160,6 +160,9 @@ ipinfov4
ipinfov6
iplrinit
iplwinit
+ipmp_aract_template
+ipmp_ardeact_template
+ipmp_kstats
iprinitv4
iprinitv6
ipsec_action_cache
diff --git a/usr/src/uts/sparc/os/name_to_major b/usr/src/uts/sparc/os/name_to_major
index ff58cf5113..9702d00ad7 100644
--- a/usr/src/uts/sparc/os/name_to_major
+++ b/usr/src/uts/sparc/os/name_to_major
@@ -182,7 +182,7 @@ pic16f819 233
kmdb 234
sctp 235
sctp6 236
-vni 237
+dlpistub 237
cpuid 238
did 239
ntwdt 240