diff options
author | meem <Peter.Memishian@Sun.COM> | 2009-01-06 20:16:25 -0500 |
---|---|---|
committer | meem <Peter.Memishian@Sun.COM> | 2009-01-06 20:16:25 -0500 |
commit | e11c3f44f531fdff80941ce57c065d2ae861cefc (patch) | |
tree | e921e957d727a9596275a1119fd627ef2ecca47d | |
parent | 732675dd38771d280fdc276731344e9652071753 (diff) | |
download | illumos-gate-e11c3f44f531fdff80941ce57c065d2ae861cefc.tar.gz |
PSARC/2007/272 Project Clearview: IPMP Rearchitecture
PSARC/2008/773 IPQoS if_groupname Selector Removal
PSARC/2009/001 Move in.mpathd into /lib/inet
6783149 Clearview IPMP Rearchitecture
4472956 libipmp should provide administrative interfaces
4494577 ipmp is opaque - there's no way to get current status
4509788 IPMP's usage of interface flags is not backward compatible
4509869 IPMP's address move mechanism needs to be transparent to applications
4531232 "in.rdiscd: sendto: Bad file number" seen during IPMP DR
4533876 new instances of interfaces under ipmp are generated with each dr/op
4699003 in.mpathd should squawk if interfaces in a group have the same hwaddr
4704937 SUNW_ip_rcm.so is sloppy with strings
4713308 IPMP shouldn't failover unconfigured logical interfaces
4785694 non-homogeneous IPMP group does not do failback
4850407 if_mpadm and IPMP DR failure
5015757 ip can panic with ASSERT(attach_ill == ipif->ipif_ill) failure
5086201 in.ndpd's phyint_reach_random() spews "SIOCSLIFLNKINFO Invalid argument"
6184000 routes cannot be created on failed interfaces
6246564 if_mpadm -r <ifname> doesn't bring up IPv6 link-local data address
6359058 SIOCLIFFAILBACK repeatedly fails with EAGAIN; in.mpathd fills logs
6359536 enabling STANDBY on an interface with no test address acts oddly
6378487 in.dhcpd doesn't work well in an IPMP setup
6462335 cannot offline to IPMP interfaces that have no probe targets
6516992 in.routed spews "Address already in use" during IPMP address move
6518460 ip_rcm`update_pif() must remain calm when logical interfaces disappear
6549957 failed IP interfaces at boot may go unreported
6591186 rpcbind can't deal with indirect calls if all addresses are deprecated
6667143 NCE_F_UNSOL_ADV broken
6698480 IGMP version not retained during IPMP failover
6726235 IPMP group failure can sometimes lead to an extra failover
6726645 in.routed skips DEPRECATED addresses even when no others exist
6738310 ip_ndp_recover() checks IPIF_CONDEMNED on the wrong ipif flags field
6739454 system panics at sdpib`sdp_rts_announce
6739531 IPv6 DAD doesn't work well with IPMP
6740719 in.mpathd may fail to switch to router target mode
6743260 ipif_resolver_up() can fail and leave ARP bringup pending
6746613 ip's DL_NOTE_SDU_SIZE logic mishandles ill_max_frag < ill_max_mtu
6748145 in.ndpd's IPv6 link-local hardware address mappings can go stale
6753560 ilg_delete_all() can race with ill_delete_tail() when ilg_ill changes
6755987 stillborn IFF_POINTOPOINT in.mpathd logic should be hauled out
6775126 SUBDIRS ipsecutils element does not in order be
6775811 NCEs can get stuck in ND_INCOMPLETE if ARP fails when IPMP is in-use
6777496 receive-side ILL_HCKSUM_CAPABLE checks look at the wrong ill
6781488 IPSQ timer restart logic can deadlock under stress
6781883 ip_ndp_find_solicitation() can be passed adverts, and has API issues
6784852 RPCIB, SDP, and RDS all break when vanity naming is used
6786048 IPv6 ND probes create IREs with incorrect source addresses
6786091 I_PLINK handling in IP must not request reentry via ipsq_try_enter()
6786711 IPQoS if_groupname selector needs to go
6787091 assertion failure in ipcl_conn_cleanup() due to non-NULL conn_ilg
6789235 INADDR_ANY ILMs can trigger an assertion failure in IPMP environments
6789502 ipif_resolver_up() calls after ipif_ndp_up() clobber ipif_addr_ready
6789718 ip6.tun0 cannot be plumbed in a non-global-zone post-6745288
6789732 libdlpi may get stuck in i_dlpi_strgetmsg()
6789870 ipif6_dup_recovery() may operate on a freed ipif, corrupting memory
6789874 ipnet_nicevent_cb() may call taskq_dispatch() on a bogus taskq
6790310 in.mpathd may core with "phyint_inst_timer: invalid state 4"
--HG--
rename : usr/src/lib/libinetutil/common/inetutil4.c => usr/src/lib/libinetutil/common/inetutil.c
rename : usr/src/uts/common/inet/vni/vni.c => usr/src/uts/common/inet/dlpistub/dlpistub.c
rename : usr/src/uts/common/inet/vni/vni.conf => usr/src/uts/common/inet/dlpistub/dlpistub.conf
rename : usr/src/uts/common/inet/vni/vni_impl.h => usr/src/uts/common/inet/dlpistub/dlpistub_impl.h
rename : usr/src/uts/intel/vni/Makefile => usr/src/uts/intel/dlpistub/Makefile
rename : usr/src/uts/sparc/vni/Makefile => usr/src/uts/sparc/dlpistub/Makefile
168 files changed, 17264 insertions, 18066 deletions
diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c index 34bb772632..5a4779cfa5 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -133,6 +133,7 @@ main(int argc, char **argv) boolean_t is_verbose; int ipc_fd; int c; + int aware = RTAW_UNDER_IPMP; struct rlimit rl; debug_level = df_get_int("", B_FALSE, DF_DEBUG_LEVEL); @@ -301,6 +302,17 @@ main(int argc, char **argv) dhcpmsg(MSG_ERR, "cannot open routing socket"); return (EXIT_FAILURE); } + + /* + * We're IPMP-aware and can manage IPMP test addresses, so issue + * RT_AWARE to get routing socket messages for interfaces under IPMP. + */ + if (setsockopt(rtsock_fd, SOL_ROUTE, RT_AWARE, &aware, + sizeof (aware)) == -1) { + dhcpmsg(MSG_ERR, "cannot set RT_AWARE on routing socket"); + return (EXIT_FAILURE); + } + if (iu_register_event(eh, rtsock_fd, POLLIN, rtsock_event, 0) == -1) { dhcpmsg(MSG_ERR, "cannot register routing socket for messages"); return (EXIT_FAILURE); @@ -1182,7 +1194,7 @@ check_lif(dhcp_lif_t *lif, const struct ifa_msghdr *ifam, int msglen) lif->lif_name); lif_mark_decline(lif, "duplicate address"); close_ip_lif(lif); - (void) open_ip_lif(lif, INADDR_ANY); + (void) open_ip_lif(lif, INADDR_ANY, B_TRUE); } dad_wait = lif->lif_dad_wait; diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c index 4637ecc346..6cfce9f0a9 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * BOUND state of the DHCP client state machine. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/socket.h> #include <sys/types.h> #include <string.h> @@ -358,7 +356,8 @@ dhcp_bound_complete(dhcp_smach_t *dsmp) lif = dsmp->dsm_lif; if (router_list != NULL && (router_list->len % sizeof (ipaddr_t)) == 0 && - strchr(lif->lif_name, ':') == NULL) { + strchr(lif->lif_name, ':') == NULL && + !lif->lif_pif->pif_under_ipmp) { dsmp->dsm_nrouters = router_list->len / sizeof (ipaddr_t); dsmp->dsm_routers = malloc(router_list->len); diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c index 0cfdad40e3..5d2d5fb99e 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -76,6 +76,7 @@ insert_pif(const char *pname, boolean_t isv6, int *error) { dhcp_pif_t *pif; struct lifreq lifr; + lifgroupinfo_t lifgr; dlpi_handle_t dh = NULL; int fd = isv6 ? v6_sock_fd : v4_sock_fd; @@ -127,12 +128,60 @@ insert_pif(const char *pname, boolean_t isv6, int *error) } /* - * For IPv4, use DLPI to determine the hardware type, hardware - * address, and hardware address length. + * Check if the pif is in an IPMP group. Interfaces using IPMP don't + * have dedicated hardware addresses, and get their hardware type from + * the SIOCGLIFGROUPINFO ioctl rather than DLPI. */ - if (!isv6) { - int rc; - dlpi_info_t dlinfo; + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1) { + *error = DHCP_IPC_E_INT; + dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFGROUPNAME for %s", pname); + goto failure; + } + + if (lifr.lifr_groupname[0] != '\0') { + (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname, + LIFGRNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPINFO, &lifgr) == -1) { + *error = DHCP_IPC_E_INT; + dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFGROUPINFO for %s", + lifgr.gi_grname); + goto failure; + } + + pif->pif_hwtype = dlpi_arptype(lifgr.gi_mactype); + pif->pif_under_ipmp = (strcmp(pname, lifgr.gi_grifname) != 0); + (void) strlcpy(pif->pif_grifname, lifgr.gi_grifname, LIFNAMSIZ); + + /* + * For IPMP underlying interfaces, stash the interface index + * of the IPMP meta-interface; we'll use it to send/receive + * traffic. This is both necessary (since IP_BOUND_IF for + * non-unicast traffic won't work on underlying interfaces) + * and preferred (since a test address lease will be able to + * be maintained as long as another interface in the group is + * still functioning). + */ + if (pif->pif_under_ipmp) { + (void) strlcpy(lifr.lifr_name, pif->pif_grifname, + LIFNAMSIZ); + + if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) { + *error = DHCP_IPC_E_INT; + dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFINDEX " + "for %s", lifr.lifr_name); + goto failure; + } + pif->pif_grindex = lifr.lifr_index; + } + } + + /* + * For IPv4, if the hardware type is still unknown, use DLPI to + * determine it, the hardware address, and hardware address length. + */ + if (!isv6 && pif->pif_hwtype == 0) { + int rc; + dlpi_info_t dlinfo; if ((rc = dlpi_open(pname, &dh, 0)) != DLPI_SUCCESS) { dhcpmsg(MSG_ERROR, "insert_pif: dlpi_open: %s", @@ -661,11 +710,12 @@ verify_lif(const dhcp_lif_t *lif) boolean_t isv6; int fd; struct lifreq lifr; + dhcp_pif_t *pif = lif->lif_pif; (void) memset(&lifr, 0, sizeof (struct lifreq)); (void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ); - isv6 = lif->lif_pif->pif_isv6; + isv6 = pif->pif_isv6; fd = isv6 ? v6_sock_fd : v4_sock_fd; if (ioctl(fd, SIOCGLIFFLAGS, &lifr) == -1) { @@ -689,43 +739,41 @@ verify_lif(const dhcp_lif_t *lif) } /* - * Special case: if the interface has gone down as a duplicate, then - * this alone does _not_ mean that we're abandoning it just yet. Allow - * the state machine to handle this normally by trying to get a new - * lease. - */ - if ((lifr.lifr_flags & (IFF_UP|IFF_DUPLICATE)) == IFF_DUPLICATE) { - dhcpmsg(MSG_DEBUG, "verify_lif: duplicate address on %s", - lif->lif_name); - return (B_TRUE); - } - - /* - * If the user has torn down or started up the interface manually, then - * abandon the lease. - */ - if ((lif->lif_flags ^ lifr.lifr_flags) & IFF_UP) { - dhcpmsg(MSG_DEBUG, "verify_lif: user has %s %s", - lifr.lifr_flags & IFF_UP ? "started up" : "shut down", - lif->lif_name); - return (B_FALSE); - } - - /* * Check for delete and recreate. */ if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) { - dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX failed on %s", - lif->lif_name); + if (errno != ENXIO) { + dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX failed " + "on %s", lif->lif_name); + } return (B_FALSE); } - if (lifr.lifr_index != lif->lif_pif->pif_index) { + if (lifr.lifr_index != pif->pif_index) { dhcpmsg(MSG_DEBUG, "verify_lif: ifindex on %s changed: %u to %u", - lif->lif_name, lif->lif_pif->pif_index, lifr.lifr_index); + lif->lif_name, pif->pif_index, lifr.lifr_index); return (B_FALSE); } + if (pif->pif_under_ipmp) { + (void) strlcpy(lifr.lifr_name, pif->pif_grifname, LIFNAMSIZ); + + if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) { + if (errno != ENXIO) { + dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX " + "failed on %s", lifr.lifr_name); + } + return (B_FALSE); + } + + if (lifr.lifr_index != pif->pif_grindex) { + dhcpmsg(MSG_DEBUG, "verify_lif: IPMP group ifindex " + "on %s changed: %u to %u", lifr.lifr_name, + pif->pif_grindex, lifr.lifr_index); + return (B_FALSE); + } + } + /* * If the IP address, netmask, or broadcast address have changed, or * the interface has been unplumbed, then we act like there has been an @@ -934,6 +982,13 @@ plumb_lif(dhcp_pif_t *pif, const in6_addr_t *addr) lifr.lifr_name); goto failure; } + + /* + * See comment in set_lif_dhcp(). + */ + if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER)) + lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED; + lifr.lifr_flags |= IFF_UP | IFF_DHCPRUNNING; if (ioctl(v6_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) { dhcpmsg(MSG_ERR, "plumb_lif: SIOCSLIFFLAGS %s", @@ -1060,8 +1115,9 @@ set_lif_dhcp(dhcp_lif_t *lif, boolean_t is_adopting) int fd; int err; struct lifreq lifr; + dhcp_pif_t *pif = lif->lif_pif; - fd = lif->lif_pif->pif_isv6 ? v6_sock_fd : v4_sock_fd; + fd = pif->pif_isv6 ? v6_sock_fd : v4_sock_fd; (void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ); @@ -1098,6 +1154,17 @@ set_lif_dhcp(dhcp_lif_t *lif, boolean_t is_adopting) "set on %s", lif->lif_name); } } else { + /* + * If the lif is on an interface under IPMP, IFF_NOFAILOVER + * must be set or the kernel will prevent us from setting + * IFF_DHCPRUNNING (since the subsequent IFF_UP would lead to + * migration). We set IFF_DEPRECATED too since the kernel + * will set it automatically when setting IFF_NOFAILOVER, + * causing our lif_flags value to grow stale. + */ + if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER)) + lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED; + lifr.lifr_flags |= IFF_DHCPRUNNING; if (ioctl(fd, SIOCSLIFFLAGS, &lifr) == -1) { dhcpmsg(MSG_ERR, "set_lif_dhcp: SIOCSLIFFLAGS for %s", @@ -1207,6 +1274,13 @@ clear_lif_deprecated(dhcp_lif_t *lif) return (B_FALSE); } + /* + * Don't try to clear IFF_DEPRECATED if this is a test address, + * since IPMP's use of IFF_DEPRECATED is not compatible with ours. + */ + if (lifr.lifr_flags & IFF_NOFAILOVER) + return (B_TRUE); + if (!(lifr.lifr_flags & IFF_DEPRECATED)) return (B_TRUE); @@ -1226,16 +1300,19 @@ clear_lif_deprecated(dhcp_lif_t *lif) * * input: dhcp_lif_t *: the logical interface to operate on * in_addr_t: the address the socket will be bound to (in hbo) + * boolean_t: B_TRUE if the address should be brought up (if needed) * output: boolean_t: B_TRUE if the socket was opened successfully. */ boolean_t -open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) +open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo, boolean_t bringup) { const char *errmsg; struct lifreq lifr; int on = 1; uchar_t ttl = 255; + uint32_t ifindex; + dhcp_pif_t *pif = lif->lif_pif; if (lif->lif_sock_ip_fd != -1) { dhcpmsg(MSG_WARNING, "open_ip_lif: socket already open on %s", @@ -1270,7 +1347,7 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) } if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_DHCPINIT_IF, - &lif->lif_pif->pif_index, sizeof (int)) == -1) { + &pif->pif_index, sizeof (int)) == -1) { errmsg = "cannot set IP_DHCPINIT_IF"; goto failure; } @@ -1288,23 +1365,40 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) goto failure; } - if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_BOUND_IF, - &lif->lif_pif->pif_index, sizeof (int)) == -1) { + ifindex = pif->pif_under_ipmp ? pif->pif_grindex : pif->pif_index; + if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_BOUND_IF, &ifindex, + sizeof (int)) == -1) { errmsg = "cannot set IP_BOUND_IF"; goto failure; } - /* - * Make sure at least one lif on the interface we used in IP_BOUND_IF - * is IFF_UP so that we can send and receive IP packets. - */ (void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ); if (ioctl(v4_sock_fd, SIOCGLIFFLAGS, &lifr) == -1) { errmsg = "cannot get interface flags"; goto failure; } - if (!(lifr.lifr_flags & IFF_UP)) { + /* + * If the lif is part of an interface under IPMP, IFF_NOFAILOVER must + * be set or the kernel will prevent us from setting IFF_DHCPRUNNING + * (since the subsequent IFF_UP would lead to migration). We set + * IFF_DEPRECATED too since the kernel will set it automatically when + * setting IFF_NOFAILOVER, causing our lif_flags value to grow stale. + */ + if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER)) { + lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED; + if (ioctl(v4_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) { + errmsg = "cannot set IFF_NOFAILOVER"; + goto failure; + } + } + lif->lif_flags = lifr.lifr_flags; + + /* + * If this is initial bringup, make sure the address we're acquiring a + * lease on is IFF_UP. + */ + if (bringup && !(lifr.lifr_flags & IFF_UP)) { /* * Start from a clean slate. */ @@ -1330,6 +1424,30 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) ((struct sockaddr_in *)&lifr.lifr_addr)->sin_addr.s_addr; } + /* + * Usually, bringing up the address we're acquiring a lease on is + * sufficient to allow packets to be sent and received via the + * IP_BOUND_IF we did earlier. However, if we're acquiring a lease on + * an underlying IPMP interface, the group interface will be used for + * sending and receiving IP packets via IP_BOUND_IF. Thus, ensure at + * least one address on the group interface is IFF_UP. + */ + if (bringup && pif->pif_under_ipmp) { + (void) strlcpy(lifr.lifr_name, pif->pif_grifname, LIFNAMSIZ); + if (ioctl(v4_sock_fd, SIOCGLIFFLAGS, &lifr) == -1) { + errmsg = "cannot get IPMP group interface flags"; + goto failure; + } + + if (!(lifr.lifr_flags & IFF_UP)) { + lifr.lifr_flags |= IFF_UP; + if (ioctl(v4_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) { + errmsg = "cannot bring up IPMP group interface"; + goto failure; + } + } + } + lif->lif_packet_id = iu_register_event(eh, lif->lif_sock_ip_fd, POLLIN, dhcp_packet_lif, lif); if (lif->lif_packet_id == -1) { diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h index a59e3ea68d..46cf30bedb 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef INTERFACE_H #define INTERFACE_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Interface.[ch] encapsulate all of the agent's knowledge of network * interfaces from the DHCP agent's perspective. See interface.c for @@ -66,6 +64,9 @@ struct dhcp_pif_s { boolean_t pif_running; /* interface is running */ uint_t pif_hold_count; /* reference count */ char pif_name[LIFNAMSIZ]; + char pif_grifname[LIFNAMSIZ]; + uint32_t pif_grindex; /* interface index for pif_grifname */ + boolean_t pif_under_ipmp; /* is an ipmp underlying interface */ }; struct dhcp_lif_s { @@ -182,7 +183,7 @@ dhcp_lif_t *attach_lif(const char *, boolean_t, int *); int set_lif_dhcp(dhcp_lif_t *, boolean_t); void set_lif_deprecated(dhcp_lif_t *); boolean_t clear_lif_deprecated(dhcp_lif_t *); -boolean_t open_ip_lif(dhcp_lif_t *, in_addr_t); +boolean_t open_ip_lif(dhcp_lif_t *, in_addr_t, boolean_t); void close_ip_lif(dhcp_lif_t *); void lif_mark_decline(dhcp_lif_t *, const char *); boolean_t schedule_lif_timer(dhcp_lif_t *, dhcp_timer_t *, diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c index 8a32b55ea5..a763530436 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <string.h> #include <sys/types.h> #include <stdlib.h> @@ -970,7 +968,10 @@ send_pkt_internal(dhcp_smach_t *dsmp) ipi6->ipi6_addr = lif->lif_v6addr; else ipi6->ipi6_addr = my_in6addr_any; - ipi6->ipi6_ifindex = lif->lif_pif->pif_index; + if (lif->lif_pif->pif_under_ipmp) + ipi6->ipi6_ifindex = lif->lif_pif->pif_grindex; + else + ipi6->ipi6_ifindex = lif->lif_pif->pif_index; cmsg->cmsg_len = (char *)(ipi6 + 1) - (char *)cmsg; /* diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c index a8c05de986..78da07aebf 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * REQUESTING state of the client state machine. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdlib.h> #include <string.h> #include <search.h> @@ -1008,7 +1006,8 @@ dhcp_acknak_global(iu_eh_t *ehp, int fd, short events, iu_event_id_t id, for (dsmp = lookup_smach_by_xid(xid, NULL, isv6); dsmp != NULL; dsmp = lookup_smach_by_xid(xid, dsmp, isv6)) { pif = dsmp->dsm_lif->lif_pif; - if (pif->pif_index == plp->ifindex) + if (pif->pif_index == plp->ifindex || + pif->pif_under_ipmp && pif->pif_grindex == plp->ifindex) break; } diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c index 9ae7fd7aba..852b428551 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * This module contains core functions for managing DHCP state machine * instances. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdlib.h> #include <search.h> #include <string.h> @@ -151,7 +149,7 @@ insert_smach(dhcp_lif_t *lif, int *error) /* * With IPv4 DHCP, we use a socket per lif. */ - if (!open_ip_lif(lif, INADDR_ANY)) { + if (!open_ip_lif(lif, INADDR_ANY, B_TRUE)) { dhcpmsg(MSG_ERR, "unable to open socket for %s", lif->lif_name); /* This will also dispose of the LIF */ @@ -696,14 +694,15 @@ set_smach_state(dhcp_smach_t *dsmp, DHCPSTATE state) if (is_bound_state(dsmp->dsm_state)) { if (!is_bound_state(state)) { close_ip_lif(lif); - if (!open_ip_lif(lif, INADDR_ANY)) + if (!open_ip_lif(lif, INADDR_ANY, + B_FALSE)) return (B_FALSE); } } else { if (is_bound_state(state)) { close_ip_lif(lif); if (!open_ip_lif(lif, - ntohl(lif->lif_addr))) + ntohl(lif->lif_addr), B_FALSE)) return (B_FALSE); } } @@ -952,11 +951,14 @@ no_specified_id: * unable to parse it. We need to determine if a Client ID is required * and, if so, generate one. * - * If it's IPv4 and not a logical interface, then we need to preserve - * backward-compatibility by avoiding new-fangled DUID/IAID - * construction. + * If it's IPv4, not in an IPMP group, and not a logical interface, + * then we need to preserve backward-compatibility by avoiding + * new-fangled DUID/IAID construction. (Note: even for IPMP test + * addresses, we construct a DUID/IAID since we may renew a lease for + * an IPMP test address on any functioning IP interface in the group.) */ - if (!pif->pif_isv6 && strchr(dsmp->dsm_name, ':') == NULL) { + if (!pif->pif_isv6 && pif->pif_grifname[0] == '\0' && + strchr(dsmp->dsm_name, ':') == NULL) { if (pif->pif_hwtype == ARPHRD_IB) { /* * This comes from the DHCP over IPoIB specification. diff --git a/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c b/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c index 47e1202b32..d73722cc55 100644 --- a/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c +++ b/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -8,8 +8,6 @@ * specifies the terms and conditions for redistribution. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Ifparse splits up an ifconfig command line, and was written for use * with the networking boot scripts; see $SRC/cmd/svc/shell/net_include.sh @@ -184,6 +182,7 @@ struct cmd { { "auto-revarp", 0, AF_INET, PARSEFIXED}, { "plumb", 0, AF_ANY, PARSENOW }, { "unplumb", 0, AF_ANY, PARSENOW }, + { "ipmp", 0, AF_ANY, PARSELOG0 }, { "subnet", NEXTARG, AF_ANY, 0 }, { "token", NEXTARG, AF_INET6, PARSELOG0 }, { "tsrc", NEXTARG, AF_ANY, PARSELOG0 }, diff --git a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c index b9a02b54e7..2d115e221b 100644 --- a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c +++ b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,8 +29,6 @@ * MROUTING Revision 3.5 */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * simple netstat based on snmp/mib-2 interface to the TCP/IP stack * @@ -221,6 +219,7 @@ static char *plural(int n); static char *pluraly(int n); static char *plurales(int n); static void process_filter(char *arg); +static char *ifindex2str(uint_t, char *); static boolean_t family_selected(int family); static void usage(char *); @@ -680,8 +679,14 @@ mibget(int sd) tor->OPT_offset = sizeof (struct T_optmgmt_req); tor->OPT_length = sizeof (struct opthdr); tor->MGMT_flags = T_CURRENT; + + + /* + * Note: we use the special level value below so that IP will return + * us information concerning IRE_MARK_TESTHIDDEN routes. + */ req = (struct opthdr *)&tor[1]; - req->level = MIB2_IP; /* any MIB2_xxx value ok here */ + req->level = EXPER_IP_AND_TESTHIDDEN; req->name = 0; req->len = 0; @@ -712,7 +717,7 @@ mibget(int sd) stderr); i = 0; for (last_item = first_item; last_item; - last_item = last_item->next_item) + last_item = last_item->next_item) (void) printf("%d %4d %5d %d\n", ++i, last_item->group, @@ -1707,19 +1712,19 @@ mib_get_constants(mib_item_t *item) ipRouteAttributeSize = ip->ipRouteAttributeSize; transportMLPSize = ip->transportMLPSize; assert(IS_P2ALIGNED(ipAddrEntrySize, - sizeof (mib2_ipAddrEntry_t *)) && - IS_P2ALIGNED(ipRouteEntrySize, - sizeof (mib2_ipRouteEntry_t *)) && - IS_P2ALIGNED(ipNetToMediaEntrySize, - sizeof (mib2_ipNetToMediaEntry_t *)) && - IS_P2ALIGNED(ipMemberEntrySize, - sizeof (ip_member_t *)) && - IS_P2ALIGNED(ipGroupSourceEntrySize, - sizeof (ip_grpsrc_t *)) && - IS_P2ALIGNED(ipRouteAttributeSize, - sizeof (mib2_ipAttributeEntry_t *)) && - IS_P2ALIGNED(transportMLPSize, - sizeof (mib2_transportMLPEntry_t *))); + sizeof (mib2_ipAddrEntry_t *))); + assert(IS_P2ALIGNED(ipRouteEntrySize, + sizeof (mib2_ipRouteEntry_t *))); + assert(IS_P2ALIGNED(ipNetToMediaEntrySize, + sizeof (mib2_ipNetToMediaEntry_t *))); + assert(IS_P2ALIGNED(ipMemberEntrySize, + sizeof (ip_member_t *))); + assert(IS_P2ALIGNED(ipGroupSourceEntrySize, + sizeof (ip_grpsrc_t *))); + assert(IS_P2ALIGNED(ipRouteAttributeSize, + sizeof (mib2_ipAttributeEntry_t *))); + assert(IS_P2ALIGNED(transportMLPSize, + sizeof (mib2_transportMLPEntry_t *))); break; } case EXPER_DVMRP: { @@ -1728,8 +1733,9 @@ mib_get_constants(mib_item_t *item) vifctlSize = mrts->mrts_vifctlSize; mfcctlSize = mrts->mrts_mfcctlSize; assert(IS_P2ALIGNED(vifctlSize, - sizeof (struct vifclt *)) && - IS_P2ALIGNED(mfcctlSize, sizeof (struct mfcctl *))); + sizeof (struct vifclt *))); + assert(IS_P2ALIGNED(mfcctlSize, + sizeof (struct mfcctl *))); break; } case MIB2_IP6: { @@ -1745,17 +1751,17 @@ mib_get_constants(mib_item_t *item) ipv6GroupSourceEntrySize = ip6->ipv6GroupSourceEntrySize; assert(IS_P2ALIGNED(ipv6IfStatsEntrySize, - sizeof (mib2_ipv6IfStatsEntry_t *)) && - IS_P2ALIGNED(ipv6AddrEntrySize, - sizeof (mib2_ipv6AddrEntry_t *)) && - IS_P2ALIGNED(ipv6RouteEntrySize, - sizeof (mib2_ipv6RouteEntry_t *)) && - IS_P2ALIGNED(ipv6NetToMediaEntrySize, - sizeof (mib2_ipv6NetToMediaEntry_t *)) && - IS_P2ALIGNED(ipv6MemberEntrySize, - sizeof (ipv6_member_t *)) && - IS_P2ALIGNED(ipv6GroupSourceEntrySize, - sizeof (ipv6_grpsrc_t *))); + sizeof (mib2_ipv6IfStatsEntry_t *))); + assert(IS_P2ALIGNED(ipv6AddrEntrySize, + sizeof (mib2_ipv6AddrEntry_t *))); + assert(IS_P2ALIGNED(ipv6RouteEntrySize, + sizeof (mib2_ipv6RouteEntry_t *))); + assert(IS_P2ALIGNED(ipv6NetToMediaEntrySize, + sizeof (mib2_ipv6NetToMediaEntry_t *))); + assert(IS_P2ALIGNED(ipv6MemberEntrySize, + sizeof (ipv6_member_t *))); + assert(IS_P2ALIGNED(ipv6GroupSourceEntrySize, + sizeof (ipv6_grpsrc_t *))); break; } case MIB2_ICMP6: { @@ -1774,9 +1780,9 @@ mib_get_constants(mib_item_t *item) tcpConnEntrySize = tcp->tcpConnTableSize; tcp6ConnEntrySize = tcp->tcp6ConnTableSize; assert(IS_P2ALIGNED(tcpConnEntrySize, - sizeof (mib2_tcpConnEntry_t *)) && - IS_P2ALIGNED(tcp6ConnEntrySize, - sizeof (mib2_tcp6ConnEntry_t *))); + sizeof (mib2_tcpConnEntry_t *))); + assert(IS_P2ALIGNED(tcp6ConnEntrySize, + sizeof (mib2_tcp6ConnEntry_t *))); break; } case MIB2_UDP: { @@ -1785,9 +1791,9 @@ mib_get_constants(mib_item_t *item) udpEntrySize = udp->udpEntrySize; udp6EntrySize = udp->udp6EntrySize; assert(IS_P2ALIGNED(udpEntrySize, - sizeof (mib2_udpEntry_t *)) && - IS_P2ALIGNED(udp6EntrySize, - sizeof (mib2_udp6Entry_t *))); + sizeof (mib2_udpEntry_t *))); + assert(IS_P2ALIGNED(udp6EntrySize, + sizeof (mib2_udp6Entry_t *))); break; } case MIB2_SCTP: { @@ -1843,7 +1849,6 @@ stat_report(mib_item_t *item) { int jtemp = 0; char ifname[LIFNAMSIZ + 1]; - char *ifnamep; /* 'for' loop 1: */ for (; item; item = item->next_item) { @@ -1891,12 +1896,10 @@ stat_report(mib_item_t *item) bzero(&sum6, sizeof (sum6)); /* 'for' loop 2a: */ for (ip6 = (mib2_ipv6IfStatsEntry_t *)item->valp; - (char *)ip6 < (char *)item->valp - + item->length; + (char *)ip6 < (char *)item->valp + item->length; /* LINTED: (note 1) */ ip6 = (mib2_ipv6IfStatsEntry_t *)((char *)ip6 + ipv6IfStatsEntrySize)) { - if (ip6->ipv6IfIndex == 0) { /* * The "unknown interface" ip6 @@ -1905,19 +1908,10 @@ stat_report(mib_item_t *item) sum_ip6_stats(ip6, &sum6); continue; /* 'for' loop 2a */ } - ifnamep = if_indextoname( - ip6->ipv6IfIndex, - ifname); - if (ifnamep == NULL) { - (void) printf( - "Invalid ifindex %d\n", - ip6->ipv6IfIndex); - continue; /* 'for' loop 2a */ - } - if (Aflag) { (void) printf("\nIPv6 for %s\n", - ifnamep); + ifindex2str(ip6->ipv6IfIndex, + ifname)); print_ip6_stats(ip6); } sum_ip6_stats(ip6, &sum6); @@ -1935,15 +1929,10 @@ stat_report(mib_item_t *item) break; bzero(&sum6, sizeof (sum6)); /* 'for' loop 2b: */ - for (icmp6 = - (mib2_ipv6IfIcmpEntry_t *)item->valp; - (char *)icmp6 < (char *)item->valp - + item->length; - icmp6 = - /* LINTED: (note 1) */ - (mib2_ipv6IfIcmpEntry_t *)((char *)icmp6 - + ipv6IfIcmpEntrySize)) { - + for (icmp6 = (mib2_ipv6IfIcmpEntry_t *)item->valp; + (char *)icmp6 < (char *)item->valp + item->length; + icmp6 = (void *)((char *)icmp6 + + ipv6IfIcmpEntrySize)) { if (icmp6->ipv6IfIcmpIfIndex == 0) { /* * The "unknown interface" icmp6 @@ -1952,19 +1941,10 @@ stat_report(mib_item_t *item) sum_icmp6_stats(icmp6, &sum6); continue; /* 'for' loop 2b: */ } - ifnamep = if_indextoname( - icmp6->ipv6IfIcmpIfIndex, ifname); - if (ifnamep == NULL) { - (void) printf( - "Invalid ifindex %d\n", - icmp6->ipv6IfIcmpIfIndex); - continue; /* 'for' loop 2b: */ - } - if (Aflag) { - (void) printf( - "\nICMPv6 for %s\n", - ifnamep); + (void) printf("\nICMPv6 for %s\n", + ifindex2str( + icmp6->ipv6IfIcmpIfIndex, ifname)); print_icmp6_stats(icmp6); } sum_icmp6_stats(icmp6, &sum6); @@ -2369,51 +2349,49 @@ print_mrt_stats(struct mrtstat *mrts) { (void) puts("DVMRP multicast routing:"); (void) printf(" %10u hit%s - kernel forwarding cache hits\n", - mrts->mrts_mfc_hits, PLURAL(mrts->mrts_mfc_hits)); + mrts->mrts_mfc_hits, PLURAL(mrts->mrts_mfc_hits)); (void) printf(" %10u miss%s - kernel forwarding cache misses\n", - mrts->mrts_mfc_misses, PLURALES(mrts->mrts_mfc_misses)); + mrts->mrts_mfc_misses, PLURALES(mrts->mrts_mfc_misses)); (void) printf(" %10u packet%s potentially forwarded\n", - mrts->mrts_fwd_in, PLURAL(mrts->mrts_fwd_in)); + mrts->mrts_fwd_in, PLURAL(mrts->mrts_fwd_in)); (void) printf(" %10u packet%s actually sent out\n", - mrts->mrts_fwd_out, PLURAL(mrts->mrts_fwd_out)); + mrts->mrts_fwd_out, PLURAL(mrts->mrts_fwd_out)); (void) printf(" %10u upcall%s - upcalls made to mrouted\n", - mrts->mrts_upcalls, PLURAL(mrts->mrts_upcalls)); + mrts->mrts_upcalls, PLURAL(mrts->mrts_upcalls)); (void) printf(" %10u packet%s not sent out due to lack of resources\n", - mrts->mrts_fwd_drop, PLURAL(mrts->mrts_fwd_drop)); + mrts->mrts_fwd_drop, PLURAL(mrts->mrts_fwd_drop)); (void) printf(" %10u datagram%s with malformed tunnel options\n", - mrts->mrts_bad_tunnel, PLURAL(mrts->mrts_bad_tunnel)); + mrts->mrts_bad_tunnel, PLURAL(mrts->mrts_bad_tunnel)); (void) printf(" %10u datagram%s with no room for tunnel options\n", - mrts->mrts_cant_tunnel, PLURAL(mrts->mrts_cant_tunnel)); + mrts->mrts_cant_tunnel, PLURAL(mrts->mrts_cant_tunnel)); (void) printf(" %10u datagram%s arrived on wrong interface\n", - mrts->mrts_wrong_if, PLURAL(mrts->mrts_wrong_if)); + mrts->mrts_wrong_if, PLURAL(mrts->mrts_wrong_if)); (void) printf(" %10u datagram%s dropped due to upcall Q overflow\n", - mrts->mrts_upq_ovflw, PLURAL(mrts->mrts_upq_ovflw)); + mrts->mrts_upq_ovflw, PLURAL(mrts->mrts_upq_ovflw)); (void) printf(" %10u datagram%s cleaned up by the cache\n", - mrts->mrts_cache_cleanups, PLURAL(mrts->mrts_cache_cleanups)); + mrts->mrts_cache_cleanups, PLURAL(mrts->mrts_cache_cleanups)); (void) printf(" %10u datagram%s dropped selectively by ratelimiter\n", - mrts->mrts_drop_sel, PLURAL(mrts->mrts_drop_sel)); + mrts->mrts_drop_sel, PLURAL(mrts->mrts_drop_sel)); (void) printf(" %10u datagram%s dropped - bucket Q overflow\n", - mrts->mrts_q_overflow, PLURAL(mrts->mrts_q_overflow)); + mrts->mrts_q_overflow, PLURAL(mrts->mrts_q_overflow)); (void) printf(" %10u datagram%s dropped - larger than bkt size\n", - mrts->mrts_pkt2large, PLURAL(mrts->mrts_pkt2large)); + mrts->mrts_pkt2large, PLURAL(mrts->mrts_pkt2large)); (void) printf("\nPIM multicast routing:\n"); (void) printf(" %10u datagram%s dropped - bad version number\n", - mrts->mrts_pim_badversion, PLURAL(mrts->mrts_pim_badversion)); + mrts->mrts_pim_badversion, PLURAL(mrts->mrts_pim_badversion)); (void) printf(" %10u datagram%s dropped - bad checksum\n", - mrts->mrts_pim_rcv_badcsum, PLURAL(mrts->mrts_pim_rcv_badcsum)); + mrts->mrts_pim_rcv_badcsum, PLURAL(mrts->mrts_pim_rcv_badcsum)); (void) printf(" %10u datagram%s dropped - bad register packets\n", - mrts->mrts_pim_badregisters, - PLURAL(mrts->mrts_pim_badregisters)); + mrts->mrts_pim_badregisters, PLURAL(mrts->mrts_pim_badregisters)); (void) printf( - " %10u datagram%s potentially forwarded - register packets\n", - mrts->mrts_pim_regforwards, PLURAL(mrts->mrts_pim_regforwards)); + " %10u datagram%s potentially forwarded - register packets\n", + mrts->mrts_pim_regforwards, PLURAL(mrts->mrts_pim_regforwards)); (void) printf(" %10u datagram%s dropped - register send drops\n", - mrts->mrts_pim_regsend_drops, - PLURAL(mrts->mrts_pim_regsend_drops)); + mrts->mrts_pim_regsend_drops, PLURAL(mrts->mrts_pim_regsend_drops)); (void) printf(" %10u datagram%s dropped - packet malformed\n", - mrts->mrts_pim_malformed, PLURAL(mrts->mrts_pim_malformed)); + mrts->mrts_pim_malformed, PLURAL(mrts->mrts_pim_malformed)); (void) printf(" %10u datagram%s dropped - no memory to forward\n", - mrts->mrts_pim_nomemory, PLURAL(mrts->mrts_pim_nomemory)); + mrts->mrts_pim_nomemory, PLURAL(mrts->mrts_pim_nomemory)); } static void @@ -2674,7 +2652,7 @@ if_report(mib_item_t *item, char *matchname, "Ierrs", "Opkts", "Oerrs", "Collis", "Queue"); - first = B_FALSE; + first = B_FALSE; } if_report_ip4(ap, ifname, logintname, &stat, B_TRUE); @@ -2717,7 +2695,7 @@ if_report(mib_item_t *item, char *matchname, + item->length; ap++) { (void) octetstr(&ap->ipAdEntIfIndex, - 'a', ifname, sizeof (ifname)); + 'a', ifname, sizeof (ifname)); (void) strtok(ifname, ":"); if (matchname) { @@ -3387,7 +3365,7 @@ dhcp_walk_interfaces(uint_t flags_on, uint_t flags_off, int af, */ (void) memset(&lifn, 0, sizeof (lifn)); lifn.lifn_family = af; - lifn.lifn_flags = LIFC_ALLZONES | LIFC_NOXMIT; + lifn.lifn_flags = LIFC_ALLZONES | LIFC_NOXMIT | LIFC_UNDER_IPMP; if (ioctl(sock_fd, SIOCGLIFNUM, &lifn) == -1) n_ifs = LIFN_GUARD_VALUE; else @@ -3471,7 +3449,6 @@ group_report(mib_item_t *item) ip_grpsrc_t *ips; ipv6_member_t *ipmp6; ipv6_grpsrc_t *ips6; - char *ifnamep; boolean_t first, first_src; /* 'for' loop 1: */ @@ -3604,7 +3581,7 @@ group_report(mib_item_t *item) (char *)ipmp6 < (char *)v6grp->valp + v6grp->length; /* LINTED: (note 1) */ ipmp6 = (ipv6_member_t *)((char *)ipmp6 + - ipv6MemberEntrySize)) { + ipv6MemberEntrySize)) { if (first) { (void) puts("Group Memberships: " "IPv6"); @@ -3615,15 +3592,8 @@ group_report(mib_item_t *item) first = B_FALSE; } - ifnamep = if_indextoname( - ipmp6->ipv6GroupMemberIfIndex, ifname); - if (ifnamep == NULL) { - (void) printf("Invalid ifindex %d\n", - ipmp6->ipv6GroupMemberIfIndex); - continue; - } (void) printf("%-5s %-27s %5u\n", - ifnamep, + ifindex2str(ipmp6->ipv6GroupMemberIfIndex, ifname), pr_addr6(&ipmp6->ipv6GroupMemberAddress, abuf, sizeof (abuf)), ipmp6->ipv6GroupMemberRefCnt); @@ -3784,7 +3754,6 @@ ndp_report(mib_item_t *item) char xbuf[STR_EXPAND * OCTET_LENGTH + 1]; mib2_ipv6NetToMediaEntry_t *np6; char ifname[LIFNAMSIZ + 1]; - char *ifnamep; boolean_t first; if (!(family_selected(AF_INET6))) @@ -3820,13 +3789,6 @@ ndp_report(mib_item_t *item) first = B_FALSE; } - ifnamep = if_indextoname(np6->ipv6NetToMediaIfIndex, - ifname); - if (ifnamep == NULL) { - (void) printf("Invalid ifindex %d\n", - np6->ipv6NetToMediaIfIndex); - continue; /* 'for' loop 2 */ - } switch (np6->ipv6NetToMediaState) { case ND_INCOMPLETE: state = "INCOMPLETE"; @@ -3865,7 +3827,7 @@ ndp_report(mib_item_t *item) break; } (void) printf("%-5s %-17s %-7s %-12s %-27s\n", - ifnamep, + ifindex2str(np6->ipv6NetToMediaIfIndex, ifname), octetstr(&np6->ipv6NetToMediaPhysAddress, 'h', xbuf, sizeof (xbuf)), type, @@ -4472,7 +4434,7 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first, (void) printf("%-27s %-27s %-5s %5u%c %5u %3u " "%-5s %6u %6u %s\n", pr_prefix6(&rp6->ipv6RouteDest, - rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), + rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), IN6_IS_ADDR_UNSPECIFIED(&rp6->ipv6RouteNextHop) ? " --" : pr_addr6(&rp6->ipv6RouteNextHop, gwbuf, sizeof (gwbuf)), @@ -4489,7 +4451,7 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first, } else { (void) printf("%-27s %-27s %-5s %3u %7u %-5s %s\n", pr_prefix6(&rp6->ipv6RouteDest, - rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), + rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), IN6_IS_ADDR_UNSPECIFIED(&rp6->ipv6RouteNextHop) ? " --" : pr_addr6(&rp6->ipv6RouteNextHop, gwbuf, sizeof (gwbuf)), @@ -4690,9 +4652,9 @@ tcp_report_item_v4(const mib2_tcpConnEntry_t *tp, boolean_t first, (void) printf("%-20s\n%-20s %5u %08x %08x %5u %08x %08x " "%5u %5u %s\n", pr_ap(tp->tcpConnLocalAddress, - tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), + tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap(tp->tcpConnRemAddress, - tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), + tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), tp->tcpConnEntryInfo.ce_swnd, tp->tcpConnEntryInfo.ce_snxt, tp->tcpConnEntryInfo.ce_suna, @@ -4710,9 +4672,9 @@ tcp_report_item_v4(const mib2_tcpConnEntry_t *tp, boolean_t first, (void) printf("%-20s %-20s %5u %6d %5u %6d %s\n", pr_ap(tp->tcpConnLocalAddress, - tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), + tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap(tp->tcpConnRemAddress, - tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), + tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), tp->tcpConnEntryInfo.ce_swnd, (sq >= 0) ? sq : 0, tp->tcpConnEntryInfo.ce_rwnd, @@ -4756,9 +4718,9 @@ tcp_report_item_v6(const mib2_tcp6ConnEntry_t *tp6, boolean_t first, (void) printf("%-33s\n%-33s %5u %08x %08x %5u %08x %08x " "%5u %5u %-11s %s\n", pr_ap6(&tp6->tcp6ConnLocalAddress, - tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), + tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap6(&tp6->tcp6ConnRemAddress, - tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), + tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), tp6->tcp6ConnEntryInfo.ce_swnd, tp6->tcp6ConnEntryInfo.ce_snxt, tp6->tcp6ConnEntryInfo.ce_suna, @@ -4777,9 +4739,9 @@ tcp_report_item_v6(const mib2_tcp6ConnEntry_t *tp6, boolean_t first, (void) printf("%-33s %-33s %5u %6d %5u %6d %-11s %s\n", pr_ap6(&tp6->tcp6ConnLocalAddress, - tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), + tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap6(&tp6->tcp6ConnRemAddress, - tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), + tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), tp6->tcp6ConnEntryInfo.ce_swnd, (sq >= 0) ? sq : 0, tp6->tcp6ConnEntryInfo.ce_rwnd, @@ -5112,7 +5074,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr, * displaying. */ switch (type) { - case MIB2_SCTP_ADDR_V4: + case MIB2_SCTP_ADDR_V4: /* v4 */ v6addr = *addr; @@ -5124,7 +5086,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr, } break; - case MIB2_SCTP_ADDR_V6: + case MIB2_SCTP_ADDR_V6: /* v6 */ if (port > 0) { (void) pr_ap6(addr, port, "sctp", name, namelen); @@ -5133,7 +5095,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr, } break; - default: + default: (void) snprintf(name, namelen, "<unknown addr type>"); break; } @@ -5379,7 +5341,7 @@ mrt_report(mib_item_t *item) case EXPER_DVMRP_MRT: if (Dflag) (void) printf("%u records for ipMfcTable:\n", - item->length/sizeof (struct vifctl)); + item->length/sizeof (struct vifctl)); if (item->length/sizeof (struct vifctl) == 0) { (void) puts("\nMulticast Forwarding Cache is " "empty"); @@ -5402,10 +5364,10 @@ mrt_report(mib_item_t *item) abuf, sizeof (abuf))); (void) printf("%-15.15s %6s %3u ", pr_net(mfccp->mfcc_mcastgrp.s_addr, - mfccp->mfcc_mcastgrp.s_addr, - abuf, sizeof (abuf)), + mfccp->mfcc_mcastgrp.s_addr, + abuf, sizeof (abuf)), pktscale((int)mfccp->mfcc_pkt_cnt), - mfccp->mfcc_parent); + mfccp->mfcc_parent); for (vifi = 0; vifi < MAXVIFS; ++vifi) { if (mfccp->mfcc_ttls[vifi]) { @@ -5468,7 +5430,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes) strncmp(ksp->ks_name, "streams_dblk", 12) == 0) { (void) safe_kstat_read(kc, ksp, NULL); total_buf_inuse -= - kstat_named_value(ksp, "buf_constructed"); + kstat_named_value(ksp, "buf_constructed"); continue; /* 'for' loop 1 */ } @@ -5501,7 +5463,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes) if (buf_size == 0) { (void) printf("%-22s [couldn't find statistics for %s]\n", - title, name); + title, name); return; } @@ -5511,7 +5473,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes) (void) snprintf(buf, sizeof (buf), "%s", title); (void) printf("%-22s %6d %9d %11lld %11d\n", buf, - total_buf_inuse, total_buf_max, total_alloc, total_alloc_fail); + total_buf_inuse, total_buf_max, total_alloc, total_alloc_fail); } static void @@ -5534,7 +5496,7 @@ m_report(void) kmem_cache_stats("qband", "qband_cache", 0, &total_bytes); (void) printf("\n%lld Kbytes allocated for streams data\n", - total_bytes / 1024); + total_bytes / 1024); (void) putchar('\n'); (void) fflush(stdout); @@ -5967,7 +5929,7 @@ portname(uint_t port, char *proto, char *dst, uint_t dstlen) sp = getservbyport(htons(port), proto); if (sp || port == 0) (void) snprintf(dst, dstlen, "%.*s", MAXHOSTNAMELEN, - sp ? sp->s_name : "*"); + sp ? sp->s_name : "*"); else (void) snprintf(dst, dstlen, "%d", port); dst[dstlen - 1] = 0; @@ -6161,8 +6123,8 @@ process_filter(char *arg) */ if (hp->h_addr_list[0] != NULL && /* LINTED: (note 1) */ - IN6_IS_ADDR_V4MAPPED((in6_addr_t - *)hp->h_addr_list[0])) { + IN6_IS_ADDR_V4MAPPED((in6_addr_t *) + hp->h_addr_list[0])) { maxv = IP_ABITS; } else { maxv = IPV6_ABITS; @@ -6226,6 +6188,21 @@ family_selected(int family) } /* + * Convert the interface index to a string using the buffer `ifname', which + * must be at least LIFNAMSIZ bytes. We first try to map it to name. If that + * fails (e.g., because we're inside a zone and it does not have access to + * interface for the index in question), just return "if#<num>". + */ +static char * +ifindex2str(uint_t ifindex, char *ifname) +{ + if (if_indextoname(ifindex, ifname) == NULL) + (void) snprintf(ifname, LIFNAMSIZ, "if#%d", ifindex); + + return (ifname); +} + +/* * print the usage line */ static void diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile index f0c4c03250..f3ce9fae4b 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile @@ -19,51 +19,58 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# - -PROG = in.mpathd -OBJS = mpd_tables.o mpd_main.o mpd_probe.o -SRCS = $(OBJS:%.o=%.c) -DEFAULTFILES = mpathd.dfl +PROG = in.mpathd +ROOTFS_PROG = $(PROG) +OBJS = mpd_tables.o mpd_main.o mpd_probe.o +SRCS = $(OBJS:%.o=%.c) +DEFAULTFILES = mpathd.dfl include ../../../Makefile.cmd -POFILE = $(PROG).po -POFILES = $(SRCS:%.c=%.po) +ROOTCMDDIR = $(ROOT)/lib/inet + +POFILE = $(PROG).po +POFILES = $(SRCS:%.c=%.po) -C99MODE= $(C99_ENABLE) +C99MODE = $(C99_ENABLE) # # We need access to the ancillary data features which are only available # via the SUS standards. Further, C99 support requires SUSv3 or higher. # CPPFLAGS += -D_XOPEN_SOURCE=600 -D__EXTENSIONS__ -LDLIBS += -lsocket -lnsl -lsysevent -lnvpair -lipmp -lc +LDLIBS += -lsocket -lnsl -lsysevent -lnvpair -lipmp -linetutil -ldlpi +LINTFLAGS += -erroff=E_INCONS_ARG_DECL2 -erroff=E_INCONS_ARG_USED2 -LINTFLAGS += -erroff=E_FUNC_DECL_VAR_ARG2 -erroff=E_INCONS_VAL_TYPE_DECL2 \ - -erroff=E_FUNC_USED_VAR_ARG2 -erroff=E_INCONS_ARG_DECL2 \ - -erroff=E_NAME_USED_NOT_DEF2 -erroff=E_INCONS_ARG_USED2 \ - -errtags=yes +# +# Instrument in.mpathd with CTF data to ease debugging. +# +CTFCONVERT_HOOK = && $(CTFCONVERT_O) +CTFMERGE_HOOK = && $(CTFMERGE) -L VERSION -o $@ $(OBJS) +$(OBJS) := CFLAGS += $(CTF_FLAGS) .KEEP_STATE: all: $(PROG) $(PROG): $(OBJS) - $(LINK.c) -o $@ $(OBJS) $(LDLIBS) + $(LINK.c) -o $@ $(OBJS) $(LDLIBS) $(CTFMERGE_HOOK) $(POST_PROCESS) include ../Makefile.lib +$(ROOTLIBINETPROG): + $(RM) $@; $(SYMLINK) ../../../lib/inet/$(PROG) $@ + $(ROOTSBINPROG): - $(RM) $@; $(SYMLINK) ../usr/lib/inet/$(PROG) $@ + $(RM) $@; $(SYMLINK) ../lib/inet/$(PROG) $@ -install: all $(ROOTLIBINETPROG) $(ROOTSBINPROG) $(ROOTETCDEFAULTFILES) +install: all $(ROOTLIBINETPROG) $(ROOTSBINPROG) $(ROOTCMD) \ + $(ROOTETCDEFAULTFILES) clean: $(RM) $(OBJS) diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h index 9b07e2a7a3..e7cb096bf7 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _MPD_DEFS_H #define _MPD_DEFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -78,12 +76,13 @@ extern "C" { #include <locale.h> #include <deflt.h> +#include <libdlpi.h> +#include <libinetutil.h> #include <libnvpair.h> #include <libsysevent.h> #include <sys/sysevent.h> #include <sys/sysevent/eventdefs.h> #include <sys/sysevent/ipmp.h> -#include <zone.h> #include <ipmp_mpathd.h> #include <ipmp_query_impl.h> @@ -92,7 +91,7 @@ extern "C" { /* Debug flags */ #define D_ALL 0xffff /* enable all debug */ #define D_PROBE 0x0001 /* probe mechanism */ -#define D_FAILOVER 0x0002 /* failover mechanism */ +#define D_FAILREP 0x0002 /* failure/repair mechanism */ #define D_PHYINT 0x0004 /* phyint table */ #define D_LOGINT 0x0008 /* logint table */ #define D_TARGET 0x0010 /* target table */ @@ -199,10 +198,8 @@ extern int user_failure_detection_time; /* User specified fdt */ extern int ifsock_v4; /* IPv4 socket for ioctls */ extern int ifsock_v6; /* IPv6 socket for ioctls */ -extern boolean_t full_scan_required; /* Do full scans */ - extern int debug; /* debug option */ - +extern boolean_t cleanup_started; /* true if we're shutting down */ extern boolean_t handle_link_notifications; /* @@ -212,6 +209,7 @@ extern void timer_schedule(uint_t delay); extern void logmsg(int pri, const char *fmt, ...); extern void logperror(const char *str); extern int poll_add(int fd); +extern int poll_remove(int fd); extern uint64_t getcurrentsec(void); extern uint_t getcurrenttime(void); diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c index aa6a99fb9c..e1e22e12d4 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mpd_defs.h" #include "mpd_tables.h" @@ -46,7 +44,6 @@ static int lsock_v6; /* Listen socket to detect mpathd */ static int mibfd = -1; /* fd to get mib info */ static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ -boolean_t full_scan_required = _B_FALSE; static uint_t last_initifs_time; /* Time when initifs was last run */ static char **argv0; /* Saved for re-exec on SIGHUP */ boolean_t handle_link_notifications = _B_TRUE; @@ -58,10 +55,6 @@ static void check_if_removed(struct phyint_instance *pii); static void select_test_ifs(void); static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); -static void router_add_v4(mib2_ipRouteEntry_t *rp1, - struct in_addr nexthop_v4); -static void router_add_v6(mib2_ipv6RouteEntry_t *rp1, - struct in6_addr nexthop_v6); static void router_add_common(int af, char *ifname, struct in6_addr nexthop); static void init_router_targets(); @@ -74,17 +67,17 @@ static void check_addr_unique(struct phyint_instance *, static void init_host_targets(void); static void dup_host_targets(struct phyint_instance *desired_pii); static void loopback_cmd(int sock, int family); -static int poll_remove(int fd); static boolean_t daemonize(void); static int closefunc(void *, int); static unsigned int process_cmd(int newfd, union mi_commands *mpi); static unsigned int process_query(int fd, mi_query_t *miq); +static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop); static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); static unsigned int send_result(int fd, unsigned int error, int syserror); -struct local_addr *laddr_list = NULL; +addrlist_t *localaddrs; /* * Return the current time in milliseconds (from an arbitrary reference) @@ -153,7 +146,7 @@ retry: /* * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. */ -static int +int poll_remove(int fd) { int i; @@ -205,17 +198,11 @@ pii_process(int af, char *name, struct phyint_instance **pii_p) break; case PI_GROUP_CHANGED: - /* - * The phyint has changed group. - */ - restore_phyint(pii->pii_phyint); - /* FALLTHRU */ - case PI_IFINDEX_CHANGED: /* - * Interface index has changed. Delete and - * recreate the phyint as it is quite likely - * the interface has been unplumbed and replumbed. + * Interface index or group membership has changed. + * Delete the old state and recreate based on the new + * state (it may no longer be in a group). */ pii_other = phyint_inst_other(pii); if (pii_other != NULL) @@ -249,51 +236,26 @@ pii_process(int af, char *name, struct phyint_instance **pii_p) } /* - * This phyint is leaving the group. Try to restore the phyint to its - * initial state. Return the addresses that belong to other group members, - * to the group, and take back any addresses owned by this phyint - */ -void -restore_phyint(struct phyint *pi) -{ - if (pi->pi_group == phyint_anongroup) - return; - - /* - * Move everthing to some other member in the group. - * The phyint has changed group in the kernel. But we - * have yet to do it in our tables. - */ - if (!pi->pi_empty) - (void) try_failover(pi, FAILOVER_TO_ANY); - /* - * Move all addresses owned by 'pi' back to pi, from each - * of the other members of the group - */ - (void) try_failback(pi); -} - -/* * Scan all interfaces to detect changes as well as new and deleted interfaces */ static void initifs() { - int n; + int i, nlifr; int af; char *cp; char *buf; - int numifs; + int sockfd; + uint64_t flags; struct lifnum lifn; struct lifconf lifc; + struct lifreq lifreq; struct lifreq *lifr; struct logint *li; struct phyint_instance *pii; struct phyint_instance *next_pii; - char pi_name[LIFNAMSIZ + 1]; - boolean_t exists; - struct phyint *pi; - struct local_addr *next; + struct phyint_group *pg, *next_pg; + char pi_name[LIFNAMSIZ + 1]; if (debug & D_PHYINT) logdebug("initifs: Scanning interfaces\n"); @@ -301,13 +263,9 @@ initifs() last_initifs_time = getcurrenttime(); /* - * Free the laddr_list before collecting the local addresses. + * Free the existing local address list; we'll build a new list below. */ - while (laddr_list != NULL) { - next = laddr_list->next; - free(laddr_list); - laddr_list = next; - } + addrlist_free(&localaddrs); /* * Mark the interfaces so that we can find phyints and logints @@ -326,122 +284,142 @@ initifs() } } + /* + * As above, mark groups so that we can detect IPMP interfaces which + * have been removed from the kernel. Also, delete the group address + * list since we'll iteratively recreate it below. + */ + for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { + pg->pg_in_use = _B_FALSE; + addrlist_free(&pg->pg_addrs); + } + lifn.lifn_family = AF_UNSPEC; - lifn.lifn_flags = LIFC_ALLZONES; + lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; +again: if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { - logperror("initifs: ioctl (get interface numbers)"); + logperror("initifs: ioctl (get interface count)"); return; } - numifs = lifn.lifn_count; + /* + * Pad the interface count to detect when additional interfaces have + * been configured between SIOCGLIFNUM and SIOCGLIFCONF. + */ + lifn.lifn_count += 4; - buf = (char *)calloc(numifs, sizeof (struct lifreq)); - if (buf == NULL) { + if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) { logperror("initifs: calloc"); return; } lifc.lifc_family = AF_UNSPEC; - lifc.lifc_flags = LIFC_ALLZONES; - lifc.lifc_len = numifs * sizeof (struct lifreq); + lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; + lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); lifc.lifc_buf = buf; if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { - /* - * EINVAL is commonly encountered, when things change - * underneath us rapidly, (eg. at boot, when new interfaces - * are plumbed successively) and the kernel finds the buffer - * size we passed as too small. We will retry again - * when we see the next routing socket msg, or at worst after - * IF_SCAN_INTERVAL ms. - */ - if (errno != EINVAL) { - logperror("initifs: ioctl" - " (get interface configuration)"); - } + logperror("initifs: ioctl (get interface configuration)"); free(buf); return; } - lifr = (struct lifreq *)lifc.lifc_req; - /* - * For each lifreq returned by SIOGGLIFCONF, call pii_process() - * and get the state of the corresponding phyint_instance. If it is - * successful, then call logint_init_from_k() to get the state of the - * logint. + * If every lifr_req slot is taken, then additional interfaces must + * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF. + * Recalculate to make sure we didn't miss any interfaces. */ - for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) { - int sockfd; - struct local_addr *taddr; - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - struct lifreq lifreq; + nlifr = lifc.lifc_len / sizeof (struct lifreq); + if (nlifr >= lifn.lifn_count) { + free(buf); + goto again; + } + /* + * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the + * global list of addresses, phyint groups, phyints, and logints. + */ + for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) { af = lifr->lifr_addr.ss_family; - - /* - * Collect all local addresses. - */ sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; - (void) memset(&lifreq, 0, sizeof (lifreq)); - (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, - sizeof (lifreq.lifr_name)); + (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ); if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { if (errno != ENXIO) logperror("initifs: ioctl (SIOCGLIFFLAGS)"); continue; } + flags = lifreq.lifr_flags; + + /* + * If the address is IFF_UP, add it to the local address list. + * (We ignore addresses that aren't IFF_UP since another node + * might legitimately have that address IFF_UP.) + */ + if (flags & IFF_UP) { + (void) addrlist_add(&localaddrs, lifr->lifr_name, flags, + &lifr->lifr_addr); + } /* - * Add the interface address to laddr_list. - * Another node might have the same IP address which is up. - * In that case, it is appropriate to use the address as a - * target, even though it is also configured (but not up) on - * the local system. - * Hence,the interface address is not added to laddr_list - * unless it is IFF_UP. + * If this address is on an IPMP meta-interface, update our + * phyint_group information (either by recording that group + * still exists or creating a new group), and track what + * group the address is part of. */ - if (lifreq.lifr_flags & IFF_UP) { - taddr = malloc(sizeof (struct local_addr)); - if (taddr == NULL) { - logperror("initifs: malloc"); + if (flags & IFF_IPMP) { + if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) { + if (errno != ENXIO) + logperror("initifs: ioctl " + "(SIOCGLIFGROUPNAME)"); continue; } - if (af == AF_INET) { - sin = (struct sockaddr_in *)&lifr->lifr_addr; - IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, - &taddr->addr); - } else { - sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr; - taddr->addr = sin6->sin6_addr; + + pg = phyint_group_lookup(lifreq.lifr_groupname); + if (pg == NULL) { + pg = phyint_group_create(lifreq.lifr_groupname); + if (pg == NULL) { + logerr("initifs: cannot create group " + "%s\n", lifreq.lifr_groupname); + continue; + } + phyint_group_insert(pg); + } + pg->pg_in_use = _B_TRUE; + + /* + * Add this to the group's list of data addresses. + */ + if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags, + &lifr->lifr_addr)) { + logerr("initifs: insufficient memory to track " + "data address information for %s\n", + lifr->lifr_name); } - taddr->next = laddr_list; - laddr_list = taddr; + continue; } /* - * Need to pass a phyint name to pii_process. Insert the - * null where the ':' IF_SEPARATOR is found in the logical - * name. + * This isn't an address on an IPMP meta-interface, so it's + * either on an underlying interface or not related to any + * group. Update our phyint and logint information (via + * pii_process() and logint_init_from_k()) -- but first, + * convert the logint name to a phyint name so we can call + * pii_process(). */ (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) *cp = '\0'; - exists = pii_process(af, pi_name, &pii); - if (exists) { + if (pii_process(af, pi_name, &pii)) { /* The phyint is fine. So process the logint */ logint_init_from_k(pii, lifr->lifr_name); check_addr_unique(pii, &lifr->lifr_addr); } - } - free(buf); /* - * Scan for phyints and logints that have disappeared from the + * Scan for groups, phyints and logints that have disappeared from the * kernel, and delete them. */ for (pii = phyint_instances; pii != NULL; pii = next_pii) { @@ -449,70 +427,31 @@ initifs() check_if_removed(pii); } + for (pg = phyint_groups; pg != NULL; pg = next_pg) { + next_pg = pg->pg_next; + if (!pg->pg_in_use) { + phyint_group_delete(pg); + continue; + } + /* + * Refresh the group's state. This is necessary since the + * group's state is defined by the set of usable interfaces in + * the group, and an interface is considered unusable if all + * of its addresses are down. When an address goes down/up, + * the RTM_DELADDR/RTM_NEWADDR brings us through here. + */ + phyint_group_refresh_state(pg); + } + /* * Select a test address for sending probes on each phyint instance */ select_test_ifs(); /* - * Handle link up/down notifications from the NICs. + * Handle link up/down notifications. */ process_link_state_changes(); - - for (pi = phyints; pi != NULL; pi = pi->pi_next) { - /* - * If this is a case of group failure, we don't have much - * to do until the group recovers again. - */ - if (GROUP_FAILED(pi->pi_group)) - continue; - - /* - * Try/Retry any pending failovers / failbacks, that did not - * not complete, or that could not be initiated previously. - * This implements the 3 invariants described in the big block - * comment at the beginning of probe.c - */ - if (pi->pi_flags & IFF_INACTIVE) { - if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) - (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); - } else { - struct phyint_instance *pii; - - /* - * Skip LINK UP interfaces which are not capable - * of probing. - */ - pii = pi->pi_v4; - if (pii == NULL || - (LINK_UP(pi) && !PROBE_CAPABLE(pii))) { - pii = pi->pi_v6; - if (pii == NULL || - (LINK_UP(pi) && !PROBE_CAPABLE(pii))) - continue; - } - - /* - * It is possible that the phyint has started - * receiving packets, after it has been marked - * PI_FAILED. Don't initiate failover, if the - * phyint has started recovering. failure_state() - * captures this check. A similar logic is used - * for failback/repair case. - */ - if (pi->pi_state == PI_FAILED && !pi->pi_empty && - (failure_state(pii) == PHYINT_FAILURE)) { - (void) try_failover(pi, FAILOVER_NORMAL); - } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) { - if (try_failback(pi) != IPMP_FAILURE) { - (void) change_lif_flags(pi, IFF_FAILED, - _B_FALSE); - /* Per state diagram */ - pi->pi_empty = 0; - } - } - } - } } /* @@ -569,7 +508,7 @@ check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss) * The probe socket is closed on each interface instance, and the * interface state set to PI_OFFLINE. */ -static void +void stop_probing(struct phyint *pi) { struct phyint_instance *pii; @@ -631,7 +570,6 @@ select_test_ifs(void) struct logint *li; struct logint *probe_logint; boolean_t target_scan_reqd = _B_FALSE; - struct target *tg; int rating; if (debug & D_PHYINT) @@ -645,8 +583,8 @@ select_test_ifs(void) probe_logint = NULL; /* - * An interface that is offline, should not be probed. - * Offline interfaces should always in PI_OFFLINE state, + * An interface that is offline should not be probed. + * IFF_OFFLINE interfaces should always be PI_OFFLINE * unless some other entity has set the offline flag. */ if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { @@ -659,6 +597,15 @@ select_test_ifs(void) stop_probing(pii->pii_phyint); } continue; + } else { + /* + * If something cleared IFF_OFFLINE (e.g., by accident + * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is + * inherently racy), the phyint may still be offline. + * Just ignore it. + */ + if (pii->pii_phyint->pi_state == PI_OFFLINE) + continue; } li = pii->pii_probe_logint; @@ -776,17 +723,6 @@ select_test_ifs(void) phyint_chstate(pii->pii_phyint, PI_NOTARGETS); } - if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { - tg = pii->pii_targets; - if (tg != NULL) - target_delete(tg); - assert(pii->pii_targets == NULL); - assert(pii->pii_target_next == NULL); - assert(pii->pii_ntargets == 0); - target_create(pii, probe_logint->li_dstaddr, - _B_TRUE); - } - /* * If no targets are currently known for this phyint * we need to call init_router_targets. Since @@ -806,15 +742,16 @@ select_test_ifs(void) } /* - * Check the interface list for any interfaces that are marked - * PI_FAILED but no longer enabled to send probes, and call - * phyint_check_for_repair() to see if the link now indicates that the - * interface should be repaired. Also see the state diagram in + * Scan the interface list for any interfaces that are PI_FAILED or + * PI_NOTARGETS but no longer enabled to send probes, and call + * phyint_check_for_repair() to see if the link state indicates that + * the interface should be repaired. Also see the state diagram in * mpd_probe.c. */ for (pi = phyints; pi != NULL; pi = pi->pi_next) { - if (pi->pi_state == PI_FAILED && - !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { + if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) && + (pi->pi_state == PI_FAILED || + pi->pi_state == PI_NOTARGETS)) { phyint_check_for_repair(pi); } } @@ -875,15 +812,14 @@ check_testconfig(void) pi->pi_v6->pii_probe_logint->li_dupaddr) li = pi->pi_v6->pii_probe_logint; - if (li != NULL) { - if (!pi->pi_duptaddrmsg_printed) { - (void) pr_addr(li->li_phyint_inst->pii_af, - li->li_addr, abuf, sizeof (abuf)); - logerr("Test address %s is not unique in " - "group; disabling probe-based failure " - "detection on %s\n", abuf, pi->pi_name); - pi->pi_duptaddrmsg_printed = 1; - } + if (li != NULL && li->li_dupaddr) { + if (pi->pi_duptaddrmsg_printed) + continue; + logerr("Test address %s is not unique in group; " + "disabling probe-based failure detection on %s\n", + pr_addr(li->li_phyint_inst->pii_af, + li->li_addr, abuf, sizeof (abuf)), pi->pi_name); + pi->pi_duptaddrmsg_printed = 1; continue; } @@ -915,10 +851,10 @@ check_config(void) boolean_t v6_in_group; /* - * All phyints of a group must be homogenous to ensure that - * failover or failback can be done. If any phyint in a group - * has IPv4 plumbed, check that all phyints have IPv4 plumbed. - * Do a similar check for IPv6. + * All phyints of a group must be homogeneous to ensure that they can + * take over for one another. If any phyint in a group has IPv4 + * plumbed, check that all phyints have IPv4 plumbed. Do a similar + * check for IPv6. */ for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { if (pg == phyint_anongroup) @@ -949,9 +885,9 @@ check_config(void) if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { if (!pi->pi_cfgmsg_printed) { - logerr("NIC %s of group %s is" - " not plumbed for IPv4 and may" - " affect failover capability\n", + logerr("IP interface %s in group %s is" + " not plumbed for IPv4, affecting" + " IPv4 connectivity\n", pi->pi_name, pi->pi_group->pg_name); pi->pi_cfgmsg_printed = 1; @@ -959,9 +895,9 @@ check_config(void) } else if (v6_in_group == _B_TRUE && pi->pi_v6 == NULL) { if (!pi->pi_cfgmsg_printed) { - logerr("NIC %s of group %s is" - " not plumbed for IPv6 and may" - " affect failover capability\n", + logerr("IP interface %s in group %s is" + " not plumbed for IPv6, affecting" + " IPv6 connectivity\n", pi->pi_name, pi->pi_group->pg_name); pi->pi_cfgmsg_printed = 1; @@ -974,10 +910,10 @@ check_config(void) * error recovery message */ if (pi->pi_cfgmsg_printed) { - logerr("NIC %s is now consistent with " - "group %s and failover capability " - "is restored\n", pi->pi_name, - pi->pi_group->pg_name); + logerr("IP interface %s is now" + " consistent with group %s " + " and connectivity is restored\n", + pi->pi_name, pi->pi_group->pg_name); pi->pi_cfgmsg_printed = 0; } } @@ -1117,8 +1053,8 @@ run_timeouts(void) static int eventpipe_read = -1; /* Used for synchronous signal delivery */ static int eventpipe_write = -1; -static boolean_t cleanup_started = _B_FALSE; - /* Don't write to eventpipe if in cleanup */ +boolean_t cleanup_started = _B_FALSE; /* true if we're going away */ + /* * Ensure that signals are processed synchronously with the rest of * the code by just writing a one character signal number on the pipe. @@ -1228,7 +1164,7 @@ in_signal(int fd) "Number of probes sent %lld\n" "Number of probe acks received %lld\n" "Number of probes/acks lost %lld\n" - "Number of valid unacknowled probes %lld\n" + "Number of valid unacknowledged probes %lld\n" "Number of ambiguous probe acks received %lld\n", AF_STR(pii->pii_af), pii->pii_name, sent, acked, lost, unacked, unknown); @@ -1321,12 +1257,20 @@ setup_rtsock(int af) { int s; int flags; + int aware = RTAW_UNDER_IPMP; s = socket(PF_ROUTE, SOCK_RAW, af); if (s == -1) { logperror("setup_rtsock: socket PF_ROUTE"); exit(1); } + + if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) { + logperror("setup_rtsock: setsockopt RT_AWARE"); + (void) close(s); + exit(1); + } + if ((flags = fcntl(s, F_GETFL, 0)) < 0) { logperror("setup_rtsock: fcntl F_GETFL"); (void) close(s); @@ -1347,8 +1291,7 @@ setup_rtsock(int af) /* * Process an RTM_IFINFO message received on a routing socket. * The return value indicates whether a full interface scan is required. - * Link up/down notifications from the NICs are reflected in the - * IFF_RUNNING flag. + * Link up/down notifications are reflected in the IFF_RUNNING flag. * If just the state of the IFF_RUNNING interface flag has changed, a * a full interface scan isn't required. */ @@ -1400,7 +1343,7 @@ process_rtm_ifinfo(if_msghdr_t *ifm, int type) /* * We want to try and avoid doing a full interface scan for - * link state notifications from the NICs, as indicated + * link state notifications from the datalink layer, as indicated * by the state of the IFF_RUNNING flag. If just the * IFF_RUNNING flag has changed state, the link state changes * are processed without a full scan. @@ -1441,25 +1384,7 @@ process_rtm_ifinfo(if_msghdr_t *ifm, int type) * types. */ if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) - phyint_newtype(pi); - - /* - * If IFF_INACTIVE has been set, then no data addresses should be - * hosted on the interface. If IFF_INACTIVE has been cleared, then - * move previously failed-over addresses back to it, provided it is - * not failed. For details, see the state diagram in mpd_probe.c. - */ - if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) { - if (pii->pii_flags & IFF_INACTIVE) { - if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) - (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); - } else { - if (pi->pi_state == PI_RUNNING && !pi->pi_full) { - pi->pi_empty = 0; - (void) try_failback(pi); - } - } - } + phyint_changed(pi); /* Has just the IFF_RUNNING flag changed state ? */ if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { @@ -1620,22 +1545,24 @@ update_router_list(int fd) t_scalar_t prim; tor = (struct T_optmgmt_req *)&buf; - tor->PRIM_type = T_SVR4_OPTMGMT_REQ; tor->OPT_offset = sizeof (struct T_optmgmt_req); tor->OPT_length = sizeof (struct opthdr); tor->MGMT_flags = T_CURRENT; + /* + * Note: we use the special level value below so that IP will return + * us information concerning IRE_MARK_TESTHIDDEN routes. + */ req = (struct opthdr *)&tor[1]; - req->level = MIB2_IP; /* any MIB2_xxx value ok here */ + req->level = EXPER_IP_AND_TESTHIDDEN; req->name = 0; req->len = 0; ctlbuf.buf = (char *)&buf; ctlbuf.len = tor->OPT_length + tor->OPT_offset; ctlbuf.maxlen = sizeof (buf); - flags = 0; - if (putmsg(fd, &ctlbuf, NULL, flags) == -1) { + if (putmsg(fd, &ctlbuf, NULL, 0) == -1) { logperror("update_router_list: putmsg(ctl)"); return (_B_FALSE); } @@ -1689,7 +1616,8 @@ update_router_list(int fd) case T_OPTMGMT_ACK: toa = &buf.uprim.optmgmt_ack; optp = (struct opthdr *)&toa[1]; - if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) { + if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) + + sizeof (struct opthdr))) { logerr("update_router_list: ctlbuf.len %d\n", ctlbuf.len); return (_B_FALSE); @@ -1707,7 +1635,7 @@ update_router_list(int fd) return (_B_FALSE); } - /* Process the T_OPGMGMT_ACK below */ + /* Process the T_OPTMGMT_ACK below */ assert(prim == T_OPTMGMT_ACK); switch (status) { @@ -1717,9 +1645,8 @@ update_router_list(int fd) * message. If this is the last message i.e EOD, * return, else process the next T_OPTMGMT_ACK msg. */ - if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) + - sizeof (struct opthdr)) && optp->len == 0 && - optp->name == 0 && optp->level == 0) { + if (optp->len == 0 && optp->name == 0 && + optp->level == 0) { /* * This is the EOD message. Return */ @@ -1747,17 +1674,14 @@ update_router_list(int fd) databuf.len = 0; flags = 0; for (;;) { - status = getmsg(fd, NULL, &databuf, &flags); - if (status >= 0) { + if (getmsg(fd, NULL, &databuf, &flags) >= 0) break; - } else if (errno == EINTR) { + if (errno == EINTR) continue; - } else { - logperror("update_router_list:" - " getmsg(data)"); - free(databuf.buf); - return (_B_FALSE); - } + + logperror("update_router_list: getmsg(data)"); + free(databuf.buf); + return (_B_FALSE); } if (optp->level == MIB2_IP && @@ -1777,18 +1701,35 @@ update_router_list(int fd) /* NOTREACHED */ } + +/* + * Convert octet `octp' to a phyint name and store in `ifname' + */ +static void +oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize) +{ + char *cp; + size_t len = MIN(octp->o_length, ifsize - 1); + + (void) strncpy(ifname, octp->o_bytes, len); + ifname[len] = '\0'; + + if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL) + *cp = '\0'; +} + /* - * Examine the IPv4 routing table, for default routers. For each default - * router, populate the list of targets of each phyint that is on the same - * link as the default router + * Examine the IPv4 routing table `buf' for possible targets. For each + * possible target, if it's on the same subnet an interface route, pass + * it to router_add_common() for further consideration. */ static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) { - mib2_ipRouteEntry_t *rp; - mib2_ipRouteEntry_t *rp1; - struct in_addr nexthop_v4; - mib2_ipRouteEntry_t *endp; + char ifname[LIFNAMSIZ]; + mib2_ipRouteEntry_t *rp, *rp1, *endp; + struct in_addr nexthop_v4; + struct in6_addr nexthop; if (len == 0) return; @@ -1797,75 +1738,40 @@ ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) endp = buf + (len / sizeof (mib2_ipRouteEntry_t)); /* - * Loop thru the routing table entries. Process any IRE_DEFAULT, - * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. - * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. - * This is a potential target for probing, which we try to add - * to the list of probe targets. + * Scan the routing table entries for any IRE_OFFSUBNET entries, and + * cross-reference them with the interface routes to determine if + * they're possible probe targets. */ for (rp = buf; rp < endp; rp++) { if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) continue; - /* Get the nexthop address. */ + /* Get the nexthop address. */ nexthop_v4.s_addr = rp->ipRouteNextHop; /* - * Get the nexthop address. Then determine the outgoing - * interface, by examining all interface IREs, and picking the - * match. We don't look at the interface specified in the route - * because we need to add the router target on all matching - * interfaces anyway; the goal is to avoid falling back to - * multicast when some interfaces are in the same subnet but - * not in the same group. + * Rescan the routing table looking for interface routes that + * are on the same subnet, and try to add them. If they're + * not relevant (e.g., the interface route isn't part of an + * IPMP group, router_add_common() will discard). */ for (rp1 = buf; rp1 < endp; rp1++) { - if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) { + if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) || + rp1->ipRouteIfIndex.o_length == 0) continue; - } - /* - * Determine the interface IRE that matches the nexthop. - * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask) - */ - if ((rp1->ipRouteDest & rp1->ipRouteMask) == - (nexthop_v4.s_addr & rp1->ipRouteMask)) { - /* - * We found the interface ire - */ - router_add_v4(rp1, nexthop_v4); - } + if ((rp1->ipRouteDest & rp1->ipRouteMask) != + (nexthop_v4.s_addr & rp1->ipRouteMask)) + continue; + + oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ); + IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); + router_add_common(AF_INET, ifname, nexthop); } } } void -router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4) -{ - char *cp; - char ifname[LIFNAMSIZ + 1]; - struct in6_addr nexthop; - int len; - - if (debug & D_TARGET) - logdebug("router_add_v4()\n"); - - len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1); - (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len); - ifname[len] = '\0'; - - if (ifname[0] == '\0') - return; - - cp = strchr(ifname, IF_SEPARATOR); - if (cp != NULL) - *cp = '\0'; - - IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); - router_add_common(AF_INET, ifname, nexthop); -} - -void router_add_common(int af, char *ifname, struct in6_addr nexthop) { struct phyint_instance *pii; @@ -1906,16 +1812,17 @@ router_add_common(int af, char *ifname, struct in6_addr nexthop) } /* - * Examine the IPv6 routing table, for default routers. For each default - * router, populate the list of targets of each phyint that is on the same - * link as the default router + * Examine the IPv6 routing table `buf' for possible link-local targets, and + * pass any contenders to router_add_common() for further consideration. */ static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) { - mib2_ipv6RouteEntry_t *rp; - mib2_ipv6RouteEntry_t *endp; - struct in6_addr nexthop_v6; + struct lifreq lifr; + char ifname[LIFNAMSIZ]; + char grname[LIFGRNAMSIZ]; + mib2_ipv6RouteEntry_t *rp, *rp1, *endp; + struct in6_addr nexthop_v6; if (debug & D_TARGET) logdebug("ire_process_v6(len %d)\n", len); @@ -1927,62 +1834,51 @@ ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t)); /* - * Loop thru the routing table entries. Process any IRE_DEFAULT, - * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. - * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. - * This is a potential target for probing, which we try to add - * to the list of probe targets. + * Scan the routing table entries for any IRE_OFFSUBNET entries, and + * cross-reference them with the interface routes to determine if + * they're possible probe targets. */ for (rp = buf; rp < endp; rp++) { - if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET)) + if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) || + !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop)) continue; - /* - * We have the outgoing interface in ipv6RouteIfIndex - * if ipv6RouteIfindex.o_length is non-zero. The outgoing - * interface must be present for link-local addresses. Since - * we use only link-local addreses for probing, we don't - * consider the case when the outgoing interface is not - * known and we need to scan interface ires - */ + /* Get the nexthop address. */ nexthop_v6 = rp->ipv6RouteNextHop; - if (rp->ipv6RouteIfIndex.o_length != 0) { - /* - * We already have the outgoing interface - * in ipv6RouteIfIndex. - */ - router_add_v6(rp, nexthop_v6); - } - } -} - -void -router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6) -{ - char ifname[LIFNAMSIZ + 1]; - char *cp; - int len; - - if (debug & D_TARGET) - logdebug("router_add_v6()\n"); - - len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1); - (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len); - ifname[len] = '\0'; + /* + * The interface name should always exist for link-locals; + * we use it to map this entry to an IPMP group name. + */ + if (rp->ipv6RouteIfIndex.o_length == 0) + continue; - if (ifname[0] == '\0') - return; + oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ); + if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 || + strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) { + continue; + } - cp = strchr(ifname, IF_SEPARATOR); - if (cp != NULL) - *cp = '\0'; + /* + * Rescan the list of routes for interface routes, and add the + * above target to any interfaces in the same IPMP group. + */ + for (rp1 = buf; rp1 < endp; rp1++) { + if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) || + rp1->ipv6RouteIfIndex.o_length == 0) { + continue; + } + oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ); + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); - router_add_common(AF_INET6, ifname, nexthop_v6); + if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 && + strcmp(lifr.lifr_groupname, grname) == 0) { + router_add_common(AF_INET6, ifname, nexthop_v6); + } + } + } } - - /* * Build a list of target routers, by scanning the routing tables. * It is assumed that interface routes exist, to reach the routers. @@ -2001,11 +1897,9 @@ init_router_targets(void) for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { pi = pii->pii_phyint; /* - * Exclude ptp and host targets. Set tg_in_use to false, - * only for router targets. + * Set tg_in_use to false only for router targets. */ - if (!pii->pii_targets_are_routers || - (pi->pi_flags & IFF_POINTOPOINT)) + if (!pii->pii_targets_are_routers) continue; for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) @@ -2026,15 +1920,21 @@ init_router_targets(void) } for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { - if (!pii->pii_targets_are_routers || - (pi->pi_flags & IFF_POINTOPOINT)) + pi = pii->pii_phyint; + if (!pii->pii_targets_are_routers) continue; for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { next_tg = tg->tg_next; - if (!tg->tg_in_use) { + /* + * If the group has failed, it's likely the route was + * removed by an application affected by that failure. + * In that case, we keep the target so that we can + * reliably repair, at which point we'll refresh the + * target list again. + */ + if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group)) target_delete(tg); - } } } } @@ -2140,7 +2040,7 @@ getdefault(char *name) * Command line options below */ boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ -boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */ +boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */ static boolean_t adopt = _B_FALSE; static boolean_t foreground = _B_FALSE; @@ -2149,6 +2049,7 @@ main(int argc, char *argv[]) { int i; int c; + struct phyint *pi; struct phyint_instance *pii; char *value; @@ -2173,14 +2074,15 @@ main(int argc, char *argv[]) if (user_failure_detection_time <= 0) { user_failure_detection_time = FAILURE_DETECTION_TIME; logerr("Invalid failure detection time %s, assuming " - "default %d\n", value, user_failure_detection_time); + "default of %d ms\n", value, + user_failure_detection_time); } else if (user_failure_detection_time < MIN_FAILURE_DETECTION_TIME) { user_failure_detection_time = MIN_FAILURE_DETECTION_TIME; logerr("Too small failure detection time of %s, " - "assuming minimum %d\n", value, + "assuming minimum of %d ms\n", value, user_failure_detection_time); } free(value); @@ -2211,9 +2113,9 @@ main(int argc, char *argv[]) */ value = getdefault("FAILBACK"); if (value != NULL) { - if (strncasecmp(value, "yes", 3) == 0) + if (strcasecmp(value, "yes") == 0) failback_enabled = _B_TRUE; - else if (strncasecmp(value, "no", 2) == 0) + else if (strcasecmp(value, "no") == 0) failback_enabled = _B_FALSE; else logerr("Invalid value for FAILBACK %s\n", value); @@ -2229,9 +2131,9 @@ main(int argc, char *argv[]) */ value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); if (value != NULL) { - if (strncasecmp(value, "yes", 3) == 0) + if (strcasecmp(value, "yes") == 0) track_all_phyints = _B_FALSE; - else if (strncasecmp(value, "no", 2) == 0) + else if (strcasecmp(value, "no") == 0) track_all_phyints = _B_TRUE; else logerr("Invalid value for " @@ -2340,12 +2242,6 @@ main(int argc, char *argv[]) initifs(); - /* Inform kernel whether failback is enabled or disabled */ - if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) { - logperror("main: ioctl (SIOCSIPMPFAILBACK)"); - exit(1); - } - /* * If we're operating in "adopt" mode and no interfaces need to be * tracked, shut down (ifconfig(1M) will restart us on demand if @@ -2379,6 +2275,7 @@ main(int argc, char *argv[]) process_rtsock(rtsock_v4, rtsock_v6); break; } + for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { if (pollfds[i].fd == pii->pii_probe_sock) { @@ -2389,15 +2286,21 @@ main(int argc, char *argv[]) break; } } + + for (pi = phyints; pi != NULL; pi = pi->pi_next) { + if (pi->pi_notes != 0 && + pollfds[i].fd == dlpi_fd(pi->pi_dh)) { + (void) dlpi_recv(pi->pi_dh, NULL, NULL, + NULL, NULL, 0, NULL); + break; + } + } + if (pollfds[i].fd == lsock_v4) loopback_cmd(lsock_v4, AF_INET); else if (pollfds[i].fd == lsock_v6) loopback_cmd(lsock_v6, AF_INET6); } - if (full_scan_required) { - initifs(); - full_scan_required = _B_FALSE; - } } /* NOTREACHED */ return (EXIT_SUCCESS); @@ -2481,29 +2384,23 @@ static struct { { "MI_PING", sizeof (uint32_t) }, { "MI_OFFLINE", sizeof (mi_offline_t) }, { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, - { "MI_SETOINDEX", sizeof (mi_setoindex_t) }, { "MI_QUERY", sizeof (mi_query_t) } }; /* - * Commands received over the loopback interface come here. Currently - * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP - * module. ifconfig only makes a connection, and closes it to check if - * in.mpathd is running. - * if_mpadm sends commands in the format specified by the mpathd_interface - * structure. + * Commands received over the loopback interface come here (via libipmp). */ static void loopback_cmd(int sock, int family) { int newfd; ssize_t len; + boolean_t is_priv = _B_FALSE; struct sockaddr_storage peer; struct sockaddr_in *peer_sin; struct sockaddr_in6 *peer_sin6; socklen_t peerlen; union mi_commands mpi; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; char abuf[INET6_ADDRSTRLEN]; uint_t cmd; int retval; @@ -2528,10 +2425,11 @@ loopback_cmd(int sock, int family) return; } peer_sin = (struct sockaddr_in *)&peer; - if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) || - (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) { - (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, - abuf, sizeof (abuf)); + is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED; + (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, + abuf, sizeof (abuf)); + + if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) { logerr("Attempt to connect from addr %s port %d\n", abuf, ntohs(peer_sin->sin_port)); (void) close(newfd); @@ -2551,11 +2449,10 @@ loopback_cmd(int sock, int family) * talking to us. */ peer_sin6 = (struct sockaddr_in6 *)&peer; - if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) || - (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr, - &loopback_addr))) { - (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, - sizeof (abuf)); + is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED; + (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, + sizeof (abuf)); + if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) { logerr("Attempt to connect from addr %s port %d\n", abuf, ntohs(peer_sin6->sin6_port)); (void) close(newfd); @@ -2575,15 +2472,6 @@ loopback_cmd(int sock, int family) len = read(newfd, &mpi, sizeof (mpi)); /* - * ifconfig does not send any data. Just tests to see if mpathd - * is already running. - */ - if (len <= 0) { - (void) close(newfd); - return; - } - - /* * In theory, we can receive any sized message for a stream socket, * but we don't expect that to happen for a small message over a * loopback connection. @@ -2591,6 +2479,8 @@ loopback_cmd(int sock, int family) if (len < sizeof (uint32_t)) { logerr("loopback_cmd: bad command format or read returns " "partial data %d\n", len); + (void) close(newfd); + return; } cmd = mpi.mi_command; @@ -2600,6 +2490,16 @@ loopback_cmd(int sock, int family) return; } + /* + * Only MI_PING and MI_QUERY can come from unprivileged sources. + */ + if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) { + logerr("Unprivileged request from %s for privileged " + "command %s\n", abuf, commands[cmd].name); + (void) close(newfd); + return; + } + if (len < commands[cmd].size) { logerr("loopback_cmd: short %s command (expected %d, got %d)\n", commands[cmd].name, commands[cmd].size, len); @@ -2615,179 +2515,46 @@ loopback_cmd(int sock, int family) (void) close(newfd); } -extern int global_errno; /* set by failover() or failback() */ - /* - * Process the offline, undo offline and set original index commands, - * received from if_mpadm(1M) + * Process the commands received via libipmp. */ static unsigned int process_cmd(int newfd, union mi_commands *mpi) { - uint_t nif = 0; - uint32_t cmd; struct phyint *pi; - struct phyint *pi2; - struct phyint_group *pg; - boolean_t success; - int error; struct mi_offline *mio; struct mi_undo_offline *miu; - struct lifreq lifr; - int ifsock; - struct mi_setoindex *mis; + unsigned int retval; - cmd = mpi->mi_command; + switch (mpi->mi_command) { + case MI_PING: + return (send_result(newfd, IPMP_SUCCESS, 0)); - switch (cmd) { case MI_OFFLINE: mio = &mpi->mi_ocmd; - /* - * Lookup the interface that needs to be offlined. - * If it does not exist, return a suitable error. - */ + pi = phyint_lookup(mio->mio_ifname); if (pi == NULL) - return (send_result(newfd, IPMP_FAILURE, EINVAL)); - - /* - * Verify that the minimum redundancy requirements are met. - * The multipathing group must have at least the specified - * number of functional interfaces after offlining the - * requested interface. Otherwise return a suitable error. - */ - pg = pi->pi_group; - nif = 0; - if (pg != phyint_anongroup) { - for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL; - pi2 = pi2->pi_pgnext) { - if ((pi2->pi_state == PI_RUNNING) || - (pg->pg_groupfailed && - !(pi2->pi_flags & IFF_OFFLINE))) - nif++; - } - } - if (nif < mio->mio_min_redundancy) - return (send_result(newfd, IPMP_EMINRED, 0)); + return (send_result(newfd, IPMP_EUNKIF, 0)); - /* - * The order of operation is to set IFF_OFFLINE, followed by - * failover. Setting IFF_OFFLINE ensures that no new ipif's - * can be created. Subsequent failover moves everything on - * the OFFLINE interface to some other functional interface. - */ - success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE); - if (success) { - if (!pi->pi_empty) { - error = try_failover(pi, FAILOVER_NORMAL); - if (error != 0) { - if (!change_lif_flags(pi, IFF_OFFLINE, - _B_FALSE)) { - logerr("process_cmd: couldn't" - " clear OFFLINE flag on" - " %s\n", pi->pi_name); - /* - * Offline interfaces should - * not be probed. - */ - stop_probing(pi); - } - return (send_result(newfd, error, - global_errno)); - } - } - } else { + retval = phyint_offline(pi, mio->mio_min_redundancy); + if (retval == IPMP_FAILURE) return (send_result(newfd, IPMP_FAILURE, errno)); - } - /* - * The interface is now Offline, so stop probing it. - * Note that if_mpadm(1M) will down the test addresses, - * after receiving a success reply from us. The routing - * socket message will then make us close the socket used - * for sending probes. But it is more logical that an - * offlined interface must not be probed, even if it has - * test addresses. - */ - stop_probing(pi); - return (send_result(newfd, IPMP_SUCCESS, 0)); + return (send_result(newfd, retval, 0)); case MI_UNDO_OFFLINE: miu = &mpi->mi_ucmd; - /* - * Undo the offline command. As usual lookup the interface. - * Send an error if it does not exist or is not offline. - */ - pi = phyint_lookup(miu->miu_ifname); - if (pi == NULL || pi->pi_state != PI_OFFLINE) - return (send_result(newfd, IPMP_FAILURE, EINVAL)); - - /* - * Reset the state of the interface based on the current link - * state; if this phyint subsequently acquires a test address, - * the state will be updated later as a result of the probes. - */ - if (LINK_UP(pi)) - phyint_chstate(pi, PI_RUNNING); - else - phyint_chstate(pi, PI_FAILED); - - if (pi->pi_state == PI_RUNNING) { - /* - * Note that the success of MI_UNDO_OFFLINE is not - * contingent on actually failing back; in the odd - * case where we cannot do it here, we will try again - * in initifs() since pi->pi_full will still be zero. - */ - if (do_failback(pi) != IPMP_SUCCESS) { - logdebug("process_cmd: cannot failback from " - "%s during MI_UNDO_OFFLINE\n", pi->pi_name); - } - } - - /* - * Clear the IFF_OFFLINE flag. We have to do this last - * because do_failback() relies on it being set to decide - * when to display messages. - */ - (void) change_lif_flags(pi, IFF_OFFLINE, _B_FALSE); - - /* - * Give the requestor time to configure test addresses - * before complaining that they're missing. - */ - pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME; - - return (send_result(newfd, IPMP_SUCCESS, 0)); - - case MI_SETOINDEX: - mis = &mpi->mi_scmd; - /* Get the socket for doing ioctls */ - ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6; - - /* - * Get index of new original interface. - * The index is returned in lifr.lifr_index. - */ - (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname, - sizeof (lifr.lifr_name)); + pi = phyint_lookup(miu->miu_ifname); + if (pi == NULL) + return (send_result(newfd, IPMP_EUNKIF, 0)); - if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) + retval = phyint_undo_offline(pi); + if (retval == IPMP_FAILURE) return (send_result(newfd, IPMP_FAILURE, errno)); - /* - * Set new original interface index. - * The new index was put into lifr.lifr_index by the - * SIOCGLIFINDEX ioctl. - */ - (void) strlcpy(lifr.lifr_name, mis->mis_lifname, - sizeof (lifr.lifr_name)); - - if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0) - return (send_result(newfd, IPMP_FAILURE, errno)); - - return (send_result(newfd, IPMP_SUCCESS, 0)); + return (send_result(newfd, retval, 0)); case MI_QUERY: return (process_query(newfd, &mpi->mi_qcmd)); @@ -2806,6 +2573,8 @@ process_cmd(int newfd, union mi_commands *mpi) static unsigned int process_query(int fd, mi_query_t *miq) { + ipmp_addrinfo_t *adinfop; + ipmp_addrinfolist_t *adlp; ipmp_groupinfo_t *grinfop; ipmp_groupinfolist_t *grlp; ipmp_grouplist_t *grlistp; @@ -2815,6 +2584,19 @@ process_query(int fd, mi_query_t *miq) unsigned int retval; switch (miq->miq_inforeq) { + case IPMP_ADDRINFO: + retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr, + &adinfop); + if (retval != IPMP_SUCCESS) + return (send_result(fd, retval, errno)); + + retval = send_result(fd, IPMP_SUCCESS, 0); + if (retval == IPMP_SUCCESS) + retval = send_addrinfo(fd, adinfop); + + ipmp_freeaddrinfo(adinfop); + return (retval); + case IPMP_GROUPLIST: retval = getgrouplist(&grlistp); if (retval != IPMP_SUCCESS) @@ -2829,7 +2611,7 @@ process_query(int fd, mi_query_t *miq) case IPMP_GROUPINFO: miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; - retval = getgroupinfo(miq->miq_ifname, &grinfop); + retval = getgroupinfo(miq->miq_grname, &grinfop); if (retval != IPMP_SUCCESS) return (send_result(fd, retval, errno)); @@ -2854,6 +2636,11 @@ process_query(int fd, mi_query_t *miq) return (retval); case IPMP_SNAP: + /* + * Before taking the snapshot, sync with the kernel. + */ + initifs(); + retval = getsnap(&snap); if (retval != IPMP_SUCCESS) return (send_result(fd, retval, errno)); @@ -2883,6 +2670,13 @@ process_query(int fd, mi_query_t *miq) if (retval != IPMP_SUCCESS) goto out; } + + adlp = snap->sn_adinfolistp; + for (; adlp != NULL; adlp = adlp->adl_next) { + retval = send_addrinfo(fd, adlp->adl_adinfop); + if (retval != IPMP_SUCCESS) + goto out; + } out: ipmp_snap_free(snap); return (retval); @@ -2902,14 +2696,20 @@ static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) { ipmp_iflist_t *iflistp = grinfop->gr_iflistp; + ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; unsigned int retval; retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); if (retval != IPMP_SUCCESS) return (retval); - return (ipmp_writetlv(fd, IPMP_IFLIST, - IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp)); + retval = ipmp_writetlv(fd, IPMP_IFLIST, + IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp); + if (retval != IPMP_SUCCESS) + return (retval); + + return (ipmp_writetlv(fd, IPMP_ADDRLIST, + IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp)); } /* @@ -2919,7 +2719,31 @@ send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) { - return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop)); + ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp; + ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp; + unsigned int retval; + + retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop); + if (retval != IPMP_SUCCESS) + return (retval); + + retval = ipmp_writetlv(fd, IPMP_ADDRLIST, + IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p); + if (retval != IPMP_SUCCESS) + return (retval); + + return (ipmp_writetlv(fd, IPMP_ADDRLIST, + IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p)); +} + +/* + * Send the address information pointed to by `adinfop' on file descriptor + * `fd'. Returns an IPMP error code. + */ +static unsigned int +send_addrinfo(int fd, ipmp_addrinfo_t *adinfop) +{ + return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop)); } /* @@ -3109,3 +2933,32 @@ close_probe_socket(struct phyint_instance *pii, boolean_t polled) pii->pii_probe_sock = -1; pii->pii_basetime_inited = 0; } + +boolean_t +addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags, + struct sockaddr_storage *ssp) +{ + addrlist_t *addrp; + + if ((addrp = malloc(sizeof (addrlist_t))) == NULL) + return (_B_FALSE); + + (void) strlcpy(addrp->al_name, name, LIFNAMSIZ); + addrp->al_flags = flags; + addrp->al_addr = *ssp; + addrp->al_next = *addrsp; + *addrsp = addrp; + return (_B_TRUE); +} + +void +addrlist_free(addrlist_t **addrsp) +{ + addrlist_t *addrp, *next_addrp; + + for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) { + next_addrp = addrp->al_next; + free(addrp); + } + *addrsp = NULL; +} diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c index a2ff76a983..cf327fbaff 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -20,8 +20,6 @@ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mpd_defs.h" #include "mpd_tables.h" @@ -45,7 +43,7 @@ struct pr_icmp uint16_t pr_icmp_cksum; /* checksum field */ uint16_t pr_icmp_id; /* Identification */ uint16_t pr_icmp_seq; /* sequence number */ - uint32_t pr_icmp_timestamp; /* Time stamp */ + uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */ uint32_t pr_icmp_mtype; /* Message type */ }; @@ -58,11 +56,12 @@ static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ -static void *find_ancillary(struct msghdr *msg, int cmsg_type); -static void pi_set_crtt(struct target *tg, int m, +static void *find_ancillary(struct msghdr *msg, int cmsg_level, + int cmsg_type); +static void pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni); static void incoming_echo_reply(struct phyint_instance *pii, - struct pr_icmp *reply, struct in6_addr fromaddr); + struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp); static void incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr); static void incoming_mcast_reply(struct phyint_instance *pii, @@ -78,13 +77,11 @@ static void probe_success_info(struct phyint_instance *pii, struct target *cur_tg, struct probe_success_count *psinfo); static boolean_t phyint_repaired(struct phyint *pi); -static int failover(struct phyint *from, struct phyint *to); -static int failback(struct phyint *from, struct phyint *to); -static struct phyint *get_failover_dst(struct phyint *pi, int failover_type); - static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); static int in_cksum(ushort_t *addr, int len); static void reset_snxt_basetimes(void); +static int ns2ms(int64_t ns); +static int64_t tv2ns(struct timeval *); /* * CRTT - Conservative Round Trip Time Estimate @@ -104,7 +101,7 @@ static void reset_snxt_basetimes(void); * Phyint state diagram * * The state of a phyint that is capable of being probed, is completely - * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>. + * specified by the 3-tuple <pi_state, pg_state, I>. * * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state * of the link (according to the driver). If the phyint is also configured @@ -117,8 +114,8 @@ static void reset_snxt_basetimes(void); * state, which indicates that the link is apparently functional but that * in.mpathd is unable to send probes to verify functionality (in this case, * in.mpathd makes the optimistic assumption that the interface is working - * correctly and thus does not perform a failover, but reports the interface - * as IPMP_IF_UNKNOWN through the async events and query interfaces). + * correctly and thus does not mark the interface FAILED, but reports it as + * IPMP_IF_UNKNOWN through the async events and query interfaces). * * At any point, a phyint may be administratively marked offline via if_mpadm. * In this case, the interface always transitions to PI_OFFLINE, regardless @@ -131,8 +128,11 @@ static void reset_snxt_basetimes(void); * PI_RUNNING: The failure detection logic says the phyint is good. * PI_FAILED: The failure detection logic says the phyint has failed. * - * pg_groupfailed - Group failure, all interfaces in the group have failed. - * The pi_state may be either PI_FAILED or PI_NOTARGETS. + * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED. + * PG_OK: All interfaces in the group are OK. + * PG_DEGRADED: Some interfaces in the group are unusable. + * PG_FAILED: All interfaces in the group are unusable. + * * In the case of router targets, we assume that the current list of * targets obtained from the routing table, is still valid, so the * phyint stat is PI_FAILED. In the case of host targets, we delete the @@ -140,144 +140,46 @@ static void reset_snxt_basetimes(void); * target list. So the phyints are in the PI_NOTARGETS state. * * I - value of (pi_flags & IFF_INACTIVE) - * IFF_INACTIVE: No failovers have been done to this phyint, from - * other phyints. This phyint is inactive. Phyint can be a Standby. - * When failback has been disabled (FAILOVER=no configured), - * phyint can also be a non-STANDBY. In this case IFF_INACTIVE - * is set when phyint subsequently recovers after a failure. - * - * pi_empty - * This phyint has failed over successfully to another phyint, and - * this phyint is currently "empty". It does not host any addresses or - * multicast membership etc. This is the state of a phyint after a - * failover from the phyint has completed successfully and no subsequent - * 'failover to' or 'failback to' has occurred on the phyint. - * IP guarantees that no new logicals will be hosted nor any multicast - * joins permitted on the phyint, since the phyint is either failed or - * inactive. pi_empty is set implies the phyint is either failed or - * inactive. - * - * pi_full - * The phyint hosts all of its own addresses that it "owns". If the - * phyint was previously failed or inactive, failbacks to the phyint - * has completed successfully. i.e. No more failbacks to this phyint - * can produce any change in system state whatsoever. - * - * Not all 32 possible combinations of the above 5-tuple are possible. - * Furthermore some of the above combinations are transient. They may occur - * only because the failover or failback did not complete successfully. The - * failover/failback will be retried and eventually a stable state will be - * reached. - * - * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd. - * The following are the state machines. 'from' and 'to' are the src and - * dst of the failover/failback, below - * - * pi_empty state machine - * --------------------------------------------------------------------------- - * Event State -> New State - * --------------------------------------------------------------------------- - * successful completion from.pi_empty = 0 -> from.pi_empty = 1 - * of failover + * IFF_INACTIVE: This phyint will not send or receive packets. + * Usually, inactive is tied to standby interfaces that are not yet + * needed (e.g., no non-standby interfaces in the group have failed). + * When failback has been disabled (FAILBACK=no configured), phyint can + * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint + * subsequently recovers after a failure. * - * Initiate failover to.pi_empty = X -> to.pi_empty = 0 + * Not all 9 possible combinations of the above 3-tuple are possible. * - * Initiate failback to.pi_empty = X -> to.pi_empty = 0 - * - * group failure pi_empty = X -> pi_empty = 0 - * --------------------------------------------------------------------------- - * - * pi_full state machine - * --------------------------------------------------------------------------- - * Event State -> New State - * --------------------------------------------------------------------------- - * successful completion to.pi_full = 0 -> to.pi_full = 1 - * of failback from - * each of the other phyints - * - * Initiate failover from.pi_full = X -> from.pi_full = 0 - * - * group failure pi_full = X -> pi_full = 0 - * --------------------------------------------------------------------------- + * I is tracked by IP. pi_state is tracked by mpathd. * * pi_state state machine * --------------------------------------------------------------------------- * Event State New State * Action: * --------------------------------------------------------------------------- - * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) + * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) * detection : set IFF_FAILED on this phyint - * : failover from this phyint to another * - * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) + * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) * detection : set IFF_FAILED on this phyint * - * NIC repair (PI_FAILED, I == 0, FAILBACK=yes) + * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes) * detection -> (PI_RUNNING, I == 0) - * : to.pi_empty = 0 * : clear IFF_FAILED on this phyint - * : failback to this phyint if enabled * - * NIC repair (PI_FAILED, I == 0, FAILBACK=no) + * IP interface repair (PI_FAILED, I == 0, FAILBACK=no) * detection -> (PI_RUNNING, I == 1) - * : to.pi_empty = 0 * : clear IFF_FAILED on this phyint * : if failback is disabled set I == 1 * * Group failure (perform on all phyints in the group) * detection PI_RUNNING PI_FAILED * (Router targets) : set IFF_FAILED - * : clear pi_empty and pi_full * * Group failure (perform on all phyints in the group) * detection PI_RUNNING PI_NOTARGETS * (Host targets) : set IFF_FAILED - * : clear pi_empty and pi_full * : delete the target list on all phyints * --------------------------------------------------------------------------- - * - * I state machine - * --------------------------------------------------------------------------- - * Event State Action: - * --------------------------------------------------------------------------- - * Turn on I pi_empty == 0, STANDBY : failover from standby - * - * Turn off I PI_RUNNING, STANDBY : pi_empty = 0 - * pi_full == 0 : failback to this if enabled - * --------------------------------------------------------------------------- - * - * Assertions: (Read '==>' as implies) - * - * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED) - * (pi_empty == 1) ==> (pi_full == 0) - * (pi_full == 1) ==> (pi_empty == 0) - * - * Invariants - * - * pg_groupfailed = 0 && - * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby - * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint - * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint - * - * 1. says that an inactive standby, that is not empty, has to be failed - * over. For a standby to be truly inactive, it should not host any - * addresses. So we move them to some other phyint. Usually we catch the - * turn on of IFF_INACTIVE, and perform this action. However if the failover - * did not complete successfully, then subsequently we have lost the edge - * trigger, and this invariant kicks in and completes the action. - * - * 2. says that any failed phyint that is not empty must be failed over. - * Usually we do the failover when we detect NIC failure. However if the - * failover does not complete successfully, this invariant kicks in and - * completes the failover. We exclude inactive standby which is covered by 1. - * - * 3. says that any running phyint that is not full must be failed back. - * Usually we do the failback when we detect NIC repair. However if the - * failback does not complete successfully, this invariant kicks in and - * completes the failback. Note that we don't want to failback to an inactive - * standby. - * - * The invariants 1 - 3 and the actions are in initifs(). */ struct probes_missed probes_missed; @@ -295,7 +197,7 @@ struct probes_missed probes_missed; * not less than the current CRTT. pii_probes[] stores data * about these probes. These packets consume sequence number space. * - * PROBE_RTT: This type is used to make only rtt measurments. Normally these + * PROBE_RTT: This type is used to make only rtt measurements. Normally these * are not used. Under heavy network load, the rtt may go up very high, * due to a spike, or may appear to go high, due to extreme scheduling * delays. Once the network stress is removed, mpathd takes long time to @@ -310,17 +212,19 @@ struct probes_missed probes_missed; * no targets are known. The packet is multicast to the all hosts addr. */ static void -probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) +probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime) { + hrtime_t sent_hrtime; + struct timeval sent_tv; struct pr_icmp probe_pkt; /* Probe packet */ - struct sockaddr_in6 whereto6; /* target address IPv6 */ - struct sockaddr_in whereto; /* target address IPv4 */ + struct sockaddr_storage targ; /* target address */ + uint_t targaddrlen; /* targed address length */ int pr_ndx; /* probe index in pii->pii_probes[] */ boolean_t sent = _B_TRUE; if (debug & D_TARGET) { - logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af), - pii->pii_name, probe_type, cur_time); + logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af), + pii->pii_name, probe_type, start_hrtime); } assert(pii->pii_probe_sock != -1); @@ -339,7 +243,7 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) * network byte order at initialization itself. */ probe_pkt.pr_icmp_id = pii->pii_icmpid; - probe_pkt.pr_icmp_timestamp = htonl(cur_time); + probe_pkt.pr_icmp_timestamp = htonll(start_hrtime); probe_pkt.pr_icmp_mtype = htonl(probe_type); /* @@ -349,38 +253,34 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && pii->pii_rtt_target_next != NULL)); + bzero(&targ, sizeof (targ)); + targ.ss_family = pii->pii_af; + if (pii->pii_af == AF_INET6) { - bzero(&whereto6, sizeof (whereto6)); - whereto6.sin6_family = AF_INET6; + struct in6_addr *addr6; + + addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr; + targaddrlen = sizeof (struct sockaddr_in6); if (probe_type == PROBE_MULTI) { - whereto6.sin6_addr = all_nodes_mcast_v6; + *addr6 = all_nodes_mcast_v6; } else if (probe_type == PROBE_UNI) { - whereto6.sin6_addr = pii->pii_target_next->tg_address; - } else { - /* type is PROBE_RTT */ - whereto6.sin6_addr = - pii->pii_rtt_target_next->tg_address; - } - if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, - sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6, - sizeof (whereto6)) != sizeof (probe_pkt)) { - logperror_pii(pii, "probe: probe sendto"); - sent = _B_FALSE; + *addr6 = pii->pii_target_next->tg_address; + } else { /* type is PROBE_RTT */ + *addr6 = pii->pii_rtt_target_next->tg_address; } } else { - bzero(&whereto, sizeof (whereto)); - whereto.sin_family = AF_INET; + struct in_addr *addr4; + + addr4 = &((struct sockaddr_in *)&targ)->sin_addr; + targaddrlen = sizeof (struct sockaddr_in); if (probe_type == PROBE_MULTI) { - whereto.sin_addr = all_nodes_mcast_v4; + *addr4 = all_nodes_mcast_v4; } else if (probe_type == PROBE_UNI) { IN6_V4MAPPED_TO_INADDR( - &pii->pii_target_next->tg_address, - &whereto.sin_addr); - } else { - /* type is PROBE_RTT */ + &pii->pii_target_next->tg_address, addr4); + } else { /* type is PROBE_RTT */ IN6_V4MAPPED_TO_INADDR( - &pii->pii_rtt_target_next->tg_address, - &whereto.sin_addr); + &pii->pii_rtt_target_next->tg_address, addr4); } /* @@ -388,12 +288,18 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) */ probe_pkt.pr_icmp_cksum = in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); - if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, - sizeof (probe_pkt), 0, (struct sockaddr *)&whereto, - sizeof (whereto)) != sizeof (probe_pkt)) { - logperror_pii(pii, "probe: probe sendto"); - sent = _B_FALSE; - } + } + + /* + * Use the current time as the time we sent. Not atomic, but the best + * we can do from here. + */ + sent_hrtime = gethrtime(); + (void) gettimeofday(&sent_tv, NULL); + if (sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0, + (struct sockaddr *)&targ, targaddrlen) != sizeof (probe_pkt)) { + logperror_pii(pii, "probe: probe sendto"); + sent = _B_FALSE; } /* @@ -415,9 +321,13 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) pii->pii_cum_stats.acked++; pii->pii_cum_stats.sent++; - pii->pii_probes[pr_ndx].pr_status = PR_UNACKED; + pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt; + pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv; + pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime; + pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime; pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; - pii->pii_probes[pr_ndx].pr_time_sent = cur_time; + probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED); + pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); pii->pii_target_next = target_next(pii->pii_target_next); assert(pii->pii_target_next != NULL); @@ -448,33 +358,42 @@ in_data(struct phyint_instance *pii) { struct sockaddr_in from; struct in6_addr fromaddr; - uint_t fromlen; - static uint_t in_packet[(IP_MAXPACKET + 1)/4]; + static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; + static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; struct ip *ip; int iphlen; int len; char abuf[INET_ADDRSTRLEN]; - struct pr_icmp *reply; + struct msghdr msg; + struct iovec iov; + struct pr_icmp *reply; + struct timeval *recv_tvp; if (debug & D_PROBE) { logdebug("in_data(%s %s)\n", AF_STR(pii->pii_af), pii->pii_name); } + iov.iov_base = (char *)in_packet; + iov.iov_len = sizeof (in_packet); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_name = (struct sockaddr *)&from; + msg.msg_namelen = sizeof (from); + msg.msg_control = ancillary_data; + msg.msg_controllen = sizeof (ancillary_data); + /* * Poll has already told us that a message is waiting, * on this socket. Read it now. We should not block. */ - fromlen = sizeof (from); - len = recvfrom(pii->pii_probe_sock, (char *)in_packet, - sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen); - if (len < 0) { - logperror_pii(pii, "in_data: recvfrom"); + if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { + logperror_pii(pii, "in_data: recvmsg"); return; } /* - * If the NIC has indicated the link is down, don't go + * If the datalink has indicated the link is down, don't go * any further. */ if (LINK_DOWN(pii->pii_phyint)) @@ -483,6 +402,15 @@ in_data(struct phyint_instance *pii) /* Get the printable address for error reporting */ (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); + /* Ignore packets > 64k or control buffers that don't fit */ + if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { + if (debug & D_PKTBAD) { + logdebug("Truncated message: msg_flags 0x%x from %s\n", + msg.msg_flags, abuf); + } + return; + } + /* Make sure packet contains at least minimum ICMP header */ ip = (struct ip *)in_packet; iphlen = ip->ip_hl << 2; @@ -528,10 +456,17 @@ in_data(struct phyint_instance *pii) return; } + recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); + if (recv_tvp == NULL) { + logtrace("message without timestamp from %s on %s\n", + abuf, pii->pii_name); + return; + } + IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) /* Unicast probe reply */ - incoming_echo_reply(pii, reply, fromaddr); + incoming_echo_reply(pii, reply, fromaddr, recv_tvp); else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { /* Multicast reply */ incoming_mcast_reply(pii, reply, fromaddr); @@ -543,7 +478,6 @@ in_data(struct phyint_instance *pii) reply->pr_icmp_mtype, abuf, pii->pii_name); return; } - } /* @@ -559,8 +493,9 @@ in6_data(struct phyint_instance *pii) char abuf[INET6_ADDRSTRLEN]; struct msghdr msg; struct iovec iov; - uchar_t *opt; + void *opt; struct pr_icmp *reply; + struct timeval *recv_tvp; if (debug & D_PROBE) { logdebug("in6_data(%s %s)\n", @@ -577,12 +512,12 @@ in6_data(struct phyint_instance *pii) msg.msg_controllen = sizeof (ancillary_data); if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { - logperror_pii(pii, "in6_data: recvfrom"); + logperror_pii(pii, "in6_data: recvmsg"); return; } /* - * If the NIC has indicated that the link is down, don't go + * If the datalink has indicated that the link is down, don't go * any further. */ if (LINK_DOWN(pii->pii_phyint)) @@ -623,13 +558,14 @@ in6_data(struct phyint_instance *pii) "%s on %s\n", abuf, pii->pii_name); return; } - opt = find_ancillary(&msg, IPV6_RTHDR); + opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR); if (opt != NULL) { /* Can't allow routing headers in probe replies */ logtrace("message with routing header from %s on %s\n", abuf, pii->pii_name); return; } + if (reply->pr_icmp_code != 0) { logtrace("probe reply code: %d from %s on %s\n", reply->pr_icmp_code, abuf, pii->pii_name); @@ -640,8 +576,16 @@ in6_data(struct phyint_instance *pii) len, abuf, pii->pii_name); return; } + + recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); + if (recv_tvp == NULL) { + logtrace("message without timestamp from %s on %s\n", + abuf, pii->pii_name); + return; + } + if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { - incoming_echo_reply(pii, reply, from.sin6_addr); + incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp); } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { incoming_mcast_reply(pii, reply, from.sin6_addr); } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { @@ -663,11 +607,9 @@ static void incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr) { - int m; /* rtt measurment in ms */ - uint32_t cur_time; /* in ms from some arbitrary point */ + int64_t m; /* rtt measurement in ns */ char abuf[INET6_ADDRSTRLEN]; struct target *target; - uint32_t pr_icmp_timestamp; struct phyint_group *pg; /* Get the printable address for error reporting */ @@ -683,10 +625,7 @@ incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, if (target == NULL) return; - pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); - cur_time = getcurrenttime(); - m = (int)(cur_time - pr_icmp_timestamp); - + m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp)); /* Invalid rtt. It has wrapped around */ if (m < 0) return; @@ -754,29 +693,30 @@ incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, */ static void incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, - struct in6_addr fromaddr) + struct in6_addr fromaddr, struct timeval *recv_tvp) { - int m; /* rtt measurment in ms */ - uint32_t cur_time; /* in ms from some arbitrary point */ + int64_t m; /* rtt measurement in ns */ + hrtime_t cur_hrtime; /* in ns from some arbitrary point */ char abuf[INET6_ADDRSTRLEN]; int pr_ndx; struct target *target; boolean_t exception; - uint32_t pr_icmp_timestamp; + uint64_t pr_icmp_timestamp; uint16_t pr_icmp_seq; + struct probe_stats *pr_statp; struct phyint_group *pg = pii->pii_phyint->pi_group; /* Get the printable address for error reporting */ (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); if (debug & D_PROBE) { - logdebug("incoming_echo_reply: %s %s %s seq %u\n", + logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n", AF_STR(pii->pii_af), pii->pii_name, abuf, - ntohs(reply->pr_icmp_seq)); + ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp)); } - pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); - pr_icmp_seq = ntohs(reply->pr_icmp_seq); + pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp); + pr_icmp_seq = ntohs(reply->pr_icmp_seq); /* Reject out of window probe replies */ if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || @@ -786,15 +726,16 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, pii->pii_cum_stats.unknown++; return; } - cur_time = getcurrenttime(); - m = (int)(cur_time - pr_icmp_timestamp); + + cur_hrtime = gethrtime(); + m = (int64_t)(cur_hrtime - pr_icmp_timestamp); if (m < 0) { /* * This is a ridiculously high value of rtt. rtt has wrapped * around. Log a message, and ignore the rtt. */ - logerr("incoming_echo_reply: rtt wraparound cur_time %u reply " - "timestamp %u\n", cur_time, pr_icmp_timestamp); + logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld " + "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp); } /* @@ -868,10 +809,10 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, * debugger, or the system was hung or too busy for a * substantial time that we didn't get a chance to run. */ - if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) { + if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) { /* - * If the probe corresponding to this receieved response - * was truly sent 'm' ms. ago, then this response must + * If the probe corresponding to this received response + * was truly sent 'm' ns. ago, then this response must * have been rejected by the sequence number checks. The * fact that it has passed the sequence number checks * means that the measured rtt is wrong. We were probably @@ -947,7 +888,7 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, * adjusts pii->pii_target_next */ target_delete(target); - probe(pii, PROBE_MULTI, cur_time); + probe(pii, PROBE_MULTI, cur_hrtime); } } else { /* @@ -999,8 +940,12 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, } } out: - pii->pii_probes[pr_ndx].pr_status = PR_ACKED; - pii->pii_probes[pr_ndx].pr_time_acked = cur_time; + pr_statp = &pii->pii_probes[pr_ndx]; + pr_statp->pr_hrtime_ackproc = cur_hrtime; + pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent + + (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent)); + + probe_chstate(pr_statp, pii, PR_ACKED); /* * Update pii->pii_rack, i.e. the sequence number of the last received @@ -1240,13 +1185,13 @@ incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, * * New scaled average and deviation are passed back via sap and svp */ -static int -compute_crtt(int *sap, int *svp, int m) +static int64_t +compute_crtt(int64_t *sap, int64_t *svp, int64_t m) { - int sa = *sap; - int sv = *svp; - int crtt; - int saved_m = m; + int64_t sa = *sap; + int64_t sv = *svp; + int64_t crtt; + int64_t saved_m = m; assert(*sap >= -1); assert(*svp >= 0); @@ -1285,8 +1230,8 @@ compute_crtt(int *sap, int *svp, int m) crtt = (sa >> 3) + sv; if (debug & D_PROBE) { - logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = " - "%d\n", saved_m, sa, sv, crtt); + logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> " + "crtt = %lld\n", saved_m, sa, sv, crtt); } *sap = sa; @@ -1300,22 +1245,22 @@ compute_crtt(int *sap, int *svp, int m) } static void -pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) +pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni) { struct phyint_instance *pii = tg->tg_phyint_inst; int probe_interval = pii->pii_phyint->pi_group->pg_probeint; - int sa = tg->tg_rtt_sa; - int sv = tg->tg_rtt_sd; + int64_t sa = tg->tg_rtt_sa; + int64_t sv = tg->tg_rtt_sd; int new_crtt; int i; if (debug & D_PROBE) - logdebug("pi_set_crtt: target - m %d\n", m); + logdebug("pi_set_crtt: target - m %lld\n", m); /* store the round trip time, in case we need to defer computation */ tg->tg_deferred[tg->tg_num_deferred] = m; - new_crtt = compute_crtt(&sa, &sv, m); + new_crtt = ns2ms(compute_crtt(&sa, &sv, m)); /* * If this probe's round trip time would singlehandedly cause an @@ -1342,8 +1287,8 @@ pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) } for (i = 0; i <= tg->tg_num_deferred; i++) { - tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa, - &tg->tg_rtt_sd, tg->tg_deferred[i]); + tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa, + &tg->tg_rtt_sd, tg->tg_deferred[i])); } tg->tg_num_deferred = 0; @@ -1373,13 +1318,13 @@ pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) * If not found return NULL. */ static void * -find_ancillary(struct msghdr *msg, int cmsg_type) +find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type) { struct cmsghdr *cmsg; for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { - if (cmsg->cmsg_level == IPPROTO_IPV6 && + if (cmsg->cmsg_level == cmsg_level && cmsg->cmsg_type == cmsg_type) { return (CMSG_DATA(cmsg)); } @@ -1388,107 +1333,194 @@ find_ancillary(struct msghdr *msg, int cmsg_type) } /* - * See if a previously failed interface has started working again. + * Try to activate another INACTIVE interface in the same group as `pi'. + * Prefer STANDBY INACTIVE to just INACTIVE. */ void -phyint_check_for_repair(struct phyint *pi) +phyint_activate_another(struct phyint *pi) { - if (phyint_repaired(pi)) { - if (pi->pi_group == phyint_anongroup) { - logerr("NIC repair detected on %s\n", pi->pi_name); - } else { - logerr("NIC repair detected on %s of group %s\n", - pi->pi_name, pi->pi_group->pg_name); - } + struct phyint *pi2; + struct phyint *inactivepi = NULL; - /* - * If the interface is offline, just clear the FAILED flag, - * delaying the state change and failback operation until it - * is brought back online. - */ - if (pi->pi_state == PI_OFFLINE) { - (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); - return; - } + if (pi->pi_group == phyint_anongroup) + return; - if (pi->pi_flags & IFF_STANDBY) { - (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); - } else { - if (try_failback(pi) != IPMP_FAILURE) { - (void) change_lif_flags(pi, - IFF_FAILED, _B_FALSE); - /* Per state diagram */ - pi->pi_empty = 0; + for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (pi == pi2 || pi2->pi_state != PI_RUNNING || + !(pi2->pi_flags & IFF_INACTIVE)) + continue; + + inactivepi = pi2; + if (pi2->pi_flags & IFF_STANDBY) + break; + } + + if (inactivepi != NULL) + (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE); +} + +/* + * Transition a phyint back to PI_RUNNING (from PI_FAILED or PI_OFFLINE). The + * caller must ensure that the transition is appropriate. Clears IFF_OFFLINE + * or IFF_FAILED, as appropriate. Also sets IFF_INACTIVE on this or other + * interfaces as appropriate (see comment below). Finally, also updates the + * phyint's group state to account for the change. + */ +void +phyint_transition_to_running(struct phyint *pi) +{ + struct phyint *pi2; + struct phyint *actstandbypi = NULL; + uint_t nactive = 0, nnonstandby = 0; + boolean_t onlining = (pi->pi_state == PI_OFFLINE); + uint64_t set, clear; + + /* + * The interface is running again, but should it or another interface + * in the group end up INACTIVE? There are three cases: + * + * 1. If it's a STANDBY interface, it should be end up INACTIVE if + * the group is operating at capacity (i.e., there are at least as + * many active interfaces as non-STANDBY interfaces in the group). + * No other interfaces should be changed. + * + * 2. If it's a non-STANDBY interface and we're onlining it or + * FAILBACK is enabled, then it should *not* end up INACTIVE. + * Further, if the group is above capacity as a result of this + * interface, then an active STANDBY interface in the group should + * end up INACTIVE. + * + * 3. If it's a non-STANDBY interface, we're repairing it, and + * FAILBACK is disabled, then it should end up INACTIVE *unless* + * the group was failed (in which case we have no choice but to + * use it). No other interfaces should be changed. + */ + if (pi->pi_group != phyint_anongroup) { + pi2 = pi->pi_group->pg_phyint; + for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (!(pi2->pi_flags & IFF_STANDBY)) + nnonstandby++; + + if (pi2->pi_state == PI_RUNNING) { + if (!(pi2->pi_flags & IFF_INACTIVE)) { + nactive++; + if (pi2->pi_flags & IFF_STANDBY) + actstandbypi = pi2; + } } } + } - phyint_chstate(pi, PI_RUNNING); + set = 0; + clear = (onlining ? IFF_OFFLINE : IFF_FAILED); - if (GROUP_FAILED(pi->pi_group)) { - /* - * This is the 1st phyint to receive a response - * after group failure. - */ - logerr("At least 1 interface (%s) of group %s has " - "repaired\n", pi->pi_name, pi->pi_group->pg_name); - phyint_group_chstate(pi->pi_group, PG_RUNNING); - } + if (pi->pi_flags & IFF_STANDBY) { /* case 1 */ + if (nactive >= nnonstandby) + set |= IFF_INACTIVE; + else + clear |= IFF_INACTIVE; + } else if (onlining || failback_enabled) { /* case 2 */ + if (nactive >= nnonstandby && actstandbypi != NULL) + (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0); + } else if (!GROUP_FAILED(pi->pi_group)) { /* case 3 */ + set |= IFF_INACTIVE; + } + (void) change_pif_flags(pi, set, clear); + + phyint_chstate(pi, PI_RUNNING); + + /* + * Update the group state to account for the change. + */ + phyint_group_refresh_state(pi->pi_group); +} + +/* + * See if a previously failed interface has started working again. + */ +void +phyint_check_for_repair(struct phyint *pi) +{ + if (!phyint_repaired(pi)) + return; + + if (pi->pi_group == phyint_anongroup) { + logerr("IP interface repair detected on %s\n", pi->pi_name); + } else { + logerr("IP interface repair detected on %s of group %s\n", + pi->pi_name, pi->pi_group->pg_name); } + + /* + * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet. + * So just clear IFF_OFFLINE and defer phyint_transition_to_running() + * until it is brought back online. + */ + if (pi->pi_state == PI_OFFLINE) { + (void) change_pif_flags(pi, 0, IFF_FAILED); + return; + } + + phyint_transition_to_running(pi); /* calls phyint_chstate() */ } /* - * See if a previously functioning interface has failed, or if the - * whole group of interfaces has failed. + * See if an interface has failed, or if the whole group of interfaces has + * failed. */ static void phyint_inst_check_for_failure(struct phyint_instance *pii) { - struct phyint *pi; - struct phyint *pi2; - - pi = pii->pii_phyint; + struct phyint *pi = pii->pii_phyint; + struct phyint *pi2; + boolean_t was_active; switch (failure_state(pii)) { case PHYINT_FAILURE: - (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); + was_active = ((pi->pi_flags & IFF_INACTIVE) == 0); + + (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); if (pi->pi_group == phyint_anongroup) { - logerr("NIC failure detected on %s\n", pii->pii_name); + logerr("IP interface failure detected on %s\n", + pii->pii_name); } else { - logerr("NIC failure detected on %s of group %s\n", - pii->pii_name, pi->pi_group->pg_name); + logerr("IP interface failure detected on %s of group" + " %s\n", pii->pii_name, pi->pi_group->pg_name); } + /* - * Do the failover, unless the interface is offline (in - * which case we've already failed over). + * If the interface is offline, the state change will be + * noted when it comes back online. */ if (pi->pi_state != PI_OFFLINE) { + /* + * If the failed interface was active, activate + * another INACTIVE interface in the group if + * possible. (If the interface is PI_OFFLINE, + * we already activated another.) + */ + if (was_active) + phyint_activate_another(pi); + phyint_chstate(pi, PI_FAILED); reset_crtt_all(pi); - if (!(pi->pi_flags & IFF_INACTIVE)) - (void) try_failover(pi, FAILOVER_NORMAL); } break; case GROUP_FAILURE: - logerr("All Interfaces in group %s have failed\n", - pi->pi_group->pg_name); - for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; - pi2 = pi2->pi_pgnext) { - if (pi2->pi_flags & IFF_OFFLINE) + pi2 = pi->pi_group->pg_phyint; + for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { + (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE); + if (pi2->pi_state == PI_OFFLINE) /* see comment above */ continue; - (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE); - reset_crtt_all(pi2); + reset_crtt_all(pi2); /* - * In the case of host targets, we - * would have flushed the targets, - * and gone to PI_NOTARGETS state. + * In the case of host targets, we would have flushed + * the targets, and gone to PI_NOTARGETS state. */ if (pi2->pi_state == PI_RUNNING) phyint_chstate(pi2, PI_FAILED); - - pi2->pi_empty = 0; - pi2->pi_full = 0; } break; @@ -1519,7 +1551,8 @@ phyint_inst_timer(struct phyint_instance *pii) hrtime_t cur_hrtime; int probe_interval = pii->pii_phyint->pi_group->pg_probeint; - cur_time = getcurrenttime(); + cur_hrtime = gethrtime(); + cur_time = ns2ms(cur_hrtime); if (debug & D_TIMER) { logdebug("phyint_inst_timer(%s %s)\n", @@ -1621,7 +1654,7 @@ phyint_inst_timer(struct phyint_instance *pii) * the failure detection (fd) probe timer has not yet fired. * Need to send only an rtt probe. The probe type is PROBE_RTT. */ - probe(pii, PROBE_RTT, cur_time); + probe(pii, PROBE_RTT, cur_hrtime); return (interval); } /* @@ -1651,7 +1684,7 @@ phyint_inst_timer(struct phyint_instance *pii) * We can have at most, the latest 2 probes that we sent, in * the PR_UNACKED state. All previous probes sent, are either * PR_LOST or PR_ACKED. An unacknowledged probe is considered - * timed out if the probe's time_sent + the CRTT < currenttime. + * timed out if the probe's time_start + the CRTT < currenttime. * For each of the last 2 probes, examine whether it has timed * out. If so, mark it PR_LOST. The probe stats is a circular array. */ @@ -1686,16 +1719,15 @@ phyint_inst_timer(struct phyint_instance *pii) * not available use group's probe interval, * which is a worst case estimate. */ + timeout = ns2ms(pr_statp->pr_hrtime_start); if (cur_tg->tg_crtt != 0) { - timeout = pr_statp->pr_time_sent + - cur_tg->tg_crtt; + timeout += cur_tg->tg_crtt; } else { - timeout = pr_statp->pr_time_sent + - probe_interval; + timeout += probe_interval; } if (TIME_LT(timeout, cur_time)) { - pr_statp->pr_status = PR_LOST; pr_statp->pr_time_lost = timeout; + probe_chstate(pr_statp, pii, PR_LOST); } else if (i == 1) { /* * We are forced to consider this probe @@ -1711,8 +1743,8 @@ phyint_inst_timer(struct phyint_instance *pii) * when the timer fires, we find 2 valid * unacked probes, and they are yet to timeout */ - pr_statp->pr_status = PR_LOST; pr_statp->pr_time_lost = cur_time; + probe_chstate(pr_statp, pii, PR_LOST); } else { /* * Only the most recent probe can enter @@ -1740,16 +1772,15 @@ phyint_inst_timer(struct phyint_instance *pii) * The timer has fired. Take appropriate action depending * on the current state of the phyint. * - * PI_RUNNING state - Failure detection and failover - * PI_FAILED state - Repair detection and failback + * PI_RUNNING state - Failure detection + * PI_FAILED state - Repair detection */ switch (pii->pii_phyint->pi_state) { case PI_FAILED: /* * If the most recent probe (excluding unacked probes that * are yet to time out) has been acked, check whether the - * phyint is now repaired. If the phyint is repaired, then - * attempt failback, unless it is an inactive standby. + * phyint is now repaired. */ if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { phyint_check_for_repair(pii->pii_phyint); @@ -1760,10 +1791,8 @@ phyint_inst_timer(struct phyint_instance *pii) /* * It's possible our probes have been lost because of a * spanning-tree mandated quiet period on the switch. If so, - * ignore the lost probes and consider the interface to still - * be functioning. + * ignore the lost probes. */ - cur_hrtime = gethrtime(); if (pii->pii_fd_hrtime - cur_hrtime > 0) break; @@ -1771,8 +1800,7 @@ phyint_inst_timer(struct phyint_instance *pii) /* * We have 1 or more failed probes (excluding unacked * probes that are yet to time out). Determine if the - * phyint has failed. If so attempt a failover, - * unless it is an inactive standby + * phyint has failed. */ phyint_inst_check_for_failure(pii); } @@ -1790,16 +1818,16 @@ phyint_inst_timer(struct phyint_instance *pii) * was called, the target list may be empty. */ if (pii->pii_target_next != NULL) { - probe(pii, PROBE_UNI, cur_time); + probe(pii, PROBE_UNI, cur_hrtime); /* * If we have just the one probe target, and we're not using * router targets, try to find another as we presently have * no resilience. */ if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) - probe(pii, PROBE_MULTI, cur_time); + probe(pii, PROBE_MULTI, cur_hrtime); } else { - probe(pii, PROBE_MULTI, cur_time); + probe(pii, PROBE_MULTI, cur_hrtime); } return (interval); } @@ -1859,8 +1887,8 @@ process_link_state_down(struct phyint *pi) /* * Clear the probe statistics arrays, we don't want the repair - * detection logic relying on probes that were succesful prior - * to the link going down. + * detection logic relying on probes that were successful prior + * to the link going down. */ if (PROBE_CAPABLE(pi->pi_v4)) clear_pii_probe_stats(pi->pi_v4); @@ -2016,7 +2044,7 @@ phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) pii->pii_target_next = target_next(cur_tg); } else { target_delete(cur_tg); - probe(pii, PROBE_MULTI, getcurrenttime()); + probe(pii, PROBE_MULTI, gethrtime()); } return (PHYINT_OK); } @@ -2065,13 +2093,13 @@ failure_state(struct phyint_instance *pii) struct probe_success_count psinfo; uint_t pi2_tls; /* time last success */ uint_t pi_tff; /* time first fail */ - struct phyint *pi2; + struct phyint *pi2; struct phyint *pi; struct phyint_instance *pii2; struct phyint_group *pg; - boolean_t alone; + int retval; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("phyint_failed(%s)\n", pii->pii_name); pi = pii->pii_phyint; @@ -2082,24 +2110,13 @@ failure_state(struct phyint_instance *pii) return (PHYINT_OK); /* - * At this point, the link is down, or the phyint is suspect, - * as it has lost NUM_PROBE_FAILS or more probes. If the phyint - * does not belong to any group, or is the only member of the - * group capable of being probed, return PHYINT_FAILURE. + * At this point, the link is down, or the phyint is suspect, as it + * has lost NUM_PROBE_FAILS or more probes. If the phyint does not + * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue + * on to determine whether this should be considered a PHYINT_FAILURE + * or GROUP_FAILURE. */ - alone = _B_TRUE; - if (pg != phyint_anongroup) { - for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { - if (pi2 == pi) - continue; - if (PROBE_CAPABLE(pi2->pi_v4) || - PROBE_CAPABLE(pi2->pi_v6)) { - alone = _B_FALSE; - break; - } - } - } - if (alone) + if (pg == phyint_anongroup) return (PHYINT_FAILURE); /* @@ -2116,6 +2133,7 @@ failure_state(struct phyint_instance *pii) * after it was received, so there is no point looking at the tls * of other phyints. */ + retval = GROUP_FAILURE; for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { /* Exclude ourself from comparison */ if (pi2 == pi) @@ -2123,76 +2141,86 @@ failure_state(struct phyint_instance *pii) if (LINK_DOWN(pi)) { /* - * We use FLAGS_TO_LINK_STATE() to test the - * flags directly, rather then LINK_UP() or - * LINK_DOWN(), as we may not have got round - * to processing the link state for the other - * phyints in the group yet. + * We use FLAGS_TO_LINK_STATE() to test the flags + * directly, rather then LINK_UP() or LINK_DOWN(), as + * we may not have got round to processing the link + * state for the other phyints in the group yet. * - * The check for PI_RUNNING and group - * failure handles the case when the - * group begins to recover. The first - * phyint to recover should not trigger - * a failover from the soon-to-recover - * other phyints to the first recovered - * phyint. PI_RUNNING will be set, and - * pg_groupfailed cleared only after - * receipt of NUM_PROBE_REPAIRS, by - * which time the other phyints should - * have received at least 1 packet, - * and so will not have NUM_PROBE_FAILS. + * The check for PI_RUNNING and group failure handles + * the case when the group begins to recover. + * PI_RUNNING will be set, and group failure cleared + * only after receipt of NUM_PROBE_REPAIRS, by which + * time the other phyints should have received at + * least 1 packet, and so will not have NUM_PROBE_FAILS. */ if ((pi2->pi_state == PI_RUNNING) && - !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) - return (PHYINT_FAILURE); - } else { - /* - * Need to compare against both IPv4 and - * IPv6 instances. - */ - pii2 = pi2->pi_v4; - if (pii2 != NULL) { - probe_success_info(pii2, NULL, &psinfo); - if (psinfo.ps_tls_valid) { - pi2_tls = psinfo.ps_tls; - /* - * See comment above regarding check - * for PI_RUNNING and group failure. - */ - if (TIME_GT(pi2_tls, pi_tff) && - (pi2->pi_state == PI_RUNNING) && - !GROUP_FAILED(pg) && - FLAGS_TO_LINK_STATE(pi2)) - return (PHYINT_FAILURE); + !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) { + retval = PHYINT_FAILURE; + break; + } + continue; + } + + if (LINK_DOWN(pi2)) + continue; + + /* + * If there's no probe-based failure detection on this + * interface, and its link is still up, then it's still + * working and thus the group has not failed. + */ + if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) { + retval = PHYINT_FAILURE; + break; + } + + /* + * Need to compare against both IPv4 and IPv6 instances. + */ + pii2 = pi2->pi_v4; + if (pii2 != NULL) { + probe_success_info(pii2, NULL, &psinfo); + if (psinfo.ps_tls_valid) { + pi2_tls = psinfo.ps_tls; + /* + * See comment above regarding check + * for PI_RUNNING and group failure. + */ + if (TIME_GT(pi2_tls, pi_tff) && + (pi2->pi_state == PI_RUNNING) && + !GROUP_FAILED(pg) && + FLAGS_TO_LINK_STATE(pi2)) { + retval = PHYINT_FAILURE; + break; } } + } - pii2 = pi2->pi_v6; - if (pii2 != NULL) { - probe_success_info(pii2, NULL, &psinfo); - if (psinfo.ps_tls_valid) { - pi2_tls = psinfo.ps_tls; - /* - * See comment above regarding check - * for PI_RUNNING and group failure. - */ - if (TIME_GT(pi2_tls, pi_tff) && - (pi2->pi_state == PI_RUNNING) && - !GROUP_FAILED(pg) && - FLAGS_TO_LINK_STATE(pi2)) - return (PHYINT_FAILURE); + pii2 = pi2->pi_v6; + if (pii2 != NULL) { + probe_success_info(pii2, NULL, &psinfo); + if (psinfo.ps_tls_valid) { + pi2_tls = psinfo.ps_tls; + /* + * See comment above regarding check + * for PI_RUNNING and group failure. + */ + if (TIME_GT(pi2_tls, pi_tff) && + (pi2->pi_state == PI_RUNNING) && + !GROUP_FAILED(pg) && + FLAGS_TO_LINK_STATE(pi2)) { + retval = PHYINT_FAILURE; + break; } } } } /* - * Change the group state to PG_FAILED if it's not already. + * Update the group state to account for the changes. */ - if (!GROUP_FAILED(pg)) - phyint_group_chstate(pg, PG_FAILED); - - return (GROUP_FAILURE); + phyint_group_refresh_state(pg); + return (retval); } /* @@ -2215,7 +2243,7 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, uint_t timeout; struct target *tg; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("probe_success_info(%s)\n", pii->pii_name); bzero(psinfo, sizeof (*psinfo)); @@ -2248,10 +2276,11 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, * not available use the value of the group's probe * interval which is a worst case estimate. */ + timeout = ns2ms(pr_statp->pr_hrtime_start); if (tg->tg_crtt != 0) { - timeout = pr_statp->pr_time_sent + tg->tg_crtt; + timeout += tg->tg_crtt; } else { - timeout = pr_statp->pr_time_sent + + timeout += pii->pii_phyint->pi_group->pg_probeint; } @@ -2261,7 +2290,7 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, * recent consecutive successes. */ pr_statp->pr_time_lost = timeout; - pr_statp->pr_status = PR_LOST; + probe_chstate(pr_statp, pii, PR_LOST); pi_found_failure = _B_TRUE; if (cur_tg != NULL && tg == cur_tg) { /* @@ -2292,7 +2321,8 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, * the most recent probe success. */ if (!psinfo->ps_tls_valid) { - psinfo->ps_tls = pr_statp->pr_time_acked; + psinfo->ps_tls = + ns2ms(pr_statp->pr_hrtime_ackproc); psinfo->ps_tls_valid = _B_TRUE; } break; @@ -2339,7 +2369,7 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, uint_t timeout; struct target *tg; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("probe_fail_info(%s)\n", pii->pii_name); bzero(pfinfo, sizeof (*pfinfo)); @@ -2377,10 +2407,11 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, * not available use the group's probe interval, * which is a worst case estimate. */ + timeout = ns2ms(pr_statp->pr_hrtime_start); if (tg->tg_crtt != 0) { - timeout = pr_statp->pr_time_sent + tg->tg_crtt; + timeout += tg->tg_crtt; } else { - timeout = pr_statp->pr_time_sent + + timeout += pii->pii_phyint->pi_group->pg_probeint; } @@ -2388,7 +2419,7 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, break; pr_statp->pr_time_lost = timeout; - pr_statp->pr_status = PR_LOST; + probe_chstate(pr_statp, pii, PR_LOST); /* FALLTHRU */ case PR_LOST: @@ -2421,6 +2452,19 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, } /* + * Change the state of probe `pr' on phyint_instance `pii' to state `state'. + */ +void +probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state) +{ + if (pr->pr_status == state) + return; + + pr->pr_status = state; + (void) probe_state_event(pr, pii); +} + +/* * Check if the phyint has been repaired. If no test address has been * configured, then consider the interface repaired if the link is up (unless * the link is flapping; see below). Otherwise, look for proof of probes @@ -2436,7 +2480,7 @@ phyint_repaired(struct phyint *pi) int pr_ndx; uint_t cur_time; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("phyint_repaired(%s)\n", pi->pi_name); if (LINK_DOWN(pi)) @@ -2458,7 +2502,7 @@ phyint_repaired(struct phyint *pi) } if (!pi->pi_lfmsg_printed) { logerr("The link has come up on %s more than %d times " - "in the last minute; disabling failback until it " + "in the last minute; disabling repair until it " "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); pi->pi_lfmsg_printed = 1; } @@ -2490,354 +2534,41 @@ phyint_repaired(struct phyint *pi) } /* - * Try failover from phyint 'pi' to a suitable destination. - */ -int -try_failover(struct phyint *pi, int failover_type) -{ - struct phyint *dst; - int err; - - if (debug & D_FAILOVER) - logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type); - - /* - * Attempt to find a failover destination 'dst'. - * dst will be null if any of the following is true - * Phyint is not part of a group OR - * Phyint is the only member of a group OR - * No suitable failover dst was available - */ - dst = get_failover_dst(pi, failover_type); - if (dst == NULL) - return (IPMP_EMINRED); - - dst->pi_empty = 0; /* Per state diagram */ - pi->pi_full = 0; /* Per state diagram */ - - err = failover(pi, dst); - - if (debug & D_FAILOVER) { - logdebug("failed over from %s to %s ret %d\n", - pi->pi_name, dst->pi_name, err); - } - if (err == 0) { - pi->pi_empty = 1; /* Per state diagram */ - /* - * we don't want to print out this message if a - * phyint is leaving the group, nor for failover from - * standby - */ - if (failover_type == FAILOVER_NORMAL) { - logerr("Successfully failed over from NIC %s to NIC " - "%s\n", pi->pi_name, dst->pi_name); - } - return (0); - } else { - /* - * The failover did not succeed. We must retry the failover - * only after resyncing our state based on the kernel's. - * For eg. either the src or the dst might have been unplumbed - * causing this failure. initifs() will be called again, - * from main, since full_scan_required has been set to true - * by failover(); - */ - return (IPMP_FAILURE); - } -} - -/* - * global_errno captures the errno value, if failover() or failback() - * fails. This is sent to if_mpadm(1M). - */ -int global_errno; - -/* - * Attempt failover from phyint 'from' to phyint 'to'. - * IP moves everything from phyint 'from' to phyint 'to'. - */ -static int -failover(struct phyint *from, struct phyint *to) -{ - struct lifreq lifr; - int ret; - - if (debug & D_FAILOVER) { - logdebug("failing over from %s to %s\n", - from->pi_name, to->pi_name); - } - - /* - * Perform the failover. Both IPv4 and IPv6 are failed over - * using a single ioctl by passing in AF_UNSPEC family. - */ - lifr.lifr_addr.ss_family = AF_UNSPEC; - (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_movetoindex = to->pi_ifindex; - - ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr); - if (ret < 0) { - global_errno = errno; - logperror("failover: ioctl (failover)"); - } - - /* - * Set full_scan_required to true. This will make us read - * the state from the kernel in initifs() and update our tables, - * to reflect the current state after the failover. If the - * failover has failed it will then reissue the failover. - */ - full_scan_required = _B_TRUE; - return (ret); -} - -/* - * phyint 'pi' has recovered. Attempt failback from every phyint in the same - * group as phyint 'pi' that is a potential failback source, to phyint 'pi'. - * Return values: - * IPMP_SUCCESS: Failback successful from each of the other - * phyints in the group. - * IPMP_EFBPARTIAL: Failback successful from some of the other - * phyints in the group. - * IPMP_FAILURE: Failback syscall failed with some error. - * - * Note that failback is attempted regardless of the setting of the - * failback_enabled flag. - */ -int -do_failback(struct phyint *pi) -{ - struct phyint *from; - boolean_t done; - boolean_t partial; - boolean_t attempted_failback = _B_FALSE; - - if (debug & D_FAILOVER) - logdebug("do_failback(%s)\n", pi->pi_name); - - /* If this phyint is not part of a named group, return. */ - if (pi->pi_group == phyint_anongroup) { - pi->pi_full = 1; - return (IPMP_SUCCESS); - } - - /* - * Attempt failback from every phyint in the group to 'pi'. - * The reason for doing this, instead of only from the - * phyint to which we did the failover is given below. - * - * After 'pi' failed, if any app. tries to join on a multicast - * address (IPv6), on the failed phyint, IP picks any arbitrary - * non-failed phyint in the group, instead of the failed phyint, - * in.mpathd is not aware of this. Thus failing back only from the - * interface to which 'pi' failed over, will failback the ipif's - * but not the ilm's. So we need to failback from all members of - * the phyint group - */ - done = _B_TRUE; - partial = _B_FALSE; - for (from = pi->pi_group->pg_phyint; from != NULL; - from = from->pi_pgnext) { - /* Exclude ourself as a failback src */ - if (from == pi) - continue; - - /* - * If the 'from' phyint has IPv4 plumbed, the 'to' - * phyint must also have IPv4 plumbed. Similar check - * for IPv6. IP makes the same check. Otherwise the - * failback will fail. - */ - if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) || - (from->pi_v6 != NULL && pi->pi_v6 == NULL)) { - partial = _B_TRUE; - continue; - } - - pi->pi_empty = 0; /* Per state diagram */ - attempted_failback = _B_TRUE; - if (failback(from, pi) != 0) { - done = _B_FALSE; - break; - } - } - - /* - * We are done. No more phyint from which we can src the failback - */ - if (done) { - if (!partial) - pi->pi_full = 1; /* Per state diagram */ - /* - * Don't print out a message unless there is a - * transition from FAILED to RUNNING. For eg. - * we don't want to print out this message if a - * phyint is leaving the group, or at startup - */ - if (attempted_failback && (pi->pi_flags & - (IFF_FAILED | IFF_OFFLINE))) { - logerr("Successfully failed back to NIC %s\n", - pi->pi_name); - } - return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); - } - - return (IPMP_FAILURE); -} - -/* - * This function is similar to do_failback() above, but respects the - * failback_enabled flag for phyints in named groups. - */ -int -try_failback(struct phyint *pi) -{ - if (debug & D_FAILOVER) - logdebug("try_failback(%s)\n", pi->pi_name); - - if (pi->pi_group != phyint_anongroup && !failback_enabled) - return (IPMP_EFBDISABLED); - - return (do_failback(pi)); -} - -/* - * Failback everything from phyint 'from' that has the same ifindex - * as phyint to's ifindex. - */ -static int -failback(struct phyint *from, struct phyint *to) -{ - struct lifreq lifr; - int ret; - - if (debug & D_FAILOVER) - logdebug("failback(%s %s)\n", from->pi_name, to->pi_name); - - lifr.lifr_addr.ss_family = AF_UNSPEC; - (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_movetoindex = to->pi_ifindex; - - ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr); - if (ret < 0) { - global_errno = errno; - logperror("failback: ioctl (failback)"); - } - - /* - * Set full_scan_required to true. This will make us read - * the state from the kernel in initifs() and update our tables, - * to reflect the current state after the failback. If the - * failback has failed it will then reissue the failback. - */ - full_scan_required = _B_TRUE; - - return (ret); -} - -/* - * Select a target phyint for failing over from 'pi'. - * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred - * target phyint is chosen as follows, - * 1. Pick any inactive standby interface. - * 2. If no inactive standby is available, select any phyint in the - * same group that has the least number of logints, (excluding - * IFF_NOFAILOVER and !IFF_UP logints) - * If we are failing over from a standby, failover_type is - * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination. - * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY, - * and we won't return NULL, as long as there is at least 1 other phyint - * in the group. - */ -static struct phyint * -get_failover_dst(struct phyint *pi, int failover_type) -{ - struct phyint *maybe = NULL; - struct phyint *pi2; - struct phyint *last_choice = NULL; - - if (pi->pi_group == phyint_anongroup) - return (NULL); - - /* - * Loop thru the phyints in the group, and pick the preferred - * phyint for the target. - */ - for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { - /* Exclude ourself and offlined interfaces */ - if (pi2 == pi || pi2->pi_state == PI_OFFLINE) - continue; - - /* - * The chosen target phyint must have IPv4 instance - * plumbed, if the src phyint has IPv4 plumbed. Similarly - * for IPv6. - */ - if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) || - (pi2->pi_v6 == NULL && pi->pi_v6 != NULL)) - continue; - - /* The chosen target must be PI_RUNNING. */ - if (pi2->pi_state != PI_RUNNING) { - last_choice = pi2; - continue; - } - - if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) && - (failover_type != FAILOVER_TO_NONSTANDBY)) { - return (pi2); - } else { - if (maybe == NULL) - maybe = pi2; - else if (logint_upcount(pi2) < logint_upcount(maybe)) - maybe = pi2; - } - } - if (maybe == NULL && failover_type == FAILOVER_TO_ANY) - return (last_choice); - else - return (maybe); -} - -/* * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. */ boolean_t -change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) +change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear) { int ifsock; struct lifreq lifr; uint64_t old_flags; - if (debug & D_FAILOVER) { - logdebug("change_lif_flags(%s): flags %llx setfl %d\n", - pi->pi_name, flags, (int)setfl); + if (debug & D_FAILREP) { + logdebug("change_pif_flags(%s): set %llx clear %llx\n", + pi->pi_name, set, clear); } - if (pi->pi_v4 != NULL) { + if (pi->pi_v4 != NULL) ifsock = ifsock_v4; - } else { + else ifsock = ifsock_v6; - } /* * Get the current flags from the kernel, and set/clear the * desired phyint flags. Since we set only phyint flags, we can * do it on either IPv4 or IPv6 instance. */ - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); + if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) - logperror("change_lif_flags: ioctl (get flags)"); + logperror("change_pif_flags: ioctl (get flags)"); return (_B_FALSE); } old_flags = lifr.lifr_flags; - if (setfl) - lifr.lifr_flags |= flags; - else - lifr.lifr_flags &= ~flags; + lifr.lifr_flags |= set; + lifr.lifr_flags &= ~clear; if (old_flags == lifr.lifr_flags) { /* No change in the flags. No need to send ioctl */ @@ -2846,7 +2577,7 @@ change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) - logperror("change_lif_flags: ioctl (set flags)"); + logperror("change_pif_flags: ioctl (set flags)"); return (_B_FALSE); } @@ -2854,15 +2585,13 @@ change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) * Keep pi_flags in synch. with actual flags. Assumes flags are * phyint flags. */ - if (setfl) - pi->pi_flags |= flags; - else - pi->pi_flags &= ~flags; + pi->pi_flags |= set; + pi->pi_flags &= ~clear; - if (pi->pi_v4) + if (pi->pi_v4 != NULL) pi->pi_v4->pii_flags = pi->pi_flags; - if (pi->pi_v6) + if (pi->pi_v6 != NULL) pi->pi_v6->pii_flags = pi->pi_flags; return (_B_TRUE); @@ -2928,18 +2657,31 @@ reset_snxt_basetimes(void) * and it is up, it is not possible to detect the interface failure. * SIOCTMYADDR also doesn't consider local zone address as own address. * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they - * are stored in laddr_list. + * are stored in `localaddrs' */ - boolean_t own_address(struct in6_addr addr) { - struct local_addr *taddr = laddr_list; + addrlist_t *addrp; + struct sockaddr_storage ss; + int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6; - for (; taddr != NULL; taddr = taddr->next) { - if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) { + addr2storage(af, &addr, &ss); + for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) { + if (sockaddrcmp(&ss, &addrp->al_addr)) return (_B_TRUE); - } } return (_B_FALSE); } + +static int +ns2ms(int64_t ns) +{ + return (ns / (NANOSEC / MILLISEC)); +} + +static int64_t +tv2ns(struct timeval *tvp) +{ + return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000); +} diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c index b56648cf12..def08d39ce 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mpd_defs.h" #include "mpd_tables.h" @@ -47,11 +45,7 @@ static void phyint_inst_print(struct phyint_instance *pii); static void phyint_insert(struct phyint *pi, struct phyint_group *pg); static void phyint_delete(struct phyint *pi); - -static void phyint_group_insert(struct phyint_group *pg); -static void phyint_group_delete(struct phyint_group *pg); -static struct phyint_group *phyint_group_lookup(const char *pg_name); -static struct phyint_group *phyint_group_create(const char *pg_name); +static boolean_t phyint_is_usable(struct phyint *pi); static void logint_print(struct logint *li); static void logint_insert(struct phyint_instance *pii, struct logint *li); @@ -68,16 +62,13 @@ static void reset_pii_probes(struct phyint_instance *pii, struct target *tg); static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii); static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii); -static void ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask); -static boolean_t prefix_equal(struct in6_addr p1, struct in6_addr p2, - int prefix_len); - static int phyint_state_event(struct phyint_group *pg, struct phyint *pi); static int phyint_group_state_event(struct phyint_group *pg); static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t); static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi, ipmp_if_op_t op); +static int logint_upcount(struct phyint *pi); static uint64_t gensig(void); /* Initialize any per-file global state. Returns 0 on success, -1 on failure */ @@ -110,6 +101,183 @@ phyint_lookup(const char *name) return (pi); } +/* + * Lookup a phyint in the group that has the same hardware address as `pi', or + * NULL if there's none. If `online_only' is set, then only online phyints + * are considered when matching. Otherwise, phyints that had been offlined + * due to a duplicate hardware address will also be considered. + */ +static struct phyint * +phyint_lookup_hwaddr(struct phyint *pi, boolean_t online_only) +{ + struct phyint *pi2; + + if (pi->pi_group == phyint_anongroup) + return (NULL); + + for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (pi2 == pi) + continue; + + /* + * NOTE: even when online_only is B_FALSE, we ignore phyints + * that are administratively offline (rather than offline + * because they're dups); when they're brought back online, + * they'll be flagged as dups if need be. + */ + if (pi2->pi_state == PI_OFFLINE && + (online_only || !pi2->pi_hwaddrdup)) + continue; + + if (pi2->pi_hwaddrlen == pi->pi_hwaddrlen && + bcmp(pi2->pi_hwaddr, pi->pi_hwaddr, pi->pi_hwaddrlen) == 0) + return (pi2); + } + return (NULL); +} + +/* + * Respond to DLPI notifications. Currently, this only processes physical + * address changes for the phyint passed via `arg' by onlining or offlining + * phyints in the group. + */ +/* ARGSUSED */ +static void +phyint_link_notify(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg) +{ + struct phyint *pi = arg; + struct phyint *oduppi = NULL, *duppi = NULL; + + assert((dnip->dni_note & pi->pi_notes) != 0); + + if (dnip->dni_note != DL_NOTE_PHYS_ADDR) + return; + + assert(dnip->dni_physaddrlen <= DLPI_PHYSADDR_MAX); + + /* + * If our hardware address hasn't changed, there's nothing to do. + */ + if (pi->pi_hwaddrlen == dnip->dni_physaddrlen && + bcmp(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen) == 0) + return; + + oduppi = phyint_lookup_hwaddr(pi, _B_FALSE); + pi->pi_hwaddrlen = dnip->dni_physaddrlen; + (void) memcpy(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen); + duppi = phyint_lookup_hwaddr(pi, _B_FALSE); + + if (oduppi != NULL || pi->pi_hwaddrdup) { + /* + * Our old hardware address was a duplicate. If we'd been + * offlined because of it, and our new hardware address is not + * a duplicate, then bring us online. Otherwise, `oduppi' + * must've been the one brought offline; bring it online. + */ + if (pi->pi_hwaddrdup) { + if (duppi == NULL) + (void) phyint_undo_offline(pi); + } else { + assert(oduppi->pi_hwaddrdup); + (void) phyint_undo_offline(oduppi); + } + } + + if (duppi != NULL && !pi->pi_hwaddrdup) { + /* + * Our new hardware address was a duplicate and we're not + * yet flagged as a duplicate; bring us offline. + */ + pi->pi_hwaddrdup = _B_TRUE; + (void) phyint_offline(pi, 0); + } +} + +/* + * Initialize information about the underlying link for `pi', and set us + * up to be notified about future changes. Returns _B_TRUE on success. + */ +boolean_t +phyint_link_init(struct phyint *pi) +{ + int retval; + uint_t notes; + const char *errmsg; + dlpi_notifyid_t id; + + pi->pi_notes = 0; + retval = dlpi_open(pi->pi_name, &pi->pi_dh, 0); + if (retval != DLPI_SUCCESS) { + pi->pi_dh = NULL; + errmsg = "cannot open"; + goto failed; + } + + pi->pi_hwaddrlen = DLPI_PHYSADDR_MAX; + retval = dlpi_get_physaddr(pi->pi_dh, DL_CURR_PHYS_ADDR, pi->pi_hwaddr, + &pi->pi_hwaddrlen); + if (retval != DLPI_SUCCESS) { + errmsg = "cannot get hardware address"; + goto failed; + } + + retval = dlpi_bind(pi->pi_dh, DLPI_ANY_SAP, NULL); + if (retval != DLPI_SUCCESS) { + errmsg = "cannot bind to DLPI_ANY_SAP"; + goto failed; + } + + /* + * Check if the link supports DLPI link state notifications. For + * historical reasons, the actual changes are tracked through routing + * sockets, so we immediately disable the notification upon success. + */ + notes = DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN; + retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id); + if (retval == DLPI_SUCCESS) { + (void) dlpi_disabnotify(pi->pi_dh, id, NULL); + pi->pi_notes |= notes; + } + + /* + * Enable notification of hardware address changes to keep pi_hwaddr + * up-to-date and track if we need to offline/undo-offline phyints. + */ + notes = DL_NOTE_PHYS_ADDR; + retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id); + if (retval == DLPI_SUCCESS && poll_add(dlpi_fd(pi->pi_dh)) == 0) + pi->pi_notes |= notes; + + return (_B_TRUE); +failed: + logerr("%s: %s: %s\n", pi->pi_name, errmsg, dlpi_strerror(retval)); + if (pi->pi_dh != NULL) { + dlpi_close(pi->pi_dh); + pi->pi_dh = NULL; + } + return (_B_FALSE); +} + +/* + * Close use of link on `pi'. + */ +void +phyint_link_close(struct phyint *pi) +{ + if (pi->pi_notes & DL_NOTE_PHYS_ADDR) { + (void) poll_remove(dlpi_fd(pi->pi_dh)); + pi->pi_notes &= ~DL_NOTE_PHYS_ADDR; + } + + /* + * NOTE: we don't clear pi_notes here so that iflinkstate() can still + * properly report the link state even when offline (which is possible + * since we use IFF_RUNNING to track link state). + */ + dlpi_close(pi->pi_dh); + pi->pi_dh = NULL; +} + /* Return the phyint instance with the given name and the given family */ struct phyint_instance * phyint_inst_lookup(int af, char *name) @@ -128,7 +296,7 @@ phyint_inst_lookup(int af, char *name) return (PHYINT_INSTANCE(pi, af)); } -static struct phyint_group * +struct phyint_group * phyint_group_lookup(const char *pg_name) { struct phyint_group *pg; @@ -173,6 +341,9 @@ phyint_insert(struct phyint *pi, struct phyint_group *pg) pi->pi_pgnext->pi_pgprev = pi; pg->pg_phyint = pi; + /* Refresh the group state now that this phyint has been added */ + phyint_group_refresh_state(pg); + pg->pg_sig++; (void) phyint_group_member_event(pg, pi, IPMP_IF_ADD); } @@ -214,24 +385,24 @@ phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex, } /* - * Record the phyint values. Also insert the phyint into the - * phyint group by calling phyint_insert(). + * Record the phyint values. */ (void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name)); pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME; pi->pi_ifindex = ifindex; - pi->pi_icmpid = - htons(((getpid() & 0xFF) << 8) | (pi->pi_ifindex & 0xFF)); + pi->pi_icmpid = htons(((getpid() & 0xFF) << 8) | (ifindex & 0xFF)); + /* - * We optimistically start in the PI_RUNNING state. Later (in - * process_link_state_changes()), we will readjust this to match the + * If the interface is offline, we set the state to PI_OFFLINE. + * Otherwise, we optimistically start in the PI_RUNNING state. Later + * (in process_link_state_changes()), we will adjust this to match the * current state of the link. Further, if test addresses are * subsequently assigned, we will transition to PI_NOTARGETS and then - * either PI_RUNNING or PI_FAILED, depending on the result of the test - * probes. + * to either PI_RUNNING or PI_FAILED depending on the probe results. */ - pi->pi_state = PI_RUNNING; + pi->pi_state = (flags & IFF_OFFLINE) ? PI_OFFLINE : PI_RUNNING; pi->pi_flags = PHYINT_FLAGS(flags); + /* * Initialise the link state. The link state is initialised to * up, so that if the link is down when IPMP starts monitoring @@ -241,19 +412,17 @@ phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex, */ INIT_LINK_STATE(pi); + if (!phyint_link_init(pi)) { + free(pi); + return (NULL); + } + /* * Insert the phyint in the list of all phyints, and the * list of phyint group members */ phyint_insert(pi, pg); - /* - * If we are joining a failed group, mark the interface as - * failed. - */ - if (GROUP_FAILED(pg)) - (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); - return (pi); } @@ -313,15 +482,14 @@ phyint_chstate(struct phyint *pi, enum pi_state state) return; pi->pi_state = state; - pi->pi_group->pg_sig++; - (void) phyint_state_event(pi->pi_group, pi); + phyint_changed(pi); } /* - * Note that the type of phyint `pi' has changed. + * Note that `pi' has changed state. */ void -phyint_newtype(struct phyint *pi) +phyint_changed(struct phyint *pi) { pi->pi_group->pg_sig++; (void) phyint_state_event(pi->pi_group, pi); @@ -331,7 +499,7 @@ phyint_newtype(struct phyint *pi) * Insert the phyint group in the linked list of all phyint groups * at the head of the list */ -static void +void phyint_group_insert(struct phyint_group *pg) { pg->pg_next = phyint_groups; @@ -347,7 +515,7 @@ phyint_group_insert(struct phyint_group *pg) /* * Create a new phyint group called 'name'. */ -static struct phyint_group * +struct phyint_group * phyint_group_create(const char *name) { struct phyint_group *pg; @@ -363,9 +531,16 @@ phyint_group_create(const char *name) (void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name)); pg->pg_sig = gensig(); - pg->pg_fdt = user_failure_detection_time; pg->pg_probeint = user_probe_interval; + pg->pg_in_use = _B_TRUE; + + /* + * Normal groups always start in the PG_FAILED state since they + * have no active interfaces. In contrast, anonymous groups are + * heterogeneous and thus always PG_OK. + */ + pg->pg_state = (name[0] == '\0' ? PG_OK : PG_FAILED); return (pg); } @@ -378,10 +553,20 @@ phyint_group_chstate(struct phyint_group *pg, enum pg_state state) { assert(pg != phyint_anongroup); + /* + * To simplify things, some callers always set a given state + * regardless of the previous state of the group (e.g., setting + * PG_DEGRADED when it's already set). We shouldn't bother + * generating an event or consuming a signature for these, since + * the actual state of the group is unchanged. + */ + if (pg->pg_state == state) + return; + + pg->pg_state = state; + switch (state) { case PG_FAILED: - pg->pg_groupfailed = 1; - /* * We can never know with certainty that a group has * failed. It is possible that all known targets have @@ -392,16 +577,15 @@ phyint_group_chstate(struct phyint_group *pg, enum pg_state state) * hosts, we have to discover it by multicast. So flush * all the host targets. The next probe will send out a * multicast echo request. If this is a group failure, we - * will still not see any response, otherwise we will - * clear the pg_groupfailed flag after we get - * NUM_PROBE_REPAIRS consecutive unicast replies on any - * phyint. + * will still not see any response, otherwise the group + * will be repaired after we get NUM_PROBE_REPAIRS + * consecutive unicast replies on any phyint. */ target_flush_hosts(pg); break; - case PG_RUNNING: - pg->pg_groupfailed = 0; + case PG_OK: + case PG_DEGRADED: break; default: @@ -432,7 +616,6 @@ phyint_inst_init_from_k(int af, char *pi_name) struct lifreq lifr; struct phyint *pi; struct phyint_instance *pii; - boolean_t pg_created; boolean_t pi_created; struct phyint_group *pg; @@ -441,7 +624,6 @@ retry: pi = NULL; pg = NULL; pi_created = _B_FALSE; - pg_created = _B_FALSE; if (debug & D_PHYINT) { logdebug("phyint_inst_init_from_k(%s %s)\n", @@ -454,11 +636,11 @@ retry: ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6; /* - * Get the interface flags. Ignore loopback and multipoint - * interfaces. + * Get the interface flags. Ignore virtual interfaces, IPMP + * meta-interfaces, point-to-point interfaces, and interfaces + * that can't support multicast. */ - (void) strncpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; + (void) strlcpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name)); if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) { logperror("phyint_inst_init_from_k:" @@ -467,7 +649,8 @@ retry: return (NULL); } flags = lifr.lifr_flags; - if (!(flags & IFF_MULTICAST) || (flags & IFF_LOOPBACK)) + if (!(flags & IFF_MULTICAST) || + (flags & (IFF_VIRTUAL|IFF_IPMP|IFF_POINTOPOINT))) return (NULL); /* @@ -493,8 +676,7 @@ retry: } return (NULL); } - (void) strncpy(pg_name, lifr.lifr_groupname, sizeof (pg_name)); - pg_name[sizeof (pg_name) - 1] = '\0'; + (void) strlcpy(pg_name, lifr.lifr_groupname, sizeof (pg_name)); /* * If the phyint is not part of any group, pg_name is the @@ -503,12 +685,13 @@ retry: */ if (pg_name[0] == '\0' && !track_all_phyints) { /* - * If the IFF_FAILED or IFF_OFFLINE flags are set, reset - * them. These flags shouldn't be set if IPMP isn't - * tracking the interface. + * If the IFF_FAILED, IFF_INACTIVE, or IFF_OFFLINE flags are + * set, reset them. These flags shouldn't be set if in.mpathd + * isn't tracking the interface. */ - if ((flags & (IFF_FAILED | IFF_OFFLINE)) != 0) { - lifr.lifr_flags = flags & ~(IFF_FAILED | IFF_OFFLINE); + if ((flags & (IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE))) { + lifr.lifr_flags = flags & + ~(IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE); if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) { logperror("phyint_inst_init_from_k:" @@ -520,21 +703,20 @@ retry: } /* - * We need to create a new phyint instance. A phyint instance - * belongs to a phyint, and the phyint belongs to a phyint group. - * So we first lookup the 'parents' and if they don't exist then - * we create them. + * We need to create a new phyint instance. We may also need to + * create the group if e.g. the SIOCGLIFCONF loop in initifs() found + * an underlying interface before it found its IPMP meta-interface. + * Note that we keep any created groups even if phyint_inst_from_k() + * fails since a group's existence is not dependent on the ability of + * in.mpathd to the track the group's interfaces. */ - pg = phyint_group_lookup(pg_name); - if (pg == NULL) { - pg = phyint_group_create(pg_name); - if (pg == NULL) { - logerr("phyint_inst_init_from_k:" - " unable to create group %s\n", pg_name); + if ((pg = phyint_group_lookup(pg_name)) == NULL) { + if ((pg = phyint_group_create(pg_name)) == NULL) { + logerr("phyint_inst_init_from_k: cannot create group " + "%s\n", pg_name); return (NULL); } phyint_group_insert(pg); - pg_created = _B_TRUE; } /* @@ -546,8 +728,6 @@ retry: if (pi == NULL) { logerr("phyint_inst_init_from_k:" " unable to create phyint %s\n", pi_name); - if (pg_created) - phyint_group_delete(pg); return (NULL); } pi_created = _B_TRUE; @@ -564,8 +744,6 @@ retry: * while we are yet to update our tables. Do it now. */ if (pi->pi_ifindex != ifindex) { - if (pg_created) - phyint_group_delete(pg); phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af))); goto retry; } @@ -577,9 +755,6 @@ retry: * changed, while we are yet to update our tables. Do it now. */ if (strcmp(pi->pi_group->pg_name, pg_name) != 0) { - if (pg_created) - phyint_group_delete(pg); - restore_phyint(pi); phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af))); goto retry; @@ -594,16 +769,25 @@ retry: if (pii == NULL) { logerr("phyint_inst_init_from_k: unable to create" "phyint inst %s\n", pi->pi_name); - if (pi_created) { - /* - * Deleting the phyint will delete the phyint group - * if this is the last phyint in the group. - */ + if (pi_created) phyint_delete(pi); - } + return (NULL); } + if (pi_created) { + /* + * If this phyint does not have a unique hardware address in its + * group, offline it. (The change_pif_flags() implementation + * requires that we defer this until after the phyint_instance + * is created.) + */ + if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) { + pi->pi_hwaddrdup = _B_TRUE; + (void) phyint_offline(pi, 0); + } + } + return (pii); } @@ -677,16 +861,16 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) { icmp6_filter_t filter; int hopcount = 1; - int int_op; + int off = 0; + int on = 1; struct sockaddr_in6 testaddr; /* * Open a raw socket with ICMPv6 protocol. * - * Use IPV6_DONTFAILOVER_IF to make sure that probes go out - * on the specified phyint only, and are not subject to load - * balancing. Bind to the src address chosen will ensure that - * the responses are received only on the specified phyint. + * Use IPV6_BOUND_IF to make sure that probes are sent and received on + * the specified phyint only. Bind to the test address to ensure that + * the responses are sent to the specified phyint. * * Set the hopcount to 1 so that probe packets are not routed. * Disable multicast loopback. Set the receive filter to @@ -696,7 +880,7 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) if (pii->pii_probe_sock < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: socket"); return (_B_FALSE); -} + } bzero(&testaddr, sizeof (testaddr)); testaddr.sin6_family = AF_INET6; @@ -709,14 +893,17 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - /* - * IPV6_DONTFAILOVER_IF option takes precedence over setting - * IP_MULTICAST_IF. So we don't set IPV6_MULTICAST_IF again. - */ - if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_DONTFAILOVER_IF, + if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_IF, (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" - " IPV6_DONTFAILOVER_IF"); + " IPV6_MULTICAST_IF"); + return (_B_FALSE); + } + + if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_BOUND_IF, + &pii->pii_ifindex, sizeof (uint_t)) < 0) { + logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" + " IPV6_BOUND_IF"); return (_B_FALSE); } @@ -734,9 +921,8 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - int_op = 0; /* used to turn off option */ if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, - (char *)&int_op, sizeof (int_op)) < 0) { + (char *)&off, sizeof (off)) < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" " IPV6_MULTICAST_LOOP"); return (_B_FALSE); @@ -755,15 +941,22 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - /* Enable receipt of ancillary data */ - int_op = 1; + /* Enable receipt of hoplimit */ if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT, - (char *)&int_op, sizeof (int_op)) < 0) { + &on, sizeof (on)) < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" " IPV6_RECVHOPLIMIT"); return (_B_FALSE); } + /* Enable receipt of timestamp */ + if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, + &on, sizeof (on)) < 0) { + logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" + " SO_TIMESTAMP"); + return (_B_FALSE); + } + return (_B_TRUE); } @@ -775,20 +968,20 @@ static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii) { struct sockaddr_in testaddr; - char char_op; + char char_off = 0; int ttl = 1; char char_ttl = 1; + int on = 1; /* * Open a raw socket with ICMPv4 protocol. * - * Use IP_DONTFAILOVER_IF to make sure that probes go out - * on the specified phyint only, and are not subject to load - * balancing. Bind to the src address chosen will ensure that - * the responses are received only on the specified phyint. + * Use IP_BOUND_IF to make sure that probes are sent and received on + * the specified phyint only. Bind to the test address to ensure that + * the responses are sent to the specified phyint. * * Set the ttl to 1 so that probe packets are not routed. - * Disable multicast loopback. + * Disable multicast loopback. Enable receipt of timestamp. */ pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP); if (pii->pii_probe_sock < 0) { @@ -808,14 +1001,17 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - /* - * IP_DONTFAILOVER_IF option takes precedence over setting - * IP_MULTICAST_IF. So we don't set IP_MULTICAST_IF again. - */ - if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_DONTFAILOVER_IF, + if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_BOUND_IF, + &pii->pii_ifindex, sizeof (uint_t)) < 0) { + logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" + " IP_BOUND_IF"); + return (_B_FALSE); + } + + if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_IF, (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) { logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" - " IP_DONTFAILOVER"); + " IP_MULTICAST_IF"); return (_B_FALSE); } @@ -826,9 +1022,8 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - char_op = 0; /* used to turn off option */ if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP, - (char *)&char_op, sizeof (char_op)) == -1) { + (char *)&char_off, sizeof (char_off)) == -1) { logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" " IP_MULTICAST_LOOP"); return (_B_FALSE); @@ -841,6 +1036,13 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) return (_B_FALSE); } + if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, &on, + sizeof (on)) < 0) { + logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" + " SO_TIMESTAMP"); + return (_B_FALSE); + } + return (_B_TRUE); } @@ -848,7 +1050,7 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) * Remove the phyint group from the list of 'all phyint groups' * and free it. */ -static void +void phyint_group_delete(struct phyint_group *pg) { /* @@ -881,10 +1083,69 @@ phyint_group_delete(struct phyint_group *pg) phyint_grouplistsig++; (void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE); + addrlist_free(&pg->pg_addrs); free(pg); } /* + * Refresh the state of `pg' based on its current members. + */ +void +phyint_group_refresh_state(struct phyint_group *pg) +{ + enum pg_state state; + enum pg_state origstate = pg->pg_state; + struct phyint *pi, *usablepi; + uint_t nif = 0, nusable = 0; + + /* + * Anonymous groups never change state. + */ + if (pg == phyint_anongroup) + return; + + for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { + nif++; + if (phyint_is_usable(pi)) { + nusable++; + usablepi = pi; + } + } + + if (nusable == 0) + state = PG_FAILED; + else if (nif == nusable) + state = PG_OK; + else + state = PG_DEGRADED; + + phyint_group_chstate(pg, state); + + /* + * If we're shutting down, skip logging messages since otherwise our + * shutdown housecleaning will make us report that groups are unusable. + */ + if (cleanup_started) + return; + + /* + * NOTE: We use pg_failmsg_printed rather than origstate since + * otherwise at startup we'll log a "now usable" message when the + * first usable phyint is added to an empty group. + */ + if (state != PG_FAILED && pg->pg_failmsg_printed) { + assert(origstate == PG_FAILED); + logerr("At least 1 IP interface (%s) in group %s is now " + "usable\n", usablepi->pi_name, pg->pg_name); + pg->pg_failmsg_printed = _B_FALSE; + } else if (origstate != PG_FAILED && state == PG_FAILED) { + logerr("All IP interfaces in group %s are now unusable\n", + pg->pg_name); + pg->pg_failmsg_printed = _B_TRUE; + } +} + +/* * Extract information from the kernel about the desired phyint. * Look only for properties of the phyint and not properties of logints. * Take appropriate action on the changes. @@ -998,28 +1259,16 @@ phyint_inst_update_from_k(struct phyint_instance *pii) if (pi->pi_v6 != NULL) pi->pi_v6->pii_flags = pi->pi_flags; + /* + * Make sure the IFF_FAILED flag is set if and only if we think + * the interface should be failed. + */ if (pi->pi_flags & IFF_FAILED) { - /* - * If we are in the running and full state, we have - * completed failbacks successfully and we would have - * expected IFF_FAILED to have been clear. That it is - * set means there was a race condition. Some other - * process turned on the IFF_FAILED flag. Since the - * flag setting is not atomic, i.e. a get ioctl followed - * by a set ioctl, and since there is no way to set an - * individual flag bit, this could have occurred. - */ - if (pi->pi_state == PI_RUNNING && pi->pi_full) - (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); + if (pi->pi_state == PI_RUNNING) + (void) change_pif_flags(pi, 0, IFF_FAILED); } else { - /* - * If we are in the failed state, there was a race. - * we have completed failover successfully because our - * state is failed and empty. Some other process turned - * off the IFF_FAILED flag. Same comment as above - */ - if (pi->pi_state == PI_FAILED && pi->pi_empty) - (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); + if (pi->pi_state == PI_FAILED) + (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); } /* No change in phyint status */ @@ -1028,12 +1277,12 @@ phyint_inst_update_from_k(struct phyint_instance *pii) /* * Delete the phyint. Remove it from the list of all phyints, and the - * list of phyint group members. If the group becomes empty, delete the - * group also. + * list of phyint group members. */ static void phyint_delete(struct phyint *pi) { + struct phyint *pi2; struct phyint_group *pg = pi->pi_group; if (debug & D_PHYINT) @@ -1065,6 +1314,9 @@ phyint_delete(struct phyint *pi) pi->pi_pgnext = NULL; pi->pi_pgprev = NULL; + /* Refresh the group state now that this phyint has been removed */ + phyint_group_refresh_state(pg); + /* Remove the phyint from the global list of phyints */ if (pi->pi_prev == NULL) { /* Phyint is the 1st in the list */ @@ -1077,11 +1329,153 @@ phyint_delete(struct phyint *pi) pi->pi_next = NULL; pi->pi_prev = NULL; + /* + * See if another phyint in the group had been offlined because + * it was a dup of `pi' -- and if so, online it. + */ + if (!pi->pi_hwaddrdup && + (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) { + assert(pi2->pi_hwaddrdup); + (void) phyint_undo_offline(pi2); + } + phyint_link_close(pi); free(pi); +} + +/* + * Offline phyint `pi' if at least `minred' usable interfaces remain in the + * group. Returns an IPMP error code. + */ +int +phyint_offline(struct phyint *pi, uint_t minred) +{ + unsigned int nusable = 0; + struct phyint *pi2; + struct phyint_group *pg = pi->pi_group; + + /* + * Verify that enough usable interfaces in the group would remain. + * As a special case, if the group has failed, allow any non-offline + * phyints to be offlined. + */ + if (pg != phyint_anongroup) { + for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (pi2 == pi) + continue; + if (phyint_is_usable(pi2) || + (GROUP_FAILED(pg) && pi2->pi_state != PI_OFFLINE)) + nusable++; + } + } + if (nusable < minred) + return (IPMP_EMINRED); + + if (!change_pif_flags(pi, IFF_OFFLINE, 0)) + return (IPMP_FAILURE); + + /* + * The interface is now offline, so stop probing it. Note that + * if_mpadm(1M) will down the test addresses, after receiving a + * success reply from us. The routing socket message will then make us + * close the socket used for sending probes. But it is more logical + * that an offlined interface must not be probed, even if it has test + * addresses. + * + * NOTE: stop_probing() also sets PI_OFFLINE. + */ + stop_probing(pi); + + /* + * If we're offlining the phyint because it has a duplicate hardware + * address, print a warning -- and leave the link open so that we can + * be notified of hardware address changes that make it usable again. + * Otherwise, close the link so that we won't prevent a detach. + */ + if (pi->pi_hwaddrdup) { + logerr("IP interface %s has a hardware address which is not " + "unique in group %s; offlining\n", pi->pi_name, + pg->pg_name); + } else { + phyint_link_close(pi); + } + + /* + * If this phyint was preventing another phyint with a duplicate + * hardware address from being online, bring that one online now. + */ + if (!pi->pi_hwaddrdup && + (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) { + assert(pi2->pi_hwaddrdup); + (void) phyint_undo_offline(pi2); + } - /* Delete the phyint_group if the last phyint has been deleted */ - if (pg->pg_phyint == NULL) - phyint_group_delete(pg); + /* + * If this interface was active, try to activate another INACTIVE + * interface in the group. + */ + if (!(pi->pi_flags & IFF_INACTIVE)) + phyint_activate_another(pi); + + return (IPMP_SUCCESS); +} + +/* + * Undo a previous offline of `pi'. Returns an IPMP error code. + */ +int +phyint_undo_offline(struct phyint *pi) +{ + if (pi->pi_state != PI_OFFLINE) { + errno = EINVAL; + return (IPMP_FAILURE); + } + + /* + * If necessary, reinitialize our link information and verify that its + * hardware address is still unique across the group. + */ + if (pi->pi_dh == NULL && !phyint_link_init(pi)) { + errno = EIO; + return (IPMP_FAILURE); + } + + if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) { + pi->pi_hwaddrdup = _B_TRUE; + return (IPMP_EHWADDRDUP); + } + + if (pi->pi_hwaddrdup) { + logerr("IP interface %s now has a unique hardware address in " + "group %s; onlining\n", pi->pi_name, pi->pi_group->pg_name); + pi->pi_hwaddrdup = _B_FALSE; + } + + if (!change_pif_flags(pi, 0, IFF_OFFLINE)) + return (IPMP_FAILURE); + + /* + * While the interface was offline, it may have failed (e.g. the link + * may have gone down). phyint_inst_check_for_failure() will have + * already set pi_flags with IFF_FAILED, so we can use that to decide + * whether the phyint should transition to running. Note that after + * we transition to running, we will start sending probes again (if + * test addresses are configured), which may also reveal that the + * interface is in fact failed. + */ + if (pi->pi_flags & IFF_FAILED) { + phyint_chstate(pi, PI_FAILED); + } else { + /* calls phyint_chstate() */ + phyint_transition_to_running(pi); + } + + /* + * Give the requestor time to configure test addresses before + * complaining that they're missing. + */ + pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME; + + return (IPMP_SUCCESS); } /* @@ -1166,11 +1560,10 @@ phyint_inst_print(struct phyint_instance *pii) } logdebug("\nPhyint instance: %s %s index %u state %x flags %llx " - "sock %x in_use %d empty %x full %x\n", + "sock %x in_use %d\n", AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex, pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock, - pii->pii_in_use, pii->pii_phyint->pi_empty, - pii->pii_phyint->pi_full); + pii->pii_in_use); for (li = pii->pii_logint; li != NULL; li = li->li_next) logint_print(li); @@ -1211,9 +1604,11 @@ phyint_inst_print(struct phyint_instance *pii) } else { logdebug("#%d target NULL ", i); } - logdebug("time_sent %u status %d time_ack/lost %u\n", - pii->pii_probes[i].pr_time_sent, + logdebug("time_start %lld status %d " + "time_ackproc %lld time_lost %u", + pii->pii_probes[i].pr_hrtime_start, pii->pii_probes[i].pr_status, + pii->pii_probes[i].pr_hrtime_ackproc, pii->pii_probes[i].pr_time_lost); i = PROBE_INDEX_PREV(i); } while (i != most_recent); @@ -1293,7 +1688,6 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name) struct logint *li; struct lifreq lifr; struct in6_addr test_subnet; - struct in6_addr test_subnet_mask; struct in6_addr testaddr; int test_subnet_len; struct sockaddr_in6 *sin6; @@ -1373,55 +1767,21 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name) testaddr = sin6->sin6_addr; } - if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { - ptp = _B_TRUE; - if (ioctl(ifsock, SIOCGLIFDSTADDR, (char *)&lifr) < 0) { - if (errno != ENXIO) { - logperror_li(li, "logint_init_from_k:" - " (get dstaddr)"); - } - goto error; - } - if (pii->pii_af == AF_INET) { - sin = (struct sockaddr_in *)&lifr.lifr_addr; - IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &tgaddr); - } else { - sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; - tgaddr = sin6->sin6_addr; - } - } else { - if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) { - /* Interface may have vanished */ - if (errno != ENXIO) { - logperror_li(li, "logint_init_from_k:" - " (get subnet)"); - } - goto error; - } - if (lifr.lifr_subnet.ss_family == AF_INET6) { - sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet; - test_subnet = sin6->sin6_addr; - test_subnet_len = lifr.lifr_addrlen; - } else { - sin = (struct sockaddr_in *)&lifr.lifr_subnet; - IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet); - test_subnet_len = lifr.lifr_addrlen + - (IPV6_ABITS - IP_ABITS); - } - (void) ip_index_to_mask_v6(test_subnet_len, &test_subnet_mask); - } - - /* - * Also record the OINDEX for completeness. This information is - * not used. - */ - if (ioctl(ifsock, SIOCGLIFOINDEX, (char *)&lifr) < 0) { - if (errno != ENXIO) { - logperror_li(li, "logint_init_from_k:" - " (get lifoindex)"); - } + if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) { + /* Interface may have vanished */ + if (errno != ENXIO) + logperror_li(li, "logint_init_from_k: (get subnet)"); goto error; } + if (lifr.lifr_subnet.ss_family == AF_INET6) { + sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet; + test_subnet = sin6->sin6_addr; + test_subnet_len = lifr.lifr_addrlen; + } else { + sin = (struct sockaddr_in *)&lifr.lifr_subnet; + IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet); + test_subnet_len = lifr.lifr_addrlen + (IPV6_ABITS - IP_ABITS); + } /* * If this is the logint corresponding to the test address used for @@ -1454,7 +1814,6 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name) /* Update the logint with the values obtained from the kernel. */ li->li_addr = testaddr; li->li_in_use = 1; - li->li_oifindex = lifr.lifr_index; if (ptp) { li->li_dstaddr = tgaddr; li->li_subnet_len = (pii->pii_af == AF_INET) ? @@ -1530,15 +1889,12 @@ static void logint_print(struct logint *li) { char abuf[INET6_ADDRSTRLEN]; - int af; - - af = li->li_phyint_inst->pii_af; + int af = li->li_phyint_inst->pii_af; logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name, pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len); - logdebug("\tFlags: %llx in_use %d oifindex %d\n", - li->li_flags, li->li_in_use, li->li_oifindex); + logdebug("\tFlags: %llx in_use %d\n", li->li_flags, li->li_in_use); } char * @@ -1555,6 +1911,33 @@ pr_addr(int af, struct in6_addr addr, char *abuf, int len) return (abuf); } +/* + * Fill in the sockaddr_storage pointed to by `ssp' with the IP address + * represented by the [`af',`addr'] pair. Needed because in.mpathd internally + * stores all addresses as in6_addrs, but we don't want to expose that. + */ +void +addr2storage(int af, const struct in6_addr *addr, struct sockaddr_storage *ssp) +{ + struct sockaddr_in *sinp = (struct sockaddr_in *)ssp; + struct sockaddr_in6 *sin6p = (struct sockaddr_in6 *)ssp; + + assert(af == AF_INET || af == AF_INET6); + + switch (af) { + case AF_INET: + (void) memset(sinp, 0, sizeof (*sinp)); + sinp->sin_family = AF_INET; + IN6_V4MAPPED_TO_INADDR(addr, &sinp->sin_addr); + break; + case AF_INET6: + (void) memset(sin6p, 0, sizeof (*sin6p)); + sin6p->sin6_family = AF_INET6; + sin6p->sin6_addr = *addr; + break; + } +} + /* Lookup target on its address */ struct target * target_lookup(struct phyint_instance *pii, struct in6_addr addr) @@ -1686,7 +2069,7 @@ target_select_best(struct phyint_instance *pii) if (tg->tg_latime + MIN_RECOVERY_TIME < now) { slow_recovered = tg; /* - * Promote the slow_recoverd to unused + * Promote the slow_recovered to unused */ tg->tg_status = TG_UNUSED; } else { @@ -1698,7 +2081,7 @@ target_select_best(struct phyint_instance *pii) if (tg->tg_latime + MIN_RECOVERY_TIME < now) { dead_recovered = tg; /* - * Promote the dead_recoverd to slow + * Promote the dead_recovered to slow */ tg->tg_status = TG_SLOW; tg->tg_latime = now; @@ -1798,11 +2181,9 @@ target_create(struct phyint_instance *pii, struct in6_addr addr, /* * If there are multiple subnets associated with an interface, then - * add the target to this phyint instance, only if it belongs to the - * same subnet as the test address. The reason is that interface - * routes derived from non-test-addresses i.e. non-IFF_NOFAILOVER - * addresses, will disappear after failover, and the targets will not - * be reachable from this interface. + * add the target to this phyint instance only if it belongs to the + * same subnet as the test address. This assures us that we will + * be able to reach this target through our routing table. */ if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len)) return; @@ -1906,11 +2287,12 @@ target_add(struct phyint_instance *pii, struct in6_addr addr, /* * If the target does not exist, create it; target_create() will set - * tg_in_use to true. If it exists already, and it is a router - * target, set tg_in_use to to true, so that init_router_targets() - * won't delete it + * tg_in_use to true. Even if it exists already, if it's a router + * target and we'd previously learned of it through multicast, then we + * need to recreate it as a router target. Otherwise, just set + * tg_in_use to to true so that init_router_targets() won't delete it. */ - if (tg == NULL) + if (tg == NULL || (is_router && !pii->pii_targets_are_routers)) target_create(pii, addr, is_router); else if (is_router) tg->tg_in_use = 1; @@ -2034,16 +2416,17 @@ target_delete(struct target *tg) * relevant any longer. */ assert(pii->pii_targets == NULL); + pii->pii_targets_are_routers = _B_FALSE; clear_pii_probe_stats(pii); pii_other = phyint_inst_other(pii); /* - * If there are no targets on both instances and the interface is - * online, go back to PI_NOTARGETS state, since we cannot probe this - * phyint any more. For more details, please see phyint state - * diagram in mpd_probe.c. + * If there are no targets on both instances and the interface would + * otherwise be considered PI_RUNNING, go back to PI_NOTARGETS state, + * since we cannot probe this phyint any more. For more details, + * please see phyint state diagram in mpd_probe.c. */ - if (!PROBE_CAPABLE(pii_other) && + if (!PROBE_CAPABLE(pii_other) && LINK_UP(pii->pii_phyint) && pii->pii_phyint->pi_state != PI_OFFLINE) phyint_chstate(pii->pii_phyint, PI_NOTARGETS); } @@ -2101,9 +2484,11 @@ reset_pii_probes(struct phyint_instance *pii, struct target *tg) for (i = 0; i < PROBE_STATS_COUNT; i++) { if (pii->pii_probes[i].pr_target == tg) { + if (pii->pii_probes[i].pr_status == PR_UNACKED) { + probe_chstate(&pii->pii_probes[i], pii, + PR_LOST); + } pii->pii_probes[i].pr_target = NULL; - if (pii->pii_probes[i].pr_status == PR_UNACKED) - pii->pii_probes[i].pr_status = PR_LOST; } } @@ -2132,7 +2517,7 @@ target_print(struct target *tg) af = tg->tg_phyint_inst->pii_af; logdebug("Target on %s %s addr %s\n" - "status %d rtt_sa %d rtt_sd %d crtt %d tg_in_use %d\n", + "status %d rtt_sa %lld rtt_sd %lld crtt %d tg_in_use %d\n", AF_STR(af), tg->tg_phyint_inst->pii_name, pr_addr(af, tg->tg_address, abuf, sizeof (abuf)), tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd, @@ -2158,35 +2543,16 @@ phyint_inst_print_all(void) } /* - * Convert length for a mask to the mask. - */ -static void -ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask) -{ - int j; - - assert(masklen <= IPV6_ABITS); - bzero((char *)bitmask, sizeof (*bitmask)); - - /* Make the 'masklen' leftmost bits one */ - for (j = 0; masklen > 8; masklen -= 8, j++) - bitmask->s6_addr[j] = 0xff; - - bitmask->s6_addr[j] = 0xff << (8 - masklen); - -} - -/* * Compare two prefixes that have the same prefix length. * Fails if the prefix length is unreasonable. */ -static boolean_t -prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len) +boolean_t +prefix_equal(struct in6_addr p1, struct in6_addr p2, uint_t prefix_len) { uchar_t mask; int j; - if (prefix_len < 0 || prefix_len > IPV6_ABITS) + if (prefix_len > IPV6_ABITS) return (_B_FALSE); for (j = 0; prefix_len > 8; prefix_len -= 8, j++) @@ -2202,35 +2568,25 @@ prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len) } /* - * Get the number of UP logints (excluding IFF_NOFAILOVERs), on both - * IPv4 and IPv6 put together. The phyint with the least such number - * will be used as the failover destination, if no standby interface is - * available + * Get the number of UP logints on phyint `pi'. */ -int +static int logint_upcount(struct phyint *pi) { struct logint *li; - struct phyint_instance *pii; int count = 0; - pii = pi->pi_v4; - if (pii != NULL) { - for (li = pii->pii_logint; li != NULL; li = li->li_next) { - if ((li->li_flags & - (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) { + if (pi->pi_v4 != NULL) { + for (li = pi->pi_v4->pii_logint; li != NULL; li = li->li_next) { + if (li->li_flags & IFF_UP) count++; - } } } - pii = pi->pi_v6; - if (pii != NULL) { - for (li = pii->pii_logint; li != NULL; li = li->li_next) { - if ((li->li_flags & - (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) { + if (pi->pi_v6 != NULL) { + for (li = pi->pi_v6->pii_logint; li != NULL; li = li->li_next) { + if (li->li_flags & IFF_UP) count++; - } } } @@ -2250,6 +2606,28 @@ phyint_inst_other(struct phyint_instance *pii) } /* + * Check whether a phyint is functioning. + */ +static boolean_t +phyint_is_functioning(struct phyint *pi) +{ + if (pi->pi_state == PI_RUNNING) + return (_B_TRUE); + return (pi->pi_state == PI_NOTARGETS && !(pi->pi_flags & IFF_FAILED)); +} + +/* + * Check whether a phyint is usable. + */ +static boolean_t +phyint_is_usable(struct phyint *pi) +{ + if (logint_upcount(pi) == 0) + return (_B_FALSE); + return (phyint_is_functioning(pi)); +} + +/* * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'. * Before sending the event, it prepends the current version of the IPMP * sysevent API. Returns 0 on success, -1 on failure (in either case, @@ -2258,16 +2636,18 @@ phyint_inst_other(struct phyint_instance *pii) static int post_event(const char *subclass, nvlist_t *nvl) { - sysevent_id_t eid; + static evchan_t *evchp = NULL; /* - * Since sysevents don't work yet in non-global zones, there cannot - * possibly be any consumers yet, so don't bother trying to generate - * them. (Otherwise, we'll spew warnings.) + * Initialize the event channel if we haven't already done so. */ - if (getzoneid() != GLOBAL_ZONEID) { - nvlist_free(nvl); - return (0); + if (evchp == NULL) { + errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evchp, EVCH_CREAT); + if (errno != 0) { + logerr("cannot create event channel `%s': %s\n", + IPMP_EVENT_CHAN, strerror(errno)); + goto failed; + } } errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION, @@ -2278,8 +2658,9 @@ post_event(const char *subclass, nvlist_t *nvl) goto failed; } - if (sysevent_post_event(EC_IPMP, (char *)subclass, SUNW_VENDOR, - "in.mpathd", nvl, &eid) == -1) { + errno = sysevent_evc_publish(evchp, EC_IPMP, subclass, "com.sun", + "in.mpathd", nvl, EVCH_NOSLEEP); + if (errno != 0) { logerr("cannot send `%s' event: %s\n", subclass, strerror(errno)); goto failed; @@ -2300,6 +2681,8 @@ ifstate(struct phyint *pi) { switch (pi->pi_state) { case PI_NOTARGETS: + if (pi->pi_flags & IFF_FAILED) + return (IPMP_IF_FAILED); return (IPMP_IF_UNKNOWN); case PI_OFFLINE: @@ -2330,12 +2713,203 @@ iftype(struct phyint *pi) } /* + * Return the external IPMP link state associated with phyint `pi'. + */ +static ipmp_if_linkstate_t +iflinkstate(struct phyint *pi) +{ + if (!(pi->pi_notes & (DL_NOTE_LINK_UP|DL_NOTE_LINK_DOWN))) + return (IPMP_LINK_UNKNOWN); + + return (LINK_DOWN(pi) ? IPMP_LINK_DOWN : IPMP_LINK_UP); +} + +/* + * Return the external IPMP probe state associated with phyint `pi'. + */ +static ipmp_if_probestate_t +ifprobestate(struct phyint *pi) +{ + if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) + return (IPMP_PROBE_DISABLED); + + if (pi->pi_state == PI_FAILED) + return (IPMP_PROBE_FAILED); + + if (!PROBE_CAPABLE(pi->pi_v4) && !PROBE_CAPABLE(pi->pi_v6)) + return (IPMP_PROBE_UNKNOWN); + + return (IPMP_PROBE_OK); +} + +/* + * Return the external IPMP target mode associated with phyint instance `pii'. + */ +static ipmp_if_targmode_t +iftargmode(struct phyint_instance *pii) +{ + if (!PROBE_ENABLED(pii)) + return (IPMP_TARG_DISABLED); + else if (pii->pii_targets_are_routers) + return (IPMP_TARG_ROUTES); + else + return (IPMP_TARG_MULTICAST); +} + +/* + * Return the external IPMP flags associated with phyint `pi'. + */ +static ipmp_if_flags_t +ifflags(struct phyint *pi) +{ + ipmp_if_flags_t flags = 0; + + if (logint_upcount(pi) == 0) + flags |= IPMP_IFFLAG_DOWN; + if (pi->pi_flags & IFF_INACTIVE) + flags |= IPMP_IFFLAG_INACTIVE; + if (pi->pi_hwaddrdup) + flags |= IPMP_IFFLAG_HWADDRDUP; + if (phyint_is_functioning(pi) && flags == 0) + flags |= IPMP_IFFLAG_ACTIVE; + + return (flags); +} + +/* + * Store the test address used on phyint instance `pii' in `ssp'. If there's + * no test address, 0.0.0.0 is stored. + */ +static struct sockaddr_storage * +iftestaddr(struct phyint_instance *pii, struct sockaddr_storage *ssp) +{ + if (PROBE_ENABLED(pii)) + addr2storage(pii->pii_af, &pii->pii_probe_logint->li_addr, ssp); + else + addr2storage(AF_INET6, &in6addr_any, ssp); + + return (ssp); +} + +/* * Return the external IPMP group state associated with phyint group `pg'. */ static ipmp_group_state_t groupstate(struct phyint_group *pg) { - return (GROUP_FAILED(pg) ? IPMP_GROUP_FAILED : IPMP_GROUP_OK); + switch (pg->pg_state) { + case PG_FAILED: + return (IPMP_GROUP_FAILED); + case PG_DEGRADED: + return (IPMP_GROUP_DEGRADED); + case PG_OK: + return (IPMP_GROUP_OK); + } + + logerr("groupstate: unknown state %d; aborting\n", pg->pg_state); + abort(); + /* NOTREACHED */ +} + +/* + * Return the external IPMP probe state associated with probe `ps'. + */ +static ipmp_probe_state_t +probestate(struct probe_stats *ps) +{ + switch (ps->pr_status) { + case PR_UNUSED: + case PR_LOST: + return (IPMP_PROBE_LOST); + case PR_UNACKED: + return (IPMP_PROBE_SENT); + case PR_ACKED: + return (IPMP_PROBE_ACKED); + } + + logerr("probestate: unknown state %d; aborting\n", ps->pr_status); + abort(); + /* NOTREACHED */ +} + +/* + * Generate an ESC_IPMP_PROBE_STATE sysevent for the probe described by `pr' + * on phyint instance `pii'. Returns 0 on success, -1 on failure. + */ +int +probe_state_event(struct probe_stats *pr, struct phyint_instance *pii) +{ + nvlist_t *nvl; + hrtime_t proc_time = 0, recv_time = 0; + struct sockaddr_storage ss; + struct target *tg = pr->pr_target; + + errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); + if (errno != 0) { + logperror("cannot create `interface change' event"); + return (-1); + } + + errno = nvlist_add_uint32(nvl, IPMP_PROBE_ID, pr->pr_id); + if (errno != 0) + goto failed; + + errno = nvlist_add_string(nvl, IPMP_IF_NAME, pii->pii_phyint->pi_name); + if (errno != 0) + goto failed; + + errno = nvlist_add_uint32(nvl, IPMP_PROBE_STATE, probestate(pr)); + if (errno != 0) + goto failed; + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_START_TIME, + pr->pr_hrtime_start); + if (errno != 0) + goto failed; + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_SENT_TIME, + pr->pr_hrtime_sent); + if (errno != 0) + goto failed; + + if (pr->pr_status == PR_ACKED) { + recv_time = pr->pr_hrtime_ackrecv; + proc_time = pr->pr_hrtime_ackproc; + } + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, recv_time); + if (errno != 0) + goto failed; + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, proc_time); + if (errno != 0) + goto failed; + + if (tg != NULL) + addr2storage(pii->pii_af, &tg->tg_address, &ss); + else + addr2storage(pii->pii_af, &in6addr_any, &ss); + + errno = nvlist_add_byte_array(nvl, IPMP_PROBE_TARGET, (uchar_t *)&ss, + sizeof (ss)); + if (errno != 0) + goto failed; + + errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, + tg->tg_rtt_sa / 8); + if (errno != 0) + goto failed; + + errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, + tg->tg_rtt_sd / 4); + if (errno != 0) + goto failed; + + return (post_event(ESC_IPMP_PROBE_STATE, nvl)); +failed: + logperror("cannot create `probe state' event"); + nvlist_free(nvl); + return (-1); } /* @@ -2529,10 +3103,15 @@ gensig(void) unsigned int getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp) { - struct phyint_group *pg; struct phyint *pi; + struct phyint_group *pg; char (*ifs)[LIFNAMSIZ]; - unsigned int nif, i; + unsigned int i, j; + unsigned int nif = 0, naddr = 0; + lifgroupinfo_t lifgr; + addrlist_t *addrp; + struct sockaddr_storage *addrs; + int fdt = 0; pg = phyint_group_lookup(grname); if (pg == NULL) @@ -2540,39 +3119,143 @@ getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp) /* * Tally up the number of interfaces, allocate an array to hold them, - * and insert their names into the array. + * and insert their names into the array. While we're at it, if any + * interface is actually enabled to send probes, save the group fdt. */ - for (nif = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) + for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) nif++; ifs = alloca(nif * sizeof (*ifs)); for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) { assert(i < nif); (void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ); + if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) + fdt = pg->pg_fdt; } assert(i == nif); - *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, - groupstate(pg), nif, ifs); + /* + * If this is the anonymous group, there's no other information to + * collect (since there's no IPMP interface). + */ + if (pg == phyint_anongroup) { + *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt, + groupstate(pg), nif, ifs, "", "", "", "", 0, NULL); + return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); + } + + /* + * Grab some additional information about the group from the kernel. + * (NOTE: since SIOCGLIFGROUPINFO does not look up by interface name, + * we can use ifsock_v4 even for a V6-only group.) + */ + (void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ); + if (ioctl(ifsock_v4, SIOCGLIFGROUPINFO, &lifgr) == -1) { + if (errno == ENOENT) + return (IPMP_EUNKGROUP); + + logperror("getgroupinfo: SIOCGLIFGROUPINFO"); + return (IPMP_FAILURE); + } + + /* + * Tally up the number of data addresses, allocate an array to hold + * them, and insert their values into the array. + */ + for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) + naddr++; + + addrs = alloca(naddr * sizeof (*addrs)); + i = 0; + for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) { + /* + * It's possible to have duplicate addresses (if some are + * down). Weed the dups out to avoid confusing consumers. + * (If groups start having tons of addresses, we'll need a + * better algorithm here.) + */ + for (j = 0; j < i; j++) { + if (sockaddrcmp(&addrs[j], &addrp->al_addr)) + break; + } + if (j == i) { + assert(i < naddr); + addrs[i++] = addrp->al_addr; + } + } + naddr = i; + + *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt, + groupstate(pg), nif, ifs, lifgr.gi_grifname, lifgr.gi_m4ifname, + lifgr.gi_m6ifname, lifgr.gi_bcifname, naddr, addrs); return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); } /* + * Store the target information associated with phyint instance `pii' into a + * dynamically allocated structure pointed to by `*targinfopp'. Returns an + * IPMP error code. + */ +unsigned int +gettarginfo(struct phyint_instance *pii, const char *name, + ipmp_targinfo_t **targinfopp) +{ + uint_t ntarg = 0; + struct target *tg; + struct sockaddr_storage ss; + struct sockaddr_storage *targs = NULL; + + if (PROBE_CAPABLE(pii)) { + targs = alloca(pii->pii_ntargets * sizeof (*targs)); + tg = pii->pii_target_next; + do { + if (tg->tg_status == TG_ACTIVE) { + assert(ntarg < pii->pii_ntargets); + addr2storage(pii->pii_af, &tg->tg_address, + &targs[ntarg++]); + } + if ((tg = tg->tg_next) == NULL) + tg = pii->pii_targets; + } while (tg != pii->pii_target_next); + + assert(ntarg == pii->pii_ntargets); + } + + *targinfopp = ipmp_targinfo_create(name, iftestaddr(pii, &ss), + iftargmode(pii), ntarg, targs); + return (*targinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); +} + +/* * Store the information associated with interface `ifname' into a dynamically * allocated structure pointed to by `*ifinfopp'. Returns an IPMP error code. */ unsigned int getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp) { + int retval; struct phyint *pi; + ipmp_targinfo_t *targinfo4; + ipmp_targinfo_t *targinfo6; pi = phyint_lookup(ifname); if (pi == NULL) return (IPMP_EUNKIF); + if ((retval = gettarginfo(pi->pi_v4, pi->pi_name, &targinfo4)) != 0 || + (retval = gettarginfo(pi->pi_v6, pi->pi_name, &targinfo6)) != 0) + goto out; + *ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name, - ifstate(pi), iftype(pi)); - return (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); + ifstate(pi), iftype(pi), iflinkstate(pi), ifprobestate(pi), + ifflags(pi), targinfo4, targinfo6); + retval = (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); +out: + if (targinfo4 != NULL) + ipmp_freetarginfo(targinfo4); + if (targinfo6 != NULL) + ipmp_freetarginfo(targinfo6); + return (retval); } /* @@ -2605,6 +3288,54 @@ getgrouplist(ipmp_grouplist_t **grlistpp) } /* + * Store the address information for `ssp' (in group `grname') into a + * dynamically allocated structure pointed to by `*adinfopp'. Returns an IPMP + * error code. (We'd call this function getaddrinfo(), but it would conflict + * with getaddrinfo(3SOCKET)). + */ +unsigned int +getgraddrinfo(const char *grname, struct sockaddr_storage *ssp, + ipmp_addrinfo_t **adinfopp) +{ + int ifsock; + addrlist_t *addrp, *addrmatchp = NULL; + ipmp_addr_state_t state; + const char *binding = ""; + struct lifreq lifr; + struct phyint_group *pg; + + if ((pg = phyint_group_lookup(grname)) == NULL) + return (IPMP_EUNKADDR); + + /* + * Walk through the data addresses, and find a match. Note that since + * some of the addresses may be down, more than one may match. We + * prefer an up address (if one exists). + */ + for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) { + if (sockaddrcmp(ssp, &addrp->al_addr)) { + addrmatchp = addrp; + if (addrmatchp->al_flags & IFF_UP) + break; + } + } + + if (addrmatchp == NULL) + return (IPMP_EUNKADDR); + + state = (addrmatchp->al_flags & IFF_UP) ? IPMP_ADDR_UP : IPMP_ADDR_DOWN; + if (state == IPMP_ADDR_UP) { + ifsock = (ssp->ss_family == AF_INET) ? ifsock_v4 : ifsock_v6; + (void) strlcpy(lifr.lifr_name, addrmatchp->al_name, LIFNAMSIZ); + if (ioctl(ifsock, SIOCGLIFBINDING, &lifr) >= 0) + binding = lifr.lifr_binding; + } + + *adinfopp = ipmp_addrinfo_create(ssp, state, pg->pg_name, binding); + return (*adinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); +} + +/* * Store a snapshot of the IPMP subsystem into a dynamically allocated * structure pointed to by `*snapp'. Returns an IPMP error code. */ @@ -2613,10 +3344,12 @@ getsnap(ipmp_snap_t **snapp) { ipmp_grouplist_t *grlistp; ipmp_groupinfo_t *grinfop; + ipmp_addrinfo_t *adinfop; + ipmp_addrlist_t *adlistp; ipmp_ifinfo_t *ifinfop; ipmp_snap_t *snap; struct phyint *pi; - unsigned int i; + unsigned int i, j; int retval; snap = ipmp_snap_create(); @@ -2627,26 +3360,37 @@ getsnap(ipmp_snap_t **snapp) * Add group list. */ retval = getgrouplist(&snap->sn_grlistp); - if (retval != IPMP_SUCCESS) { - ipmp_snap_free(snap); - return (retval); - } + if (retval != IPMP_SUCCESS) + goto failed; /* - * Add information for each group in the list. + * Add information for each group in the list, along with all of its + * data addresses. */ grlistp = snap->sn_grlistp; for (i = 0; i < grlistp->gl_ngroup; i++) { retval = getgroupinfo(grlistp->gl_groups[i], &grinfop); - if (retval != IPMP_SUCCESS) { - ipmp_snap_free(snap); - return (retval); - } + if (retval != IPMP_SUCCESS) + goto failed; + retval = ipmp_snap_addgroupinfo(snap, grinfop); if (retval != IPMP_SUCCESS) { ipmp_freegroupinfo(grinfop); - ipmp_snap_free(snap); - return (retval); + goto failed; + } + + adlistp = grinfop->gr_adlistp; + for (j = 0; j < adlistp->al_naddr; j++) { + retval = getgraddrinfo(grinfop->gr_name, + &adlistp->al_addrs[j], &adinfop); + if (retval != IPMP_SUCCESS) + goto failed; + + retval = ipmp_snap_addaddrinfo(snap, adinfop); + if (retval != IPMP_SUCCESS) { + ipmp_freeaddrinfo(adinfop); + goto failed; + } } } @@ -2655,18 +3399,19 @@ getsnap(ipmp_snap_t **snapp) */ for (pi = phyints; pi != NULL; pi = pi->pi_next) { retval = getifinfo(pi->pi_name, &ifinfop); - if (retval != IPMP_SUCCESS) { - ipmp_snap_free(snap); - return (retval); - } + if (retval != IPMP_SUCCESS) + goto failed; + retval = ipmp_snap_addifinfo(snap, ifinfop); if (retval != IPMP_SUCCESS) { ipmp_freeifinfo(ifinfop); - ipmp_snap_free(snap); - return (retval); + goto failed; } } *snapp = snap; return (IPMP_SUCCESS); +failed: + ipmp_snap_free(snap); + return (retval); } diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h index e4be3ccb30..39da2c3f1b 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _MPD_TABLES_H #define _MPD_TABLES_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -47,20 +45,11 @@ extern "C" { * switch AND * (ii) share the same phyint group name. * Load spreading and failover occur across members of the same phyint group. - * phyint group members must be homogenous. i.e. if a phyint belonging to a + * phyint group members must be homogeneous. i.e. if a phyint belonging to a * phyint group has a IPv6 protocol instance, then all members of the phyint * group, must have IPv6 protocol instances. (struct phyint_group) */ -/* - * Parameter passed to try_failover(), indicating the type of failover - * that is requested. - */ -#define FAILOVER_NORMAL 1 /* Failover to another phyint */ - /* that is preferably a standby */ -#define FAILOVER_TO_NONSTANDBY 2 /* Failover to non-standby phyint */ -#define FAILOVER_TO_ANY 3 /* Failover to any available phyint */ - #define MAXDEFERREDRTT 1 /* Maximum number of deferred rtts */ /* @@ -79,15 +68,9 @@ extern "C" { #define PI_IOCTL_ERROR 4 /* Some ioctl error */ #define PI_GROUP_CHANGED 5 /* The phyint has changed group. */ -/* - * Though IFF_POINTOPOINT is a logint property, for the purpose of - * failover, we treat it as a phyint property. Note that we cannot failover - * individual logints. - */ #define PHYINT_FLAGS(flags) \ - (((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \ - IFF_POINTOPOINT | IFF_RUNNING)) | (handle_link_notifications ? \ - 0 : IFF_RUNNING)) + (((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \ + IFF_RUNNING)) | (handle_link_notifications ? 0 : IFF_RUNNING)) /* A Phyint can have up to 2 instances, the IPv4 and the IPv6 instance */ #define PHYINT_INSTANCE(pi, af) \ @@ -152,29 +135,32 @@ extern "C" { * Phyint group states; see below for the phyint group definition. */ enum pg_state { - PG_RUNNING = 1, /* at least one interface in group is working */ - PG_FAILED = 2 /* group has failed completely */ + PG_OK = 1, /* all interfaces in the group are working */ + PG_DEGRADED, /* some interfaces in the group are unusable */ + PG_FAILED /* all interfaces in the group are unusable */ }; /* * Convenience macro to check if the whole group has failed. */ -#define GROUP_FAILED(pg) ((pg)->pg_groupfailed) +#define GROUP_FAILED(pg) ((pg)->pg_state == PG_FAILED) /* * A doubly linked list of all phyint groups in the system. * A phyint group is identified by its group name. */ struct phyint_group { - char pg_name[LIFNAMSIZ + 1]; /* Phyint group name */ + char pg_name[LIFGRNAMSIZ]; /* Phyint group name */ struct phyint *pg_phyint; /* List of phyints in this group */ struct phyint_group *pg_next; /* Next phyint group */ struct phyint_group *pg_prev; /* Prev phyint group */ - uint64_t pg_sig; /* Current signature of this group */ - int pg_probeint; /* Interval between probes */ - int pg_fdt; /* Time needed to detect failure */ - uint_t - pg_groupfailed : 1; /* The whole group has failed */ + uint64_t pg_sig; /* Current signature of this group */ + int pg_probeint; /* Interval between probes */ + int pg_fdt; /* Time needed to detect failure */ + enum pg_state pg_state; /* Current group state */ + boolean_t pg_in_use; /* To detect removed groups */ + struct addrlist *pg_addrs; /* Data addresses in this group */ + boolean_t pg_failmsg_printed; /* Group failure msg printed */ }; /* @@ -207,6 +193,11 @@ struct phyint { uint16_t pi_icmpid; /* icmp id in icmp echo request */ uint64_t pi_taddrthresh; /* time (in secs) to delay logging */ /* about missing test addresses */ + dlpi_handle_t pi_dh; /* DLPI handle to underlying link */ + uint_t pi_notes; /* enabled DLPI notifications */ + uchar_t pi_hwaddr[DLPI_PHYSADDR_MAX]; /* phyint's hw address */ + size_t pi_hwaddrlen; /* phyint's hw address length */ + /* * The pi_whenup array is a circular buffer of the most recent * times (in milliseconds since some arbitrary point of time in @@ -217,14 +208,12 @@ struct phyint { unsigned int pi_whendx; uint_t - pi_empty : 1, /* failover done, empty */ - pi_full : 1, /* failback done, full */ - /* More details in probe.c */ pi_taddrmsg_printed : 1, /* testaddr msg printed */ pi_duptaddrmsg_printed : 1, /* dup testaddr msg printed */ pi_cfgmsg_printed : 1, /* bad config msg printed */ pi_lfmsg_printed : 1, /* link-flapping msg printed */ - pi_link_state : 1; /* interface link state */ + pi_link_state : 1, /* interface link state */ + pi_hwaddrdup : 1; /* disabled due to dup hw address */ }; /* @@ -260,19 +249,19 @@ struct phyint_instance { uint64_t pii_flags; /* Phyint flags from kernel */ struct probe_stats { - struct target *pr_target; /* Probe Target */ - uint_t pr_time_sent; /* Time probe was sent */ + uint_t pr_id; /* Full ID of probe */ + struct target *pr_target; /* Probe Target */ + uint_t pr_time_lost; /* Time probe declared lost */ + struct timeval pr_tv_sent; /* Wall time probe was sent */ + hrtime_t pr_hrtime_start; /* hrtime probe op started */ + hrtime_t pr_hrtime_sent; /* hrtime probe was sent */ + hrtime_t pr_hrtime_ackrecv; /* hrtime probe ack received */ + hrtime_t pr_hrtime_ackproc; /* hrtime probe ack processed */ uint_t pr_status; /* probe status as below */ #define PR_UNUSED 0 /* Probe slot unused */ #define PR_UNACKED 1 /* Probe is unacknowledged */ #define PR_ACKED 2 /* Probe has been acknowledged */ #define PR_LOST 3 /* Probe is declared lost */ - union { - uint_t tl; /* time probe is declared lost */ - uint_t ta; /* time probe is acked */ - } prt; -#define pr_time_lost prt.tl -#define pr_time_acked prt.ta } pii_probes[PROBE_STATS_COUNT]; uint_t @@ -319,7 +308,6 @@ struct logint { struct in6_addr li_subnet; /* prefix / subnet */ uint_t li_subnet_len; /* prefix / subnet length */ uint64_t li_flags; /* IFF_* flags */ - uint_t li_oifindex; /* original ifindex (SIOCGLIFOINDEX) */ uint_t li_in_use : 1, /* flag to detect deleted logints */ li_dupaddr : 1; /* test address is not unique */ @@ -345,12 +333,12 @@ struct target { #define TG_DEAD 4 /* Target is not responding */ hrtime_t tg_latime; /* Target's last active time */ - int tg_rtt_sa; /* Scaled round trip time(RTT) avg. */ - int tg_rtt_sd; /* Scaled RTT deviation */ - int tg_crtt; /* Conservative RTT = A + 4D */ + int64_t tg_rtt_sa; /* Scaled RTT average (in ns) */ + int64_t tg_rtt_sd; /* Scaled RTT deviation (in ns) */ + int tg_crtt; /* Conservative RTT = A + 4D (in ms) */ uint32_t tg_in_use : 1; /* In use flag */ - int tg_deferred[MAXDEFERREDRTT + 1]; + int64_t tg_deferred[MAXDEFERREDRTT + 1]; /* Deferred rtt data points */ int tg_num_deferred; /* Number of deferred rtt data points */ @@ -393,19 +381,20 @@ struct probe_success_count struct probes_missed { uint_t pm_nprobes; /* Cumulative number of missed probes */ - uint_t pm_ntimes; /* Total number of occassions */ + uint_t pm_ntimes; /* Total number of occasions */ }; -struct local_addr -{ - struct in6_addr addr; - struct local_addr *next; -}; +typedef struct addrlist { + struct addrlist *al_next; /* next address */ + char al_name[LIFNAMSIZ]; /* address lif name */ + uint64_t al_flags; /* address flags */ + struct sockaddr_storage al_addr; /* address */ +} addrlist_t; /* * Globals */ -extern struct local_addr *laddr_list; +extern addrlist_t *localaddrs; /* List of all local addresses, including local zones */ extern struct phyint *phyints; /* List of all phyints */ extern struct phyint_group *phyint_groups; /* List of all phyint groups */ @@ -428,10 +417,19 @@ extern void phyint_inst_delete(struct phyint_instance *pii); extern uint_t phyint_inst_timer(struct phyint_instance *pii); extern boolean_t phyint_inst_sockinit(struct phyint_instance *pii); -extern void phyint_newtype(struct phyint *pi); +extern void phyint_changed(struct phyint *pi); extern void phyint_chstate(struct phyint *pi, enum pi_state state); extern void phyint_group_chstate(struct phyint_group *pg, enum pg_state state); +extern struct phyint_group *phyint_group_create(const char *pg_name); +extern struct phyint_group *phyint_group_lookup(const char *pg_name); +extern void phyint_group_insert(struct phyint_group *pg); +extern void phyint_group_delete(struct phyint_group *pg); +extern void phyint_group_refresh_state(struct phyint_group *pg); extern void phyint_check_for_repair(struct phyint *pi); +extern void phyint_transition_to_running(struct phyint *pi); +extern void phyint_activate_another(struct phyint *pi); +extern int phyint_offline(struct phyint *pi, unsigned int); +extern int phyint_undo_offline(struct phyint *pi); extern void logint_init_from_k(struct phyint_instance *pii, char *li_name); extern void logint_delete(struct logint *li); @@ -448,34 +446,40 @@ extern void target_add(struct phyint_instance *pii, struct in6_addr addr, extern void in_data(struct phyint_instance *pii); extern void in6_data(struct phyint_instance *pii); -extern int try_failover(struct phyint *pi, int failover_type); -extern int try_failback(struct phyint *pi); -extern int do_failback(struct phyint *pi); -extern boolean_t change_lif_flags(struct phyint *pi, uint64_t flags, - boolean_t setfl); - extern void logperror_pii(struct phyint_instance *pii, const char *str); extern void logperror_li(struct logint *li, const char *str); extern char *pr_addr(int af, struct in6_addr addr, char *abuf, int len); +extern void addr2storage(int af, const struct in6_addr *addr, + struct sockaddr_storage *ssp); extern void phyint_inst_print_all(void); +extern boolean_t prefix_equal(struct in6_addr, struct in6_addr, uint_t); -extern int logint_upcount(struct phyint *pi); -extern void restore_phyint(struct phyint *pi); extern void reset_crtt_all(struct phyint *pi); extern int failure_state(struct phyint_instance *pii); extern void process_link_state_changes(void); extern void clear_pii_probe_stats(struct phyint_instance *pii); extern void start_timer(struct phyint_instance *pii); +extern void stop_probing(struct phyint *pi); extern boolean_t own_address(struct in6_addr addr); +extern boolean_t change_pif_flags(struct phyint *pi, uint64_t set, + uint64_t clear); extern void close_probe_socket(struct phyint_instance *pii, boolean_t flag); +extern int probe_state_event(struct probe_stats *, struct phyint_instance *); +extern void probe_chstate(struct probe_stats *, struct phyint_instance *, int); +extern unsigned int getgraddrinfo(const char *, struct sockaddr_storage *, + ipmp_addrinfo_t **); extern unsigned int getifinfo(const char *, ipmp_ifinfo_t **); extern unsigned int getgroupinfo(const char *, ipmp_groupinfo_t **); extern unsigned int getgrouplist(ipmp_grouplist_t **); extern unsigned int getsnap(ipmp_snap_t **); +extern boolean_t addrlist_add(addrlist_t **, const char *, uint64_t, + struct sockaddr_storage *); +extern void addrlist_free(addrlist_t **); + #ifdef __cplusplus } #endif diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c index 27716cabce..703ddcfaad 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c @@ -17,14 +17,11 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "tables.h" #include <fcntl.h> @@ -122,7 +119,7 @@ sendpacket(struct sockaddr_in6 *sin6, int sock, int size, int flags) char abuf[INET6_ADDRSTRLEN]; cc = sendto(sock, (char *)packet, size, flags, - (struct sockaddr *)sin6, sizeof (*sin6)); + (struct sockaddr *)sin6, sizeof (*sin6)); if (cc < 0 || cc != size) { if (cc < 0) { logperror("sendpacket: sendto"); @@ -135,6 +132,32 @@ sendpacket(struct sockaddr_in6 *sin6, int sock, int size, int flags) } } +/* + * If possible, place an ND_OPT_SOURCE_LINKADDR option at `optp'. + * Return the number of bytes placed in the option. + */ +static uint_t +add_opt_lla(struct phyint *pi, struct nd_opt_lla *optp) +{ + uint_t optlen; + uint_t hwaddrlen; + struct lifreq lifr; + + /* If this phyint doesn't have a link-layer address, bail */ + if (phyint_get_lla(pi, &lifr) == -1) + return (0); + + hwaddrlen = lifr.lifr_nd.lnr_hdw_len; + /* roundup to multiple of 8 and make padding zero */ + optlen = ((sizeof (struct nd_opt_hdr) + hwaddrlen + 7) / 8) * 8; + bzero(optp, optlen); + optp->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR; + optp->nd_opt_lla_len = optlen / 8; + bcopy(lifr.lifr_nd.lnr_hdw_addr, optp->nd_opt_lla_hdw_addr, hwaddrlen); + + return (optlen); +} + /* Send a Router Solicitation */ static void solicit(struct sockaddr_in6 *sin6, struct phyint *pi) @@ -151,24 +174,8 @@ solicit(struct sockaddr_in6 *sin6, struct phyint *pi) packetlen += sizeof (*rs); pptr += sizeof (*rs); - /* Attach any options */ - if (pi->pi_hdw_addr_len != 0) { - struct nd_opt_lla *lo = (struct nd_opt_lla *)pptr; - int optlen; - - /* roundup to multiple of 8 and make padding zero */ - optlen = ((sizeof (struct nd_opt_hdr) + - pi->pi_hdw_addr_len + 7) / 8) * 8; - bzero(pptr, optlen); - - lo->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR; - lo->nd_opt_lla_len = optlen / 8; - bcopy((char *)pi->pi_hdw_addr, - (char *)lo->nd_opt_lla_hdw_addr, - pi->pi_hdw_addr_len); - packetlen += optlen; - pptr += optlen; - } + /* add options */ + packetlen += add_opt_lla(pi, (struct nd_opt_lla *)pptr); if (debug & D_PKTOUT) { print_route_sol("Sending solicitation to ", pi, rs, packetlen, @@ -224,24 +231,9 @@ advertise(struct sockaddr_in6 *sin6, struct phyint *pi, boolean_t no_prefixes) return; } - /* Attach any options */ - if (pi->pi_hdw_addr_len != 0) { - struct nd_opt_lla *lo = (struct nd_opt_lla *)pptr; - int optlen; - - /* roundup to multiple of 8 and make padding zero */ - optlen = ((sizeof (struct nd_opt_hdr) + - pi->pi_hdw_addr_len + 7) / 8) * 8; - bzero(pptr, optlen); - - lo->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR; - lo->nd_opt_lla_len = optlen / 8; - bcopy((char *)pi->pi_hdw_addr, - (char *)lo->nd_opt_lla_hdw_addr, - pi->pi_hdw_addr_len); - packetlen += optlen; - pptr += optlen; - } + /* add options */ + packetlen += add_opt_lla(pi, (struct nd_opt_lla *)pptr); + pptr = (char *)packet + packetlen; if (pi->pi_AdvLinkMTU != 0) { struct nd_opt_mtu *mo = (struct nd_opt_mtu *)pptr; @@ -1671,10 +1663,10 @@ process_rtsock(int rtsock) return; } - if (ifm->ifm_flags != pi->pi_flags) { + if (ifm->ifm_flags != (uint_t)pi->pi_flags) { if (debug & D_IFSCAN) { logmsg(LOG_DEBUG, "process_rtsock: clr for " - "%s old flags 0x%x new flags 0x%x\n", + "%s old flags 0x%llx new flags 0x%x\n", pi->pi_name, pi->pi_flags, ifm->ifm_flags); } } @@ -1825,141 +1817,67 @@ process_mibsock(int mibsock) } /* - * Check whether the address formed by pr->pr_prefix and pi_token - * exists in the kernel. Cannot call SIOCTMYADDR/ONLINK as it - * does not check for down addresses. This function should not - * be called for onlink prefixes. - */ -static boolean_t -is_address_present(struct phyint *pi, struct prefix *pr, uint64_t flags) -{ - int s; - in6_addr_t addr, *token; - int i; - int ret; - struct sockaddr_in6 sin6; - - s = socket(AF_INET6, SOCK_DGRAM, 0); - if (s < 0) { - logperror("is_address_present: socket"); - /* - * By returning B_TRUE, we make the caller delete - * the prefix from the internal table. In the worst - * case the next RA will create the prefix. - */ - return (_B_TRUE); - } - if (flags & IFF_TEMPORARY) - token = &pi->pi_tmp_token; - else - token = &pi->pi_token; - for (i = 0; i < 16; i++) { - /* - * prefix_create ensures that pr_prefix has all-zero - * bits after prefixlen. - */ - addr.s6_addr[i] = pr->pr_prefix.s6_addr[i] | token->s6_addr[i]; - } - (void) memset(&sin6, 0, sizeof (struct sockaddr_in6)); - sin6.sin6_family = AF_INET6; - sin6.sin6_addr = addr; - ret = bind(s, (struct sockaddr *)&sin6, sizeof (struct sockaddr_in6)); - (void) close(s); - if (ret < 0 && errno == EADDRNOTAVAIL) - return (_B_FALSE); - else - return (_B_TRUE); -} - -/* * Look if the phyint or one of its prefixes have been removed from * the kernel and take appropriate action. - * Uses {pi,pr}_in_use. + * Uses pr_in_use and pi{,_kernel}_state. */ static void check_if_removed(struct phyint *pi) { - struct prefix *pr; - struct prefix *next_pr; + struct prefix *pr, *next_pr; /* - * Detect phyints that have been removed from the kernel. - * Since we can't recreate it here (would require ifconfig plumb - * logic) we just terminate use of that phyint. - */ - if (!(pi->pi_kernel_state & PI_PRESENT) && - (pi->pi_state & PI_PRESENT)) { - logmsg(LOG_ERR, "Interface %s has been removed from kernel. " - "in.ndpd will no longer use it\n", pi->pi_name); - /* - * Clear state so that should the phyint reappear - * we will start with initial advertisements or - * solicitations. - */ - phyint_cleanup(pi); - } - /* * Detect prefixes which are removed. - * - * We remove the prefix in all of the following cases : - * - * 1) Static prefixes are not the ones we create. So, - * just remove it from our tables. - * - * 2) On-link prefixes potentially move to a different - * phyint during failover. As it does not have - * an address, we can't use the logic in is_address_present - * to detect whether it is present in the kernel or not. - * Thus when it is manually removed we don't recreate it. - * - * 3) If there is a token mis-match and this prefix is not - * in the kernel, it means we don't need this prefix on - * this interface anymore. It must have been moved to a - * different interface by in.mpathd. This normally - * happens after a failover followed by a failback (or - * another failover) and we re-read the network - * configuration. For the failover from A to B, we would - * have created state on B about A's address, which will - * not be in use after the subsequent failback. So, we - * remove that prefix here. - * - * 4) If the physical interface is not present, then remove - * the prefix. In the cases where we are advertising - * prefixes, the state is kept in advertisement prefix and - * hence we can delete the prefix. - * - * 5) Similar to case (3), when we failover from A to B, the - * prefix in A will not be in use as it has been moved to B. - * We will delete it from our tables and recreate it when - * it fails back. is_address_present makes sure that the - * address is still valid in kernel. - * - * If none of the above is true, we recreate the prefix as it - * has been manually removed. We do it only when the interface - * is not FAILED or INACTIVE or OFFLINE. + * Static prefixes are just removed from our tables. + * Non-static prefixes are recreated i.e. in.ndpd takes precedence + * over manually removing prefixes via ifconfig. */ for (pr = pi->pi_prefix_list; pr != NULL; pr = next_pr) { next_pr = pr->pr_next; if (!pr->pr_in_use) { - /* Clear PR_AUTO and PR_ONLINK */ + /* Clear everything except PR_STATIC */ pr->pr_kernel_state &= PR_STATIC; - if ((pr->pr_state & PR_STATIC) || - !(pr->pr_state & PR_AUTO) || - !(prefix_token_match(pi, pr, pr->pr_flags)) || - (!(pi->pi_kernel_state & PI_PRESENT)) || - (is_address_present(pi, pr, pr->pr_flags))) { + pr->pr_name[0] = '\0'; + if (pr->pr_state & PR_STATIC) { prefix_delete(pr); - } else if (!(pi->pi_flags & - (IFF_FAILED|IFF_INACTIVE|IFF_OFFLINE)) && - pr->pr_state != pr->pr_kernel_state) { - pr->pr_name[0] = '\0'; + } else if (!(pi->pi_kernel_state & PI_PRESENT)) { + /* + * Ensure that there are no future attempts to + * run prefix_update_k since the phyint is gone. + */ + pr->pr_state = pr->pr_kernel_state; + } else if (pr->pr_state != pr->pr_kernel_state) { logmsg(LOG_INFO, "Prefix manually removed " - "on %s - recreating it!\n", - pi->pi_name); + "on %s; recreating\n", pi->pi_name); prefix_update_k(pr); } } } + + /* + * Detect phyints that have been removed from the kernel, and tear + * down any prefixes we created that are associated with that phyint. + * (NOTE: IPMP depends on in.ndpd tearing down these prefixes so an + * administrator can easily place an IP interface with ADDRCONF'd + * addresses into an IPMP group.) + */ + if (!(pi->pi_kernel_state & PI_PRESENT) && + (pi->pi_state & PI_PRESENT)) { + logmsg(LOG_ERR, "Interface %s has been removed from kernel. " + "in.ndpd will no longer use it\n", pi->pi_name); + + for (pr = pi->pi_prefix_list; pr != NULL; pr = next_pr) { + next_pr = pr->pr_next; + if (pr->pr_state & PR_AUTO) + prefix_delete(pr); + } + + /* + * Clear state so that should the phyint reappear we will + * start with initial advertisements or solicitations. + */ + phyint_cleanup(pi); + } } diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c index 5d64a9303d..0a9e1e6a13 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -383,29 +383,12 @@ incoming_ra(struct phyint *pi, struct nd_router_advert *ra, int len, if (no_loopback && loopback) return; - /* - * If the interface is FAILED or INACTIVE or OFFLINE, don't - * create any addresses on them. in.mpathd assumes that no new - * addresses will appear on these. This implies that we - * won't create any new prefixes advertised by the router - * on FAILED/INACTIVE/OFFLINE interfaces. When the state changes, - * the next RA will create the prefix on this interface. - */ - if (pi->pi_flags & (IFF_FAILED|IFF_INACTIVE|IFF_OFFLINE)) - return; + bzero(&lifr, sizeof (lifr)); + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; - if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) { - if (errno == ENXIO) - return; - logperror_pi(pi, "incoming_ra: SIOCGLIFLNKINFO"); - return; - } if (ra->nd_ra_curhoplimit != CURHOP_UNSPECIFIED && ra->nd_ra_curhoplimit != pi->pi_CurHopLimit) { pi->pi_CurHopLimit = ra->nd_ra_curhoplimit; - lifr.lifr_ifinfo.lir_maxhops = pi->pi_CurHopLimit; set_needed = _B_TRUE; } @@ -460,7 +443,7 @@ incoming_ra(struct phyint *pi, struct nd_router_advert *ra, int len, logmsg(LOG_DEBUG, "incoming_ra: trigger dhcp %s on %s\n", (ra->nd_ra_flags_reserved & ~pi->pi_ra_flags & - ND_RA_FLAG_MANAGED) ? "MANAGED" : "OTHER", + ND_RA_FLAG_MANAGED) ? "MANAGED" : "OTHER", pi->pi_name); } pi->pi_ra_flags |= ra->nd_ra_flags_reserved; @@ -999,11 +982,9 @@ incoming_prefix_addrconf_process(struct phyint *pi, struct prefix *pr, * Delete this prefix structure as kernel * does not allow duplicated addresses */ - logmsg(LOG_ERR, "incoming_prefix_addrconf_process: " - "Duplicate prefix %s received on interface %s\n", - inet_ntop(AF_INET6, - (void *)&po->nd_opt_pi_prefix, abuf, + "Duplicate prefix %s received on interface %s\n", + inet_ntop(AF_INET6, &po->nd_opt_pi_prefix, abuf, sizeof (abuf)), pi->pi_name); logmsg(LOG_ERR, "incoming_prefix_addrconf_process: " "Prefix already exists in interface %s\n", @@ -1129,12 +1110,8 @@ incoming_mtu_opt(struct phyint *pi, uchar_t *opt, } pi->pi_LinkMTU = mtu; - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; - if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) { - logperror_pi(pi, "incoming_mtu_opt: SIOCGLIFLNKINFO"); - return; - } + bzero(&lifr, sizeof (lifr)); + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); lifr.lifr_ifinfo.lir_maxmtu = pi->pi_LinkMTU; if (ioctl(pi->pi_sock, SIOCSLIFLNKINFO, (char *)&lifr) < 0) { logperror_pi(pi, "incoming_mtu_opt: SIOCSLIFLNKINFO"); @@ -1155,33 +1132,33 @@ incoming_lla_opt(struct phyint *pi, uchar_t *opt, struct sockaddr_in6 *sin6; int max_content_len; - if (pi->pi_hdw_addr_len == 0) + /* + * Get our link-layer address length. We may not have one, in which + * case we can just bail. + */ + if (phyint_get_lla(pi, &lifr) != 0) return; /* * Can't remove padding since it is link type specific. - * However, we check against the length of our link-layer - * address. - * Note: assumes that all links have a fixed lengh address. + * However, we check against the length of our link-layer address. + * Note: assumes that all links have a fixed length address. */ max_content_len = lo->nd_opt_lla_len * 8 - sizeof (struct nd_opt_hdr); - if (max_content_len < pi->pi_hdw_addr_len || + if (max_content_len < lifr.lifr_nd.lnr_hdw_len || (max_content_len >= 8 && - max_content_len - 7 > pi->pi_hdw_addr_len)) { + max_content_len - 7 > lifr.lifr_nd.lnr_hdw_len)) { char abuf[INET6_ADDRSTRLEN]; (void) inet_ntop(AF_INET6, (void *)&from->sin6_addr, abuf, sizeof (abuf)); logmsg(LOG_INFO, "lla option from %s on %s too long with bad " - "physaddr length (%d vs. %d bytes)\n", - abuf, pi->pi_name, - max_content_len, pi->pi_hdw_addr_len); + "physaddr length (%d vs. %d bytes)\n", abuf, pi->pi_name, + max_content_len, lifr.lifr_nd.lnr_hdw_len); return; } - lifr.lifr_nd.lnr_hdw_len = pi->pi_hdw_addr_len; - bcopy((char *)lo->nd_opt_lla_hdw_addr, - (char *)lifr.lifr_nd.lnr_hdw_addr, + bcopy(lo->nd_opt_lla_hdw_addr, lifr.lifr_nd.lnr_hdw_addr, lifr.lifr_nd.lnr_hdw_len); sin6 = (struct sockaddr_in6 *)&lifr.lifr_nd.lnr_addr; @@ -1196,8 +1173,7 @@ incoming_lla_opt(struct phyint *pi, uchar_t *opt, lifr.lifr_nd.lnr_state_same_lla = ND_UNCHANGED; lifr.lifr_nd.lnr_state_diff_lla = ND_STALE; lifr.lifr_nd.lnr_flags = isrouter; - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); if (ioctl(pi->pi_sock, SIOCLIFSETND, (char *)&lifr) < 0) { logperror_pi(pi, "incoming_lla_opt: SIOCLIFSETND"); return; diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c index c8fc6381b7..09e6137965 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "tables.h" @@ -171,6 +169,7 @@ phyint_init_from_k(struct phyint *pi) struct ipv6_mreq v6mcastr; struct lifreq lifr; int fd; + int save_errno; boolean_t newsock; uint_t ttl; struct sockaddr_in6 *sin6; @@ -297,30 +296,6 @@ start_over: pi->pi_dst_token = in6addr_any; } - /* Get link-layer address */ - if (!(pi->pi_flags & IFF_MULTICAST) || - (pi->pi_flags & IFF_POINTOPOINT)) { - pi->pi_hdw_addr_len = 0; - } else { - sin6 = (struct sockaddr_in6 *)&lifr.lifr_nd.lnr_addr; - bzero(sin6, sizeof (struct sockaddr_in6)); - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = pi->pi_ifaddr; - - if (ioctl(fd, SIOCLIFGETND, (char *)&lifr) < 0) { - logperror_pi(pi, "phyint_init_from_k: SIOCLIFGETND"); - goto error; - } - - pi->pi_hdw_addr_len = lifr.lifr_nd.lnr_hdw_len; - - if (lifr.lifr_nd.lnr_hdw_len != 0) { - bcopy((char *)lifr.lifr_nd.lnr_hdw_addr, - (char *)pi->pi_hdw_addr, - lifr.lifr_nd.lnr_hdw_len); - } - } - if (newsock) { icmp6_filter_t filter; int on = 1; @@ -360,8 +335,21 @@ start_over: v6mcastr.ipv6mr_interface = pi->pi_index; if (setsockopt(fd, IPPROTO_IPV6, IPV6_JOIN_GROUP, (char *)&v6mcastr, sizeof (v6mcastr)) < 0) { - logperror_pi(pi, "phyint_init_from_k: " - "setsockopt IPV6_JOIN_GROUP"); + /* + * One benign reason IPV6_JOIN_GROUP could fail is + * when `pi' has been placed into an IPMP group and we + * haven't yet processed the routing socket message + * informing us of its disappearance. As such, if + * it's now in a group, don't print an error. + */ + save_errno = errno; + (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1 || + lifr.lifr_groupname[0] == '\0') { + errno = save_errno; + logperror_pi(pi, "phyint_init_from_k: " + "setsockopt IPV6_JOIN_GROUP"); + } goto error; } pi->pi_state |= PI_JOINED_ALLNODES; @@ -403,8 +391,17 @@ start_over: v6mcastr.ipv6mr_interface = pi->pi_index; if (setsockopt(fd, IPPROTO_IPV6, IPV6_JOIN_GROUP, (char *)&v6mcastr, sizeof (v6mcastr)) < 0) { - logperror_pi(pi, "phyint_init_from_k: setsockopt " - "IPV6_JOIN_GROUP"); + /* + * See IPV6_JOIN_GROUP comment above. + */ + save_errno = errno; + (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1 || + lifr.lifr_groupname[0] == '\0') { + errno = save_errno; + logperror_pi(pi, "phyint_init_from_k: " + "setsockopt IPV6_JOIN_GROUP"); + } goto error; } pi->pi_state |= PI_JOINED_ALLROUTERS; @@ -569,22 +566,16 @@ phyint_print(struct phyint *pi) struct adv_prefix *adv_pr; struct router *dr; char abuf[INET6_ADDRSTRLEN]; - char llabuf[BUFSIZ]; logmsg(LOG_DEBUG, "Phyint %s index %d state %x, kernel %x, " "num routers %d\n", pi->pi_name, pi->pi_index, pi->pi_state, pi->pi_kernel_state, pi->pi_num_k_routers); - logmsg(LOG_DEBUG, "\taddress: %s flags %x\n", + logmsg(LOG_DEBUG, "\taddress: %s flags %llx\n", inet_ntop(AF_INET6, (void *)&pi->pi_ifaddr, abuf, sizeof (abuf)), pi->pi_flags); - logmsg(LOG_DEBUG, "\tsock %d mtu %d hdw_addr len %d <%s>\n", - pi->pi_sock, pi->pi_mtu, pi->pi_hdw_addr_len, - ((pi->pi_hdw_addr_len != 0) ? - fmt_lla(llabuf, sizeof (llabuf), pi->pi_hdw_addr, - pi->pi_hdw_addr_len) : "none")); - logmsg(LOG_DEBUG, "\ttoken: len %d %s\n", - pi->pi_token_length, + logmsg(LOG_DEBUG, "\tsock %d mtu %d\n", pi->pi_sock, pi->pi_mtu); + logmsg(LOG_DEBUG, "\ttoken: len %d %s\n", pi->pi_token_length, inet_ntop(AF_INET6, (void *)&pi->pi_token, abuf, sizeof (abuf))); if (pi->pi_TmpAddrsEnabled) { @@ -632,6 +623,43 @@ phyint_print(struct phyint *pi) logmsg(LOG_DEBUG, "\n"); } + +/* + * Store the LLA for the phyint `pi' `lifrp'. Returns 0 on success, or + * -1 on failure. + * + * Note that we do not cache the hardware address since there's no reliable + * mechanism to determine when it's become stale. + */ +int +phyint_get_lla(struct phyint *pi, struct lifreq *lifrp) +{ + struct sockaddr_in6 *sin6; + + /* If this phyint doesn't have a link-layer address, bail */ + if (!(pi->pi_flags & IFF_MULTICAST) || + (pi->pi_flags & IFF_POINTOPOINT)) { + return (-1); + } + + (void) strlcpy(lifrp->lifr_name, pi->pi_name, LIFNAMSIZ); + sin6 = (struct sockaddr_in6 *)&(lifrp->lifr_nd.lnr_addr); + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = pi->pi_ifaddr; + if (ioctl(pi->pi_sock, SIOCLIFGETND, lifrp) < 0) { + /* + * For IPMP interfaces, don't report ESRCH errors since that + * merely indicates that there are no active interfaces in the + * IPMP group (and thus there's no working hardware address), + * and the packet will thus never make it out anyway. + */ + if (!(pi->pi_flags & IFF_IPMP) || errno != ESRCH) + logperror_pi(pi, "phyint_get_lla: SIOCLIFGETND"); + return (-1); + } + return (0); +} + /* * Randomize pi->pi_ReachableTime. * Done periodically when there are no RAs and at a maximum frequency when @@ -642,20 +670,14 @@ phyint_print(struct phyint *pi) void phyint_reach_random(struct phyint *pi, boolean_t set_needed) { + struct lifreq lifr; + pi->pi_ReachableTime = GET_RANDOM( (int)(ND_MIN_RANDOM_FACTOR * pi->pi_BaseReachableTime), (int)(ND_MAX_RANDOM_FACTOR * pi->pi_BaseReachableTime)); if (set_needed) { - struct lifreq lifr; - - (void) strncpy(lifr.lifr_name, pi->pi_name, - sizeof (lifr.lifr_name)); - pi->pi_name[sizeof (pi->pi_name) - 1] = '\0'; - if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) { - logperror_pi(pi, - "phyint_reach_random: SIOCGLIFLNKINFO"); - return; - } + bzero(&lifr, sizeof (lifr)); + (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ); lifr.lifr_ifinfo.lir_reachtime = pi->pi_ReachableTime; if (ioctl(pi->pi_sock, SIOCSLIFLNKINFO, (char *)&lifr) < 0) { logperror_pi(pi, @@ -1386,12 +1408,12 @@ prefix_modify_flags(struct prefix *pr, uint64_t onflags, uint64_t offflags) (void) strncpy(lifr.lifr_name, pr->pr_name, sizeof (lifr.lifr_name)); lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; if (ioctl(pi->pi_sock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { - logperror_pr(pr, "prefix_modify_flags: SIOCGLIFFLAGS"); - logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx " - "on 0x%llx off 0x%llx\n", - pr->pr_physical->pi_name, - pr->pr_name, - pr->pr_flags, onflags, offflags); + if (errno != ENXIO) { + logperror_pr(pr, "prefix_modify_flags: SIOCGLIFFLAGS"); + logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx" + " on 0x%llx off 0x%llx\n", pr->pr_physical->pi_name, + pr->pr_name, pr->pr_flags, onflags, offflags); + } return (-1); } old_flags = lifr.lifr_flags; @@ -1399,12 +1421,13 @@ prefix_modify_flags(struct prefix *pr, uint64_t onflags, uint64_t offflags) lifr.lifr_flags &= ~offflags; pr->pr_flags = lifr.lifr_flags; if (ioctl(pi->pi_sock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { - logperror_pr(pr, "prefix_modify_flags: SIOCSLIFFLAGS"); - logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx " - "new 0x%llx on 0x%llx off 0x%llx\n", - pr->pr_physical->pi_name, - pr->pr_name, - old_flags, lifr.lifr_flags, onflags, offflags); + if (errno != ENXIO) { + logperror_pr(pr, "prefix_modify_flags: SIOCSLIFFLAGS"); + logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx" + " new 0x%llx on 0x%llx off 0x%llx\n", + pr->pr_physical->pi_name, pr->pr_name, + old_flags, lifr.lifr_flags, onflags, offflags); + } return (-1); } return (0); @@ -1540,7 +1563,8 @@ prefix_update_k(struct prefix *pr) /* Remove logical interface based on pr_name */ lifr.lifr_addr.ss_family = AF_UNSPEC; - if (ioctl(pi->pi_sock, SIOCLIFREMOVEIF, (char *)&lifr) < 0) { + if (ioctl(pi->pi_sock, SIOCLIFREMOVEIF, (char *)&lifr) < 0 && + errno != ENXIO) { logperror_pr(pr, "prefix_update_k: SIOCLIFREMOVEIF"); } pr->pr_kernel_state = 0; @@ -1865,36 +1889,6 @@ prefix_print(struct prefix *pr) } /* - * Does the address formed by pr->pr_prefix and pi->pi_token match - * pr->pr_address. It does not match if a failover has happened - * earlier (done by in.mpathd) from a different pi. Should not - * be called for onlink prefixes. - */ -boolean_t -prefix_token_match(struct phyint *pi, struct prefix *pr, uint64_t flags) -{ - int i; - in6_addr_t addr, *token; - - if (flags & IFF_TEMPORARY) - token = &pi->pi_tmp_token; - else - token = &pi->pi_token; - for (i = 0; i < 16; i++) { - /* - * prefix_create ensures that pr_prefix has all-zero - * bits after prefixlen. - */ - addr.s6_addr[i] = pr->pr_prefix.s6_addr[i] | token->s6_addr[i]; - } - if (IN6_ARE_ADDR_EQUAL(&pr->pr_address, &addr)) { - return (_B_TRUE); - } else { - return (_B_FALSE); - } -} - -/* * Lookup advertisement prefix structure that matches the prefix and * prefix length. * Assumes that the bits after prefixlen might not be zero. @@ -2305,8 +2299,7 @@ phyint_print_all(void) } void -phyint_cleanup(pi) - struct phyint *pi; +phyint_cleanup(struct phyint *pi) { pi->pi_state = 0; pi->pi_kernel_state = 0; diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h index 409600a402..dfc5414d5d 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _NDPD_TABLES_H #define _NDPD_TABLES_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -58,9 +56,7 @@ struct phyint { char pi_name[LIFNAMSIZ]; /* Used to identify it */ int pi_sock; /* For sending and receiving */ struct in6_addr pi_ifaddr; /* Local address */ - uint_t pi_flags; /* IFF_* flags */ - uint_t pi_hdw_addr_len; - uchar_t pi_hdw_addr[ND_MAX_HDW_LEN]; + uint64_t pi_flags; /* IFF_* flags */ uint_t pi_mtu; /* From SIOCGLIFMTU */ struct in6_addr pi_token; uint_t pi_token_length; @@ -256,6 +252,7 @@ extern int phyint_init_from_k(struct phyint *pi); extern void phyint_delete(struct phyint *pi); extern uint_t phyint_timer(struct phyint *pi, uint_t elapsed); extern void phyint_print_all(void); +extern int phyint_get_lla(struct phyint *pi, struct lifreq *lifrp); extern void phyint_reach_random(struct phyint *pi, boolean_t set_needed); extern void phyint_cleanup(struct phyint *pi); @@ -280,8 +277,6 @@ extern void prefix_update_k(struct prefix *pr); extern uint_t prefix_timer(struct prefix *pr, uint_t elapsed); extern uint_t adv_prefix_timer(struct adv_prefix *adv_pr, uint_t elapsed); -extern boolean_t prefix_token_match(struct phyint *pi, - struct prefix *pr, uint64_t flags); extern struct prefix *prefix_lookup_addr(struct phyint *pi, struct in6_addr prefix); diff --git a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c index 15db1b7539..b76341e303 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c +++ b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c @@ -1,3 +1,7 @@ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ /* -*- Mode: C; tab-width: 4 -*- * * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. @@ -130,8 +134,6 @@ First checkin */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mDNSUNP.h" #include "mDNSDebug.h" @@ -398,13 +400,11 @@ select_src_ifi_info_solaris(int sockfd, int numifs, continue; /* * Avoid address if any of the following flags are set: - * IFF_NOFAILOVER: IPMP test address for use only by in.mpathd * IFF_NOXMIT: no packets transmitted over interface * IFF_NOLOCAL: no address * IFF_PRIVATE: is not advertised */ - if (ifflags & (IFF_NOFAILOVER | IFF_NOXMIT - | IFF_NOLOCAL | IFF_PRIVATE)) + if (ifflags & (IFF_NOXMIT | IFF_NOLOCAL | IFF_PRIVATE)) continue; if (*best_lifr != NULL) { diff --git a/usr/src/cmd/cmd-inet/usr.sbin/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/Makefile index d91d113347..e29c1765ec 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/Makefile +++ b/usr/src/cmd/cmd-inet/usr.sbin/Makefile @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -65,12 +65,13 @@ K5TELNETOBJS= in.telnetd.o SRCS= $(PROGSRCS) $(OTHERSRC) SUBDIRS= bootconfchk htable ifconfig in.ftpd in.rdisc in.routed \ - in.talkd inetadm inetconv ipqosconf kssl/kssladm kssl/ksslcfg \ - ping routeadm snoop sppptun traceroute wificonfig ipsecutils + in.talkd inetadm inetconv ipmpstat ipqosconf ipsecutils \ + kssl/kssladm kssl/ksslcfg ping routeadm snoop sppptun \ + traceroute wificonfig MSGSUBDIRS= bootconfchk htable ifconfig in.ftpd in.routed in.talkd inetadm \ - inetconv ipqosconf kssl/ksslcfg routeadm sppptun snoop \ - wificonfig ipsecutils + inetconv ipmpstat ipqosconf ipsecutils kssl/ksslcfg routeadm \ + sppptun snoop wificonfig # As programs get lint-clean, add them here and to the 'lint' target. # Eventually this hack should go away, and all in PROG should be @@ -83,7 +84,8 @@ LINTCLEAN= 6to4relay arp in.rlogind in.rshd in.telnetd in.tftpd \ # with SUBDIRS. Also (sigh) deal with the commented-out build lines # for the lint rule. LINTSUBDIRS= bootconfchk in.rdisc in.routed in.talkd inetadm inetconv \ - ipqosconf ping routeadm sppptun traceroute wificonfig ipsecutils + ipmpstat ipqosconf ipsecutils ping routeadm sppptun traceroute \ + wificonfig # And as programs are verified not to attempt to write into constants, # -xstrconst should be used to ensure they stay that way. CONSTCLEAN= @@ -144,6 +146,8 @@ LDLIBS += $(K5LIBS) $(TSNETPROG) := LDLIBS += -ltsnet in.rarpd := LDLIBS += -linetutil -ldlpi +if_mpadm := LDLIBS += -linetutil -lipmp +if_mpadm.po := XGETFLAGS += -a route := CPPFLAGS += -DNDEBUG ndd := LDLIBS += -ldladm gettable in.comsat := LDFLAGS += $(MAPFILE.NGB:%=-M%) @@ -245,7 +249,7 @@ lint: $(LINTSUBDIRS) -I$(SRC)/lib/gss_mechs/mech_krb5/include \ -I$(SRC)/lib/pam_modules/krb5 \ in.telnetd.c $(LDLIBS) -lbsm -lpam -lsocket -lnsl - $(LINT.c) if_mpadm.c $(LDLIBS) -lsocket -lnsl -lipmp + $(LINT.c) if_mpadm.c $(LDLIBS) -lsocket -lnsl -lipmp -linetutil $(LINT.c) ipaddrsel.c $(LDLIBS) -lsocket -lnsl $(LINT.c) route.c $(LDLIBS) -lsocket -lnsl -ltsnet $(LINT.c) syncinit.c $(LDLIBS) -ldlpi diff --git a/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c b/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c index d4874135fd..7c5d73c796 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,660 +19,250 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <unistd.h> -#include <stdlib.h> +#include <errno.h> +#include <ipmp_admin.h> +#include <libinetutil.h> +#include <locale.h> +#include <net/if.h> +#include <stdarg.h> #include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> #include <sys/socket.h> -#include <netinet/in.h> -#include <netinet/tcp.h> #include <sys/sockio.h> -#include <net/if.h> -#include <errno.h> -#include <strings.h> -#include <ipmp_mpathd.h> -#include <libintl.h> +#include <sys/types.h> -static int if_down(int ifsock, struct lifreq *lifr); -static int if_up(int ifsock, struct lifreq *lifr); -static void send_cmd(int cmd, char *ifname); -static int connect_to_mpathd(sa_family_t family); -static void do_offline(char *ifname); -static void undo_offline(char *ifname); -static boolean_t offline_set(char *ifname); +typedef void offline_func_t(const char *, ipmp_handle_t); -#define IF_SEPARATOR ':' -#define MAX_RETRIES 3 +static const char *progname; +static int sioc4fd, sioc6fd; +static offline_func_t do_offline, undo_offline; +static boolean_t set_lifflags(const char *, uint64_t); +static boolean_t is_offline(const char *); +static void warn(const char *, ...); +static void die(const char *, ...); static void usage() { - (void) fprintf(stderr, "Usage : if_mpadm [-d | -r] <interface_name>\n"); + (void) fprintf(stderr, "Usage: %s [-d | -r] <interface>\n", progname); + exit(1); } -static void -print_mpathd_error_msg(uint32_t error) +static const char * +mpadm_errmsg(uint32_t error) { switch (error) { - case MPATHD_MIN_RED_ERROR: - (void) fprintf(stderr, gettext( - "Offline failed as there is no other functional " - "interface available in the multipathing group " - "for failing over the network access.\n")); - break; - - case MPATHD_FAILBACK_PARTIAL: - (void) fprintf(stderr, gettext( - "Offline cannot be undone because multipathing " - "configuration is not consistent across all the " - "interfaces in the group.\n")); - break; - + case IPMP_EUNKIF: + return ("not a physical interface or not in an IPMP group"); + case IPMP_EMINRED: + return ("no other functioning interfaces are in its IPMP " + "group"); default: - /* - * We shouldn't get here. All errors should have a - * meaningful error message, as shown in the above - * cases. If we get here, someone has made a mistake. - */ - (void) fprintf(stderr, gettext( - "Operation returned an unrecognized error: %u\n"), - error); - break; + return (ipmp_errmsg(error)); } } int main(int argc, char **argv) { - char *ifname; - int cmd = 0; + int retval; + ipmp_handle_t handle; + offline_func_t *ofuncp = NULL; + const char *ifname; int c; -#if !defined(TEXT_DOMAIN) -#define TEXT_DOMAIN "SYS_TEST" -#endif + if ((progname = strrchr(argv[0], '/')) != NULL) + progname++; + else + progname = argv[0]; + + (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); while ((c = getopt(argc, argv, "d:r:")) != EOF) { switch (c) { case 'd': ifname = optarg; - cmd = MI_OFFLINE; - if (offline_set(ifname)) { - (void) fprintf(stderr, gettext("Interface " - "already offlined\n")); - exit(1); - } + ofuncp = do_offline; break; case 'r': ifname = optarg; - cmd = MI_UNDO_OFFLINE; - if (!offline_set(ifname)) { - (void) fprintf(stderr, gettext("Interface not " - "offlined\n")); - exit(1); - } + ofuncp = undo_offline; break; default : usage(); - exit(1); } } - if (cmd == 0) { + if (ofuncp == NULL) usage(); - exit(1); - } /* - * Send the command to in.mpathd which is generic to - * both the commands. send_cmd returns only if there - * is no error. + * Create the global V4 and V6 socket ioctl descriptors. */ - send_cmd(cmd, ifname); - if (cmd == MI_OFFLINE) { - do_offline(ifname); - } else { - undo_offline(ifname); - } + sioc4fd = socket(AF_INET, SOCK_DGRAM, 0); + sioc6fd = socket(AF_INET6, SOCK_DGRAM, 0); + if (sioc4fd == -1 || sioc6fd == -1) + die("cannot create sockets"); - return (0); -} + if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS) + die("cannot create ipmp handle: %s\n", ipmp_errmsg(retval)); -/* - * Is IFF_OFFLINE set ? - * Returns B_FALSE on failure and B_TRUE on success. - */ -boolean_t -offline_set(char *ifname) -{ - struct lifreq lifr; - int s4; - int s6; - int ret; - - s4 = socket(AF_INET, SOCK_DGRAM, 0); - if (s4 < 0) { - perror("socket"); - exit(1); - } - s6 = socket(AF_INET6, SOCK_DGRAM, 0); - if (s6 < 0) { - perror("socket"); - exit(1); - } - (void) strncpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name)); - ret = ioctl(s4, SIOCGLIFFLAGS, (caddr_t)&lifr); - if (ret < 0) { - if (errno != ENXIO) { - perror("ioctl: SIOCGLIFFLAGS"); - exit(1); - } - ret = ioctl(s6, SIOCGLIFFLAGS, (caddr_t)&lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFFLAGS"); - exit(1); - } - } - (void) close(s4); - (void) close(s6); - if (lifr.lifr_flags & IFF_OFFLINE) - return (B_TRUE); - else - return (B_FALSE); + (*ofuncp)(ifname, handle); + + ipmp_close(handle); + (void) close(sioc4fd); + (void) close(sioc6fd); + + return (EXIT_SUCCESS); } /* - * Sends the command to in.mpathd. If not successful, prints - * an error message and exits. + * Checks whether IFF_OFFLINE is set on `ifname'. */ -void -send_cmd(int cmd, char *ifname) +boolean_t +is_offline(const char *ifname) { - struct mi_offline mio; - struct mi_undo_offline miu; - struct mi_result me; - int ret; - int cmd_len; - int i; - int s; - - for (i = 0; i < MAX_RETRIES; i++) { - s = connect_to_mpathd(AF_INET); - if (s == -1) { - s = connect_to_mpathd(AF_INET6); - if (s == -1) { - (void) fprintf(stderr, gettext("Cannot " - "establish communication with " - "in.mpathd.\n")); - exit(1); - } - } - switch (cmd) { - case MI_OFFLINE : - cmd_len = sizeof (struct mi_offline); - bzero(&mio, cmd_len); - mio.mio_command = cmd; - (void) strncpy(mio.mio_ifname, ifname, LIFNAMSIZ); - mio.mio_min_redundancy = 1; - ret = write(s, &mio, cmd_len); - if (ret != cmd_len) { - /* errno is set only when ret is -1 */ - if (ret == -1) - perror("write"); - (void) fprintf(stderr, gettext("Failed to " - "successfully send command to " - "in.mpathd.\n")); - exit(1); - } - break; - case MI_UNDO_OFFLINE: - cmd_len = sizeof (struct mi_undo_offline); - bzero(&miu, cmd_len); - miu.miu_command = cmd; - (void) strncpy(miu.miu_ifname, ifname, LIFNAMSIZ); - ret = write(s, &miu, cmd_len); - if (ret != cmd_len) { - /* errno is set only when ret is -1 */ - if (ret == -1) - perror("write"); - (void) fprintf(stderr, gettext("Failed to " - "successfully send command to " - "in.mpathd.\n")); - exit(1); - } - break; - default : - (void) fprintf(stderr, "Unknown command \n"); - exit(1); - } + struct lifreq lifr = { 0 }; - /* Read the result from mpathd */ - ret = read(s, &me, sizeof (me)); - if (ret != sizeof (me)) { - /* errno is set only when ret is -1 */ - if (ret == -1) - perror("read"); - (void) fprintf(stderr, gettext("Failed to successfully " - "read result from in.mpathd.\n")); - exit(1); + (void) strlcpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name)); + if (ioctl(sioc4fd, SIOCGLIFFLAGS, &lifr) == -1) { + if (errno != ENXIO || + ioctl(sioc6fd, SIOCGLIFFLAGS, &lifr) == -1) { + die("cannot get interface flags on %s", ifname); } - if (me.me_mpathd_error == 0) { - if (i != 0) { - /* - * We retried at least once. Tell the user - * that things succeeded now. - */ - (void) fprintf(stderr, - gettext("Retry Successful.\n")); - } - return; /* Successful */ - } - - if (me.me_mpathd_error == MPATHD_SYS_ERROR) { - if (me.me_sys_error == EAGAIN) { - (void) close(s); - (void) sleep(1); - (void) fprintf(stderr, - gettext("Retrying ...\n")); - continue; /* Retry */ - } - errno = me.me_sys_error; - perror("if_mpadm"); - } else { - print_mpathd_error_msg(me.me_mpathd_error); - } - exit(1); } - /* - * We come here only if we retry the operation multiple - * times and did not succeed. Let the user try it again - * later. - */ - (void) fprintf(stderr, - gettext("Device busy. Retry the operation later.\n")); - exit(1); + + return ((lifr.lifr_flags & IFF_OFFLINE) != 0); } static void -do_offline(char *ifname) +do_offline(const char *ifname, ipmp_handle_t handle) { - struct lifreq lifr; - struct lifreq *lifcr; - struct lifnum lifn; - struct lifconf lifc; - char *buf; - int numifs; - int n; - char pi_name[LIFNAMSIZ + 1]; - char *cp; - int ifsock_v4; - int ifsock_v6; - int af; - int ret; + ifaddrlistx_t *ifaddrp, *ifaddrs; + int retval; + + if (is_offline(ifname)) + die("interface %s is already offline\n", ifname); + + if ((retval = ipmp_offline(handle, ifname, 1)) != IPMP_SUCCESS) + die("cannot offline %s: %s\n", ifname, mpadm_errmsg(retval)); /* - * Verify whether IFF_OFFLINE is not set as a sanity check. - */ - if (!offline_set(ifname)) { - (void) fprintf(stderr, gettext("Operation failed : in.mpathd " - "has not set IFF_OFFLINE on %s\n"), ifname); - exit(1); - } - /* - * Get both the sockets as we may need to bring both - * IPv4 and IPv6 interfaces down. - */ - ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); - if (ifsock_v4 < 0) { - perror("socket"); - exit(1); - } - ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); - if (ifsock_v6 < 0) { - perror("socket"); - exit(1); - } - /* - * Get all the logicals for "ifname" and mark them down. - * There is no easy way of doing this. We get all the - * interfaces in the system using SICGLIFCONF and mark the - * ones matching the name down. + * Get all the up addresses for `ifname' and bring them down. */ - lifn.lifn_family = AF_UNSPEC; - lifn.lifn_flags = 0; - if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { - perror("ioctl : SIOCGLIFNUM"); - exit(1); - } - numifs = lifn.lifn_count; - - buf = calloc(numifs, sizeof (struct lifreq)); - if (buf == NULL) { - perror("calloc"); - exit(1); - } + if (ifaddrlistx(ifname, IFF_UP, 0, &ifaddrs) == -1) + die("cannot get addresses on %s", ifname); - lifc.lifc_family = AF_UNSPEC; - lifc.lifc_flags = 0; - lifc.lifc_len = numifs * sizeof (struct lifreq); - lifc.lifc_buf = buf; + for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!(ifaddrp->ia_flags & IFF_OFFLINE)) + warn("IFF_OFFLINE vanished on %s\n", ifaddrp->ia_name); - if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { - perror("ioctl : SIOCGLIFCONF"); - exit(1); + if (!set_lifflags(ifaddrp->ia_name, + ifaddrp->ia_flags & ~IFF_UP)) + warn("cannot bring down address on %s\n", + ifaddrp->ia_name); } - lifcr = (struct lifreq *)lifc.lifc_req; - for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifcr++) { - af = lifcr->lifr_addr.ss_family; - (void) strncpy(pi_name, lifcr->lifr_name, - sizeof (pi_name)); - pi_name[sizeof (pi_name) - 1] = '\0'; - if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) - *cp = '\0'; - if (strcmp(pi_name, ifname) == 0) { - /* It matches the interface name that was offlined */ - (void) strncpy(lifr.lifr_name, lifcr->lifr_name, - sizeof (lifr.lifr_name)); - if (af == AF_INET) - ret = if_down(ifsock_v4, &lifr); - else - ret = if_down(ifsock_v6, &lifr); - if (ret != 0) { - (void) fprintf(stderr, gettext("Bringing down " - "the interfaces failed.\n")); - exit(1); - } - } - } + ifaddrlistx_free(ifaddrs); } static void -undo_offline(char *ifname) +undo_offline(const char *ifname, ipmp_handle_t handle) { - struct lifreq lifr; - struct lifreq *lifcr; - struct lifnum lifn; - struct lifconf lifc; - char *buf; - int numifs; - int n; - char pi_name[LIFNAMSIZ + 1]; - char *cp; - int ifsock_v4; - int ifsock_v6; - int af; - int ret; + ifaddrlistx_t *ifaddrp, *ifaddrs; + int retval; + + if (!is_offline(ifname)) + die("interface %s is not offline\n", ifname); /* - * Verify whether IFF_OFFLINE is set as a sanity check. - */ - if (offline_set(ifname)) { - (void) fprintf(stderr, gettext("Operation failed : in.mpathd " - "has not cleared IFF_OFFLINE on %s\n"), ifname); - exit(1); - } - /* - * Get both the sockets as we may need to bring both - * IPv4 and IPv6 interfaces UP. - */ - ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); - if (ifsock_v4 < 0) { - perror("socket"); - exit(1); - } - ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); - if (ifsock_v6 < 0) { - perror("socket"); - exit(1); - } - /* - * Get all the logicals for "ifname" and mark them up. - * There is no easy way of doing this. We get all the - * interfaces in the system using SICGLIFCONF and mark the - * ones matching the name up. + * Get all the down addresses for `ifname' and bring them up. */ - lifn.lifn_family = AF_UNSPEC; - lifn.lifn_flags = 0; - if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { - perror("ioctl : SIOCGLIFNUM"); - exit(1); - } - numifs = lifn.lifn_count; - - buf = calloc(numifs, sizeof (struct lifreq)); - if (buf == NULL) { - perror("calloc"); - exit(1); - } + if (ifaddrlistx(ifname, 0, IFF_UP, &ifaddrs) == -1) + die("cannot get addresses for %s", ifname); - lifc.lifc_family = AF_UNSPEC; - lifc.lifc_flags = 0; - lifc.lifc_len = numifs * sizeof (struct lifreq); - lifc.lifc_buf = buf; + for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!(ifaddrp->ia_flags & IFF_OFFLINE)) + warn("IFF_OFFLINE vanished on %s\n", ifaddrp->ia_name); - if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { - perror("ioctl : SIOCGLIFCONF"); - exit(1); + if (!set_lifflags(ifaddrp->ia_name, ifaddrp->ia_flags | IFF_UP)) + warn("cannot bring up address on %s\n", + ifaddrp->ia_name); } - lifcr = (struct lifreq *)lifc.lifc_req; - for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifcr++) { - af = lifcr->lifr_addr.ss_family; - (void) strncpy(pi_name, lifcr->lifr_name, - sizeof (pi_name)); - pi_name[sizeof (pi_name) - 1] = '\0'; - if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) - *cp = '\0'; - - if (strcmp(pi_name, ifname) == 0) { - /* It matches the interface name that was offlined */ - (void) strncpy(lifr.lifr_name, lifcr->lifr_name, - sizeof (lifr.lifr_name)); - if (af == AF_INET) - ret = if_up(ifsock_v4, &lifr); - else - ret = if_up(ifsock_v6, &lifr); - if (ret != 0) { - (void) fprintf(stderr, gettext("Bringing up " - "the interfaces failed.\n")); - exit(1); - } - } - } -} + ifaddrlistx_free(ifaddrs); -/* - * Returns -1 on failure. Returns the socket file descriptor on - * success. - */ -static int -connect_to_mpathd(sa_family_t family) -{ - int s; - struct sockaddr_storage ss; - struct sockaddr_in *sin = (struct sockaddr_in *)&ss; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; - int addrlen; - int ret; - int on; - - s = socket(family, SOCK_STREAM, 0); - if (s < 0) { - perror("socket"); - return (-1); - } - bzero((char *)&ss, sizeof (ss)); - ss.ss_family = family; /* - * Need to bind to a privileged port. For non-root, this - * will fail. in.mpathd verifies that only commands coming - * from privileged ports succeed so that the ordinary user - * can't issue offline commands. + * Undo the offline. */ - on = 1; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - perror("setsockopt : TCP_ANONPRIVBIND"); - exit(1); - } - switch (family) { - case AF_INET: - sin->sin_port = 0; - sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - addrlen = sizeof (struct sockaddr_in); - break; - case AF_INET6: - sin6->sin6_port = 0; - sin6->sin6_addr = loopback_addr; - addrlen = sizeof (struct sockaddr_in6); - break; - } - ret = bind(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - perror("bind"); - return (-1); - } - switch (family) { - case AF_INET: - sin->sin_port = htons(MPATHD_PORT); - break; - case AF_INET6: - sin6->sin6_port = htons(MPATHD_PORT); - break; + if ((retval = ipmp_undo_offline(handle, ifname)) != IPMP_SUCCESS) { + die("cannot undo-offline %s: %s\n", ifname, + mpadm_errmsg(retval)); } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - perror("connect"); - return (-1); - } - on = 0; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - perror("setsockopt : TCP_ANONPRIVBIND"); - return (-1); - } - return (s); + + /* + * Verify whether IFF_OFFLINE is set as a sanity check. + */ + if (is_offline(ifname)) + warn("in.mpathd has not cleared IFF_OFFLINE on %s\n", ifname); } /* - * Bring down the interface specified by the name lifr->lifr_name. - * - * Returns -1 on failure. Returns 0 on success. + * Change `lifname' to have `flags' set. Returns B_TRUE on success. */ -static int -if_down(int ifsock, struct lifreq *lifr) +static boolean_t +set_lifflags(const char *lifname, uint64_t flags) { - int ret; + struct lifreq lifr = { 0 }; + int fd = (flags & IFF_IPV4) ? sioc4fd : sioc6fd; - ret = ioctl(ifsock, SIOCGLIFFLAGS, (caddr_t)lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFFLAGS"); - return (-1); - } + (void) strlcpy(lifr.lifr_name, lifname, LIFNAMSIZ); + lifr.lifr_flags = flags; - /* IFF_OFFLINE was set to start with. Is it still there ? */ - if (!(lifr->lifr_flags & (IFF_OFFLINE))) { - (void) fprintf(stderr, gettext("IFF_OFFLINE disappeared on " - "%s\n"), lifr->lifr_name); - return (-1); - } - lifr->lifr_flags &= ~IFF_UP; - ret = ioctl(ifsock, SIOCSLIFFLAGS, (caddr_t)lifr); - if (ret < 0) { - perror("ioctl: SIOCSLIFFLAGS"); - return (-1); - } - return (0); + return (ioctl(fd, SIOCSLIFFLAGS, &lifr) >= 0); } -/* - * Bring up the interface specified by the name lifr->lifr_name. - * - * Returns -1 on failure. Returns 0 on success. - */ -static int -if_up(int ifsock, struct lifreq *lifr) +/* PRINTFLIKE1 */ +static void +die(const char *format, ...) { - int ret; - boolean_t zeroaddr = B_FALSE; - struct sockaddr_in *addr; - - ret = ioctl(ifsock, SIOCGLIFADDR, lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFADDR"); - return (-1); - } + va_list alist; + char *errstr = strerror(errno); - addr = (struct sockaddr_in *)&lifr->lifr_addr; - switch (addr->sin_family) { - case AF_INET: - zeroaddr = (addr->sin_addr.s_addr == INADDR_ANY); - break; + format = gettext(format); + (void) fprintf(stderr, gettext("%s: fatal: "), progname); - case AF_INET6: - zeroaddr = IN6_IS_ADDR_UNSPECIFIED( - &((struct sockaddr_in6 *)addr)->sin6_addr); - break; + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); - default: - break; - } + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", errstr); - ret = ioctl(ifsock, SIOCGLIFFLAGS, lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFFLAGS"); - return (-1); - } - /* - * Don't affect the state of addresses that failed back. - * - * XXX Link local addresses that are not marked IFF_NOFAILOVER - * will not be brought up. Link local addresses never failover. - * When the interface was offlined, we brought the link local - * address down. We will not bring it up now if IFF_NOFAILOVER - * is not marked. We check for IFF_NOFAILOVER below so that - * we want to maintain the state of all other addresses as it - * was before offline. Normally link local addresses are marked - * IFF_NOFAILOVER and hence this is not an issue. These can - * be fixed in future with RCM and it is beyond the scope - * of if_mpadm to maintain state and do this correctly. - */ - if (!(lifr->lifr_flags & IFF_NOFAILOVER)) - return (0); + exit(EXIT_FAILURE); +} - /* - * When a data address associated with the physical interface itself - * is failed over (e.g., qfe0, rather than qfe0:1), the kernel must - * fill the ipif data structure for qfe0 with a placeholder entry (the - * "replacement ipif"). Replacement ipif's cannot be brought IFF_UP - * (nor would it make any sense to do so), so we must be careful to - * skip them; thankfully they can be easily identified since they - * all have a zeroed address. - */ - if (zeroaddr) - return (0); - - /* IFF_OFFLINE was not set to start with. Is it there ? */ - if (lifr->lifr_flags & IFF_OFFLINE) { - (void) fprintf(stderr, - gettext("IFF_OFFLINE set wrongly on %s\n"), - lifr->lifr_name); - return (-1); - } - lifr->lifr_flags |= IFF_UP; - ret = ioctl(ifsock, SIOCSLIFFLAGS, lifr); - if (ret < 0) { - perror("ioctl: SIOCSLIFFLAGS"); - return (-1); - } - return (0); +/* PRINTFLIKE1 */ +static void +warn(const char *format, ...) +{ + va_list alist; + char *errstr = strerror(errno); + + format = gettext(format); + (void) fprintf(stderr, gettext("%s: warning: "), progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", errstr); } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile index 69e91758ea..e99f2945a7 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile @@ -19,10 +19,9 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# PROG = ifconfig ROOTFS_PROG = $(PROG) @@ -38,7 +37,7 @@ COMMONSRCS= $(CMDINETCOMMONDIR)/$(COMMONOBJS:%.o=%.c) SRCS= $(LOCALSRCS) $(COMMONSRCS) CPPFLAGS += -I$(CMDINETCOMMONDIR) -I$(SRC)/common/net/dhcp -LDLIBS += -ldhcpagent -linetcfg -ldlpi -ldladm +LDLIBS += -ldhcpagent -ldlpi -linetutil -linetcfg -lipmp -ldladm LINTFLAGS += -m ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%) diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h index c993baeb02..4aa1aa0ed7 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -11,13 +11,12 @@ #ifndef _DEFS_H #define _DEFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif #include <errno.h> +#include <limits.h> #include <unistd.h> #include <stdlib.h> #include <stdio.h> @@ -54,7 +53,10 @@ extern "C" { #include <assert.h> #include <ipmp_mpathd.h> +#include <ipmp_admin.h> #include <inetcfg.h> +#include <libinetutil.h> +#include <alloca.h> #ifdef __cplusplus } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c index f49fca249c..d5517a4700 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -23,6 +23,7 @@ #define TUN_NAME "tun" #define ATUN_NAME "atun" #define TUN6TO4_NAME "6to4tun" +#define IPMPSTUB (void *)-1 typedef struct if_flags { uint64_t iff_value; @@ -67,7 +68,20 @@ static if_flags_t if_flags_tbl[] = { { IFF_TEMPORARY, "TEMPORARY" }, { IFF_FIXEDMTU, "FIXEDMTU" }, { IFF_VIRTUAL, "VIRTUAL" }, - { IFF_DUPLICATE, "DUPLICATE" } + { IFF_DUPLICATE, "DUPLICATE" }, + { IFF_IPMP, "IPMP"} +}; + +typedef struct { + const char *ia_app; + uint64_t ia_flag; + uint_t ia_tries; +} if_appflags_t; + +static const if_appflags_t if_appflags_tbl[] = { + { "dhcpagent(1M)", IFF_DHCPRUNNING, 1 }, + { "in.ndpd(1M)", IFF_ADDRCONF, 3 }, + { NULL, 0, 0 } }; static struct lifreq lifr; @@ -75,7 +89,6 @@ static struct lifreq lifr; static char name[LIFNAMSIZ]; /* foreach interface saved name */ static char origname[LIFNAMSIZ]; -static char savedname[LIFNAMSIZ]; /* For addif */ static int setaddr; /* @@ -89,20 +102,7 @@ static int setaddr; #define NO_ESP_AALG 256 #define NO_ESP_EALG 256 -/* - * iface_t - * used by setifether to create a list of interfaces to mark - * down-up when changing the ethernet address of an interface - */ -typedef struct iface { - struct lifreq lifr; - struct iface *next; /* pointer to the next list element */ -} iface_t; - -static iface_t *logifs = NULL; /* list of logical interfaces */ -static iface_t *phyif = NULL; /* physical interface */ - -int s; +int s, s4, s6; int af = AF_INET; /* default address family */ int debug = 0; int all = 0; /* setifdhcp() needs to know this */ @@ -113,6 +113,7 @@ int v4compat = 0; /* Compatible printing format */ * Function prototypes for command functions. */ static int addif(char *arg, int64_t param); +static int inetipmp(char *arg, int64_t param); static int inetplumb(char *arg, int64_t param); static int inetunplumb(char *arg, int64_t param); static int removeif(char *arg, int64_t param); @@ -141,7 +142,7 @@ static int modinsert(char *arg, int64_t param); static int modremove(char *arg, int64_t param); static int setifgroupname(char *arg, int64_t param); static int configinfo(char *arg, int64_t param); -static void print_config_flags(uint64_t flags); +static void print_config_flags(int af, uint64_t flags); static void print_flags(uint64_t flags); static void print_ifether(char *ifname); static int set_tun_encap_limit(char *arg, int64_t param); @@ -150,6 +151,7 @@ static int set_tun_hop_limit(char *arg, int64_t param); static int setzone(char *arg, int64_t param); static int setallzones(char *arg, int64_t param); static int setifsrc(char *arg, int64_t param); +static int lifnum(const char *ifname); /* * Address family specific function prototypes. @@ -179,19 +181,22 @@ static int settaddr(char *, int (*)(icfg_handle_t, static void status(void); static void ifstatus(const char *); static void usage(void); -static int strioctl(int s, int cmd, char *buf, int buflen); +static int strioctl(int s, int cmd, void *buf, int buflen); static int setifdhcp(const char *caller, const char *ifname, int argc, char *argv[]); static int ip_domux2fd(int *, int *, int *, int *, int *); static int ip_plink(int, int, int, int, int); static int modop(char *arg, char op); -static void selectifs(int argc, char *argv[], int af, - struct lifreq *lifrp); -static int updownifs(iface_t *ifs, int up); static int find_all_global_interfaces(struct lifconf *lifcp, char **buf, int64_t lifc_flags); static int find_all_zone_interfaces(struct lifconf *lifcp, char **buf, int64_t lifc_flags); +static int create_ipmp(const char *grname, int af, const char *ifname, + boolean_t implicit); +static int create_ipmp_peer(int af, const char *ifname); +static void start_ipmp_daemon(void); +static boolean_t ifaddr_up(ifaddrlistx_t *ifaddrp); +static boolean_t ifaddr_down(ifaddrlistx_t *ifaddrp); #define max(a, b) ((a) < (b) ? (b) : (a)) @@ -251,6 +256,7 @@ struct cmd { { "index", NEXTARG, setifindex, 0, AF_ANY }, { "broadcast", NEXTARG, setifbroadaddr, 0, AF_INET }, { "auto-revarp", 0, setifrevarp, 1, AF_INET }, + { "ipmp", 0, inetipmp, 1, AF_ANY }, { "plumb", 0, inetplumb, 1, AF_ANY }, { "unplumb", 0, inetunplumb, 0, AF_ANY }, { "subnet", NEXTARG, setifsubnet, 0, AF_ANY }, @@ -297,22 +303,30 @@ struct cmd { typedef struct if_config_cmd { uint64_t iff_flag; + int iff_af; char *iff_name; } if_config_cmd_t; +/* + * NOTE: print_config_flags() processes this table in order, so we put "up" + * last so that we can be sure "-failover" will take effect first. Otherwise, + * IPMP test addresses will erroneously migrate to the IPMP interface. + */ static if_config_cmd_t if_config_cmd_tbl[] = { - { IFF_UP, "up" }, - { IFF_NOTRAILERS, "-trailers" }, - { IFF_PRIVATE, "private" }, - { IFF_NOXMIT, "-xmit" }, - { IFF_ANYCAST, "anycast" }, - { IFF_NOLOCAL, "-local" }, - { IFF_DEPRECATED, "deprecated" }, - { IFF_NOFAILOVER, "-failover" }, - { IFF_STANDBY, "standby" }, - { IFF_FAILED, "failed" }, - { IFF_PREFERRED, "preferred" }, - { 0, 0 }, + { IFF_NOTRAILERS, AF_UNSPEC, "-trailers" }, + { IFF_PRIVATE, AF_UNSPEC, "private" }, + { IFF_NOXMIT, AF_UNSPEC, "-xmit" }, + { IFF_ANYCAST, AF_INET6, "anycast" }, + { IFF_NOLOCAL, AF_UNSPEC, "-local" }, + { IFF_DEPRECATED, AF_UNSPEC, "deprecated" }, + { IFF_NOFAILOVER, AF_UNSPEC, "-failover" }, + { IFF_STANDBY, AF_UNSPEC, "standby" }, + { IFF_FAILED, AF_UNSPEC, "failed" }, + { IFF_PREFERRED, AF_UNSPEC, "preferred" }, + { IFF_NONUD, AF_INET6, "-nud" }, + { IFF_NOARP, AF_INET, "-arp" }, + { IFF_UP, AF_UNSPEC, "up" }, + { 0, 0, NULL }, }; typedef struct ni { @@ -345,10 +359,11 @@ struct afswtch *afp; /* the address family being set or asked about */ int main(int argc, char *argv[]) { - /* Include IFF_NOXMIT, IFF_TEMPORARY and all zone interfaces */ - int64_t lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; + int64_t lifc_flags; char *default_ip_str; + lifc_flags = LIFC_NOXMIT|LIFC_TEMPORARY|LIFC_ALLZONES|LIFC_UNDER_IPMP; + if (argc < 2) { usage(); exit(1); @@ -388,9 +403,10 @@ main(int argc, char *argv[]) } s = socket(SOCKET_AF(af), SOCK_DGRAM, 0); - if (s < 0) { + s4 = socket(AF_INET, SOCK_DGRAM, 0); + s6 = socket(AF_INET6, SOCK_DGRAM, 0); + if (s == -1 || s4 == -1 || s6 == -1) Perror0_exit("socket"); - } /* * Special interface names is any combination of these flags. @@ -1441,39 +1457,38 @@ setifdstaddr(char *addr, int64_t param) static int setifflags(char *val, int64_t value) { - int phyintlen, origphyintlen; + struct lifreq lifrl; /* local lifreq struct */ + boolean_t bringup = _B_FALSE; (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) Perror0_exit("setifflags: SIOCGLIFFLAGS"); - if (value == IFF_NOFAILOVER) { - /* - * Fail if '-failover' is set after a prior addif created the - * alias on a different interface. This can happen when the - * interface is part of an IPMP group. - */ - phyintlen = strcspn(name, ":"); - origphyintlen = strcspn(origname, ":"); - if (phyintlen != origphyintlen || - strncmp(name, origname, phyintlen) != 0) { - (void) fprintf(stderr, "ifconfig: can't set -failover " - "on failed/standby/offlined interface %s\n", - origname); - exit(1); - } - } - if (value < 0) { value = -value; + + if ((value & IFF_NOFAILOVER) && (lifr.lifr_flags & IFF_UP)) { + /* + * The kernel does not allow administratively up test + * addresses to be converted to data addresses. Bring + * the address down first, then bring it up after it's + * been converted to a data address. + */ + lifr.lifr_flags &= ~IFF_UP; + (void) ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr); + bringup = _B_TRUE; + } + lifr.lifr_flags &= ~value; - if ((value & IFF_UP) && (lifr.lifr_flags & IFF_DUPLICATE)) { + if ((value & (IFF_UP | IFF_NOFAILOVER)) && + (lifr.lifr_flags & IFF_DUPLICATE)) { /* * If the user is trying to mark an interface with a - * duplicate address as "down," then fetch the address - * and set it. This will cause IP to clear the - * IFF_DUPLICATE flag and stop the automatic recovery - * timer. + * duplicate address as "down," or convert a duplicate + * test address to a data address, then fetch the + * address and set it. This will cause IP to clear + * the IFF_DUPLICATE flag and stop the automatic + * recovery timer. */ value = lifr.lifr_flags; if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) >= 0) @@ -1483,10 +1498,48 @@ setifflags(char *val, int64_t value) } else { lifr.lifr_flags |= value; } + + /* + * If we're about to bring up an underlying physical IPv6 interface in + * an IPMP group, ensure the IPv6 IPMP interface is also up. This is + * for backward compatibility with legacy configurations in which + * there are no explicit hostname files for IPMP interfaces. (For + * IPv4, this is automatically handled by the kernel when migrating + * the underlying interface's data address to the IPMP interface.) + */ + (void) strlcpy(lifrl.lifr_name, name, LIFNAMSIZ); + + if (lifnum(lifr.lifr_name) == 0 && + (lifr.lifr_flags & (IFF_UP|IFF_IPV6)) == (IFF_UP|IFF_IPV6) && + ioctl(s, SIOCGLIFGROUPNAME, &lifrl) == 0 && + lifrl.lifr_groupname[0] != '\0') { + lifgroupinfo_t lifgr; + + (void) strlcpy(lifgr.gi_grname, lifrl.lifr_groupname, + LIFGRNAMSIZ); + if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1) + Perror0_exit("setifflags: SIOCGLIFGROUPINFO"); + + (void) strlcpy(lifrl.lifr_name, lifgr.gi_grifname, LIFNAMSIZ); + if (ioctl(s, SIOCGLIFFLAGS, &lifrl) == -1) + Perror0_exit("setifflags: SIOCGLIFFLAGS"); + if (!(lifrl.lifr_flags & IFF_UP)) { + lifrl.lifr_flags |= IFF_UP; + if (ioctl(s, SIOCSLIFFLAGS, &lifrl) == -1) + Perror0_exit("setifflags: SIOCSLIFFLAGS"); + } + } + (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); - if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) { + if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) Perror0_exit("setifflags: SIOCSLIFFLAGS"); + + if (bringup) { + lifr.lifr_flags |= IFF_UP; + if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) + Perror0_exit("setifflags: SIOCSLIFFLAGS IFF_UP"); } + return (0); } @@ -1524,12 +1577,21 @@ setifindex(char *val, int64_t param) } /* ARGSUSED */ +static void +notifycb(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg) +{ +} + +/* ARGSUSED */ static int setifether(char *addr, int64_t param) { - uchar_t *ea; - iface_t *current; - int maclen; + uchar_t *hwaddr; + int hwaddrlen; + int retval; + ifaddrlistx_t *ifaddrp, *ifaddrs = NULL; + dlpi_handle_t dh; + dlpi_notifyid_t id; if (addr == NULL) { ifstatus(name); @@ -1537,9 +1599,6 @@ setifether(char *addr, int64_t param) return (0); } - phyif = NULL; - logifs = NULL; - /* * if the IP interface in the arguments is a logical * interface, exit with an error now. @@ -1550,79 +1609,68 @@ setifether(char *addr, int64_t param) exit(1); } - ea = _link_aton(addr, &maclen); - if (ea == NULL) { - if (maclen == -1) + if ((hwaddr = _link_aton(addr, &hwaddrlen)) == NULL) { + if (hwaddrlen == -1) (void) fprintf(stderr, - "ifconfig: %s: bad address\n", addr); + "ifconfig: %s: bad address\n", hwaddr); else (void) fprintf(stderr, "ifconfig: malloc() failed\n"); exit(1); } - (void) strncpy(savedname, name, sizeof (savedname)); + if ((retval = dlpi_open(name, &dh, 0)) != DLPI_SUCCESS) + Perrdlpi_exit("cannot dlpi_open() link", name, retval); - /* - * Call selectifs only for the IP interfaces that are ipv4. - * offflags == IFF_IPV6 because you should not change the - * Ethernet address of an ipv6 interface - */ - foreachinterface(selectifs, 0, (char **)NULL, 0, 0, IFF_IPV6, 0); + if ((retval = dlpi_bind(dh, DLPI_ANY_SAP, NULL)) != DLPI_SUCCESS) + Perrdlpi_exit("cannot dlpi_bind() link", name, retval); - /* If physical interface not found, exit now */ - if (phyif == NULL) { - (void) fprintf(stderr, - "ifconfig: interface %s not found\n", savedname); - exit(1); - } - - /* Restore */ - (void) strncpy(name, savedname, sizeof (name)); - (void) strncpy(origname, savedname, sizeof (origname)); - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); - - /* - * close and reopen the socket - * we don't know which type of socket we have now - */ - (void) close(s); - s = socket(SOCKET_AF(AF_UNSPEC), SOCK_DGRAM, 0); - if (s < 0) { - Perror0_exit("socket"); - } - - /* - * mark down the logical interfaces first, - * and then the physical interface - */ - if (updownifs(logifs, 0) < 0 || updownifs(phyif, 0) < 0) { - Perror0_exit("mark down interface failed"); + retval = dlpi_enabnotify(dh, DL_NOTE_PHYS_ADDR, notifycb, NULL, &id); + if (retval == DLPI_SUCCESS) { + (void) dlpi_disabnotify(dh, id, NULL); + } else { + /* + * This link does not support DL_NOTE_PHYS_ADDR: bring down + * all of the addresses to flush the old hardware address + * information out of IP. + * + * NOTE: Skipping this when DL_NOTE_PHYS_ADDR is supported is + * more than an optimization: in.mpathd will set IFF_OFFLINE + * if it's notified and the new address is a duplicate of + * another in the group -- but the flags manipulation in + * ifaddr_{down,up}() cannot be atomic and thus might clobber + * IFF_OFFLINE, confusing in.mpathd. + */ + if (ifaddrlistx(name, IFF_UP, 0, &ifaddrs) == -1) + Perror2_exit(name, "cannot get address list"); + + ifaddrp = ifaddrs; + for (; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_down(ifaddrp)) { + Perror2_exit(ifaddrp->ia_name, + "cannot bring down"); + } + } } /* - * Change the physical address + * Change the hardware address. */ - if (dlpi_set_address(savedname, ea, maclen) == -1) { + retval = dlpi_set_physaddr(dh, DL_CURR_PHYS_ADDR, hwaddr, hwaddrlen); + if (retval != DLPI_SUCCESS) { (void) fprintf(stderr, - "ifconfig: failed setting mac address on %s\n", - savedname); + "ifconfig: failed setting mac address on %s\n", name); } + dlpi_close(dh); /* - * if any interfaces were marked down before changing the - * ethernet address, put them up again. - * First the physical interface, then the logical ones. + * If any addresses were brought down before changing the hardware + * address, bring them up again. */ - if (updownifs(phyif, 1) < 0 || updownifs(logifs, 1) < 0) { - Perror0_exit("mark down interface failed"); - } - - /* Free the memory allocated by selectifs */ - free(phyif); - for (current = logifs; current != NULL; current = logifs) { - logifs = logifs->next; - free(current); + for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_up(ifaddrp)) + Perror2_exit(ifaddrp->ia_name, "cannot bring up"); } + ifaddrlistx_free(ifaddrs); return (0); } @@ -1655,8 +1703,8 @@ print_ifether(char *ifname) } (void) close(fd); - /* Virtual interfaces don't have MAC addresses */ - if (lifr.lifr_flags & IFF_VIRTUAL) + /* VNI and IPMP interfaces don't have MAC addresses */ + if (lifr.lifr_flags & (IFF_VIRTUAL|IFF_IPMP)) return; /* @@ -1685,104 +1733,6 @@ print_ifether(char *ifname) } /* - * static void selectifs(int argc, char *argv[], int af, struct lifreq *rp) - * - * Called inside setifether() to create a list of interfaces to - * mark down/up when changing the Ethernet address. - * If the current interface is the physical interface passed - * as an argument to ifconfig, update phyif. - * If the current interface is a logical interface associated - * to the physical interface, add it to the logifs list. - */ -/* ARGSUSED */ -static void -selectifs(int argc, char *argv[], int af, struct lifreq *rp) -{ - char *colonp; - int length; - iface_t *current; - - /* - * savedname= name of the IP interface to which you want to - * change ethernet address - * name= name of the current IP interface - */ - colonp = strchr(name, ':'); - if (colonp == NULL) - length = max(strlen(savedname), strlen(name)); - else - length = max(strlen(savedname), colonp - name); - if (strncmp(savedname, name, length) == 0) { - (void) strcpy(lifr.lifr_name, name); - if (ioctl(s, SIOCGLIFFLAGS, &lifr) < 0) { - Perror0("selectifs: SIOCGLIFFLAGS"); - return; - } - - if ((current = malloc(sizeof (iface_t))) == NULL) { - Perror0_exit("selectifs: malloc failed\n"); - } - - if (colonp == NULL) { - /* this is the physical interface */ - phyif = current; - bcopy(&lifr, &phyif->lifr, sizeof (struct lifreq)); - phyif->next = NULL; - } else { - /* this is a logical interface */ - bcopy(&lifr, ¤t->lifr, sizeof (struct lifreq)); - current->next = logifs; - logifs = current; - } - } -} - -/* - * static int updownifs(iface_t *ifs, int up) - * - * It takes in input a list of IP interfaces (ifs) - * and a flag (up). - * It marks each interface in the list down (up = 0) - * or up (up > 0). This is done ONLY if the IP - * interface was originally up. - * - * Return values: - * 0 = everything OK - * -1 = problem - */ -static int -updownifs(iface_t *ifs, int up) -{ - iface_t *current; - int ret = 0; - int save_errno; - char savename[LIFNAMSIZ]; - uint64_t orig_flags; - - for (current = ifs; current != NULL; current = current->next) { - if (current->lifr.lifr_flags & IFF_UP) { - orig_flags = current->lifr.lifr_flags; - if (!up) - current->lifr.lifr_flags &= ~IFF_UP; - if (ioctl(s, SIOCSLIFFLAGS, ¤t->lifr) < 0) { - save_errno = errno; - (void) strcpy(savename, - current->lifr.lifr_name); - ret = -1; - } - if (!up) /* restore the original flags */ - current->lifr.lifr_flags = orig_flags; - } - } - - if (ret == -1) { - (void) strcpy(lifr.lifr_name, savename); - errno = save_errno; - } - return (ret); -} - -/* * static int find_all_global_interfaces(struct lifconf *lifcp, char **buf, * int64_t lifc_flags) * @@ -2109,130 +2059,217 @@ setiftoken(char *addr, int64_t param) return (0); } -/* - * Return value: 0 on success, -1 on failure. - */ -static int -connect_to_mpathd(int family) -{ - int s; - struct sockaddr_storage ss; - struct sockaddr_in *sin = (struct sockaddr_in *)&ss; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; - int addrlen; - int ret; - int on; - - s = socket(family, SOCK_STREAM, 0); - if (s < 0) { - Perror0_exit("connect_to_mpathd: socket"); - } - (void) bzero((char *)&ss, sizeof (ss)); - ss.ss_family = family; - /* - * Need to bind to a privileged port. For non-root, this - * will fail. in.mpathd verifies that only commands coming - * from privileged ports succeed so that ordinary users - * can't connect and start talking to in.mpathd - */ - on = 1; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - Perror0_exit("connect_to_mpathd: setsockopt"); - } - switch (family) { - case AF_INET: - sin->sin_port = 0; - sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - addrlen = sizeof (struct sockaddr_in); - break; - case AF_INET6: - sin6->sin6_port = 0; - sin6->sin6_addr = loopback_addr; - addrlen = sizeof (struct sockaddr_in6); - break; - } - ret = bind(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - (void) close(s); - return (-1); - } - - switch (family) { - case AF_INET: - sin->sin_port = htons(MPATHD_PORT); - break; - case AF_INET6: - sin6->sin6_port = htons(MPATHD_PORT); - break; - } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - (void) close(s); - return (ret); -} - /* ARGSUSED */ static int -setifgroupname(char *grpname, int64_t param) +setifgroupname(char *grname, int64_t param) { + lifgroupinfo_t lifgr; + struct lifreq lifrl; + ifaddrlistx_t *ifaddrp, *nextifaddrp; + ifaddrlistx_t *ifaddrs = NULL, *downaddrs = NULL; + int af; + if (debug) { (void) printf("Setting groupname %s on interface %s\n", - grpname, name); - } - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); - (void) strncpy(lifr.lifr_groupname, grpname, - sizeof (lifr.lifr_groupname)); - if (ioctl(s, SIOCSLIFGROUPNAME, (caddr_t)&lifr) < 0) { - Perror0_exit("setifgroupname: SIOCSLIFGROUPNAME"); + grname, name); } - /* - * If the SUNW_NO_MPATHD environment variable is set then don't - * bother starting up in.mpathd. See PSARC/2002/249 for the - * depressing details on this bit of stupidity. - */ - if (getenv("SUNW_NO_MPATHD") != NULL) { - return (0); + (void) strlcpy(lifrl.lifr_name, name, LIFNAMSIZ); + (void) strlcpy(lifrl.lifr_groupname, grname, LIFGRNAMSIZ); + + while (ioctl(s, SIOCSLIFGROUPNAME, &lifrl) == -1) { + switch (errno) { + case ENOENT: + /* + * The group doesn't yet exist; create it and repeat. + */ + af = afp->af_af; + if (create_ipmp(grname, af, NULL, _B_TRUE) == -1) { + if (errno == EEXIST) + continue; + + Perror2(grname, "cannot create IPMP group"); + goto fail; + } + continue; + + case EALREADY: + /* + * The interface is already in another group; must + * remove existing membership first. + */ + lifrl.lifr_groupname[0] = '\0'; + if (ioctl(s, SIOCSLIFGROUPNAME, &lifrl) == -1) { + Perror2(name, "cannot remove existing " + "IPMP group membership"); + goto fail; + } + (void) strlcpy(lifrl.lifr_groupname, grname, + LIFGRNAMSIZ); + continue; + + case EAFNOSUPPORT: + /* + * The group exists, but it's not configured with the + * address families the interface needs. Since only + * two address families are currently supported, just + * configure the "other" address family. Note that we + * may race with group deletion or creation by another + * process (ENOENT or EEXIST); in such cases we repeat + * our original SIOCSLIFGROUPNAME. + */ + (void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ); + if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1) { + if (errno == ENOENT) + continue; + + Perror2(grname, "SIOCGLIFGROUPINFO"); + goto fail; + } + + af = lifgr.gi_v4 ? AF_INET6 : AF_INET; + if (create_ipmp(grname, af, lifgr.gi_grifname, + _B_TRUE) == -1) { + if (errno == EEXIST) + continue; + + Perror2(grname, "cannot configure IPMP group"); + goto fail; + } + continue; + + case EADDRINUSE: + /* + * Some addresses are in-use (or under control of DAD). + * Bring them down and retry the group join operation. + * We will bring them back up after the interface has + * been placed in the group. + */ + if (ifaddrlistx(lifrl.lifr_name, IFF_UP|IFF_DUPLICATE, + 0, &ifaddrs) == -1) { + Perror2(grname, "cannot get address list"); + goto fail; + } + + ifaddrp = ifaddrs; + for (; ifaddrp != NULL; ifaddrp = nextifaddrp) { + if (!ifaddr_down(ifaddrp)) { + ifaddrs = ifaddrp; + goto fail; + } + nextifaddrp = ifaddrp->ia_next; + ifaddrp->ia_next = downaddrs; + downaddrs = ifaddrp; + } + ifaddrs = NULL; + continue; + + case EADDRNOTAVAIL: { + /* + * Some data addresses are under application control. + * For some of these (e.g., ADDRCONF), the application + * should remove the address, in which case we retry a + * few times (since the application's action is not + * atomic with respect to us) before bailing out and + * informing the user. + */ + int ntries, nappaddr = 0; + const if_appflags_t *iap = if_appflags_tbl; + + for (; iap->ia_app != NULL; iap++) { + ntries = 0; +again: + if (ifaddrlistx(lifrl.lifr_name, iap->ia_flag, + IFF_NOFAILOVER, &ifaddrs) == -1) { + (void) fprintf(stderr, "ifconfig: %s: " + "cannot get data addresses managed " + "by %s\n", lifrl.lifr_name, + iap->ia_app); + goto fail; + } + + if (ifaddrs == NULL) + continue; + + ifaddrlistx_free(ifaddrs); + ifaddrs = NULL; + + if (++ntries < iap->ia_tries) { + (void) poll(NULL, 0, 100); + goto again; + } + + (void) fprintf(stderr, "ifconfig: cannot join " + "IPMP group: %s has data addresses managed " + "by %s\n", lifrl.lifr_name, iap->ia_app); + nappaddr++; + } + if (nappaddr > 0) + goto fail; + continue; + } + default: + Perror2(name, "SIOCSLIFGROUPNAME"); + goto fail; + } } /* - * Try to connect to in.mpathd using IPv4. If we succeed, - * we conclude that in.mpathd is running, and quit. + * If there were addresses that we had to bring down, it's time to + * bring them up again. As part of bringing them up, the kernel will + * automatically move them to the new IPMP interface. */ - if (connect_to_mpathd(AF_INET) == 0) { - /* connect succeeded, mpathd is already running */ - return (0); + for (ifaddrp = downaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_up(ifaddrp) && errno != ENXIO) { + (void) fprintf(stderr, "ifconfig: cannot bring back up " + "%s: %s\n", ifaddrp->ia_name, strerror(errno)); + } } + ifaddrlistx_free(downaddrs); + return (0); +fail: /* - * Try to connect to in.mpathd using IPv6. If we succeed, - * we conclude that in.mpathd is running, and quit. + * Attempt to bring back up any interfaces that we downed. */ - if (connect_to_mpathd(AF_INET6) == 0) { - /* connect succeeded, mpathd is already running */ - return (0); + for (ifaddrp = downaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_up(ifaddrp) && errno != ENXIO) { + (void) fprintf(stderr, "ifconfig: cannot bring back up " + "%s: %s\n", ifaddrp->ia_name, strerror(errno)); + } } + ifaddrlistx_free(downaddrs); + ifaddrlistx_free(ifaddrs); /* - * in.mpathd may not be running. Start it now. If it is already - * running, in.mpathd will take care of handling multiple incarnations - * of itself. ifconfig only tries to optimize performance by not - * starting another incarnation of in.mpathd. + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). */ - switch (fork()) { + exit(1); + /* NOTREACHED */ +} - case -1: - Perror0_exit("setifgroupname: fork"); - /* NOTREACHED */ - case 0: - (void) execl(MPATHD_PATH, MPATHD_PATH, NULL); - _exit(1); - /* NOTREACHED */ - default: - return (0); +static boolean_t +modcheck(const char *ifname) +{ + (void) strlcpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name)); + + if (ioctl(s, SIOCGLIFFLAGS, &lifr) < 0) { + Perror0("SIOCGLIFFLAGS"); + return (_B_FALSE); } -} + if (lifr.lifr_flags & IFF_IPMP) { + (void) fprintf(stderr, "ifconfig: %s: module operations not" + " supported on IPMP interfaces\n", ifname); + return (_B_FALSE); + } + if (lifr.lifr_flags & IFF_VIRTUAL) { + (void) fprintf(stderr, "ifconfig: %s: module operations not" + " supported on virtual IP interfaces\n", ifname); + return (_B_FALSE); + } + return (_B_TRUE); +} /* * To list all the modules above a given network interface. @@ -2250,7 +2287,13 @@ modlist(char *null, int64_t param) struct str_list strlist; int orig_arpid; - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + /* + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). + */ + if (!modcheck(name)) + exit(1); + if (ip_domux2fd(&muxfd, &muxid_fd, &ipfd_lowstr, &arpfd_lowstr, &orig_arpid) < 0) { return (-1); @@ -2354,8 +2397,8 @@ open_arp_on_udp(char *udp_dev_name) * Return: * -1 if operation fails, 0 otherwise. * - * Please see the big block comment above plumb_one_device() - * for the logic of the PLINK/PUNLINK + * Please see the big block comment above ifplumb() for the logic of the + * PLINK/PUNLINK */ static int ip_domux2fd(int *muxfd, int *muxid_fd, int *ipfd_lowstr, int *arpfd_lowstr, @@ -2467,8 +2510,8 @@ ip_domux2fd(int *muxfd, int *muxid_fd, int *ipfd_lowstr, int *arpfd_lowstr, * Return: * -1 if operation fails, 0 otherwise. * - * Please see the big block comment above plumb_one_device() - * for the logic of the PLINK/PUNLINK + * Please see the big block comment above ifplumb() for the logic of the + * PLINK/PUNLINK */ static int ip_plink(int muxfd, int muxid_fd, int ipfd_lowstr, int arpfd_lowstr, @@ -2530,7 +2573,12 @@ modop(char *arg, char op) char *arg_str; int orig_arpid; - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + /* + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). + */ + if (!modcheck(name)) + exit(1); /* Need to save the original string for -a option. */ if ((arg_str = malloc(strlen(arg) + 1)) == NULL) { @@ -3067,13 +3115,14 @@ status(void) static int configinfo(char *null, int64_t param) { + char *cp; struct afswtch *p = afp; uint64_t flags; - char phydevname[LIFNAMSIZ]; + char lifname[LIFNAMSIZ]; char if_usesrc_name[LIFNAMSIZ]; - char *cp; (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) { Perror0_exit("status: SIOCGLIFFLAGS"); } @@ -3084,13 +3133,13 @@ configinfo(char *null, int64_t param) name, flags, p != NULL ? p->af_af : -1); } - /* remove LIF component */ - (void) strncpy(phydevname, name, sizeof (phydevname)); - cp = strchr(phydevname, ':'); - if (cp) { - *cp = 0; - } - phydevname[sizeof (phydevname) - 1] = '\0'; + /* + * Build the interface name to print (we can't directly use `name' + * because one cannot "plumb" ":0" interfaces). + */ + (void) strlcpy(lifname, name, LIFNAMSIZ); + if ((cp = strchr(lifname, ':')) != NULL && atoi(cp + 1) == 0) + *cp = '\0'; /* * if the interface is IPv4 @@ -3105,7 +3154,7 @@ configinfo(char *null, int64_t param) if (v4compat) flags &= ~IFF_IPV4; - (void) printf("%s inet plumb", phydevname); + (void) printf("%s inet plumb", lifname); } else if (flags & IFF_IPV6) { /* * else if the interface is IPv6 @@ -3117,7 +3166,7 @@ configinfo(char *null, int64_t param) if (v4compat) return (-1); - (void) printf("%s inet6 plumb", phydevname); + (void) printf("%s inet6 plumb", lifname); } (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); @@ -3131,8 +3180,8 @@ configinfo(char *null, int64_t param) ioctl(s, SIOCGLIFMTU, (caddr_t)&lifr) >= 0) (void) printf(" mtu %d", lifr.lifr_metric); - /* don't print index when in compatibility mode */ - if (!v4compat) { + /* Index only applies to the zeroth interface */ + if (lifnum(name) == 0) { if (ioctl(s, SIOCGLIFINDEX, (caddr_t)&lifr) >= 0) (void) printf(" index %d", lifr.lifr_index); } @@ -3162,7 +3211,6 @@ configinfo(char *null, int64_t param) } (void) printf("\n"); - return (0); } @@ -3398,15 +3446,11 @@ in_status(int force, uint64_t flags) inet_ntoa(sin->sin_addr)); } } - /* If there is a groupname, print it for lun 0 alone */ + /* If there is a groupname, print it for only the physical interface */ if (strchr(name, ':') == NULL) { - (void) memset(lifr.lifr_groupname, 0, - sizeof (lifr.lifr_groupname)); - if (ioctl(s, SIOCGLIFGROUPNAME, (caddr_t)&lifr) >= 0) { - if (strlen(lifr.lifr_groupname) > 0) { - (void) printf("\n\tgroupname %s", - lifr.lifr_groupname); - } + if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 && + lifr.lifr_groupname[0] != '\0') { + (void) printf("\n\tgroupname %s", lifr.lifr_groupname); } } (void) putchar('\n'); @@ -3550,11 +3594,7 @@ in_configinfo(int force, uint64_t flags) Perror0_exit("in_configinfo: SIOCGLIFADDR"); } sin = (struct sockaddr_in *)&lifr.lifr_addr; - if (strchr(name, ':') != NULL) { - (void) printf(" addif %s ", inet_ntoa(sin->sin_addr)); - } else { - (void) printf(" set %s ", inet_ntoa(sin->sin_addr)); - } + (void) printf(" set %s ", inet_ntoa(sin->sin_addr)); laddr = sin; } @@ -3614,8 +3654,8 @@ in_configinfo(int force, uint64_t flags) } } - /* If there is a groupname, print it for only the physical interface */ - if (strchr(name, ':') == NULL) { + /* If there is a groupname, print it for only the zeroth interface */ + if (lifnum(name) == 0) { if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 && lifr.lifr_groupname[0] != '\0') { (void) printf(" group %s ", lifr.lifr_groupname); @@ -3623,12 +3663,7 @@ in_configinfo(int force, uint64_t flags) } /* Print flags to configure */ - print_config_flags(flags); - - /* IFF_NOARP applies to AF_INET only */ - if (flags & IFF_NOARP) { - (void) printf("-arp "); - } + print_config_flags(AF_INET, flags); } static void @@ -3657,17 +3692,9 @@ in6_configinfo(int force, uint64_t flags) Perror0_exit("in6_configinfo: SIOCGLIFADDR"); } sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; - if (strchr(name, ':') != NULL) { - (void) printf(" addif %s/%d ", - inet_ntop(AF_INET6, (void *)&sin6->sin6_addr, - abuf, sizeof (abuf)), - lifr.lifr_addrlen); - } else { - (void) printf(" set %s/%d ", - inet_ntop(AF_INET6, (void *)&sin6->sin6_addr, - abuf, sizeof (abuf)), - lifr.lifr_addrlen); - } + (void) printf(" set %s/%d ", + inet_ntop(AF_INET6, &sin6->sin6_addr, abuf, sizeof (abuf)), + lifr.lifr_addrlen); laddr6 = sin6; } (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); @@ -3720,8 +3747,8 @@ in6_configinfo(int force, uint64_t flags) lifr.lifr_addrlen); } - /* If there is a groupname, print it for only the physical interface */ - if (strchr(name, ':') == NULL) { + /* If there is a groupname, print it for only the zeroth interface */ + if (lifnum(name) == 0) { if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 && lifr.lifr_groupname[0] != '\0') { (void) printf(" group %s ", lifr.lifr_groupname); @@ -3729,12 +3756,7 @@ in6_configinfo(int force, uint64_t flags) } /* Print flags to configure */ - print_config_flags(flags); - - /* IFF_NONUD applies to AF_INET6 only */ - if (flags & IFF_NONUD) { - (void) printf("-nud "); - } + print_config_flags(AF_INET6, flags); } /* @@ -3768,31 +3790,41 @@ in6_configinfo(int force, uint64_t flags) * compatibility for other utilities like atmifconfig etc. In this case * the utility must use SIOCSLIFMUXID. */ -static void -plumb_one_device(int af) +static int +ifplumb(const char *linkname, const char *ifname, boolean_t genppa, int af) { int arp_muxid = -1, ip_muxid; int mux_fd, ip_fd, arp_fd; int retval; - uint_t ppa; char *udp_dev_name; - char provider[DLPI_LINKNAME_MAX]; + uint64_t flags; + uint_t dlpi_flags; dlpi_handle_t dh_arp, dh_ip; /* - * We use DLPI_NOATTACH because the ip module will do the attach - * itself for DLPI style-2 devices. + * Always dlpi_open() with DLPI_NOATTACH because the IP and ARP module + * will do the attach themselves for DLPI style-2 links. */ - retval = dlpi_open(name, &dh_ip, DLPI_NOATTACH); - if (retval != DLPI_SUCCESS) - Perrdlpi_exit("cannot open link", name, retval); + dlpi_flags = DLPI_NOATTACH; - if ((retval = dlpi_parselink(name, provider, &ppa)) != DLPI_SUCCESS) - Perrdlpi_exit("dlpi_parselink", name, retval); + /* + * If `linkname' is the special token IPMPSTUB, then this is a request + * to create an IPMP interface atop /dev/ipmpstub0. (We can't simply + * pass "ipmpstub0" as `linkname' since an admin *could* have a normal + * vanity-named link named "ipmpstub0" that they'd like to plumb.) + */ + if (linkname == IPMPSTUB) { + linkname = "ipmpstub0"; + dlpi_flags |= DLPI_DEVONLY; + } + + retval = dlpi_open(linkname, &dh_ip, dlpi_flags); + if (retval != DLPI_SUCCESS) + Perrdlpi_exit("cannot open link", linkname, retval); if (debug) { - (void) printf("ifconfig: plumb_one_device: provider %s," - " ppa %u\n", provider, ppa); + (void) printf("ifconfig: ifplumb: link %s, ifname %s, " + "genppa %u\n", linkname, ifname, genppa); } ip_fd = dlpi_fd(dh_ip); @@ -3812,29 +3844,106 @@ plumb_one_device(int af) Perror2_exit("I_PUSH", ARP_MOD_NAME); /* - * Set IFF_IPV4/IFF_IPV6 flags. - * At this point in time the kernel also allows an - * override of the CANTCHANGE flags. + * Prepare to set IFF_IPV4/IFF_IPV6 flags as part of SIOCSLIFNAME. + * (At this point in time the kernel also allows an override of the + * IFF_CANTCHANGE flags.) */ lifr.lifr_name[0] = '\0'; if (ioctl(ip_fd, SIOCGLIFFLAGS, (char *)&lifr) == -1) - Perror0_exit("plumb_one_device: SIOCGLIFFLAGS"); + Perror0_exit("ifplumb: SIOCGLIFFLAGS"); - /* Set the name string and the IFF_IPV* flag */ if (af == AF_INET6) { - lifr.lifr_flags |= IFF_IPV6; - lifr.lifr_flags &= ~(IFF_BROADCAST | IFF_IPV4); + flags = lifr.lifr_flags | IFF_IPV6; + flags &= ~(IFF_BROADCAST | IFF_IPV4); } else { - lifr.lifr_flags |= IFF_IPV4; - lifr.lifr_flags &= ~IFF_IPV6; + flags = lifr.lifr_flags | IFF_IPV4; + flags &= ~IFF_IPV6; } - /* record the device and module names as interface name */ - lifr.lifr_ppa = ppa; - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + /* + * Set the interface name. If we've been asked to generate the PPA, + * then find the lowest available PPA (only currently used for IPMP + * interfaces). Otherwise, use the interface name as-is. + */ + if (genppa) { + int ppa; + + /* + * We'd like to just set lifr_ppa to UINT_MAX and have the + * kernel pick a PPA. Unfortunately, that would mishandle + * two cases: + * + * 1. If the PPA is available but the groupname is taken + * (e.g., the "ipmp2" IP interface name is available + * but the "ipmp2" groupname is taken) then the + * auto-assignment by the kernel will fail. + * + * 2. If we're creating (e.g.) an IPv6-only IPMP + * interface, and there's already an IPv4-only IPMP + * interface, the kernel will allow us to accidentally + * reuse the IPv6 IPMP interface name (since + * SIOCSLIFNAME uniqueness is per-interface-type). + * This will cause administrative confusion. + * + * Thus, we instead take a brute-force approach of checking + * whether the IPv4 or IPv6 name is already in-use before + * attempting the SIOCSLIFNAME. As per (1) above, the + * SIOCSLIFNAME may still fail, in which case we just proceed + * to the next one. If this approach becomes too slow, we + * can add a new SIOC* to handle this case in the kernel. + */ + for (ppa = 0; ppa < UINT_MAX; ppa++) { + (void) snprintf(lifr.lifr_name, LIFNAMSIZ, "%s%d", + ifname, ppa); + + if (ioctl(s4, SIOCGLIFFLAGS, &lifr) != -1 || + errno != ENXIO) + continue; + + if (ioctl(s6, SIOCGLIFFLAGS, &lifr) != -1 || + errno != ENXIO) + continue; + + lifr.lifr_ppa = ppa; + lifr.lifr_flags = flags; + retval = ioctl(ip_fd, SIOCSLIFNAME, &lifr); + if (retval != -1 || errno != EEXIST) + break; + } + } else { + ifspec_t ifsp; + + /* + * The interface name could have come from the command-line; + * check it. + */ + if (!ifparse_ifspec(ifname, &ifsp) || ifsp.ifsp_lunvalid) + Perror2_exit("invalid IP interface name", ifname); + + /* + * Before we call SIOCSLIFNAME, ensure that the IPMP group + * interface for this address family exists. Otherwise, the + * kernel will kick the interface out of the group when we do + * the SIOCSLIFNAME. + * + * Example: suppose bge0 is plumbed for IPv4 and in group "a". + * If we're now plumbing bge0 for IPv6, but the IPMP group + * interface for "a" is not plumbed for IPv6, the SIOCSLIFNAME + * will kick bge0 out of group "a", which is undesired. + */ + if (create_ipmp_peer(af, ifname) == -1) { + (void) fprintf(stderr, "ifconfig: warning: cannot " + "create %s IPMP group; %s will be removed from " + "group\n", af == AF_INET ? "IPv4" : "IPv6", ifname); + } - /* set the interface name */ - if (ioctl(ip_fd, SIOCSLIFNAME, (char *)&lifr) == -1) { + lifr.lifr_ppa = ifsp.ifsp_ppa; + lifr.lifr_flags = flags; + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); + retval = ioctl(ip_fd, SIOCSLIFNAME, &lifr); + } + + if (retval == -1) { if (errno != EEXIST) Perror0_exit("SIOCSLIFNAME for ip"); /* @@ -3847,15 +3956,15 @@ plumb_one_device(int af) * called for EEXIST. */ Perror0("SIOCSLIFNAME for ip"); - return; + return (-1); } /* Get the full set of existing flags for this stream */ if (ioctl(ip_fd, SIOCGLIFFLAGS, (char *)&lifr) == -1) - Perror0_exit("plumb_one_device: SIOCFLIFFLAGS"); + Perror0_exit("ifplumb: SIOCGLIFFLAGS"); if (debug) { - (void) printf("ifconfig: plumb_one_device: %s got flags:\n", + (void) printf("ifconfig: ifplumb: %s got flags:\n", lifr.lifr_name); print_flags(lifr.lifr_flags); (void) putchar('\n'); @@ -3890,7 +3999,7 @@ plumb_one_device(int af) if ((ip_muxid = ioctl(mux_fd, I_PLINK, ip_fd)) == -1) Perror0_exit("I_PLINK for ip"); (void) close(mux_fd); - return; + return (lifr.lifr_ppa); } /* @@ -3901,15 +4010,11 @@ plumb_one_device(int af) * only on the interface stream, not on the ARP stream. */ if (debug) - (void) printf("ifconfig: plumb_one_device: ifname: %s\n", name); + (void) printf("ifconfig: ifplumb: interface %s", ifname); - /* - * We use DLPI_NOATTACH because the arp module will do the attach - * itself for DLPI style-2 devices. - */ - retval = dlpi_open(name, &dh_arp, DLPI_NOATTACH); + retval = dlpi_open(linkname, &dh_arp, dlpi_flags); if (retval != DLPI_SUCCESS) - Perrdlpi_exit("cannot open link", name, retval); + Perrdlpi_exit("cannot open link", linkname, retval); arp_fd = dlpi_fd(dh_arp); if (ioctl(arp_fd, I_PUSH, ARP_MOD_NAME) == -1) @@ -3919,16 +4024,13 @@ plumb_one_device(int af) * Tell ARP the name and unit number for this interface. * Note that arp has no support for transparent ioctls. */ - if (strioctl(arp_fd, SIOCSLIFNAME, (char *)&lifr, - sizeof (lifr)) == -1) { + if (strioctl(arp_fd, SIOCSLIFNAME, &lifr, sizeof (lifr)) == -1) { if (errno != EEXIST) Perror0_exit("SIOCSLIFNAME for arp"); Perror0("SIOCSLIFNAME for arp"); - dlpi_close(dh_arp); - dlpi_close(dh_ip); - (void) close(mux_fd); - return; + goto out; } + /* * PLINK the IP and ARP streams so that ifconfig can exit * without tearing down the stream. @@ -3942,12 +4044,13 @@ plumb_one_device(int af) if (debug) (void) printf("arp muxid = %d\n", arp_muxid); +out: dlpi_close(dh_ip); dlpi_close(dh_arp); (void) close(mux_fd); + return (lifr.lifr_ppa); } - /* * If this is a physical interface then remove it. * If it is a logical interface name use SIOCLIFREMOVEIF to @@ -3965,6 +4068,7 @@ inetunplumb(char *arg, int64_t param) uint64_t flags; boolean_t changed_arp_muxid = _B_FALSE; int save_errno; + boolean_t v6 = (afp->af_af == AF_INET6); strptr = strchr(name, ':'); if (strptr != NULL || strcmp(name, LOOPBACK_IF) == 0) { @@ -3986,7 +4090,7 @@ inetunplumb(char *arg, int64_t param) * We used /dev/udp or udp6 to set up the mux. So we have to use * the same now for PUNLINK also. */ - if (afp->af_af == AF_INET6) + if (v6) udp_dev_name = UDP6_DEV_NAME; else udp_dev_name = UDP_DEV_NAME; @@ -4002,6 +4106,50 @@ inetunplumb(char *arg, int64_t param) Perror0_exit("unplumb: SIOCGLIFFLAGS"); } flags = lifr.lifr_flags; + + if (flags & IFF_IPMP) { + lifgroupinfo_t lifgr; + ifaddrlistx_t *ifaddrs, *ifaddrp; + + /* + * The kernel will fail the I_PUNLINK if the group still has + * members, but check now to provide a better error message. + */ + if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) == -1) + Perror0_exit("unplumb: SIOCGLIFGROUPNAME"); + + (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname, + LIFGRNAMSIZ); + if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1) + Perror0_exit("unplumb: SIOCGLIFGROUPINFO"); + + if ((v6 && lifgr.gi_nv6 != 0) || (!v6 && lifgr.gi_nv4 != 0)) { + (void) fprintf(stderr, "ifconfig: %s: cannot unplumb:" + " IPMP group is not empty\n", name); + exit(1); + } + + /* + * The kernel will fail the I_PUNLINK if the IPMP interface + * has administratively up addresses; bring 'em down. + */ + if (ifaddrlistx(name, IFF_UP|IFF_DUPLICATE, 0, &ifaddrs) == -1) + Perror2_exit(name, "cannot get address list"); + + ifaddrp = ifaddrs; + for (; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (((ifaddrp->ia_flags & IFF_IPV6) && !v6) || + (!(ifaddrp->ia_flags & IFF_IPV6) && v6)) + continue; + + if (!ifaddr_down(ifaddrp)) { + Perror2_exit(ifaddrp->ia_name, + "cannot bring down"); + } + } + ifaddrlistx_free(ifaddrs); + } + if (ioctl(muxid_fd, SIOCGLIFMUXID, (caddr_t)&lifr) < 0) { Perror0_exit("unplumb: SIOCGLIFMUXID"); } @@ -4098,12 +4246,6 @@ inetplumb(char *arg, int64_t param) Perror2_exit("plumb: SIOCLIFADDIF", name); } } - /* - * IP can create the new logical interface on a different - * physical interface in the same IPMP group. Take the new - * interface into account for further operations. - */ - (void) strncpy(name, lifr.lifr_name, sizeof (name)); return (0); } @@ -4131,10 +4273,229 @@ inetplumb(char *arg, int64_t param) if (debug) (void) printf("inetplumb: %s af %d\n", name, afp->af_af); - plumb_one_device(afp->af_af); + (void) ifplumb(name, name, _B_FALSE, afp->af_af); + return (0); +} + +/* ARGSUSED */ +static int +inetipmp(char *arg, int64_t param) +{ + int retval; + + /* + * Treat e.g. "ifconfig ipmp0:2 ipmp" as "ifconfig ipmp0:2 plumb". + * Otherwise, try to create the requested IPMP interface. + */ + if (strchr(name, ':') != NULL) + retval = inetplumb(arg, param); + else + retval = create_ipmp(name, afp->af_af, name, _B_FALSE); + + /* + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). + */ + if (retval == -1) + exit(1); return (0); } +/* + * Create an IPMP group `grname' with address family `af'. If `ifname' is + * non-NULL, it specifies the interface name to use. Otherwise, use the name + * ipmpN, where N corresponds to the lowest available integer. If `implicit' + * is set, then the group is being created as a side-effect of placing an + * underlying interface in a group. Also start in.mpathd if necessary. + */ +static int +create_ipmp(const char *grname, int af, const char *ifname, boolean_t implicit) +{ + int ppa; + static int ipmp_daemon_started; + + if (debug) { + (void) printf("create_ipmp: ifname %s grname %s af %d\n", + ifname != NULL ? ifname : "NULL", grname, af); + } + + if (ifname != NULL) + ppa = ifplumb(IPMPSTUB, ifname, _B_FALSE, af); + else + ppa = ifplumb(IPMPSTUB, "ipmp", _B_TRUE, af); + + if (ppa == -1) { + Perror2(grname, "cannot create IPMP interface"); + return (-1); + } + + if (ifname != NULL) + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); + else + (void) snprintf(lifr.lifr_name, LIFNAMSIZ, "ipmp%d", ppa); + + /* + * To preserve backward-compatibility, always bring up the link-local + * address for implicitly-created IPv6 IPMP interfaces. + */ + if (implicit && af == AF_INET6) { + if (ioctl(s6, SIOCGLIFFLAGS, &lifr) == 0) { + lifr.lifr_flags |= IFF_UP; + (void) ioctl(s6, SIOCSLIFFLAGS, &lifr); + } + } + + /* + * If the caller requested a different group name, issue a + * SIOCSLIFGROUPNAME on the new IPMP interface. + */ + if (strcmp(lifr.lifr_name, grname) != 0) { + (void) strlcpy(lifr.lifr_groupname, grname, LIFGRNAMSIZ); + if (ioctl(s, SIOCSLIFGROUPNAME, &lifr) == -1) { + Perror0("SIOCSLIFGROUPNAME"); + return (-1); + } + } + + /* + * If we haven't done so yet, ensure in.mpathd is started. + */ + if (ipmp_daemon_started++ == 0) + start_ipmp_daemon(); + + return (0); +} + +/* + * Check if `ifname' is plumbed and in an IPMP group on its "other" address + * family. If so, create a matching IPMP group for address family `af'. + */ +static int +create_ipmp_peer(int af, const char *ifname) +{ + int fd; + lifgroupinfo_t lifgr; + + assert(af == AF_INET || af == AF_INET6); + + /* + * Get the socket for the "other" address family. + */ + fd = (af == AF_INET) ? s6 : s4; + + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) != 0) + return (0); + + (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname, LIFGRNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPINFO, &lifgr) != 0) + return (0); + + /* + * If `ifname' *is* the IPMP group interface, or if the relevant + * address family is already configured, then there's nothing to do. + */ + if (strcmp(lifgr.gi_grifname, ifname) == 0 || + (af == AF_INET && lifgr.gi_v4) || (af == AF_INET6 && lifgr.gi_v6)) + return (0); + + return (create_ipmp(lifgr.gi_grname, af, lifgr.gi_grifname, _B_TRUE)); +} + +/* + * Start in.mpathd if it's not already running. + */ +static void +start_ipmp_daemon(void) +{ + int retval; + ipmp_handle_t ipmp_handle; + + /* + * Ping in.mpathd to see if it's running already. + */ + if ((retval = ipmp_open(&ipmp_handle)) != IPMP_SUCCESS) { + (void) fprintf(stderr, "ifconfig: cannot create IPMP handle: " + "%s\n", ipmp_errmsg(retval)); + return; + } + + retval = ipmp_ping_daemon(ipmp_handle); + ipmp_close(ipmp_handle); + + switch (retval) { + case IPMP_ENOMPATHD: + break; + case IPMP_SUCCESS: + return; + default: + (void) fprintf(stderr, "ifconfig: cannot ping in.mpathd: %s\n", + ipmp_errmsg(retval)); + break; + } + + /* + * Start in.mpathd. Note that in.mpathd will handle multiple + * incarnations (ipmp_ping_daemon() is just an optimization) so we + * don't need to worry about racing with another ifconfig process. + */ + switch (fork()) { + case -1: + Perror0_exit("start_ipmp_daemon: fork"); + /* NOTREACHED */ + case 0: + (void) execl(MPATHD_PATH, MPATHD_PATH, NULL); + _exit(1); + /* NOTREACHED */ + default: + break; + } +} + +/* + * Bring the address named by `ifaddrp' up or down. Doesn't trust any mutable + * values in ia_flags since they may be stale. + */ +static boolean_t +ifaddr_op(ifaddrlistx_t *ifaddrp, boolean_t up) +{ + struct lifreq lifrl; /* Local lifreq struct */ + int fd = (ifaddrp->ia_flags & IFF_IPV4) ? s4 : s6; + + (void) memset(&lifrl, 0, sizeof (lifrl)); + (void) strlcpy(lifrl.lifr_name, ifaddrp->ia_name, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFFLAGS, &lifrl) == -1) + return (_B_FALSE); + + if (up) { + lifrl.lifr_flags |= IFF_UP; + } else { + /* + * If we've been asked to bring down an IFF_DUPLICATE address, + * then get the address and set it. This will cause IP to + * clear IFF_DUPLICATE and stop the automatic recovery timer. + */ + if (lifrl.lifr_flags & IFF_DUPLICATE) { + return (ioctl(fd, SIOCGLIFADDR, &lifrl) != -1 && + ioctl(fd, SIOCSLIFADDR, &lifrl) != -1); + } + lifrl.lifr_flags &= ~IFF_UP; + } + return (ioctl(fd, SIOCSLIFFLAGS, &lifrl) == 0); +} + +static boolean_t +ifaddr_up(ifaddrlistx_t *ifaddrp) +{ + return (ifaddr_op(ifaddrp, _B_TRUE)); +} + +static boolean_t +ifaddr_down(ifaddrlistx_t *ifaddrp) +{ + return (ifaddr_op(ifaddrp, _B_FALSE)); +} + void Perror0(const char *cmd) { @@ -4404,14 +4765,14 @@ print_flags(uint64_t flags) } static void -print_config_flags(uint64_t flags) +print_config_flags(int af, uint64_t flags) { - int cnt, i; + if_config_cmd_t *cmdp; - cnt = sizeof (if_config_cmd_tbl) / sizeof (if_config_cmd_t); - for (i = 0; i < cnt; i++) { - if (flags & if_config_cmd_tbl[i].iff_flag) { - (void) printf("%s ", if_config_cmd_tbl[i].iff_name); + for (cmdp = if_config_cmd_tbl; cmdp->iff_flag != 0; cmdp++) { + if ((flags & cmdp->iff_flag) && + (cmdp->iff_af == AF_UNSPEC || cmdp->iff_af == af)) { + (void) printf("%s ", cmdp->iff_name); } } } @@ -4454,7 +4815,18 @@ in_getmask(struct sockaddr_in *saddr, boolean_t addr_set) } static int -strioctl(int s, int cmd, char *buf, int buflen) +lifnum(const char *ifname) +{ + const char *cp; + + if ((cp = strchr(ifname, ':')) == NULL) + return (0); + else + return (atoi(cp + 1)); +} + +static int +strioctl(int s, int cmd, void *buf, int buflen) { struct strioctl ioc; @@ -4681,6 +5053,7 @@ usage(void) "\t[ modlist ]\n" "\t[ modinsert <module_name@position> ]\n" "\t[ modremove <module_name@position> ]\n" + "\t[ ipmp ]\n" "\t[ group <groupname>] | [ group \"\"]\n" "\t[ deprecated | -deprecated ]\n" "\t[ standby | -standby ]\n" diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h index 0ac600001f..f11f4d0a94 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -11,8 +11,6 @@ #ifndef _IFCONFIG_H #define _IFCONFIG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -39,7 +37,6 @@ extern void Perrdlpi_exit(const char *, const char *, int); extern int doifrevarp(const char *, struct sockaddr_in *); -extern int dlpi_set_address(const char *, uchar_t *, uint_t); extern void dlpi_print_address(const char *); #ifdef __cplusplus diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c index 725c8b24c3..aba4794942 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "ifconfig.h" #include <sys/types.h> @@ -88,6 +86,7 @@ doifrevarp(const char *linkname, struct sockaddr_in *laddr) /* don't try to revarp if we know it won't work */ if ((lifr.lifr_flags & IFF_LOOPBACK) || (lifr.lifr_flags & IFF_NOARP) || + (lifr.lifr_flags & IFF_IPMP) || (lifr.lifr_flags & IFF_POINTOPOINT)) { (void) close(s); return (0); @@ -326,28 +325,6 @@ rarp_recv(dlpi_handle_t dh, struct arphdr *ans, size_t msglen, return (DLPI_ETIMEDOUT); } -int -dlpi_set_address(const char *linkname, uchar_t *physaddr, uint_t physaddrlen) -{ - int retval; - dlpi_handle_t dh; - - if ((retval = dlpi_open(linkname, &dh, 0)) != DLPI_SUCCESS) { - Perrdlpi("dlpi_open failed", linkname, retval); - return (-1); - } - - if ((retval = dlpi_set_physaddr(dh, DL_CURR_PHYS_ADDR, physaddr, - physaddrlen)) != DLPI_SUCCESS) { - Perrdlpi("dlpi_set_physaddr failed", linkname, retval); - dlpi_close(dh); - return (-1); - } - - dlpi_close(dh); - return (0); -} - void dlpi_print_address(const char *linkname) { diff --git a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h index 900b5841ed..5cca3ecb2e 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h +++ b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Copyright (c) 1983, 1988, 1993 @@ -414,16 +414,9 @@ struct interface { (IS_REMOTE|IS_PASSIVE)) /* - * Is an IP interface up? Because of the way IPMP uses deprecated - * interfaces, we need to check more than the IFF_UP and IFF_RUNNING - * interface flags here. Basically, we do not want to use IFF_DEPRECATED - * interfaces unless they are also IFF_STANDBY and not IFF_INACTIVE. + * Is an IP interface up? */ -#define IFF_GOOD (IFF_UP|IFF_RUNNING) -#define IS_IFF_UP(f) \ - ((((f) & (IFF_GOOD|IFF_DEPRECATED)) == IFF_GOOD) || \ - (((f) & (IFF_GOOD|IFF_INACTIVE|IFF_STANDBY)) == \ - (IFF_GOOD|IFF_STANDBY))) +#define IS_IFF_UP(f) (((f) & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING)) /* * This defines interfaces that we should not use for advertising or diff --git a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c index 79ae02e703..a3a26ac2cb 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Copyright (c) 1983, 1988, 1993 @@ -36,8 +36,6 @@ * $FreeBSD: src/sbin/routed/trace.c,v 1.6 2000/08/11 08:24:38 sheldonh Exp $ */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "pathnames.h" #include <signal.h> @@ -566,6 +564,7 @@ static struct bits if_bits[] = { { IFF_TEMPORARY, 0, "TEMPORARY" }, { IFF_FIXEDMTU, 0, "FIXEDMTU" }, { IFF_VIRTUAL, 0, "VIRTUAL"}, + { IFF_IPMP, 0, "IPMP"}, { 0, 0, NULL} }; @@ -898,8 +897,8 @@ trace_upslot(struct rt_entry *rt, print_rts(rts, 0, 0, rts->rts_gate != new->rts_gate, rts->rts_tag != new->rts_tag, - rts != rt->rt_spares || AGE_RT(rt->rt_state, - rts->rts_origin, rt->rt_ifp)); + rts != rt->rt_spares || + AGE_RT(rt->rt_state, rts->rts_origin, rt->rt_ifp)); (void) fprintf(ftrace, "\n %19s%-16s ", "", (new->rts_gate != rts->rts_gate ? @@ -1173,10 +1172,9 @@ trace_rip(const char *dir1, const char *dir2, if (NA->a_type == RIP_AUTH_PW && n == msg->rip_nets) { (void) fprintf(ftrace, "\tPassword" - " Authentication:" - " \"%s\"\n", + " Authentication: \"%s\"\n", qstring(NA->au.au_pw, - RIP_AUTH_PW_LEN)); + RIP_AUTH_PW_LEN)); continue; } @@ -1186,13 +1184,12 @@ trace_rip(const char *dir1, const char *dir2, "\tMD5 Auth" " pkt_len=%d KeyID=%u" " auth_len=%d" - " seqno=%#lx" - " rsvd=%#x,%#x\n", + " seqno=%#x" + " rsvd=%#hx,%#hx\n", ntohs(NA->au.a_md5.md5_pkt_len), NA->au.a_md5.md5_keyid, NA->au.a_md5.md5_auth_len, - (unsigned long)ntohl(NA->au.a_md5. - md5_seqno), + ntohl(NA->au.a_md5.md5_seqno), ntohs(NA->au.a_md5.rsvd[0]), ntohs(NA->au.a_md5.rsvd[1])); continue; @@ -1217,14 +1214,12 @@ trace_rip(const char *dir1, const char *dir2, inet_ntoa(tmp_mask)); } else if (msg->rip_vers == RIPv1) { (void) fprintf(ftrace, "\t%-18s ", - addrname(n->n_dst, - ntohl(n->n_mask), - n->n_mask == 0 ? 2 : 1)); + addrname(n->n_dst, ntohl(n->n_mask), + n->n_mask == 0 ? 2 : 1)); } else { (void) fprintf(ftrace, "\t%-18s ", - addrname(n->n_dst, - ntohl(n->n_mask), - n->n_mask == 0 ? 2 : 0)); + addrname(n->n_dst, ntohl(n->n_mask), + n->n_mask == 0 ? 2 : 0)); } (void) fprintf(ftrace, "metric=%-2lu ", (unsigned long)ntohl(n->n_metric)); @@ -1242,8 +1237,8 @@ trace_rip(const char *dir1, const char *dir2, break; case RIPCMD_TRACEON: - (void) fprintf(ftrace, "\tfile=\"%.*s\"\n", size-4, - msg->rip_tracefile); + (void) fprintf(ftrace, "\tfile=\"%.*s\"\n", size - 4, + msg->rip_tracefile); break; case RIPCMD_TRACEOFF: diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile new file mode 100644 index 0000000000..a256cf5f49 --- /dev/null +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile @@ -0,0 +1,48 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +PROG = ipmpstat +ROOTFS_PROG = $(PROG) +ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%) + +include $(SRC)/cmd/Makefile.cmd + +C99MODE = $(C99_ENABLE) +LDLIBS += -lipmp -lsocket -lsysevent -lnvpair +XGETFLAGS += -a -x $(PROG).xcl + +.KEEP_STATE: + +all: $(PROG) + +install: all $(ROOTSBINPROG) $(ROOTUSRSBINLINKS) + +clean: + +lint: lint_PROG + +$(ROOTUSRSBINLINKS): + -$(RM) $@; $(SYMLINK) ../../sbin/$(@F) $@ + +include $(SRC)/cmd/Makefile.targ diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c new file mode 100644 index 0000000000..4620c34a24 --- /dev/null +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c @@ -0,0 +1,1498 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <alloca.h> +#include <arpa/inet.h> +#include <assert.h> +#include <errno.h> +#include <ipmp_admin.h> +#include <ipmp_query.h> +#include <libintl.h> +#include <libnvpair.h> +#include <libsysevent.h> +#include <locale.h> +#include <netdb.h> +#include <signal.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/ipmp.h> +#include <sys/sysmacros.h> +#include <sys/termios.h> +#include <sys/types.h> + +/* + * ipmpstat -- display IPMP subsystem status. + * + * This utility makes extensive use of libipmp and IPMP sysevents to gather + * and pretty-print the status of the IPMP subsystem. All output formats + * except for -p (probe) use libipmp to create a point-in-time snapshot of the + * IPMP subsystem (unless the test-special -L flag is used), and then output + * the contents of that snapshot in a user-specified manner. Because the + * output format and requested fields aren't known until run-time, three sets + * of function pointers and two core data structures are used. Specifically: + * + * * The ipmpstat_walker_t function pointers (walk_*) iterate through + * all instances of a given IPMP object (group, interface, or address). + * At most one ipmpstat_walker_t is used per ipmpstat invocation. + * Since target information is included with the interface information, + * both -i and -t use the interface walker (walk_if()). + * + * * The ipmpstat_sfunc_t function pointers (sfunc_*) obtain a given + * value for a given IPMP object. Each ipmpstat_sunc_t is passed a + * buffer to write its result into, the buffer's size, and an + * ipmpstat_sfunc_arg_t state structure. The state structure consists + * of a pointer to the IPMP object to obtain information from + * (sa_data), and an open libipmp handle (sa_ih) which can be used to + * do additional libipmp queries, if necessary (e.g., because the + * object does not have all of the needed information). + * + * * The ipmpstat_field_t structure provides the list of supported fields + * for a given output format, along with output formatting information + * (e.g., field width), and a pointer to an ipmpstat_sfunc_t function + * that can obtain the value for a IPMP given object. For a given + * ipmpstat output format, there's a corresponding array of + * ipmpstat_field_t structures. Thus, one ipmpstat_field_t array is + * used per ipmpstat invocation. + * + * * The ipmpstat_ofmt_t provides an ordered list of the requested + * ipmpstat_field_t's (e.g., via -o) for a given ipmpstat invocation. + * It is built at runtime from the command-line arguments. This + * structure (and a given IPMP object) is used by ofmt_output() to + * output a single line of information about that IPMP object. + * + * * The ipmpstat_cbfunc_t function pointers (*_cbfunc) are called back + * by the walkers. They are used both internally to implement nested + * walks, and by the ipmpstat output logic to provide the glue between + * the IPMP object walkers and the ofmt_output() logic. Usually, a + * single line is output for each IPMP object, and thus ofmt_output() + * can be directly invoked (see info_output_cbfunc()). However, if + * multiple lines need to be output, then a more complex cbfunc is + * needed (see targinfo_output_cbfunc()). At most one cbfunc is used + * per ipmpstat invocation. + */ + +/* + * Data type used by the sfunc callbacks to obtain the requested information + * from the agreed-upon object. + */ +typedef struct ipmpstat_sfunc_arg { + ipmp_handle_t sa_ih; + void *sa_data; +} ipmpstat_sfunc_arg_t; + +typedef void ipmpstat_sfunc_t(ipmpstat_sfunc_arg_t *, char *, uint_t); + +/* + * Data type that describes how to output a field; used by ofmt_output*(). + */ +typedef struct ipmpstat_field { + const char *f_name; /* field name */ + uint_t f_width; /* output width */ + ipmpstat_sfunc_t *f_sfunc; /* value->string function */ +} ipmpstat_field_t; + +/* + * Data type that specifies the output field order; used by ofmt_output*() + */ +typedef struct ipmpstat_ofmt { + const ipmpstat_field_t *o_field; /* current field info */ + struct ipmpstat_ofmt *o_next; /* next field */ +} ipmpstat_ofmt_t; + +/* + * Function pointers used to iterate through IPMP objects. + */ +typedef void ipmpstat_cbfunc_t(ipmp_handle_t, void *, void *); +typedef void ipmpstat_walker_t(ipmp_handle_t, ipmpstat_cbfunc_t *, void *); + +/* + * Data type used to implement nested walks. + */ +typedef struct ipmpstat_walkdata { + ipmpstat_cbfunc_t *iw_func; /* caller-specified callback */ + void *iw_funcarg; /* caller-specified arg */ +} ipmpstat_walkdata_t; + +/* + * Data type used by enum2str() to map an enumerated value to a string. + */ +typedef struct ipmpstat_enum { + const char *e_name; /* string */ + int e_val; /* value */ +} ipmpstat_enum_t; + +/* + * Data type used to pass state between probe_output() and probe_event(). + */ +typedef struct ipmpstat_probe_state { + ipmp_handle_t ps_ih; /* open IPMP handle */ + ipmpstat_ofmt_t *ps_ofmt; /* requested ofmt string */ +} ipmpstat_probe_state_t; + +/* + * Options that modify the output mode; more than one may be lit. + */ +typedef enum { + IPMPSTAT_OPT_NUMERIC = 0x1, + IPMPSTAT_OPT_PARSABLE = 0x2 +} ipmpstat_opt_t; + +/* + * Indices for the FLAGS field of the `-i' output format. + */ +enum { + IPMPSTAT_IFLAG_INDEX, IPMPSTAT_SFLAG_INDEX, IPMPSTAT_M4FLAG_INDEX, + IPMPSTAT_BFLAG_INDEX, IPMPSTAT_M6FLAG_INDEX, IPMPSTAT_DFLAG_INDEX, + IPMPSTAT_HFLAG_INDEX, IPMPSTAT_NUM_FLAGS +}; + +#define IPMPSTAT_NCOL 80 +#define NS2FLOATMS(ns) ((float)(ns) / (NANOSEC / MILLISEC)) +#define MS2FLOATSEC(ms) ((float)(ms) / 1000) + +static const char *progname; +static hrtime_t probe_output_start; +static struct winsize winsize; +static ipmpstat_opt_t opt; +static ipmpstat_enum_t addr_state[], group_state[], if_state[], if_link[]; +static ipmpstat_enum_t if_probe[], targ_mode[]; +static ipmpstat_field_t addr_fields[], group_fields[], if_fields[]; +static ipmpstat_field_t probe_fields[], targ_fields[]; +static ipmpstat_cbfunc_t walk_addr_cbfunc, walk_if_cbfunc; +static ipmpstat_cbfunc_t info_output_cbfunc, targinfo_output_cbfunc; +static ipmpstat_walker_t walk_addr, walk_if, walk_group; + +static int probe_event(sysevent_t *, void *); +static void probe_output(ipmp_handle_t, ipmpstat_ofmt_t *); +static ipmpstat_field_t *field_find(ipmpstat_field_t *, const char *); +static ipmpstat_ofmt_t *ofmt_create(const char *, ipmpstat_field_t []); +static void ofmt_output(const ipmpstat_ofmt_t *, ipmp_handle_t, void *); +static void ofmt_destroy(ipmpstat_ofmt_t *); +static void enum2str(const ipmpstat_enum_t *, int, char *, uint_t); +static void sockaddr2str(const struct sockaddr_storage *, char *, uint_t); +static void sighandler(int); +static void usage(void); +static void die(const char *, ...); +static void die_ipmperr(int, const char *, ...); +static void warn(const char *, ...); +static void warn_ipmperr(int, const char *, ...); + +int +main(int argc, char **argv) +{ + int c; + int err; + const char *ofields = NULL; + ipmp_handle_t ih; + ipmp_qcontext_t qcontext = IPMP_QCONTEXT_SNAP; + ipmpstat_ofmt_t *ofmt; + ipmpstat_field_t *fields = NULL; + ipmpstat_cbfunc_t *cbfunc; + ipmpstat_walker_t *walker; + + if ((progname = strrchr(argv[0], '/')) == NULL) + progname = argv[0]; + else + progname++; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + + while ((c = getopt(argc, argv, "nLPo:agipt")) != EOF) { + if (fields != NULL && strchr("agipt", c) != NULL) + die("only one output format may be specified\n"); + + switch (c) { + case 'n': + opt |= IPMPSTAT_OPT_NUMERIC; + break; + case 'L': + /* Undocumented option: for testing use ONLY */ + qcontext = IPMP_QCONTEXT_LIVE; + break; + case 'P': + opt |= IPMPSTAT_OPT_PARSABLE; + break; + case 'o': + ofields = optarg; + break; + case 'a': + walker = walk_addr; + cbfunc = info_output_cbfunc; + fields = addr_fields; + break; + case 'g': + walker = walk_group; + cbfunc = info_output_cbfunc; + fields = group_fields; + break; + case 'i': + walker = walk_if; + cbfunc = info_output_cbfunc; + fields = if_fields; + break; + case 'p': + fields = probe_fields; + break; + case 't': + walker = walk_if; + cbfunc = targinfo_output_cbfunc; + fields = targ_fields; + break; + default: + usage(); + break; + } + } + + if (argc > optind || fields == NULL) + usage(); + + if (opt & IPMPSTAT_OPT_PARSABLE) { + if (ofields == NULL) { + die("output field list (-o) required in parsable " + "output mode\n"); + } else if (strcasecmp(ofields, "all") == 0) { + die("\"all\" not allowed in parsable output mode\n"); + } + } + + /* + * Obtain the window size and monitor changes to the size. This data + * is used to redisplay the output headers when necessary. + */ + (void) sigset(SIGWINCH, sighandler); + sighandler(SIGWINCH); + + if ((err = ipmp_open(&ih)) != IPMP_SUCCESS) + die_ipmperr(err, "cannot create IPMP handle"); + + if (ipmp_ping_daemon(ih) != IPMP_SUCCESS) + die("cannot contact in.mpathd(1M) -- is IPMP in use?\n"); + + /* + * Create the ofmt linked list that will eventually be passed to + * to ofmt_output() to output the fields. + */ + ofmt = ofmt_create(ofields, fields); + + /* + * If we've been asked to display probes, then call the probe output + * function. Otherwise, snapshot IPMP state (or use live state) and + * invoke the specified walker with the specified callback function. + */ + if (fields == probe_fields) { + probe_output(ih, ofmt); + } else { + if ((err = ipmp_setqcontext(ih, qcontext)) != IPMP_SUCCESS) { + if (qcontext == IPMP_QCONTEXT_SNAP) + die_ipmperr(err, "cannot snapshot IPMP state"); + else + die_ipmperr(err, "cannot use live IPMP state"); + } + (*walker)(ih, cbfunc, ofmt); + } + + ofmt_destroy(ofmt); + ipmp_close(ih); + + return (EXIT_SUCCESS); +} + +/* + * Walks all IPMP groups on the system and invokes `cbfunc' on each, passing + * it `ih', the ipmp_groupinfo_t pointer, and `arg'. + */ +static void +walk_group(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg) +{ + int err; + uint_t i; + ipmp_groupinfo_t *grinfop; + ipmp_grouplist_t *grlistp; + + if ((err = ipmp_getgrouplist(ih, &grlistp)) != IPMP_SUCCESS) + die_ipmperr(err, "cannot get IPMP group list"); + + for (i = 0; i < grlistp->gl_ngroup; i++) { + err = ipmp_getgroupinfo(ih, grlistp->gl_groups[i], &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + grlistp->gl_groups[i]); + continue; + } + (*cbfunc)(ih, grinfop, arg); + ipmp_freegroupinfo(grinfop); + } + + ipmp_freegrouplist(grlistp); +} + +/* + * Walks all IPMP interfaces on the system and invokes `cbfunc' on each, + * passing it `ih', the ipmp_ifinfo_t pointer, and `arg'. + */ +static void +walk_if(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg) +{ + ipmpstat_walkdata_t iw = { cbfunc, arg }; + + walk_group(ih, walk_if_cbfunc, &iw); +} + +/* + * Walks all IPMP data addresses on the system and invokes `cbfunc' on each. + * passing it `ih', the ipmp_addrinfo_t pointer, and `arg'. + */ +static void +walk_addr(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg) +{ + ipmpstat_walkdata_t iw = { cbfunc, arg }; + + walk_group(ih, walk_addr_cbfunc, &iw); +} + +/* + * Nested walker callback function for walk_if(). + */ +static void +walk_if_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + int err; + uint_t i; + ipmp_groupinfo_t *grinfop = infop; + ipmp_ifinfo_t *ifinfop; + ipmp_iflist_t *iflistp = grinfop->gr_iflistp; + ipmpstat_walkdata_t *iwp = arg; + + for (i = 0; i < iflistp->il_nif; i++) { + err = ipmp_getifinfo(ih, iflistp->il_ifs[i], &ifinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for interface `%s'", + iflistp->il_ifs[i]); + continue; + } + (*iwp->iw_func)(ih, ifinfop, iwp->iw_funcarg); + ipmp_freeifinfo(ifinfop); + } +} + +/* + * Nested walker callback function for walk_addr(). + */ +static void +walk_addr_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + int err; + uint_t i; + ipmp_groupinfo_t *grinfop = infop; + ipmp_addrinfo_t *adinfop; + ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; + ipmpstat_walkdata_t *iwp = arg; + char addr[INET6_ADDRSTRLEN]; + struct sockaddr_storage *addrp; + + for (i = 0; i < adlistp->al_naddr; i++) { + addrp = &adlistp->al_addrs[i]; + err = ipmp_getaddrinfo(ih, grinfop->gr_name, addrp, &adinfop); + if (err != IPMP_SUCCESS) { + sockaddr2str(addrp, addr, sizeof (addr)); + warn_ipmperr(err, "cannot get info for `%s'", addr); + continue; + } + (*iwp->iw_func)(ih, adinfop, iwp->iw_funcarg); + ipmp_freeaddrinfo(adinfop); + } +} + +static void +sfunc_nvwarn(const char *nvname, char *buf, uint_t bufsize) +{ + warn("cannot retrieve %s\n", nvname); + (void) strlcpy(buf, "?", bufsize); +} + +static void +sfunc_addr_address(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_addrinfo_t *adinfop = arg->sa_data; + + sockaddr2str(&adinfop->ad_addr, buf, bufsize); +} + +static void +sfunc_addr_group(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + ipmp_addrinfo_t *adinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + err = ipmp_getgroupinfo(arg->sa_ih, adinfop->ad_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + adinfop->ad_group); + (void) strlcpy(buf, "?", bufsize); + return; + } + (void) strlcpy(buf, grinfop->gr_ifname, bufsize); + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_addr_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_addrinfo_t *adinfop = arg->sa_data; + + enum2str(addr_state, adinfop->ad_state, buf, bufsize); +} + +static void +sfunc_addr_inbound(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_addrinfo_t *adinfop = arg->sa_data; + + (void) strlcpy(buf, adinfop->ad_binding, bufsize); +} + +static void +sfunc_addr_outbound(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + uint_t i, nactive = 0; + ipmp_ifinfo_t *ifinfop; + ipmp_iflist_t *iflistp; + ipmp_addrinfo_t *adinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + if (adinfop->ad_state == IPMP_ADDR_DOWN) + return; + + /* + * If there's no inbound interface for this address, there can't + * be any outbound traffic. + */ + if (adinfop->ad_binding[0] == '\0') + return; + + /* + * The address can use any active interface in the group, so + * obtain all of those. + */ + err = ipmp_getgroupinfo(arg->sa_ih, adinfop->ad_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + adinfop->ad_group); + (void) strlcpy(buf, "?", bufsize); + return; + } + + iflistp = grinfop->gr_iflistp; + for (i = 0; i < iflistp->il_nif; i++) { + err = ipmp_getifinfo(arg->sa_ih, iflistp->il_ifs[i], &ifinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for interface `%s'", + iflistp->il_ifs[i]); + continue; + } + + if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE) { + if (nactive++ != 0) + (void) strlcat(buf, " ", bufsize); + (void) strlcat(buf, ifinfop->if_name, bufsize); + } + ipmp_freeifinfo(ifinfop); + } + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_group_name(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + (void) strlcpy(buf, grinfop->gr_name, bufsize); +} + +static void +sfunc_group_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + (void) strlcpy(buf, grinfop->gr_ifname, bufsize); +} + +static void +sfunc_group_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + enum2str(group_state, grinfop->gr_state, buf, bufsize); +} + +static void +sfunc_group_fdt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + if (grinfop->gr_fdt == 0) + return; + + (void) snprintf(buf, bufsize, "%.2fs", MS2FLOATSEC(grinfop->gr_fdt)); +} + +static void +sfunc_group_interfaces(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + uint_t i; + char *active, *inactive, *unusable; + uint_t nactive = 0, ninactive = 0, nunusable = 0; + ipmp_groupinfo_t *grinfop = arg->sa_data; + ipmp_iflist_t *iflistp = grinfop->gr_iflistp; + ipmp_ifinfo_t *ifinfop; + + active = alloca(bufsize); + active[0] = '\0'; + inactive = alloca(bufsize); + inactive[0] = '\0'; + unusable = alloca(bufsize); + unusable[0] = '\0'; + + for (i = 0; i < iflistp->il_nif; i++) { + err = ipmp_getifinfo(arg->sa_ih, iflistp->il_ifs[i], &ifinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for interface `%s'", + iflistp->il_ifs[i]); + continue; + } + + if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE) { + if (nactive++ != 0) + (void) strlcat(active, " ", bufsize); + (void) strlcat(active, ifinfop->if_name, bufsize); + } else if (ifinfop->if_flags & IPMP_IFFLAG_INACTIVE) { + if (ninactive++ != 0) + (void) strlcat(inactive, " ", bufsize); + (void) strlcat(inactive, ifinfop->if_name, bufsize); + } else { + if (nunusable++ != 0) + (void) strlcat(unusable, " ", bufsize); + (void) strlcat(unusable, ifinfop->if_name, bufsize); + } + + ipmp_freeifinfo(ifinfop); + } + + (void) strlcpy(buf, active, bufsize); + + if (ninactive > 0) { + if (nactive != 0) + (void) strlcat(buf, " ", bufsize); + + (void) strlcat(buf, "(", bufsize); + (void) strlcat(buf, inactive, bufsize); + (void) strlcat(buf, ")", bufsize); + } + + if (nunusable > 0) { + if (nactive + ninactive != 0) + (void) strlcat(buf, " ", bufsize); + + (void) strlcat(buf, "[", bufsize); + (void) strlcat(buf, unusable, bufsize); + (void) strlcat(buf, "]", bufsize); + } +} + +static void +sfunc_if_name(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + (void) strlcpy(buf, ifinfop->if_name, bufsize); +} + +static void +sfunc_if_active(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE) + (void) strlcpy(buf, "yes", bufsize); + else + (void) strlcpy(buf, "no", bufsize); +} + +static void +sfunc_if_group(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + ipmp_ifinfo_t *ifinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + err = ipmp_getgroupinfo(arg->sa_ih, ifinfop->if_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + ifinfop->if_group); + (void) strlcpy(buf, "?", bufsize); + return; + } + + (void) strlcpy(buf, grinfop->gr_ifname, bufsize); + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_if_flags(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + ipmp_ifinfo_t *ifinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + assert(bufsize > IPMPSTAT_NUM_FLAGS); + + (void) memset(buf, '-', IPMPSTAT_NUM_FLAGS); + buf[IPMPSTAT_NUM_FLAGS] = '\0'; + + if (ifinfop->if_type == IPMP_IF_STANDBY) + buf[IPMPSTAT_SFLAG_INDEX] = 's'; + + if (ifinfop->if_flags & IPMP_IFFLAG_INACTIVE) + buf[IPMPSTAT_IFLAG_INDEX] = 'i'; + + if (ifinfop->if_flags & IPMP_IFFLAG_DOWN) + buf[IPMPSTAT_DFLAG_INDEX] = 'd'; + + if (ifinfop->if_flags & IPMP_IFFLAG_HWADDRDUP) + buf[IPMPSTAT_HFLAG_INDEX] = 'h'; + + err = ipmp_getgroupinfo(arg->sa_ih, ifinfop->if_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get broadcast/multicast info for " + "group `%s'", ifinfop->if_group); + return; + } + + if (strcmp(grinfop->gr_m4ifname, ifinfop->if_name) == 0) + buf[IPMPSTAT_M4FLAG_INDEX] = 'm'; + + if (strcmp(grinfop->gr_m6ifname, ifinfop->if_name) == 0) + buf[IPMPSTAT_M6FLAG_INDEX] = 'M'; + + if (strcmp(grinfop->gr_bcifname, ifinfop->if_name) == 0) + buf[IPMPSTAT_BFLAG_INDEX] = 'b'; + + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_if_link(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + enum2str(if_link, ifinfop->if_linkstate, buf, bufsize); +} + +static void +sfunc_if_probe(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + enum2str(if_probe, ifinfop->if_probestate, buf, bufsize); +} + +static void +sfunc_if_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + enum2str(if_state, ifinfop->if_state, buf, bufsize); +} + +static void +sfunc_probe_id(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + uint32_t probe_id; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_ID, &probe_id) != 0) { + sfunc_nvwarn("IPMP_PROBE_ID", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%u", probe_id); +} + +static void +sfunc_probe_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + char *ifname; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_string(nvl, IPMP_IF_NAME, &ifname) != 0) { + sfunc_nvwarn("IPMP_IF_NAME", buf, bufsize); + return; + } + + (void) strlcpy(buf, ifname, bufsize); +} + +static void +sfunc_probe_time(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + hrtime_t start; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_START_TIME, &start) != 0) { + sfunc_nvwarn("IPMP_PROBE_START_TIME", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%.2fs", + (float)(start - probe_output_start) / NANOSEC); +} + +static void +sfunc_probe_target(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + uint_t nelem; + struct sockaddr_storage *target; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_byte_array(nvl, IPMP_PROBE_TARGET, + (uchar_t **)&target, &nelem) != 0) { + sfunc_nvwarn("IPMP_PROBE_TARGET", buf, bufsize); + return; + } + + sockaddr2str(target, buf, bufsize); +} + +static void +sfunc_probe_rtt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + hrtime_t start, ackproc; + nvlist_t *nvl = arg->sa_data; + uint32_t state; + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) { + sfunc_nvwarn("IPMP_PROBE_STATE", buf, bufsize); + return; + } + + if (state != IPMP_PROBE_ACKED) + return; + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_START_TIME, &start) != 0) { + sfunc_nvwarn("IPMP_PROBE_START_TIME", buf, bufsize); + return; + } + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, &ackproc) != 0) { + sfunc_nvwarn("IPMP_PROBE_ACKPROC_TIME", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(ackproc - start)); +} + +static void +sfunc_probe_netrtt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + hrtime_t sent, ackrecv; + nvlist_t *nvl = arg->sa_data; + uint32_t state; + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) { + sfunc_nvwarn("IPMP_PROBE_STATE", buf, bufsize); + return; + } + + if (state != IPMP_PROBE_ACKED) + return; + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_SENT_TIME, &sent) != 0) { + sfunc_nvwarn("IPMP_PROBE_SENT_TIME", buf, bufsize); + return; + } + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, &ackrecv) != 0) { + sfunc_nvwarn("IPMP_PROBE_ACKRECV_TIME", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(ackrecv - sent)); +} + +static void +sfunc_probe_rttavg(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int64_t rttavg; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, &rttavg) != 0) { + sfunc_nvwarn("IPMP_PROBE_TARGET_RTTAVG", buf, bufsize); + return; + } + + if (rttavg != 0) + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(rttavg)); +} + +static void +sfunc_probe_rttdev(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int64_t rttdev; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, &rttdev) != 0) { + sfunc_nvwarn("IPMP_PROBE_TARGET_RTTDEV", buf, bufsize); + return; + } + + if (rttdev != 0) + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(rttdev)); +} + +/* ARGSUSED */ +static void +probe_enabled_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + uint_t *nenabledp = arg; + ipmp_ifinfo_t *ifinfop = infop; + + if (ifinfop->if_probestate != IPMP_PROBE_DISABLED) + (*nenabledp)++; +} + +static void +probe_output(ipmp_handle_t ih, ipmpstat_ofmt_t *ofmt) +{ + char sub[MAX_SUBID_LEN]; + evchan_t *evch; + ipmpstat_probe_state_t ps = { ih, ofmt }; + uint_t nenabled = 0; + + /* + * Check if any interfaces are enabled for probe-based failure + * detection. If not, immediately fail. + */ + walk_if(ih, probe_enabled_cbfunc, &nenabled); + if (nenabled == 0) + die("probe-based failure detection is disabled\n"); + + probe_output_start = gethrtime(); + + /* + * Unfortunately, until 4791900 is fixed, only privileged processes + * can bind and thus receive sysevents. + */ + errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evch, EVCH_CREAT); + if (errno != 0) { + if (errno == EPERM) + die("insufficient privileges for -p\n"); + die("sysevent_evc_bind to channel %s failed", IPMP_EVENT_CHAN); + } + + /* + * The subscriber must be unique in order for sysevent_evc_subscribe() + * to succeed, so combine our name and pid. + */ + (void) snprintf(sub, sizeof (sub), "%d-%s", getpid(), progname); + + errno = sysevent_evc_subscribe(evch, sub, EC_IPMP, probe_event, &ps, 0); + if (errno != 0) + die("sysevent_evc_subscribe for class %s failed", EC_IPMP); + + for (;;) + (void) pause(); +} + +static int +probe_event(sysevent_t *ev, void *arg) +{ + nvlist_t *nvl; + uint32_t state; + uint32_t version; + ipmpstat_probe_state_t *psp = arg; + + if (strcmp(sysevent_get_subclass_name(ev), ESC_IPMP_PROBE_STATE) != 0) + return (0); + + if (sysevent_get_attr_list(ev, &nvl) != 0) { + warn("sysevent_get_attr_list failed; dropping event"); + return (0); + } + + if (nvlist_lookup_uint32(nvl, IPMP_EVENT_VERSION, &version) != 0) { + warn("dropped event with no IPMP_EVENT_VERSION\n"); + goto out; + } + + if (version != IPMP_EVENT_CUR_VERSION) { + warn("dropped event with unsupported IPMP_EVENT_VERSION %d\n", + version); + goto out; + } + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) { + warn("dropped event with no IPMP_PROBE_STATE\n"); + goto out; + } + + if (state == IPMP_PROBE_ACKED || state == IPMP_PROBE_LOST) + ofmt_output(psp->ps_ofmt, psp->ps_ih, nvl); +out: + nvlist_free(nvl); + return (0); +} + +static void +sfunc_targ_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_targinfo_t *targinfop = arg->sa_data; + + (void) strlcpy(buf, targinfop->it_name, bufsize); +} + +static void +sfunc_targ_mode(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_targinfo_t *targinfop = arg->sa_data; + + enum2str(targ_mode, targinfop->it_targmode, buf, bufsize); +} + +static void +sfunc_targ_testaddr(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_targinfo_t *targinfop = arg->sa_data; + + if (targinfop->it_targmode != IPMP_TARG_DISABLED) + sockaddr2str(&targinfop->it_testaddr, buf, bufsize); +} + +static void +sfunc_targ_targets(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + uint_t i; + char *targname = alloca(bufsize); + ipmp_targinfo_t *targinfop = arg->sa_data; + ipmp_addrlist_t *targlistp = targinfop->it_targlistp; + + for (i = 0; i < targlistp->al_naddr; i++) { + sockaddr2str(&targlistp->al_addrs[i], targname, bufsize); + (void) strlcat(buf, targname, bufsize); + if ((i + 1) < targlistp->al_naddr) + (void) strlcat(buf, " ", bufsize); + } +} + +static void +info_output_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + ofmt_output(arg, ih, infop); +} + +static void +targinfo_output_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + ipmp_ifinfo_t *ifinfop = infop; + ipmp_if_targmode_t targmode4 = ifinfop->if_targinfo4.it_targmode; + ipmp_if_targmode_t targmode6 = ifinfop->if_targinfo6.it_targmode; + + /* + * Usually, either IPv4 or IPv6 probing will be enabled, but the admin + * may enable both. If only one is enabled, omit the other one so as + * to not encourage the admin to enable both. If neither is enabled, + * we still print one just so the admin can see a MODE of "disabled". + */ + if (targmode4 != IPMP_TARG_DISABLED || targmode6 == IPMP_TARG_DISABLED) + ofmt_output(arg, ih, &ifinfop->if_targinfo4); + if (targmode6 != IPMP_TARG_DISABLED) + ofmt_output(arg, ih, &ifinfop->if_targinfo6); +} + +/* + * Creates an ipmpstat_ofmt_t field list from the comma-separated list of + * user-specified fields passed via `ofields'. The table of known fields + * (and their attributes) is passed via `fields'. + */ +static ipmpstat_ofmt_t * +ofmt_create(const char *ofields, ipmpstat_field_t fields[]) +{ + char *token, *lasts, *ofields_dup; + const char *fieldname; + ipmpstat_ofmt_t *ofmt, *ofmt_head = NULL, *ofmt_tail; + ipmpstat_field_t *fieldp; + uint_t cols = 0; + + /* + * If "-o" was omitted or "-o all" was specified, build a list of + * field names. If "-o" was omitted, stop building the list when + * we run out of columns. + */ + if (ofields == NULL || strcasecmp(ofields, "all") == 0) { + for (fieldp = fields; fieldp->f_name != NULL; fieldp++) { + cols += fieldp->f_width; + if (ofields == NULL && cols > IPMPSTAT_NCOL) + break; + + if ((ofmt = calloc(sizeof (*ofmt), 1)) == NULL) + die("cannot allocate output format list"); + + ofmt->o_field = fieldp; + if (ofmt_head == NULL) { + ofmt_head = ofmt; + ofmt_tail = ofmt; + } else { + ofmt_tail->o_next = ofmt; + ofmt_tail = ofmt; + } + } + return (ofmt_head); + } + + if ((ofields_dup = strdup(ofields)) == NULL) + die("cannot allocate output format list"); + + token = ofields_dup; + while ((fieldname = strtok_r(token, ",", &lasts)) != NULL) { + token = NULL; + + if ((fieldp = field_find(fields, fieldname)) == NULL) { + /* + * Since machine parsers are unlikely to be able to + * gracefully handle missing fields, die if we're in + * parsable mode. Otherwise, just print a warning. + */ + if (opt & IPMPSTAT_OPT_PARSABLE) + die("unknown output field `%s'\n", fieldname); + + warn("ignoring unknown output field `%s'\n", fieldname); + continue; + } + + if ((ofmt = calloc(sizeof (*ofmt), 1)) == NULL) + die("cannot allocate output format list"); + + ofmt->o_field = fieldp; + if (ofmt_head == NULL) { + ofmt_head = ofmt; + ofmt_tail = ofmt; + } else { + ofmt_tail->o_next = ofmt; + ofmt_tail = ofmt; + } + } + + free(ofields_dup); + if (ofmt_head == NULL) + die("no valid output fields specified\n"); + + return (ofmt_head); +} + +/* + * Destroys the provided `ofmt' field list. + */ +static void +ofmt_destroy(ipmpstat_ofmt_t *ofmt) +{ + ipmpstat_ofmt_t *ofmt_next; + + for (; ofmt != NULL; ofmt = ofmt_next) { + ofmt_next = ofmt->o_next; + free(ofmt); + } +} + +/* + * Outputs a header for the fields named by `ofmt'. + */ +static void +ofmt_output_header(const ipmpstat_ofmt_t *ofmt) +{ + const ipmpstat_field_t *fieldp; + + for (; ofmt != NULL; ofmt = ofmt->o_next) { + fieldp = ofmt->o_field; + + if (ofmt->o_next == NULL) + (void) printf("%s", fieldp->f_name); + else + (void) printf("%-*s", fieldp->f_width, fieldp->f_name); + } + (void) printf("\n"); +} + +/* + * Outputs one row of values for the fields named by `ofmt'. The values to + * output are obtained through the `ofmt' function pointers, which are + * indirectly passed the `ih' and `arg' structures for state; see the block + * comment at the start of this file for details. + */ +static void +ofmt_output(const ipmpstat_ofmt_t *ofmt, ipmp_handle_t ih, void *arg) +{ + int i; + char buf[1024]; + boolean_t escsep; + static int nrow; + const char *value; + uint_t width, valwidth; + uint_t compress, overflow = 0; + const ipmpstat_field_t *fieldp; + ipmpstat_sfunc_arg_t sfunc_arg; + + /* + * For each screenful of data, display the header. + */ + if ((nrow++ % winsize.ws_row) == 0 && !(opt & IPMPSTAT_OPT_PARSABLE)) { + ofmt_output_header(ofmt); + nrow++; + } + + /* + * Check if we'll be displaying multiple fields per line, and thus + * need to escape the field separator. + */ + escsep = (ofmt != NULL && ofmt->o_next != NULL); + + for (; ofmt != NULL; ofmt = ofmt->o_next) { + fieldp = ofmt->o_field; + + sfunc_arg.sa_ih = ih; + sfunc_arg.sa_data = arg; + + buf[0] = '\0'; + (*fieldp->f_sfunc)(&sfunc_arg, buf, sizeof (buf)); + + if (opt & IPMPSTAT_OPT_PARSABLE) { + for (i = 0; buf[i] != '\0'; i++) { + if (escsep && (buf[i] == ':' || buf[i] == '\\')) + (void) putchar('\\'); + (void) putchar(buf[i]); + } + if (ofmt->o_next != NULL) + (void) putchar(':'); + } else { + value = (buf[0] == '\0') ? "--" : buf; + + /* + * To avoid needless line-wraps, for the last field, + * don't include any trailing whitespace. + */ + if (ofmt->o_next == NULL) { + (void) printf("%s", value); + continue; + } + + /* + * For other fields, grow the width as necessary to + * ensure the value completely fits. However, if + * there's unused whitespace in subsequent fields, + * then "compress" that whitespace to attempt to get + * the columns to line up again. + */ + width = fieldp->f_width; + valwidth = strlen(value); + + if (valwidth + overflow >= width) { + overflow += valwidth - width + 1; + (void) printf("%s ", value); + continue; + } + + if (overflow > 0) { + compress = MIN(overflow, width - valwidth); + overflow -= compress; + width -= compress; + } + (void) printf("%-*s", width, value); + } + } + (void) printf("\n"); + + /* + * In case stdout has been redirected to e.g. a pipe, flush stdout so + * that commands can act on our output immediately. + */ + (void) fflush(stdout); +} + +/* + * Searches the `fields' array for a field matching `fieldname'. Returns + * a pointer to that field on success, or NULL on failure. + */ +static ipmpstat_field_t * +field_find(ipmpstat_field_t *fields, const char *fieldname) +{ + ipmpstat_field_t *fieldp; + + for (fieldp = fields; fieldp->f_name != NULL; fieldp++) { + if (strcasecmp(fieldp->f_name, fieldname) == 0) + return (fieldp); + } + return (NULL); +} + +/* + * Uses `enums' to map `enumval' to a string, and stores at most `bufsize' + * bytes of that string into `buf'. + */ +static void +enum2str(const ipmpstat_enum_t *enums, int enumval, char *buf, uint_t bufsize) +{ + const ipmpstat_enum_t *enump; + + for (enump = enums; enump->e_name != NULL; enump++) { + if (enump->e_val == enumval) { + (void) strlcpy(buf, enump->e_name, bufsize); + return; + } + } + (void) snprintf(buf, bufsize, "<%d>", enumval); +} + +/* + * Stores the stringified value of the sockaddr_storage pointed to by `ssp' + * into at most `bufsize' bytes of `buf'. + */ +static void +sockaddr2str(const struct sockaddr_storage *ssp, char *buf, uint_t bufsize) +{ + int flags = NI_NOFQDN; + socklen_t socklen; + struct sockaddr *sp = (struct sockaddr *)ssp; + + /* + * Sadly, getnameinfo() does not allow the socklen to be oversized for + * a given family -- so we must determine the exact size to pass to it. + */ + switch (ssp->ss_family) { + case AF_INET: + socklen = sizeof (struct sockaddr_in); + break; + case AF_INET6: + socklen = sizeof (struct sockaddr_in6); + break; + default: + (void) strlcpy(buf, "?", bufsize); + return; + } + + if (opt & IPMPSTAT_OPT_NUMERIC) + flags |= NI_NUMERICHOST; + + (void) getnameinfo(sp, socklen, buf, bufsize, NULL, 0, flags); +} + +static void +sighandler(int sig) +{ + assert(sig == SIGWINCH); + + if (ioctl(1, TIOCGWINSZ, &winsize) == -1 || + winsize.ws_col == 0 || winsize.ws_row == 0) { + winsize.ws_col = 80; + winsize.ws_row = 24; + } +} + +static void +usage(void) +{ + const char *argstr = gettext("[-n] [-o <field> [-P]] -a|-g|-i|-p|-t"); + + (void) fprintf(stderr, gettext("usage: %s %s\n"), progname, argstr); + exit(EXIT_FAILURE); +} + +/* PRINTFLIKE1 */ +static void +warn(const char *format, ...) +{ + va_list alist; + int error = errno; + + format = gettext(format); + (void) fprintf(stderr, gettext("%s: warning: "), progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", strerror(error)); +} + +/* PRINTFLIKE2 */ +static void +warn_ipmperr(int ipmperr, const char *format, ...) +{ + va_list alist; + + format = gettext(format); + (void) fprintf(stderr, gettext("%s: warning: "), progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + (void) fprintf(stderr, ": %s\n", ipmp_errmsg(ipmperr)); +} + +/* PRINTFLIKE1 */ +static void +die(const char *format, ...) +{ + va_list alist; + int error = errno; + + format = gettext(format); + (void) fprintf(stderr, "%s: ", progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", strerror(error)); + + exit(EXIT_FAILURE); +} + +/* PRINTFLIKE2 */ +static void +die_ipmperr(int ipmperr, const char *format, ...) +{ + va_list alist; + + format = gettext(format); + (void) fprintf(stderr, "%s: ", progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + (void) fprintf(stderr, ": %s\n", ipmp_errmsg(ipmperr)); + + exit(EXIT_FAILURE); +} + +static ipmpstat_field_t addr_fields[] = { + { "ADDRESS", 26, sfunc_addr_address }, + { "STATE", 7, sfunc_addr_state }, + { "GROUP", 12, sfunc_addr_group }, + { "INBOUND", 12, sfunc_addr_inbound }, + { "OUTBOUND", 23, sfunc_addr_outbound }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t group_fields[] = { + { "GROUP", 12, sfunc_group_ifname }, + { "GROUPNAME", 12, sfunc_group_name }, + { "STATE", 10, sfunc_group_state }, + { "FDT", 10, sfunc_group_fdt }, + { "INTERFACES", 30, sfunc_group_interfaces }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t if_fields[] = { + { "INTERFACE", 12, sfunc_if_name }, + { "ACTIVE", 8, sfunc_if_active }, + { "GROUP", 12, sfunc_if_group }, + { "FLAGS", 10, sfunc_if_flags }, + { "LINK", 10, sfunc_if_link }, + { "PROBE", 10, sfunc_if_probe }, + { "STATE", 10, sfunc_if_state }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t probe_fields[] = { + { "TIME", 10, sfunc_probe_time }, + { "INTERFACE", 12, sfunc_probe_ifname }, + { "PROBE", 7, sfunc_probe_id }, + { "NETRTT", 10, sfunc_probe_netrtt }, + { "RTT", 10, sfunc_probe_rtt }, + { "RTTAVG", 10, sfunc_probe_rttavg }, + { "TARGET", 20, sfunc_probe_target }, + { "RTTDEV", 10, sfunc_probe_rttdev }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t targ_fields[] = { + { "INTERFACE", 12, sfunc_targ_ifname }, + { "MODE", 10, sfunc_targ_mode }, + { "TESTADDR", 20, sfunc_targ_testaddr }, + { "TARGETS", 38, sfunc_targ_targets }, + { NULL, 0, NULL } +}; + +static ipmpstat_enum_t addr_state[] = { + { "up", IPMP_ADDR_UP }, + { "down", IPMP_ADDR_DOWN }, + { NULL, 0 } +}; + +static ipmpstat_enum_t group_state[] = { + { "ok", IPMP_GROUP_OK }, + { "failed", IPMP_GROUP_FAILED }, + { "degraded", IPMP_GROUP_DEGRADED }, + { NULL, 0 } +}; + +static ipmpstat_enum_t if_link[] = { + { "up", IPMP_LINK_UP }, + { "down", IPMP_LINK_DOWN }, + { "unknown", IPMP_LINK_UNKNOWN }, + { NULL, 0 } +}; + +static ipmpstat_enum_t if_probe[] = { + { "ok", IPMP_PROBE_OK }, + { "failed", IPMP_PROBE_FAILED }, + { "unknown", IPMP_PROBE_UNKNOWN }, + { "disabled", IPMP_PROBE_DISABLED }, + { NULL, 0 } +}; + +static ipmpstat_enum_t if_state[] = { + { "ok", IPMP_IF_OK }, + { "failed", IPMP_IF_FAILED }, + { "unknown", IPMP_IF_UNKNOWN }, + { "offline", IPMP_IF_OFFLINE }, + { NULL, 0 } +}; + +static ipmpstat_enum_t targ_mode[] = { + { "disabled", IPMP_TARG_DISABLED }, + { "routes", IPMP_TARG_ROUTES }, + { "multicast", IPMP_TARG_MULTICAST }, + { NULL, 0 } +}; diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl new file mode 100644 index 0000000000..e2398aaf64 --- /dev/null +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl @@ -0,0 +1,106 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +msgid " " +msgid "%-*s" +msgid "%.2fms" +msgid "%.2fs" +msgid "%d-%s" +msgid "%s" +msgid "%s " +msgid "%s: " +msgid "%u" +msgid "(" +msgid ")" +msgid "," +msgid "--" +msgid ": %s\n" +msgid "?" +msgid "[" +msgid "]" +msgid "<%d>" +msgid "\n" +msgid "ACTIVE" +msgid "ADDRESS" +msgid "EC_ipmp" +msgid "ESC_ipmp_probe_state" +msgid "FDT" +msgid "FLAGS" +msgid "GROUP" +msgid "GROUPNAME" +msgid "INBOUND" +msgid "INTERFACE" +msgid "INTERFACES" +msgid "IPMP_IF_NAME" +msgid "IPMP_PROBE_ACKPROC_TIME" +msgid "IPMP_PROBE_ACKRECV_TIME" +msgid "IPMP_PROBE_ID" +msgid "IPMP_PROBE_SENT_TIME" +msgid "IPMP_PROBE_START_TIME" +msgid "IPMP_PROBE_STATE" +msgid "IPMP_PROBE_TARGET" +msgid "IPMP_PROBE_TARGET_RTTAVG" +msgid "IPMP_PROBE_TARGET_RTTDEV" +msgid "LINK" +msgid "MODE" +msgid "NETRTT" +msgid "OUTBOUND" +msgid "PROBE" +msgid "RTT" +msgid "RTTAVG" +msgid "RTTDEV" +msgid "STATE" +msgid "TARGET" +msgid "TARGETS" +msgid "TESTADDR" +msgid "TIME" +msgid "agipt" +msgid "all" +msgid "bufsize > IPMPSTAT_NUM_FLAGS" +msgid "com.sun:ipmp:events" +msgid "degraded" +msgid "disabled" +msgid "down" +msgid "failed" +msgid "ipmp_event_version" +msgid "ipmp_if_name" +msgid "ipmp_probe_ackproc_time" +msgid "ipmp_probe_ackrecv_time" +msgid "ipmp_probe_id" +msgid "ipmp_probe_sent_time" +msgid "ipmp_probe_start_time" +msgid "ipmp_probe_state" +msgid "ipmp_probe_target" +msgid "ipmp_probe_target_rttavg" +msgid "ipmp_probe_target_rttdev" +msgid "ipmpstat.c" +msgid "multicast" +msgid "nLPo:agipt" +msgid "no" +msgid "offline" +msgid "ok" +msgid "routes" +msgid "sig == SIGWINCH" +msgid "unknown" +msgid "up" +msgid "yes" diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types b/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types index bb15199492..e42bc626d8 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types @@ -1,13 +1,12 @@ # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -23,15 +22,12 @@ # CDDL HEADER END # -#pragma ident "%Z%%M% %I% %E% SMI" - fmt_version 1.0 mod_version 1.0 #PERM_CLASS default filter name string -filter if_groupname string filter user user filter projid int32 filter if_name ifname diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c index 17891ffc78..2a4ff60d57 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c @@ -18,7 +18,7 @@ * * CDDL HEADER END * - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,8 +37,6 @@ * contributors. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdio.h> #include <strings.h> #include <errno.h> @@ -243,7 +241,7 @@ main(int argc, char *argv[]) ushort_t udp_src_port6; /* used to identify replies */ uint_t flowinfo = 0; uint_t class = 0; - char tmp_buf[INET6_ADDRSTRLEN]; + char abuf[INET6_ADDRSTRLEN]; int c; int i; boolean_t has_sys_ip_config; @@ -671,24 +669,18 @@ main(int argc, char *argv[]) Printf("PING %s: %d data bytes\n", targethost, datalen); } else { if (ai_dst->ai_family == AF_INET) { - Printf("PING %s (%s): %d data bytes\n", - targethost, - inet_ntop(AF_INET, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in *) - ai_dst->ai_addr)->sin_addr, - tmp_buf, sizeof (tmp_buf)), - datalen); + (void) inet_ntop(AF_INET, + &((struct sockaddr_in *)(void *) + ai_dst->ai_addr)->sin_addr, + abuf, sizeof (abuf)); } else { - Printf("PING %s (%s): %d data bytes\n", - targethost, - inet_ntop(AF_INET6, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in6 *) - ai_dst->ai_addr)->sin6_addr, - tmp_buf, sizeof (tmp_buf)), - datalen); + (void) inet_ntop(AF_INET6, + &((struct sockaddr_in6 *)(void *) + ai_dst->ai_addr)->sin6_addr, + abuf, sizeof (abuf)); } + Printf("PING %s (%s): %d data bytes\n", + targethost, abuf, datalen); } } @@ -1074,12 +1066,12 @@ select_all_src_addrs(union any_in_addr **src_addr_list, struct addrinfo *ai, int num_dst = 1; int i; - if (probe_all) - for (aip = ai; aip->ai_next != NULL; - aip = aip->ai_next, num_dst++); + if (probe_all) { + for (aip = ai; aip->ai_next != NULL; aip = aip->ai_next) + num_dst++; + } - list = (union any_in_addr *) - calloc((size_t)num_dst, sizeof (union any_in_addr)); + list = calloc((size_t)num_dst, sizeof (union any_in_addr)); if (list == NULL) { Fprintf(stderr, "%s: calloc: %s\n", progname, strerror(errno)); exit(EXIT_FAILURE); @@ -1472,7 +1464,7 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index, int i; /* pull out the interface list */ - num_ifs = ifaddrlist(&al, family, errbuf); + num_ifs = ifaddrlist(&al, family, LIFC_UNDER_IPMP, errbuf); if (num_ifs == -1) { Fprintf(stderr, "%s: %s\n", progname, errbuf); exit(EXIT_FAILURE); @@ -1699,8 +1691,8 @@ send_scheduled_probe() } else { Printf("no answer from %s(%s)\n", targethost, inet_ntop(current_targetaddr->family, - ¤t_targetaddr->dst_addr, - tmp_buf, sizeof (tmp_buf))); + ¤t_targetaddr->dst_addr, + tmp_buf, sizeof (tmp_buf))); } } /* @@ -1736,9 +1728,8 @@ send_scheduled_probe() * Each time we move to a new targetaddr, which has * a different target IP address, we update this field. */ - current_targetaddr->starting_seq_num = - use_udp ? dest_port : - (ntransmitted % (MAX_ICMP_SEQ + 1)); + current_targetaddr->starting_seq_num = use_udp ? + dest_port : (ntransmitted % (MAX_ICMP_SEQ + 1)); } } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c index f062247997..e5b23fa126 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -139,7 +139,7 @@ check_device(dlpi_handle_t *dhp, char **devicep) if (ioctl(s, SIOCGIFFLAGS, (char *)ifr) < 0) pr_err("ioctl SIOCGIFFLAGS"); if ((ifr->ifr_flags & - (IFF_VIRTUAL|IFF_LOOPBACK|IFF_UP| + (IFF_VIRTUAL|IFF_IPMP|IFF_UP| IFF_RUNNING)) == (IFF_UP|IFF_RUNNING)) break; } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c index adc6a932b0..cae75df60d 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,8 +27,6 @@ * @(#)$Header: traceroute.c,v 1.49 97/06/13 02:30:23 leres Exp $ (LBL) */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/param.h> #include <sys/file.h> #include <sys/ioctl.h> @@ -707,7 +705,7 @@ get_hostinfo(char *host, int family, struct addrinfo **aipp) struct addrinfo hints, *ai; struct in6_addr addr6; struct in_addr addr; - char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ + char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ int rc; /* @@ -720,11 +718,10 @@ get_hostinfo(char *host, int family, struct addrinfo **aipp) IN6_V4MAPPED_TO_INADDR(&addr6, &addr); /* convert it back to a string */ - (void) inet_ntop(AF_INET, (void *)&addr, temp_buf, - sizeof (temp_buf)); + (void) inet_ntop(AF_INET, &addr, abuf, sizeof (abuf)); /* now the host is an IPv4 address */ - (void) strcpy(host, temp_buf); + (void) strcpy(host, abuf); /* * If it's a mapped address, we convert it into IPv4 @@ -826,15 +823,19 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp) struct sockaddr_in6 *sin6_from = (struct sockaddr_in6 *)pr->from; struct addrinfo *aip; char errbuf[ERRBUFSIZE]; - char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ + char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ int num_ifs; /* all the interfaces */ int num_src_ifs; /* exclude loopback and down */ int i; + uint_t ifaddrflags = 0; source = source_input; + if (device != NULL) + ifaddrflags |= LIFC_UNDER_IPMP; + /* get the interface address list */ - num_ifs = ifaddrlist(&al, pr->family, errbuf); + num_ifs = ifaddrlist(&al, pr->family, ifaddrflags, errbuf); if (num_ifs < 0) { Fprintf(stderr, "%s: ifaddrlist: %s\n", prog, errbuf); exit(EXIT_FAILURE); @@ -881,26 +882,20 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp) if (pr->family == AF_INET) ap = (union any_in_addr *) /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in *) - aip->ai_addr)->sin_addr; + &((struct sockaddr_in *)aip->ai_addr)->sin_addr; else ap = (union any_in_addr *) /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in6 *) - aip->ai_addr)->sin6_addr; + &((struct sockaddr_in6 *)aip->ai_addr)->sin6_addr; /* * LBNL bug fixed: used to accept any src address */ tmp2_al = find_ifaddr(al, num_ifs, ap, pr->family); - if (tmp2_al == NULL) { - Fprintf(stderr, - "%s: %s is not a local %s address\n", - prog, inet_ntop(pr->family, ap, - temp_buf, sizeof (temp_buf)), - pr->name); - + (void) inet_ntop(pr->family, ap, abuf, sizeof (abuf)); + Fprintf(stderr, "%s: %s is not a local %s address\n", + prog, abuf, pr->name); free(al); freeaddrinfo(aip); return (0); @@ -928,13 +923,11 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp) set_sin(pr->from, ap, pr->family); if (aip->ai_next != NULL) { - Fprintf(stderr, - "%s: Warning: %s has multiple " - "addresses; using %s\n", - prog, source, - inet_ntop(pr->family, - (const void *)pr->from_sin_addr, - temp_buf, sizeof (temp_buf))); + (void) inet_ntop(pr->family, pr->from_sin_addr, + abuf, sizeof (abuf)); + Fprintf(stderr, "%s: Warning: %s has multiple " + "addresses; using %s\n", prog, source, + abuf); } } else { /* -i and -s used */ /* @@ -1484,7 +1477,7 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, uchar_t code; /* icmp code */ int reply; int seq = 0; - char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ + char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ int longjmp_return; /* return value from longjump */ struct ip *ip = (struct ip *)packet; boolean_t got_there = _B_FALSE; /* we hit the destination */ @@ -1535,13 +1528,11 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, if (dev_name == NULL) dev_name = "?"; + (void) inet_ntop(pr->family, pr->from_sin_addr, abuf, + sizeof (abuf)); Fprintf(stderr, "%s: Warning: Multiple interfaces found;" - " using %s @ %s\n", - prog, inet_ntop(pr->family, - (const void *)pr->from_sin_addr, - temp_buf, sizeof (temp_buf)), - dev_name); + " using %s @ %s\n", prog, abuf, dev_name); } } @@ -1558,8 +1549,7 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, Fprintf(stderr, "%s to %s", prog, hostname); } else { Fprintf(stderr, "%s to %s (%s)", prog, hostname, - inet_ntop(pr->family, (const void *)ip_addr, temp_buf, - sizeof (temp_buf))); + inet_ntop(pr->family, ip_addr, abuf, sizeof (abuf))); } if (source) @@ -1700,9 +1690,8 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, } if (pr->family == AF_INET6) { - intp = - (int *)find_ancillary_data(&in_msg, - IPPROTO_IPV6, IPV6_HOPLIMIT); + intp = find_ancillary_data(&in_msg, + IPPROTO_IPV6, IPV6_HOPLIMIT); if (intp == NULL) { Fprintf(stderr, "%s: can't find " @@ -2188,10 +2177,11 @@ static void usage(void) { Fprintf(stderr, "Usage: %s [-adFIlnSvx] [-A address_family] " -"[-c traffic_class] \n" -"\t[-f first_hop] [-g gateway [-g gateway ...]| -r] [-i iface]\n" -"\t[-L flow_label] [-m max_hop] [-P pause_sec] [-p port] [-Q max_timeout]\n" -"\t[-q nqueries] [-s src_addr] [-t tos] [-w wait_time] host [packetlen]\n", - prog); + "[-c traffic_class]\n" + "\t[-f first_hop] [-g gateway [-g gateway ...]| -r] [-i iface]\n" + "\t[-L flow_label] [-m max_hop] [-P pause_sec] [-p port] " + "[-Q max_timeout]\n" + "\t[-q nqueries] [-s src_addr] [-t tos] [-w wait_time] host " + "[packetlen]\n", prog); exit(EXIT_FAILURE); } diff --git a/usr/src/cmd/devfsadm/misc_link.c b/usr/src/cmd/devfsadm/misc_link.c index c72be6be37..44756c3e98 100644 --- a/usr/src/cmd/devfsadm/misc_link.c +++ b/usr/src/cmd/devfsadm/misc_link.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -104,7 +104,7 @@ static devfsadm_create_t misc_cbt[] = { "(^ip$)|(^tcp$)|(^udp$)|(^icmp$)|(^sctp$)|" "(^ip6$)|(^tcp6$)|(^udp6$)|(^icmp6$)|(^sctp6$)|" "(^rts$)|(^arp$)|(^ipsecah$)|(^ipsecesp$)|(^keysock$)|(^spdsock$)|" - "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)", + "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)|(^dlpistub$)", TYPE_EXACT | DRV_RE, ILEVEL_1, minor_name }, { "pseudo", "ddi_pseudo", diff --git a/usr/src/cmd/mdb/common/modules/ip/ip.c b/usr/src/cmd/mdb/common/modules/ip/ip.c index f2dadd5261..f064b58d83 100644 --- a/usr/src/cmd/mdb/common/modules/ip/ip.c +++ b/usr/src/cmd/mdb/common/modules/ip/ip.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stropts.h> #include <sys/stream.h> @@ -524,8 +522,7 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg) static const mdb_bitmask_t mmasks[] = { { "CONDEMNED", IRE_MARK_CONDEMNED, IRE_MARK_CONDEMNED }, - { "NORECV", IRE_MARK_NORECV, IRE_MARK_NORECV }, - { "HIDDEN", IRE_MARK_HIDDEN, IRE_MARK_HIDDEN }, + { "TESTHIDDEN", IRE_MARK_TESTHIDDEN, IRE_MARK_TESTHIDDEN }, { "NOADD", IRE_MARK_NOADD, IRE_MARK_NOADD }, { "TEMPORARY", IRE_MARK_TEMPORARY, IRE_MARK_TEMPORARY }, { "USESRC", IRE_MARK_USESRC_CHECK, IRE_MARK_USESRC_CHECK }, diff --git a/usr/src/cmd/rcm_daemon/Makefile.com b/usr/src/cmd/rcm_daemon/Makefile.com index 365371c45c..dbe3c1f1d1 100644 --- a/usr/src/cmd/rcm_daemon/Makefile.com +++ b/usr/src/cmd/rcm_daemon/Makefile.com @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -124,7 +124,7 @@ SUNW_network_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_vlan_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_vnic_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_aggr_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm -SUNW_ip_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil -ldladm +SUNW_ip_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil -ldladm -lipmp SUNW_ip_anon_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil LDLIBS += -lgen -lelf -lrcm -lnvpair -ldevinfo -lnsl -lsocket diff --git a/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c b/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c index be9a31f952..6e1fe1bf39 100644 --- a/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c +++ b/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * RCM module to prevent plumbed IP addresses from being removed. */ @@ -177,7 +175,7 @@ ip_anon_register(rcm_handle_t *hdl) if (_cladm(CL_INITIALIZE, CL_GET_BOOTFLAG, &bootflags) != 0) { rcm_log_message(RCM_ERROR, - gettext("unable to check cluster status\n")); + gettext("unable to check cluster status\n")); (void) mutex_unlock(&ip_list_lock); return (RCM_FAILURE); } @@ -199,7 +197,7 @@ ip_anon_register(rcm_handle_t *hdl) else { if ((exclude_addrs.cladm_netaddrs_array = malloc(sizeof (cladm_netaddr_entry_t) * - (num_exclude_addrs))) == NULL) { + (num_exclude_addrs))) == NULL) { rcm_log_message(RCM_ERROR, gettext("out of memory\n")); (void) mutex_unlock(&ip_list_lock); @@ -274,7 +272,7 @@ ip_anon_register(rcm_handle_t *hdl) rcm_log_message(RCM_DEBUG, "ip_anon: obtaining list of IPv4 addresses.\n"); - num_ifs = ifaddrlist(&al, AF_INET, errbuf); + num_ifs = ifaddrlist(&al, AF_INET, LIFC_UNDER_IPMP, errbuf); if (num_ifs == -1) { rcm_log_message(RCM_ERROR, gettext("cannot get IPv4 address list errno=%d (%s)\n"), @@ -286,7 +284,7 @@ ip_anon_register(rcm_handle_t *hdl) rcm_log_message(RCM_DEBUG, "ip_anon: obtaining list of IPv6 addresses.\n"); - num_ifs6 = ifaddrlist(&al6, AF_INET6, errbuf); + num_ifs6 = ifaddrlist(&al6, AF_INET6, LIFC_UNDER_IPMP, errbuf); if (num_ifs6 == -1) { rcm_log_message(RCM_ERROR, gettext("cannot get IPv6 address list errno=%d (%s)\n"), @@ -392,7 +390,7 @@ ip_anon_register(rcm_handle_t *hdl) * currently know about it. */ if (!(tentry->flags & IP_FLAG_CL) && - !(tentry->flags & IP_FLAG_REG)) { + !(tentry->flags & IP_FLAG_REG)) { tentry->flags |= IP_FLAG_REG; rcm_log_message(RCM_DEBUG, "ip_anon: registering interest in %s\n", diff --git a/usr/src/cmd/rcm_daemon/common/ip_rcm.c b/usr/src/cmd/rcm_daemon/common/ip_rcm.c index f62b3dfc19..24be0cafeb 100644 --- a/usr/src/cmd/rcm_daemon/common/ip_rcm.c +++ b/usr/src/cmd/rcm_daemon/common/ip_rcm.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,23 +38,22 @@ #include <errno.h> #include <fcntl.h> #include <sys/types.h> +#include <sys/wait.h> #include <sys/stat.h> #include <sys/socket.h> #include <sys/sockio.h> #include <net/if.h> #include <netinet/in.h> -#include <netinet/tcp.h> #include <arpa/inet.h> #include <stropts.h> #include <strings.h> -#include <libdevinfo.h> -#include <sys/systeminfo.h> -#include <netdb.h> +#include <sys/sysmacros.h> #include <inet/ip.h> #include <libinetutil.h> #include <libdllink.h> +#include <libgen.h> +#include <ipmp_admin.h> -#include <ipmp_mpathd.h> #include "rcm_module.h" /* @@ -75,42 +74,19 @@ #define RCM_LINK_RESOURCE_MAX (13 + LINKID_STR_WIDTH) #define RCM_STR_SUNW_IP "SUNW_ip/" /* IP address export prefix */ -#define RCM_SIZE_SUNW_IP 9 /* strlen("SUNW_ip/") + 1 */ -/* ifconfig(1M) */ -#define USR_SBIN_IFCONFIG "/usr/sbin/ifconfig" /* ifconfig command */ -#define CFGFILE_FMT_IPV4 "/etc/hostname." /* IPV4 config file */ -#define CFGFILE_FMT_IPV6 "/etc/hostname6." /* IPV6 config file */ +#define SBIN_IFCONFIG "/sbin/ifconfig" /* ifconfig command */ +#define SBIN_IFPARSE "/sbin/ifparse" /* ifparse command */ +#define DHCPFILE_FMT "/etc/dhcp.%s" /* DHCP config file */ +#define CFGFILE_FMT_IPV4 "/etc/hostname.%s" /* IPV4 config file */ +#define CFGFILE_FMT_IPV6 "/etc/hostname6.%s" /* IPV6 config file */ #define CFG_CMDS_STD " netmask + broadcast + up" /* Normal config string */ -#define CONFIG_AF_INET 0x1 /* Post-configure IPv4 */ -#define CONFIG_AF_INET6 0x2 /* Post-configure IPv6 */ -#define MAXLINE 1024 /* Max. line length */ -#define MAXARGS 512 /* Max. args in ifconfig cmd */ - -/* Physical interface flags mask */ -#define RCM_PIF_FLAGS (IFF_OFFLINE | IFF_INACTIVE | IFF_FAILED | \ - IFF_STANDBY) +#define CFG_DHCP_CMD "dhcp wait 0" /* command to start DHCP */ /* Some useful macros */ -#ifndef MAX -#define MAX(a, b) (((a) > (b))?(a):(b)) -#endif /* MAX */ - -#ifndef ISSPACE #define ISSPACE(c) ((c) == ' ' || (c) == '\t') -#endif - -#ifndef ISEOL #define ISEOL(c) ((c) == '\n' || (c) == '\r' || (c) == '\0') -#endif - -#ifndef STREQ #define STREQ(a, b) (*(a) == *(b) && strcmp((a), (b)) == 0) -#endif - -#ifndef ADDSPACE -#define ADDSPACE(a) ((void) strcat((a), " ")) -#endif /* Interface Cache state flags */ #define CACHE_IF_STALE 0x1 /* stale cached data */ @@ -125,48 +101,20 @@ /* RCM IPMP Module specific property definitions */ #define RCM_IPMP_MIN_REDUNDANCY 1 /* default min. redundancy */ -/* in.mpathd(1M) specifics */ -#define MPATHD_MAX_RETRIES 5 /* Max. offline retries */ - /* Stream module operations */ #define MOD_INSERT 0 /* Insert a mid-stream module */ #define MOD_REMOVE 1 /* Remove a mid-stream module */ #define MOD_CHECK 2 /* Check mid-stream module safety */ /* - * in.mpathd(1M) message passing formats - */ -typedef struct mpathd_cmd { - uint32_t cmd_command; /* message command */ - char cmd_ifname[LIFNAMSIZ]; /* this interface name */ - char cmd_movetoif[LIFNAMSIZ]; /* move to interface */ - uint32_t cmd_min_red; /* min. redundancy */ -/* Message passing values for MI_SETOINDEX */ -#define from_lifname cmd_ifname /* current logical interface */ -#define to_pifname cmd_movetoif /* new physical interface */ -#define addr_family cmd_min_red /* address family */ -} mpathd_cmd_t; - -/* This is needed since mpathd checks message size for offline */ -typedef struct mpathd_unoffline { - uint32_t cmd_command; /* offline / undo offline */ - char cmd_ifname[LIFNAMSIZ]; /* this interface name */ -} mpathd_unoffline_t; - -typedef struct mpathd_response { - uint32_t resp_sys_errno; /* system errno */ - uint32_t resp_mpathd_err; /* mpathd error information */ -} mpathd_response_t; - -/* * IP module data types */ /* Physical interface representation */ typedef struct ip_pif { - char pi_ifname[LIFNAMSIZ+1]; /* interface name */ - char pi_grpname[LIFNAMSIZ+1]; /* IPMP group name */ - struct ip_lif *pi_lifs; /* ptr to logical interfaces */ + char pi_ifname[LIFNAMSIZ]; /* interface name */ + char pi_grname[LIFGRNAMSIZ]; /* IPMP group name */ + struct ip_lif *pi_lifs; /* ptr to logical interfaces */ } ip_pif_t; /* Logical interface representation */ @@ -239,7 +187,7 @@ static void free_node(ip_cache_t *); static void cache_insert(ip_cache_t *); static char *ip_usage(ip_cache_t *); static int update_pif(rcm_handle_t *, int, int, struct lifreq *); -static int ip_ipmp_offline(ip_cache_t *, ip_cache_t *); +static int ip_ipmp_offline(ip_cache_t *); static int ip_ipmp_undo_offline(ip_cache_t *); static int if_cfginfo(ip_cache_t *, uint_t); static int if_unplumb(ip_cache_t *); @@ -247,9 +195,6 @@ static int if_replumb(ip_cache_t *); static void ip_log_err(ip_cache_t *, char **, char *); static char *get_link_resource(const char *); static void clr_cfg_state(ip_pif_t *); -static uint64_t if_get_flags(ip_pif_t *); -static int mpathd_send_cmd(mpathd_cmd_t *); -static int connect_to_mpathd(int); static int modop(char *, char *, int, char); static int get_modlist(char *, ip_lif_t *); static int ip_domux2fd(int *, int *, int *, struct lifreq *); @@ -262,15 +207,13 @@ static char **ip_get_addrlist(ip_cache_t *); static void ip_free_addrlist(char **); static void ip_consumer_notify(rcm_handle_t *, datalink_id_t, char **, uint_t, rcm_info_t **); +static boolean_t ip_addrstr(ip_lif_t *, char *, size_t); static int if_configure(datalink_id_t); -static int isgrouped(char *); -static int if_ipmp_config(char *, int, int); -static int if_mpathd_configure(char *, char *, int, int); -static char *get_mpathd_dest(char *, int); -static int if_getcount(int); -static void tokenize(char *, char **, char *, int *); - +static boolean_t isgrouped(const char *); +static int if_config_inst(const char *, FILE *, int, boolean_t); +static uint_t ntok(const char *cp); +static boolean_t ifconfig(const char *, const char *, const char *, boolean_t); /* Module-Private data */ static struct rcm_mod_ops ip_ops = @@ -429,9 +372,9 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, { ip_cache_t *node; ip_pif_t *pif; - int detachable = 0; - int nofailover = 0; - int ipmp = 0; + boolean_t detachable = B_FALSE; + boolean_t ipmp; + int retval; rcm_log_message(RCM_TRACE1, "IP: offline(%s)\n", rsrc); @@ -455,25 +398,17 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, pif = node->ip_pif; /* Establish default detachability criteria */ - if (flags & RCM_FORCE) { - detachable++; - } + if (flags & RCM_FORCE) + detachable = B_TRUE; - /* Check if the interface is an IPMP grouped interface */ - if (strcmp(pif->pi_grpname, "")) { - ipmp++; - } - - if (if_get_flags(pif) & IFF_NOFAILOVER) { - nofailover++; - } + /* Check if the interface is under IPMP */ + ipmp = (pif->pi_grname[0] != '\0'); /* - * Even if the interface is not in an IPMP group, it's possible that - * it's still okay to offline it as long as there are higher-level - * failover mechanisms for the addresses it owns (e.g., clustering). - * In this case, ip_offlinelist() will return RCM_SUCCESS, and we - * charge on. + * Even if the interface is not under IPMP, it's possible that it's + * still okay to offline it as long as there are higher-level failover + * mechanisms for the addresses it owns (e.g., clustering). In this + * case, ip_offlinelist() will return RCM_SUCCESS, and we charge on. */ if (!ipmp && !detachable) { /* Inform consumers of IP addresses being offlined */ @@ -489,17 +424,6 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, } } - /* - * Cannot remove an IPMP interface if IFF_NOFAILOVER is set. - */ - if (ipmp && nofailover) { - /* Interface is part of an IPMP group, and cannot failover */ - ip_log_err(node, errorp, "Failover disabled"); - errno = EBUSY; - (void) mutex_unlock(&cache_lock); - return (RCM_FAILURE); - } - /* Check if it's a query */ if (flags & RCM_QUERY) { rcm_log_message(RCM_TRACE1, "IP: offline query success(%s)\n", @@ -534,38 +458,32 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, } /* - * This an IPMP interface that can be failed over. - * Request in.mpathd(1M) to failover the physical interface. + * This is an IPMP interface that can be offlined. + * Request in.mpathd(1M) to offline the physical interface. */ + if ((retval = ip_ipmp_offline(node)) != IPMP_SUCCESS) + ip_log_err(node, errorp, "in.mpathd offline failed"); - /* Failover to "any", let mpathd determine best failover candidate */ - if (ip_ipmp_offline(node, NULL) < 0) { - ip_log_err(node, errorp, "in.mpathd failover failed"); + if (retval == IPMP_EMINRED && !detachable) { /* - * Odds are that in.mpathd(1M) could not offline the device - * because it was the last interface in the group. However, - * it's possible that it's still okay to offline it as long as - * there are higher-level failover mechanisms for the - * addresses it owns (e.g., clustering). In this case, - * ip_offlinelist() will return RCM_SUCCESS, and we charge on. - * - * TODO: change ip_ipmp_offline() to return the actual failure - * from in.mpathd so that we can verify that it did indeed - * fail with IPMP_EMINRED. + * in.mpathd(1M) could not offline the device because it was + * the last interface in the group. However, it's possible + * that it's still okay to offline it as long as there are + * higher-level failover mechanisms for the addresses it owns + * (e.g., clustering). In this case, ip_offlinelist() will + * return RCM_SUCCESS, and we charge on. */ - if (!detachable) { - /* Inform consumers of IP addresses being offlined */ - if (ip_offlinelist(hd, node, errorp, flags, - depend_info) == RCM_SUCCESS) { - rcm_log_message(RCM_DEBUG, - "IP: consumers agree on detach"); - } else { - ip_log_err(node, errorp, - "Device consumers prohibit offline"); - (void) mutex_unlock(&cache_lock); - errno = EBUSY; - return (RCM_FAILURE); - } + /* Inform consumers of IP addresses being offlined */ + if (ip_offlinelist(hd, node, errorp, flags, + depend_info) == RCM_SUCCESS) { + rcm_log_message(RCM_DEBUG, + "IP: consumers agree on detach"); + } else { + ip_log_err(node, errorp, + "Device consumers prohibit offline"); + (void) mutex_unlock(&cache_lock); + errno = EBUSY; + return (RCM_FAILURE); } } @@ -574,8 +492,8 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, _("IP: Unplumb failed (%s)\n"), pif->pi_ifname); - /* Request mpathd to undo the offline */ - if (ip_ipmp_undo_offline(node) < 0) { + /* Request in.mpathd to undo the offline */ + if (ip_ipmp_undo_offline(node) != IPMP_SUCCESS) { ip_log_err(node, errorp, "Undo offline failed"); (void) mutex_unlock(&cache_lock); return (RCM_FAILURE); @@ -862,18 +780,16 @@ static char * ip_usage(ip_cache_t *node) { ip_lif_t *lif; - int numifs; - char *buf; - char *linkidstr; + uint_t numup; + char *sep, *buf, *linkidstr; datalink_id_t linkid; - const char *fmt; - char *sep; + const char *msg; char link[MAXLINKNAMELEN]; char addrstr[INET6_ADDRSTRLEN]; char errmsg[DLADM_STRSIZE]; dladm_status_t status; - int offline = 0; - size_t bufsz; + boolean_t offline, ipmp; + size_t bufsz = 0; rcm_log_message(RCM_TRACE2, "IP: usage(%s)\n", node->ip_resource); @@ -904,76 +820,53 @@ ip_usage(ip_cache_t *node) /* TRANSLATION_NOTE: separator used between IP addresses */ sep = _(", "); - numifs = 0; - for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next) { - if (lif->li_ifflags & IFF_UP) { - numifs++; - } - } + numup = 0; + for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next) + if (lif->li_ifflags & IFF_UP) + numup++; - if (node->ip_cachestate & CACHE_IF_OFFLINED) { - offline++; - } + ipmp = (node->ip_pif->pi_grname[0] != '\0'); + offline = ((node->ip_cachestate & CACHE_IF_OFFLINED) != 0); - if (!offline && numifs) { - fmt = _("%1$s hosts IP addresses: "); - } else if (offline) { - fmt = _("%1$s offlined"); + if (offline) { + msg = _("offlined"); + } else if (numup == 0) { + msg = _("plumbed but down"); } else { - fmt = _("%1$s plumbed but down"); + if (ipmp) { + msg = _("providing connectivity for IPMP group "); + bufsz += LIFGRNAMSIZ; + } else { + msg = _("hosts IP addresses: "); + bufsz += (numup * (INET6_ADDRSTRLEN + strlen(sep))); + } } - /* space for addresses and separators, plus message */ - bufsz = ((numifs * (INET6_ADDRSTRLEN + strlen(sep))) + - strlen(fmt) + strlen(link) + 1); + bufsz += strlen(link) + strlen(msg) + 1; if ((buf = malloc(bufsz)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: usage(%s) malloc failure(%s)\n"), node->ip_resource, strerror(errno)); return (NULL); } - bzero(buf, bufsz); - (void) sprintf(buf, fmt, link); - - if (offline || (numifs == 0)) { /* Nothing else to do */ - rcm_log_message(RCM_TRACE2, "IP: usage (%s) info = %s\n", - node->ip_resource, buf); - - return (buf); - } - - for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next) { + (void) snprintf(buf, bufsz, "%s: %s", link, msg); - void *addr; - int af; - - if (!(lif->li_ifflags & IFF_UP)) { - /* ignore interfaces not up */ - continue; - } - af = lif->li_addr.family; - if (af == AF_INET6) { - addr = &lif->li_addr.ip6.sin6_addr; - } else if (af == AF_INET) { - addr = &lif->li_addr.ip4.sin_addr; + if (!offline && numup > 0) { + if (ipmp) { + (void) strlcat(buf, node->ip_pif->pi_grname, bufsz); } else { - rcm_log_message(RCM_DEBUG, - "IP: unknown addr family %d, assuming AF_INET\n", - af); - af = AF_INET; - addr = &lif->li_addr.ip4.sin_addr; - } - if (inet_ntop(af, addr, addrstr, INET6_ADDRSTRLEN) == NULL) { - rcm_log_message(RCM_ERROR, - _("IP: inet_ntop: %s\n"), strerror(errno)); - continue; - } - rcm_log_message(RCM_DEBUG, "IP addr := %s\n", addrstr); + lif = node->ip_pif->pi_lifs; + for (; lif != NULL; lif = lif->li_next) { + if (!(lif->li_ifflags & IFF_UP)) + continue; + + if (!ip_addrstr(lif, addrstr, sizeof (addrstr))) + continue; - (void) strcat(buf, addrstr); - numifs--; - if (numifs > 0) { - (void) strcat(buf, ", "); + (void) strlcat(buf, addrstr, bufsz); + if (--numup > 0) + (void) strlcat(buf, sep, bufsz); + } } } @@ -983,6 +876,32 @@ ip_usage(ip_cache_t *node) return (buf); } +static boolean_t +ip_addrstr(ip_lif_t *lif, char *addrstr, size_t addrsize) +{ + int af = lif->li_addr.family; + void *addr; + + if (af == AF_INET6) { + addr = &lif->li_addr.ip6.sin6_addr; + } else if (af == AF_INET) { + addr = &lif->li_addr.ip4.sin_addr; + } else { + rcm_log_message(RCM_DEBUG, + "IP: unknown addr family %d, assuming AF_INET\n", af); + af = AF_INET; + addr = &lif->li_addr.ip4.sin_addr; + } + if (inet_ntop(af, addr, addrstr, addrsize) == NULL) { + rcm_log_message(RCM_ERROR, + _("IP: inet_ntop: %s\n"), strerror(errno)); + return (B_FALSE); + } + + rcm_log_message(RCM_DEBUG, "IP addr := %s\n", addrstr); + return (B_TRUE); +} + /* * Cache management routines, all cache management functions should be * be called with cache_lock held. @@ -1121,11 +1040,13 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) ifnumber = ifspec.ifsp_lun; /* Get the interface flags */ - (void) strcpy(lifreq.lifr_name, lifr->lifr_name); + (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ); if (ioctl(sock, SIOCGLIFFLAGS, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFFLAGS(%s): %s\n"), - pif.pi_ifname, strerror(errno)); + if (errno != ENXIO) { + rcm_log_message(RCM_ERROR, + _("IP: SIOCGLIFFLAGS(%s): %s\n"), + lifreq.lifr_name, strerror(errno)); + } return (-1); } (void) memcpy(&ifflags, &lifreq.lifr_flags, sizeof (ifflags)); @@ -1135,12 +1056,13 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) * - IFF_VIRTUAL: e.g., loopback and vni * - IFF_POINTOPOINT: e.g., sppp and ip.tun * - !IFF_MULTICAST: e.g., ip.6to4tun + * - IFF_IPMP: IPMP meta-interfaces * * Note: The !IFF_MULTICAST check can be removed once iptun is * implemented as a datalink. */ if (!(ifflags & IFF_MULTICAST) || - (ifflags & (IFF_POINTOPOINT | IFF_VIRTUAL))) { + (ifflags & (IFF_POINTOPOINT | IFF_VIRTUAL | IFF_IPMP))) { rcm_log_message(RCM_TRACE3, "IP: if ignored (%s)\n", pif.pi_ifname); return (0); @@ -1148,23 +1070,26 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) /* Get the interface group name for this interface */ if (ioctl(sock, SIOCGLIFGROUPNAME, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFGROUPNAME(%s): %s\n"), - lifreq.lifr_name, strerror(errno)); + if (errno != ENXIO) { + rcm_log_message(RCM_ERROR, + _("IP: SIOCGLIFGROUPNAME(%s): %s\n"), + lifreq.lifr_name, strerror(errno)); + } return (-1); } /* copy the group name */ - (void) memcpy(&pif.pi_grpname, &lifreq.lifr_groupname, - sizeof (pif.pi_grpname)); - pif.pi_grpname[sizeof (pif.pi_grpname) - 1] = '\0'; + (void) strlcpy(pif.pi_grname, lifreq.lifr_groupname, + sizeof (pif.pi_grname)); /* Get the interface address for this interface */ if (ioctl(sock, SIOCGLIFADDR, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFADDR(%s): %s\n"), - lifreq.lifr_name, strerror(errno)); - return (-1); + if (errno != ENXIO) { + rcm_log_message(RCM_ERROR, + _("IP: SIOCGLIFADDR(%s): %s\n"), + lifreq.lifr_name, strerror(errno)); + return (-1); + } } (void) memcpy(&ifaddr, &lifreq.lifr_addr, sizeof (ifaddr)); @@ -1241,9 +1166,9 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) sizeof (pif.pi_ifname)); } - /* save pif properties */ - (void) memcpy(&probepif->pi_grpname, &pif.pi_grpname, - sizeof (pif.pi_grpname)); + /* save the group name */ + (void) strlcpy(probepif->pi_grname, pif.pi_grname, + sizeof (pif.pi_grname)); /* add lif, if this is a lif and it is not in cache */ if (!lif_listed) { @@ -1304,7 +1229,7 @@ update_ipifs(rcm_handle_t *hd, int af) } lifn.lifn_family = af; - lifn.lifn_flags = 0; + lifn.lifn_flags = LIFC_UNDER_IPMP; if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) { rcm_log_message(RCM_ERROR, _("IP: SIOCLGIFNUM failed: %s\n"), @@ -1321,7 +1246,7 @@ update_ipifs(rcm_handle_t *hd, int af) } lifc.lifc_family = af; - lifc.lifc_flags = 0; + lifc.lifc_flags = LIFC_UNDER_IPMP; lifc.lifc_len = sizeof (struct lifreq) * lifn.lifn_count; lifc.lifc_buf = buf; @@ -1480,39 +1405,33 @@ static void ip_log_err(ip_cache_t *node, char **errorp, char *errmsg) { char *ifname = NULL; - int len; + int size; const char *errfmt; - char *error; + char *error = NULL; if ((node != NULL) && (node->ip_pif != NULL) && (node->ip_pif->pi_ifname != NULL)) { ifname = node->ip_pif->pi_ifname; } - if (errorp != NULL) - *errorp = NULL; - if (ifname == NULL) { rcm_log_message(RCM_ERROR, _("IP: %s\n"), errmsg); errfmt = _("IP: %s"); - len = strlen(errfmt) + strlen(errmsg) + 1; - if (error = (char *)calloc(1, len)) { - (void) sprintf(error, errfmt, errmsg); - } + size = strlen(errfmt) + strlen(errmsg) + 1; + if (errorp != NULL && (error = malloc(size)) != NULL) + (void) snprintf(error, size, errfmt, errmsg); } else { rcm_log_message(RCM_ERROR, _("IP: %s(%s)\n"), errmsg, ifname); errfmt = _("IP: %s(%s)"); - len = strlen(errfmt) + strlen(errmsg) + strlen(ifname) + 1; - if (error = (char *)calloc(1, len)) { - (void) sprintf(error, errfmt, errmsg, ifname); - } + size = strlen(errfmt) + strlen(errmsg) + strlen(ifname) + 1; + if (errorp != NULL && (error = malloc(size)) != NULL) + (void) snprintf(error, size, errfmt, errmsg, ifname); } if (errorp != NULL) *errorp = error; } - /* * if_cfginfo() - Save off the config info for all interfaces */ @@ -1538,7 +1457,7 @@ if_cfginfo(ip_cache_t *node, uint_t force) rcm_log_message(RCM_ERROR, _("IP: get modlist error (%s) %s\n"), pif->pi_ifname, strerror(errno)); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } @@ -1551,7 +1470,7 @@ if_cfginfo(ip_cache_t *node, uint_t force) rcm_log_message(RCM_ERROR, _("IP: module %s@%d\n"), lif->li_modules[i], i); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } } @@ -1595,11 +1514,11 @@ if_cfginfo(ip_cache_t *node, uint_t force) /* Save reconfiguration information */ if (lif->li_ifflags & IFF_IPV4) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s:%d configinfo\n", USR_SBIN_IFCONFIG, + "%s %s:%d configinfo\n", SBIN_IFCONFIG, pif->pi_ifname, lif->li_ifnum); } else if (lif->li_ifflags & IFF_IPV6) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s:%d inet6 configinfo\n", USR_SBIN_IFCONFIG, + "%s %s:%d inet6 configinfo\n", SBIN_IFCONFIG, pif->pi_ifname, lif->li_ifnum); } rcm_log_message(RCM_TRACE2, "IP: %s\n", syscmd); @@ -1609,7 +1528,7 @@ if_cfginfo(ip_cache_t *node, uint_t force) rcm_log_message(RCM_ERROR, _("IP: ifconfig configinfo error (%s:%d) %s\n"), pif->pi_ifname, lif->li_ifnum, strerror(errno)); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } bzero(buf, MAX_RECONFIG_SIZE); @@ -1619,20 +1538,18 @@ if_cfginfo(ip_cache_t *node, uint_t force) _("IP: ifconfig configinfo error (%s:%d) %s\n"), pif->pi_ifname, lif->li_ifnum, strerror(errno)); (void) pclose(fp); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } (void) pclose(fp); - lif->li_reconfig = malloc(strlen(buf)+1); - if (lif->li_reconfig == NULL) { + if ((lif->li_reconfig = strdup(buf)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: malloc error (%s) %s\n"), pif->pi_ifname, strerror(errno)); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } - (void) strcpy(lif->li_reconfig, buf); rcm_log_message(RCM_DEBUG, "IP: if_cfginfo: reconfig string(%s:%d) = %s\n", pif->pi_ifname, lif->li_ifnum, lif->li_reconfig); @@ -1654,57 +1571,37 @@ static int if_unplumb(ip_cache_t *node) { ip_lif_t *lif; - ip_pif_t *pif; - int ipv4 = 0, ipv6 = 0; - char syscmd[MAX_RECONFIG_SIZE + LIFNAMSIZ]; + ip_pif_t *pif = node->ip_pif; + boolean_t ipv4 = B_FALSE; + boolean_t ipv6 = B_FALSE; rcm_log_message(RCM_TRACE2, "IP: if_unplumb(%s)\n", node->ip_resource); - pif = node->ip_pif; - lif = pif->pi_lifs; - - while (lif != NULL) { + for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { if (lif->li_ifflags & IFF_IPV4) { - ipv4++; + ipv4 = B_TRUE; } else if (lif->li_ifflags & IFF_IPV6) { - ipv6++; + ipv6 = B_TRUE; } else { /* Unlikely case */ rcm_log_message(RCM_DEBUG, "IP: Unplumb ignored (%s:%d)\n", pif->pi_ifname, lif->li_ifnum); - lif = lif->li_next; - continue; } - lif = lif->li_next; } - /* Unplumb the physical interface */ - if (ipv4) { - rcm_log_message(RCM_TRACE2, - "IP: if_unplumb: ifconfig %s unplumb\n", pif->pi_ifname); - (void) snprintf(syscmd, sizeof (syscmd), "%s %s unplumb\n", - USR_SBIN_IFCONFIG, pif->pi_ifname); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot unplumb (%s) %s\n"), - pif->pi_ifname, strerror(errno)); - return (-1); - } + if (ipv4 && !ifconfig(pif->pi_ifname, "inet", "unplumb", B_FALSE)) { + rcm_log_message(RCM_ERROR, _("IP: Cannot unplumb (%s) %s\n"), + pif->pi_ifname, strerror(errno)); + return (-1); } - if (ipv6) { - rcm_log_message(RCM_TRACE2, - "IP: if_unplumb: ifconfig %s inet6 unplumb\n", - pif->pi_ifname); - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s inet6 unplumb\n", USR_SBIN_IFCONFIG, pif->pi_ifname); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot unplumb (%s) %s\n"), - pif->pi_ifname, strerror(errno)); - return (-1); - } + + if (ipv6 && !ifconfig(pif->pi_ifname, "inet6", "unplumb", B_FALSE)) { + rcm_log_message(RCM_ERROR, _("IP: Cannot unplumb (%s) %s\n"), + pif->pi_ifname, strerror(errno)); + return (-1); } + rcm_log_message(RCM_TRACE2, "IP: if_unplumb(%s) success\n", node->ip_resource); @@ -1723,8 +1620,11 @@ if_replumb(ip_cache_t *node) ip_lif_t *lif; ip_pif_t *pif; int i; - char syscmd[LIFNAMSIZ+MAXPATHLEN]; /* must be big enough */ - int max_ipv4 = 0, max_ipv6 = 0; + boolean_t success, ipmp; + const char *fstr; + char lifname[LIFNAMSIZ]; + char buf[MAX_RECONFIG_SIZE]; + int max_lifnum = 0; rcm_log_message(RCM_TRACE2, "IP: if_replumb(%s)\n", node->ip_resource); @@ -1738,100 +1638,103 @@ if_replumb(ip_cache_t *node) */ pif = node->ip_pif; - lif = pif->pi_lifs; + ipmp = (node->ip_pif->pi_grname[0] != '\0'); /* * Make a first pass to plumb in physical interfaces and get a count * of the max logical interfaces */ - while (lif != NULL) { + for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { + max_lifnum = MAX(lif->li_ifnum, max_lifnum); if (lif->li_ifflags & IFF_IPV4) { - if (lif->li_ifnum > max_ipv4) { - max_ipv4 = lif->li_ifnum; - } + fstr = "inet"; } else if (lif->li_ifflags & IFF_IPV6) { - if (lif->li_ifnum > max_ipv6) { - max_ipv6 = lif->li_ifnum; - } + fstr = "inet6"; } else { /* Unlikely case */ rcm_log_message(RCM_DEBUG, "IP: Re-plumb ignored (%s:%d)\n", pif->pi_ifname, lif->li_ifnum); - lif = lif->li_next; continue; } - if (lif->li_ifnum == 0) { /* physical interface instance */ - if ((lif->li_ifflags & IFF_NOFAILOVER) || - (strcmp(pif->pi_grpname, "") == 0)) { - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s\n", USR_SBIN_IFCONFIG, - lif->li_reconfig); - } else if (lif->li_ifflags & IFF_IPV4) { - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s inet plumb group %s\n", - USR_SBIN_IFCONFIG, - pif->pi_ifname, pif->pi_grpname); - } else if (lif->li_ifflags & IFF_IPV6) { - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s inet6 plumb group %s\n", - USR_SBIN_IFCONFIG, - pif->pi_ifname, pif->pi_grpname); - } + /* ignore logical interface instances */ + if (lif->li_ifnum != 0) + continue; + + if ((lif->li_ifflags & IFF_NOFAILOVER) || !ipmp) { + success = ifconfig("", "", lif->li_reconfig, B_FALSE); + } else { + (void) snprintf(buf, sizeof (buf), "plumb group %s", + pif->pi_grname); + success = ifconfig(pif->pi_ifname, fstr, buf, B_FALSE); + } + + if (!success) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot plumb (%s) %s\n"), pif->pi_ifname, + strerror(errno)); + return (-1); + } + + /* + * Restart DHCP if necessary. + */ + if ((lif->li_ifflags & IFF_DHCPRUNNING) && + !ifconfig(pif->pi_ifname, fstr, CFG_DHCP_CMD, B_FALSE)) { + rcm_log_message(RCM_ERROR, _("IP: Cannot start DHCP " + "(%s) %s\n"), pif->pi_ifname, strerror(errno)); + return (-1); + } + rcm_log_message(RCM_TRACE2, + "IP: if_replumb: Modcnt = %d\n", lif->li_modcnt); + /* modinsert modules in order, ignore driver(last) */ + for (i = 0; i < (lif->li_modcnt - 1); i++) { rcm_log_message(RCM_TRACE2, - "IP: if_replumb: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { + "IP: modinsert: Pos = %d Mod = %s\n", + i, lif->li_modules[i]); + if (modop(pif->pi_ifname, lif->li_modules[i], i, + MOD_INSERT) == -1) { rcm_log_message(RCM_ERROR, - _("IP: Cannot plumb (%s) %s\n"), - pif->pi_ifname, strerror(errno)); + _("IP: modinsert error(%s)\n"), + pif->pi_ifname); return (-1); } - - rcm_log_message(RCM_TRACE2, - "IP: if_replumb: Modcnt = %d\n", lif->li_modcnt); - /* modinsert modules in order, ignore driver(last) */ - for (i = 0; i < (lif->li_modcnt - 1); i++) { - rcm_log_message(RCM_TRACE2, - "IP: modinsert: Pos = %d Mod = %s\n", - i, lif->li_modules[i]); - if (modop(pif->pi_ifname, lif->li_modules[i], i, - MOD_INSERT) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: modinsert error(%s)\n"), - pif->pi_ifname); - return (-1); - } - } } - - lif = lif->li_next; } /* Now, add all the logical interfaces in the correct order */ - for (i = 1; i <= MAX(max_ipv6, max_ipv4); i++) { + for (i = 1; i <= max_lifnum; i++) { + (void) snprintf(lifname, LIFNAMSIZ, "%s:%d", pif->pi_ifname, i); + /* reset lif through every iteration */ - lif = pif->pi_lifs; - while (lif != NULL) { - if (((lif->li_ifflags & IFF_NOFAILOVER) || - (strcmp(pif->pi_grpname, "") == 0)) && - (lif->li_ifnum == i)) { - /* Plumb in the logical interface */ - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s\n", USR_SBIN_IFCONFIG, - lif->li_reconfig); - rcm_log_message(RCM_TRACE2, - "IP: if_replumb: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot addif (%s:%d) " - "%s\n"), - pif->pi_ifname, i, strerror(errno)); - return (-1); - } + for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { + /* + * Process entries in order. If the interface is + * using IPMP, only process test addresses. + */ + if (lif->li_ifnum != i || + (ipmp && !(lif->li_ifflags & IFF_NOFAILOVER))) + continue; + + if (!ifconfig("", "", lif->li_reconfig, B_FALSE)) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot addif (%s) %s\n"), lifname, + strerror(errno)); + return (-1); + } + + /* + * Restart DHCP if necessary. + */ + if ((lif->li_ifflags & IFF_DHCPRUNNING) && + !ifconfig(lifname, fstr, CFG_DHCP_CMD, B_FALSE)) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot start DHCP (%s) %s\n"), + lifname, strerror(errno)); + return (-1); } - lif = lif->li_next; } } @@ -1865,71 +1768,64 @@ clr_cfg_state(ip_pif_t *pif) } /* - * ip_ipmp_offline() - Failover from if_from to if_to using a - * minimum redudancy of min_red. This uses IPMPs - * "offline" mechanism to achieve the failover. + * Attempt to offline ip_cache_t `node'; returns an IPMP error code. */ static int -ip_ipmp_offline(ip_cache_t *if_from, ip_cache_t *if_to) +ip_ipmp_offline(ip_cache_t *node) { - mpathd_cmd_t mpdcmd; - - if ((if_from == NULL) || (if_from->ip_pif == NULL) || - (if_from->ip_pif->pi_ifname == NULL)) { - return (-1); - } + int retval; + ipmp_handle_t handle; rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline\n"); - mpdcmd.cmd_command = MI_OFFLINE; - (void) strcpy(mpdcmd.cmd_ifname, if_from->ip_pif->pi_ifname); - - if ((if_to != NULL) && (if_to->ip_pif != NULL) && - (if_to->ip_pif->pi_ifname != NULL)) { - rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline (%s)->(%s)\n", - if_from->ip_pif->pi_ifname, if_to->ip_pif->pi_ifname); - (void) strncpy(mpdcmd.cmd_movetoif, if_to->ip_pif->pi_ifname, - sizeof (mpdcmd.cmd_movetoif)); - mpdcmd.cmd_movetoif[sizeof (mpdcmd.cmd_movetoif) - 1] = '\0'; - } else { - rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline (%s)->(any)\n", - if_from->ip_pif->pi_ifname); - (void) strcpy(mpdcmd.cmd_movetoif, ""); /* signifies any */ + if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS) { + rcm_log_message(RCM_ERROR, + _("IP: cannot create ipmp handle: %s\n"), + ipmp_errmsg(retval)); + return (retval); } - mpdcmd.cmd_min_red = if_from->ip_ifred; - if (mpathd_send_cmd(&mpdcmd) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd offline error: %s\n"), - strerror(errno)); - return (-1); + retval = ipmp_offline(handle, node->ip_pif->pi_ifname, node->ip_ifred); + if (retval != IPMP_SUCCESS) { + rcm_log_message(RCM_ERROR, _("IP: ipmp_offline error: %s\n"), + ipmp_errmsg(retval)); + } else { + rcm_log_message(RCM_TRACE1, "IP: ipmp_offline success\n"); } - rcm_log_message(RCM_TRACE1, "IP: ipmp offline success\n"); - return (0); + ipmp_close(handle); + return (retval); } /* - * ip_ipmp_undo_offline() - Undo prior offline of the interface. - * This uses IPMPs "undo offline" feature. + * Attempt to undo the offline ip_cache_t `node'; returns an IPMP error code. */ static int ip_ipmp_undo_offline(ip_cache_t *node) { - mpathd_cmd_t mpdcmd; + int retval; + ipmp_handle_t handle; - mpdcmd.cmd_command = MI_UNDO_OFFLINE; - (void) strcpy(mpdcmd.cmd_ifname, node->ip_pif->pi_ifname); + rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_undo_offline\n"); - if (mpathd_send_cmd(&mpdcmd) < 0) { + if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS) { rcm_log_message(RCM_ERROR, - _("IP: mpathd error: %s\n"), - strerror(errno)); - return (-1); + _("IP: cannot create ipmp handle: %s\n"), + ipmp_errmsg(retval)); + return (retval); } - rcm_log_message(RCM_TRACE1, "IP: ipmp undo offline success\n"); - return (0); + retval = ipmp_undo_offline(handle, node->ip_pif->pi_ifname); + if (retval != IPMP_SUCCESS) { + rcm_log_message(RCM_ERROR, + _("IP: ipmp_undo_offline error: %s\n"), + ipmp_errmsg(retval)); + } else { + rcm_log_message(RCM_TRACE1, "IP: ipmp_undo_offline success\n"); + } + + ipmp_close(handle); + return (retval); } /* @@ -1946,10 +1842,9 @@ get_link_resource(const char *link) char *resource; dladm_status_t status; - if ((status = dladm_name2info(dld_handle, link, &linkid, &flags, NULL, - NULL)) != DLADM_STATUS_OK) { + status = dladm_name2info(dld_handle, link, &linkid, &flags, NULL, NULL); + if (status != DLADM_STATUS_OK) goto fail; - } if (!(flags & DLADM_OPT_ACTIVE)) { status = DLADM_STATUS_FAILED; @@ -1976,243 +1871,6 @@ fail: } /* - * if_get_flags() - Return the cached physical interface flags - * Call with cache_lock held - */ -static uint64_t -if_get_flags(ip_pif_t *pif) -{ - ip_lif_t *lif; - - for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { - if (lif->li_ifnum == 0) { - return (lif->li_ifflags & RCM_PIF_FLAGS); - } - } - return (0); -} - -/* - * mpathd_send_cmd() - Sends the command to in.mpathd. - */ -static int -mpathd_send_cmd(mpathd_cmd_t *mpd) -{ - mpathd_unoffline_t mpc; - struct mpathd_response mpr; - int i; - int s; - - rcm_log_message(RCM_TRACE1, "IP: mpathd_send_cmd \n"); - - for (i = 0; i < MPATHD_MAX_RETRIES; i++) { - s = connect_to_mpathd(AF_INET); - if (s == -1) { - s = connect_to_mpathd(AF_INET6); - if (s == -1) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot talk to mpathd\n")); - return (-1); - } - } - switch (mpd->cmd_command) { - case MI_OFFLINE : - rcm_log_message(RCM_TRACE1, "IP: MI_OFFLINE: " - "(%s)->(%s) redundancy = %d\n", mpd->cmd_ifname, - mpd->cmd_movetoif, mpd->cmd_min_red); - - if (write(s, mpd, sizeof (mpathd_cmd_t)) != - sizeof (mpathd_cmd_t)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd write: %s\n"), - strerror(errno)); - (void) close(s); - return (-1); - } - break; - - case MI_SETOINDEX : - rcm_log_message(RCM_TRACE1, "IP: MI_SETOINDEX: " - "(%s)->(%s) family = %d\n", mpd->from_lifname, - mpd->to_pifname, mpd->addr_family); - - if (write(s, mpd, sizeof (mpathd_cmd_t)) != - sizeof (mpathd_cmd_t)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd write: %s\n"), - strerror(errno)); - (void) close(s); - return (-1); - } - break; - - case MI_UNDO_OFFLINE: - /* mpathd checks for exact size of the message */ - mpc.cmd_command = mpd->cmd_command; - (void) strcpy(mpc.cmd_ifname, mpd->cmd_ifname); - - rcm_log_message(RCM_TRACE1, "IP: MI_UNDO_OFFLINE: " - "(%s)\n", mpd->cmd_ifname); - - if (write(s, &mpc, sizeof (mpathd_unoffline_t)) != - sizeof (mpathd_unoffline_t)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd write: %s\n"), - strerror(errno)); - (void) close(s); - return (-1); - } - break; - default : - rcm_log_message(RCM_ERROR, - _("IP: unsupported mpathd command\n")); - (void) close(s); - return (-1); - } - - bzero(&mpr, sizeof (struct mpathd_response)); - /* Read the result from mpathd */ - if (read(s, &mpr, sizeof (struct mpathd_response)) != - sizeof (struct mpathd_response)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd read : %s\n"), strerror(errno)); - (void) close(s); - return (-1); - } - - (void) close(s); - if (mpr.resp_mpathd_err == 0) { - rcm_log_message(RCM_TRACE1, - "IP: mpathd_send_cmd success\n"); - return (0); /* Successful */ - } - - if (mpr.resp_mpathd_err == MPATHD_SYS_ERROR) { - if (mpr.resp_sys_errno == EAGAIN) { - (void) sleep(1); - rcm_log_message(RCM_DEBUG, - "IP: mpathd retrying\n"); - continue; /* Retry */ - } - errno = mpr.resp_sys_errno; - rcm_log_message(RCM_WARNING, - _("IP: mpathd_send_cmd error: %s\n"), - strerror(errno)); - } else if (mpr.resp_mpathd_err == MPATHD_MIN_RED_ERROR) { - errno = EIO; - rcm_log_message(RCM_ERROR, _("IP: in.mpathd(1M): " - "Minimum redundancy not met\n")); - } else { - rcm_log_message(RCM_ERROR, - _("IP: mpathd_send_cmd error\n")); - } - /* retry */ - } - - rcm_log_message(RCM_ERROR, - _("IP: mpathd_send_cmd failed %d retries\n"), MPATHD_MAX_RETRIES); - return (-1); -} - -/* - * Returns -1 on failure. Returns the socket file descriptor on - * success. - */ -static int -connect_to_mpathd(int family) -{ - int s; - struct sockaddr_storage ss; - struct sockaddr_in *sin = (struct sockaddr_in *)&ss; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; - int addrlen; - int ret; - int on; - - rcm_log_message(RCM_TRACE1, "IP: connect_to_mpathd\n"); - - s = socket(family, SOCK_STREAM, 0); - if (s < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd socket: %s\n"), strerror(errno)); - return (-1); - } - bzero((char *)&ss, sizeof (ss)); - ss.ss_family = family; - /* - * Need to bind to a privelged port. For non-root, this - * will fail. in.mpathd verifies that only commands coming - * from priveleged ports succeed so that the ordinary user - * can't issue offline commands. - */ - on = 1; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd setsockopt: TCP_ANONPRIVBIND: %s\n"), - strerror(errno)); - return (-1); - } - switch (family) { - case AF_INET: - sin->sin_port = 0; - sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - addrlen = sizeof (struct sockaddr_in); - break; - case AF_INET6: - sin6->sin6_port = 0; - sin6->sin6_addr = loopback_addr; - addrlen = sizeof (struct sockaddr_in6); - break; - } - ret = bind(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd bind: %s\n"), strerror(errno)); - return (-1); - } - switch (family) { - case AF_INET: - sin->sin_port = htons(MPATHD_PORT); - break; - case AF_INET6: - sin6->sin6_port = htons(MPATHD_PORT); - break; - } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - if (errno == ECONNREFUSED) { - /* in.mpathd is not running, start it */ - if (rcm_exec_cmd(MPATHD_PATH) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd exec: %s\n"), - strerror(errno)); - return (-1); - } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - } - if (ret != 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd connect: %s\n"), strerror(errno)); - return (-1); - } - } - on = 0; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd setsockopt TCP_ANONPRIVBIND: %s\n"), - strerror(errno)); - return (-1); - } - - rcm_log_message(RCM_TRACE1, "IP: connect_to_mpathd success\n"); - - return (s); -} - -/* * modop() - Remove/insert a module */ static int @@ -2239,12 +1897,10 @@ modop(char *name, char *arg, int pos, char op) if (op == MOD_REMOVE) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s modremove %s@%d\n", USR_SBIN_IFCONFIG, name, arg, - pos); + "%s %s modremove %s@%d\n", SBIN_IFCONFIG, name, arg, pos); } else if (op == MOD_INSERT) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s modinsert %s@%d\n", USR_SBIN_IFCONFIG, name, arg, - pos); + "%s %s modinsert %s@%d\n", SBIN_IFCONFIG, name, arg, pos); } else { rcm_log_message(RCM_ERROR, _("IP: modop(%s): unknown operation\n"), name); @@ -2277,11 +1933,11 @@ get_modlist(char *name, ip_lif_t *lif) int i; int num_mods; struct lifreq lifr; - struct str_list strlist; + struct str_list strlist = { 0 }; rcm_log_message(RCM_TRACE1, "IP: getmodlist(%s)\n", name); - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + (void) strlcpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); lifr.lifr_flags = lif->li_ifflags; if (ip_domux2fd(&mux_fd, &muxid_fd, &fd, &lifr) < 0) { rcm_log_message(RCM_ERROR, _("IP: ip_domux2fd(%s)\n"), name); @@ -2292,39 +1948,34 @@ get_modlist(char *name, ip_lif_t *lif) rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): I_LIST(%s) \n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + goto fail; } strlist.sl_nmods = num_mods; strlist.sl_modlist = malloc(sizeof (struct str_mlist) * num_mods); - if (strlist.sl_modlist == NULL) { rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): %s\n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + goto fail; } if (ioctl(fd, I_LIST, (caddr_t)&strlist) < 0) { rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): I_LIST error: %s\n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + goto fail; } for (i = 0; i < strlist.sl_nmods; i++) { - lif->li_modules[i] = - malloc(strlen(strlist.sl_modlist[i].l_name)+1); + lif->li_modules[i] = strdup(strlist.sl_modlist[i].l_name); if (lif->li_modules[i] == NULL) { rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): %s\n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + while (i > 0) + free(lif->li_modules[--i]); + goto fail; } - (void) strcpy(lif->li_modules[i], strlist.sl_modlist[i].l_name); } lif->li_modcnt = strlist.sl_nmods; @@ -2332,6 +1983,10 @@ get_modlist(char *name, ip_lif_t *lif) rcm_log_message(RCM_TRACE1, "IP: getmodlist(%s) success\n", name); return (ip_plink(mux_fd, muxid_fd, fd, &lifr)); +fail: + free(strlist.sl_modlist); + (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); + return (-1); } /* @@ -2436,6 +2091,7 @@ ip_plink(int mux_fd, int muxid_fd, int fd, struct lifreq *lifr) * * Notify online to IP address consumers. */ +/*ARGSUSED*/ static int ip_onlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, rcm_info_t **depend_info) @@ -2464,6 +2120,7 @@ ip_onlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, * * Offline IP address consumers. */ +/*ARGSUSED*/ static int ip_offlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, rcm_info_t **depend_info) @@ -2494,9 +2151,9 @@ ip_offlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, } /* - * ip_get_addrlist() - Compile list of IP addresses hosted on this NIC (node) - * This routine malloc() required memeory for the list - * Returns list on success, NULL if failed + * ip_get_addrlist() - Get the list of IP addresses on this interface (node); + * This routine malloc()s required memory for the list. + * Returns the list on success, NULL on failure. * Call with cache_lock held. */ static char ** @@ -2504,11 +2161,9 @@ ip_get_addrlist(ip_cache_t *node) { ip_lif_t *lif; char **addrlist = NULL; - int numifs; + int i, numifs; + size_t addrlistsize; char addrstr[INET6_ADDRSTRLEN]; - void *addr; - int af; - int i; rcm_log_message(RCM_TRACE2, "IP: ip_get_addrlist(%s)\n", node->ip_resource); @@ -2532,35 +2187,21 @@ ip_get_addrlist(ip_cache_t *node) for (lif = node->ip_pif->pi_lifs, i = 0; lif != NULL; lif = lif->li_next, i++) { - af = lif->li_addr.family; - if (af == AF_INET6) { - addr = &lif->li_addr.ip6.sin6_addr; - } else if (af == AF_INET) { - addr = &lif->li_addr.ip4.sin_addr; - } else { - rcm_log_message(RCM_DEBUG, - "IP: unknown addr family %d, assuming AF_INET\n", - af); - af = AF_INET; - addr = &lif->li_addr.ip4.sin_addr; - } - if (inet_ntop(af, addr, addrstr, INET6_ADDRSTRLEN) == NULL) { - rcm_log_message(RCM_ERROR, - _("IP: inet_ntop: %s\n"), strerror(errno)); + if (!ip_addrstr(lif, addrstr, sizeof (addrstr))) { ip_free_addrlist(addrlist); return (NULL); } - if ((addrlist[i] = malloc(strlen(addrstr) + RCM_SIZE_SUNW_IP)) - == NULL) { + addrlistsize = strlen(addrstr) + sizeof (RCM_STR_SUNW_IP); + if ((addrlist[i] = malloc(addrlistsize)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: ip_get_addrlist(%s) malloc failure(%s)\n"), node->ip_resource, strerror(errno)); ip_free_addrlist(addrlist); return (NULL); } - (void) strcpy(addrlist[i], RCM_STR_SUNW_IP); /* SUNW_ip/ */ - (void) strcat(addrlist[i], addrstr); /* SUNW_ip/<address> */ + (void) snprintf(addrlist[i], addrlistsize, "%s%s", + RCM_STR_SUNW_IP, addrstr); rcm_log_message(RCM_DEBUG, "Anon Address: %s\n", addrlist[i]); } @@ -2611,16 +2252,13 @@ ip_consumer_notify(rcm_handle_t *hd, datalink_id_t linkid, char **errorp, return; } /* - * Inform anonymous consumers about IP addresses being - * onlined + * Inform anonymous consumers about IP addresses being onlined. */ (void) ip_onlinelist(hd, node, errorp, flags, depend_info); (void) mutex_unlock(&cache_lock); rcm_log_message(RCM_TRACE2, "IP: ip_consumer_notify success\n"); - return; - } /* @@ -2632,20 +2270,18 @@ if_configure(datalink_id_t linkid) char ifinst[MAXLINKNAMELEN]; char cfgfile[MAXPATHLEN]; char cached_name[RCM_LINK_RESOURCE_MAX]; - struct stat statbuf; + FILE *hostfp, *host6fp; ip_cache_t *node; - int af = 0; - int ipmp = 0; + boolean_t ipmp = B_FALSE; assert(linkid != DATALINK_INVALID_LINKID); - rcm_log_message(RCM_TRACE1, _("IP: if_configure(%u)\n"), linkid); /* Check for the interface in the cache */ (void) snprintf(cached_name, sizeof (cached_name), "%s/%u", RCM_LINK_PREFIX, linkid); - /* Check if the interface is new or was previously offlined */ + /* Check if the interface is new or was not previously offlined */ (void) mutex_lock(&cache_lock); if (((node = cache_lookup(NULL, cached_name, CACHE_REFRESH)) != NULL) && (!(node->ip_cachestate & CACHE_IF_OFFLINED))) { @@ -2663,76 +2299,69 @@ if_configure(datalink_id_t linkid) return (-1); } - /* Scan IPv4 configuration first */ - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV4, ifinst); - cfgfile[MAXPATHLEN - 1] = '\0'; - + /* + * Scan the IPv4 and IPv6 hostname files to see if (a) they exist + * and (b) if either one places the interface into an IPMP group. + */ + (void) snprintf(cfgfile, MAXPATHLEN, CFGFILE_FMT_IPV4, ifinst); rcm_log_message(RCM_TRACE1, "IP: Scanning %s\n", cfgfile); - if (stat(cfgfile, &statbuf) == 0) { - af |= CONFIG_AF_INET; - if (isgrouped(cfgfile)) { - ipmp++; - } + if ((hostfp = fopen(cfgfile, "r")) != NULL) { + if (isgrouped(cfgfile)) + ipmp = B_TRUE; } - /* Scan IPv6 configuration details */ - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV6, ifinst); - cfgfile[MAXPATHLEN - 1] = '\0'; + (void) snprintf(cfgfile, MAXPATHLEN, CFGFILE_FMT_IPV6, ifinst); rcm_log_message(RCM_TRACE1, "IP: Scanning %s\n", cfgfile); - if (stat(cfgfile, &statbuf) == 0) { - af |= CONFIG_AF_INET6; - if ((ipmp == 0) && isgrouped(cfgfile)) { - ipmp++; - } + if ((host6fp = fopen(cfgfile, "r")) != NULL) { + if (!ipmp && isgrouped(cfgfile)) + ipmp = B_TRUE; } - if (af & CONFIG_AF_INET) { - if (if_ipmp_config(ifinst, CONFIG_AF_INET, ipmp) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: IPv4 Post-attach failed (%s)\n"), ifinst); - return (-1); - } + /* + * Configure the interface according to its hostname files. + */ + if (hostfp != NULL && + if_config_inst(ifinst, hostfp, AF_INET, ipmp) == -1) { + rcm_log_message(RCM_ERROR, + _("IP: IPv4 Post-attach failed (%s)\n"), ifinst); + goto fail; } - if (af & CONFIG_AF_INET6) { - if (if_ipmp_config(ifinst, CONFIG_AF_INET6, ipmp) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: IPv6 Post-attach failed(%s)\n"), ifinst); - return (-1); - } + if (host6fp != NULL && + if_config_inst(ifinst, host6fp, AF_INET6, ipmp) == -1) { + rcm_log_message(RCM_ERROR, + _("IP: IPv6 Post-attach failed (%s)\n"), ifinst); + goto fail; } + (void) fclose(hostfp); + (void) fclose(host6fp); rcm_log_message(RCM_TRACE1, "IP: if_configure(%s) success\n", ifinst); - return (0); - +fail: + (void) fclose(hostfp); + (void) fclose(host6fp); + return (-1); } /* - * isgrouped() - Scans the given config file to see if this is a grouped - * interface - * Returns non-zero if true; 0 if false + * isgrouped() - Scans the given config file to see if this interface is + * using IPMP. Returns B_TRUE or B_FALSE. */ -static int -isgrouped(char *cfgfile) +static boolean_t +isgrouped(const char *cfgfile) { FILE *fp; struct stat statb; - char *buf = NULL; - char *tokens[MAXARGS]; /* token pointers */ - char tspace[MAXLINE]; /* token space */ - int ntok; - int group = 0; - - if (cfgfile == NULL) - return (0); + char *nlp, *line, *token, *lasts, *buf; + boolean_t grouped = B_FALSE; rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s)\n", cfgfile); if (stat(cfgfile, &statb) != 0) { rcm_log_message(RCM_TRACE1, _("IP: No config file(%s)\n"), cfgfile); - return (0); + return (B_FALSE); } /* @@ -2744,609 +2373,284 @@ isgrouped(char *cfgfile) if (statb.st_size <= 1) { rcm_log_message(RCM_TRACE1, _("IP: Empty config file(%s)\n"), cfgfile); - return (0); + return (B_FALSE); } if ((fp = fopen(cfgfile, "r")) == NULL) { rcm_log_message(RCM_ERROR, _("IP: Cannot open configuration file(%s): %s\n"), cfgfile, strerror(errno)); - return (0); + return (B_FALSE); } - if ((buf = calloc(1, statb.st_size)) == NULL) { + if ((buf = malloc(statb.st_size)) == NULL) { rcm_log_message(RCM_ERROR, - _("IP: calloc failure(%s): %s\n"), cfgfile, + _("IP: malloc failure(%s): %s\n"), cfgfile, strerror(errno)); - (void) fclose(fp); - return (0); + goto out; } while (fgets(buf, statb.st_size, fp) != NULL) { - if (*buf == '\0') - continue; - - tokenize(buf, tokens, tspace, &ntok); - while (ntok) { - if (STREQ("group", tokens[ntok - 1])) { - if (tokens[ntok] != NULL) { - group++; - } + if ((nlp = strrchr(buf, '\n')) != NULL) + *nlp = '\0'; + + line = buf; + while ((token = strtok_r(line, " \t", &lasts)) != NULL) { + line = NULL; + if (STREQ("group", token) && + strtok_r(NULL, " \t", &lasts) != NULL) { + grouped = B_TRUE; + goto out; } - ntok--; } } - +out: free(buf); - (void) fclose(fp); - if (group <= 0) { - rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s) non-grouped\n", - cfgfile); - return (0); - } else { - rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s) grouped\n", - cfgfile); - return (1); - } -} + rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s): %d\n", cfgfile, + grouped); + return (grouped); +} /* - * if_ipmp_config() - Configure an interface instance as specified by the + * if_config_inst() - Configure an interface instance as specified by the * address family af and if it is grouped (ipmp). */ static int -if_ipmp_config(char *ifinst, int af, int ipmp) +if_config_inst(const char *ifinst, FILE *hfp, int af, boolean_t ipmp) { - char cfgfile[MAXPATHLEN]; /* configuration file */ - FILE *fp; + FILE *ifparsefp; struct stat statb; - char *buf; - char *tokens[MAXARGS]; /* list of config attributes */ - char tspace[MAXLINE]; /* token space */ - char syscmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1]; - char grpcmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1]; - char fstr[8]; /* address family string inet or inet6 */ - int nofailover = 0; - int newattach = 0; - int cmdvalid = 0; - int ntok; - int n; - int stdif = 0; - - if (ifinst == NULL) - return (0); + char *buf = NULL; + char *ifparsebuf = NULL; + uint_t ifparsebufsize; + const char *fstr; /* address family string */ + boolean_t stdif = B_FALSE; - rcm_log_message(RCM_TRACE1, "IP: if_ipmp_config(%s) ipmp = %d\n", + rcm_log_message(RCM_TRACE1, "IP: if_config_inst(%s) ipmp = %d\n", ifinst, ipmp); - if (af & CONFIG_AF_INET) { - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV4, - ifinst); - (void) strcpy(fstr, "inet"); - } else if (af & CONFIG_AF_INET6) { - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV6, - ifinst); - (void) strcpy(fstr, "inet6"); - } else { - return (0); /* nothing to do */ - } - - cfgfile[MAXPATHLEN - 1] = '\0'; - grpcmd[0] = '\0'; - - if (stat(cfgfile, &statb) != 0) { - rcm_log_message(RCM_TRACE1, - "IP: No config file(%s)\n", ifinst); - return (0); + if (fstat(fileno(hfp), &statb) != 0) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot fstat file(%s)\n"), ifinst); + goto fail; } - /* Config file exists, plumb in the physical interface */ - if (af & CONFIG_AF_INET6) { - if (if_getcount(AF_INET6) == 0) { - /* - * Configure software loopback driver if this is the - * first IPv6 interface plumbed - */ - newattach++; - (void) snprintf(syscmd, sizeof (syscmd), - "%s lo0 %s plumb ::1 up", USR_SBIN_IFCONFIG, fstr); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot plumb (%s) %s\n"), - ifinst, strerror(errno)); - return (-1); - } - } - (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s plumb up", - USR_SBIN_IFCONFIG, ifinst, fstr); - } else { - (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s plumb ", - USR_SBIN_IFCONFIG, ifinst, fstr); - if (if_getcount(AF_INET) == 0) { - newattach++; - } + switch (af) { + case AF_INET: + fstr = "inet"; + break; + case AF_INET6: + fstr = "inet6"; + break; + default: + assert(0); } - rcm_log_message(RCM_TRACE1, "IP: Exec: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot plumb (%s) %s\n"), ifinst, strerror(errno)); - return (-1); - } + /* + * The hostname file exists; plumb the physical interface. + */ + if (!ifconfig(ifinst, fstr, "plumb", B_FALSE)) + goto fail; - /* Check if config file is empty, if so, nothing else to do */ - if (statb.st_size == 0) { + /* Skip static configuration if the hostname file is empty */ + if (statb.st_size <= 1) { rcm_log_message(RCM_TRACE1, - "IP: Zero size config file(%s)\n", ifinst); - return (0); + _("IP: Zero size hostname file(%s)\n"), ifinst); + goto configured; } - if ((fp = fopen(cfgfile, "r")) == NULL) { + if (fseek(hfp, 0, SEEK_SET) == -1) { rcm_log_message(RCM_ERROR, - _("IP: Open error(%s): %s\n"), cfgfile, strerror(errno)); - return (-1); + _("IP: Cannot rewind hostname file(%s): %s\n"), ifinst, + strerror(errno)); + goto fail; } + /* + * Allocate the worst-case single-line buffer sizes. A bit skanky, + * but since hostname files are small, this should suffice. + */ if ((buf = calloc(1, statb.st_size)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: calloc(%s): %s\n"), ifinst, strerror(errno)); - (void) fclose(fp); - return (-1); + goto fail; } - /* a single line with one token implies a classical if */ - if (fgets(buf, statb.st_size, fp) != NULL) { - tokenize(buf, tokens, tspace, &ntok); - if (ntok == 1) { - rcm_log_message(RCM_TRACE1, "IP: Standard interface\n"); - stdif++; - } - } - if (fseek(fp, 0L, SEEK_SET) == -1) { - rcm_log_message(RCM_ERROR, _("IP: fseek: %s\n"), - strerror(errno)); - return (-1); + ifparsebufsize = statb.st_size + sizeof (SBIN_IFPARSE " -s inet6 "); + if ((ifparsebuf = calloc(1, ifparsebufsize)) == NULL) { + rcm_log_message(RCM_ERROR, + _("IP: calloc(%s): %s\n"), ifinst, strerror(errno)); + goto fail; } /* - * Process the config command - * This loop also handles multiple logical interfaces that may - * be configured on a single line + * For IPv4, determine whether the hostname file consists of a single + * line. We need to handle these specially since they should + * automatically be suffixed with "netmask + broadcast + up". */ - while (fgets(buf, statb.st_size, fp) != NULL) { - nofailover = 0; - cmdvalid = 0; + if (af == AF_INET && + fgets(buf, statb.st_size, hfp) != NULL && + fgets(buf, statb.st_size, hfp) == NULL) { + rcm_log_message(RCM_TRACE1, "IP: one-line hostname file\n"); + stdif = B_TRUE; + } - if (*buf == '\0') - continue; + if (fseek(hfp, 0L, SEEK_SET) == -1) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot rewind hostname file(%s): %s\n"), ifinst, + strerror(errno)); + goto fail; + } - tokenize(buf, tokens, tspace, &ntok); - if (ntok <= 0) + /* + * Loop through the file one line at a time and feed it to ifconfig. + * If the interface is using IPMP, then we use /sbin/ifparse -s to + * weed out all of the data addresses, since those are already on the + * IPMP meta-interface. + */ + while (fgets(buf, statb.st_size, hfp) != NULL) { + if (ntok(buf) == 0) continue; - /* Reset the config command */ - (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s ", - USR_SBIN_IFCONFIG, ifinst, fstr); - - /* No parsing if this is first interface of its kind */ - if (newattach) { - (void) strcat(syscmd, buf); - /* Classic if */ - if ((af & CONFIG_AF_INET) && (stdif == 1)) { - (void) strcat(syscmd, CFG_CMDS_STD); - } - rcm_log_message(RCM_TRACE1, "IP: New: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Error: %s (%s): %s\n"), - syscmd, ifinst, strerror(errno)); - } + if (!ipmp) { + (void) ifconfig(ifinst, fstr, buf, stdif); continue; } - /* Parse the tokens to determine nature of the interface */ - for (n = 0; n < ntok; n++) { - /* Handle pathological failover cases */ - if (STREQ("-failover", tokens[n])) - nofailover++; - if (STREQ("failover", tokens[n])) - nofailover--; - - /* group attribute requires special processing */ - if (STREQ("group", tokens[n])) { - if (tokens[n + 1] != NULL) { - (void) snprintf(grpcmd, sizeof (grpcmd), - "%s %s %s %s %s", USR_SBIN_IFCONFIG, - ifinst, fstr, - tokens[n], tokens[n + 1]); - n++; /* skip next token */ - continue; - } - } - - /* Execute buffered command ? */ - if (STREQ("set", tokens[n]) || - STREQ("addif", tokens[n]) || - STREQ("removeif", tokens[n]) || - (n == (ntok -1))) { - - /* config command complete ? */ - if (n == (ntok -1)) { - ADDSPACE(syscmd); - (void) strcat(syscmd, tokens[n]); - cmdvalid++; - } - - if (!cmdvalid) { - ADDSPACE(syscmd); - (void) strcat(syscmd, tokens[n]); - cmdvalid++; - continue; - } - /* Classic if ? */ - if ((af & CONFIG_AF_INET) && (stdif == 1)) { - (void) strcat(syscmd, CFG_CMDS_STD); - } - - if (nofailover > 0) { - rcm_log_message(RCM_TRACE1, - "IP: Interim exec: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: %s fail(%s): %s\n"), - syscmd, ifinst, - strerror(errno)); - } - } else { - /* Have mpathd configure the address */ - if (if_mpathd_configure(syscmd, ifinst, - af, ipmp) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: %s fail(%s): %s\n"), - syscmd, ifinst, - strerror(errno)); - } - } - - /* Reset config command */ - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s %s ", USR_SBIN_IFCONFIG, ifinst, - fstr); - nofailover = 0; - cmdvalid = 0; - } - /* - * Note: No explicit command validation is required - * since ifconfig to does it for us - */ - ADDSPACE(syscmd); - (void) strcat(syscmd, tokens[n]); - cmdvalid++; - } - } - - free(buf); - (void) fclose(fp); - - /* - * The group name needs to be set after all the test/nofailover - * addresses have been configured. Otherwise, if IPMP detects that the - * interface is failed, the addresses will be moved to a working - * interface before the '-failover' flag can be set. - */ - if (grpcmd[0] != '\0') { - rcm_log_message(RCM_TRACE1, "IP: set group name: %s\n", grpcmd); - if (rcm_exec_cmd(grpcmd) != 0) { - rcm_log_message(RCM_ERROR, _("IP: %s fail(%s): %s\n"), - grpcmd, ifinst, strerror(errno)); + (void) snprintf(ifparsebuf, ifparsebufsize, SBIN_IFPARSE + " -s %s %s", fstr, buf); + if ((ifparsefp = popen(ifparsebuf, "r")) == NULL) { + rcm_log_message(RCM_ERROR, + _("IP: cannot configure %s: popen \"%s\" " + "failed: %s\n"), ifinst, buf, strerror(errno)); + goto fail; } - } - rcm_log_message(RCM_TRACE1, "IP: if_ipmp_config(%s) success\n", ifinst); - - return (0); -} - -/* - * if_mpathd_configure() - Determine configuration disposition of the interface - */ -static int -if_mpathd_configure(char *syscmd, char *ifinst, int af, int ipmp) -{ - char *tokens[MAXARGS]; - char tspace[MAXLINE]; - int ntok; - char *addr; - char *from_lifname; - mpathd_cmd_t mpdcmd; - int n; - - rcm_log_message(RCM_TRACE1, "IP: if_mpathd_configure(%s): %s\n", - ifinst, syscmd); - - tokenize(syscmd, tokens, tspace, &ntok); - if (ntok <= 0) - return (0); - - addr = tokens[3]; /* by default, third token is valid address */ - for (n = 0; n < ntok; n++) { - if (STREQ("set", tokens[n]) || - STREQ("addif", tokens[n])) { - addr = tokens[n+1]; - if (addr == NULL) { /* invalid format */ - return (-1); - } else - break; + while (fgets(buf, statb.st_size, ifparsefp) != NULL) { + if (ntok(buf) > 0) + (void) ifconfig(ifinst, fstr, buf, stdif); } - } - /* Check std. commands or no failed over address */ - if (STREQ("removeif", addr) || STREQ("group", addr) || - ((from_lifname = get_mpathd_dest(addr, af)) == NULL)) { - rcm_log_message(RCM_TRACE1, - "IP: No failed-over host, exec %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { + if (pclose(ifparsefp) == -1) { rcm_log_message(RCM_ERROR, - _("IP: %s failed(%s): %s\n"), - syscmd, ifinst, strerror(errno)); - return (-1); + _("IP: cannot configure %s: pclose \"%s\" " + "failed: %s\n"), ifinst, buf, strerror(errno)); + goto fail; } - return (0); - } - - /* Check for non-IPMP failover scenarios */ - if ((ipmp <= 0) && (from_lifname != NULL)) { - /* Address already hosted on another NIC, return */ - rcm_log_message(RCM_TRACE1, - "IP: Non-IPMP failed-over host(%s): %s\n", - ifinst, addr); - return (0); } +configured: /* - * Valid failed-over host; have mpathd set the original index + * Bring up the interface (it may already be up) + * + * Technically, since the boot scripts only unconditionally bring up + * IPv6 interfaces, we should only unconditionally bring up IPv6 here. + * However, if we don't bring up IPv4, and a legacy IPMP configuration + * without test addresses is being used, we will never bring the + * interface up even though we would've at boot. One fix is to check + * if the IPv4 hostname file contains data addresses that we would've + * brought up, but there's no simple way to do that. Given that it's + * rare to have persistent IP configuration for an interface that + * leaves it down, we cheap out and always bring it up for IPMP. */ - mpdcmd.cmd_command = MI_SETOINDEX; - (void) strcpy(mpdcmd.from_lifname, from_lifname); - (void) strcpy(mpdcmd.to_pifname, ifinst); - if (af & CONFIG_AF_INET6) { - mpdcmd.addr_family = AF_INET6; - } else { - mpdcmd.addr_family = AF_INET; - } - - /* Send command to in.mpathd(1M) */ - rcm_log_message(RCM_TRACE1, - "IP: Attempting setoindex from (%s) to (%s) ....\n", - from_lifname, ifinst); - - if (mpathd_send_cmd(&mpdcmd) < 0) { - rcm_log_message(RCM_TRACE1, - "IP: mpathd set original index unsuccessful: %s\n", - strerror(errno)); - return (-1); - } - - rcm_log_message(RCM_TRACE1, - "IP: setoindex success (%s) to (%s)\n", - from_lifname, ifinst); - - return (0); -} - -/* - * get_mpathd_dest() - Return current destination for lif; caller is - * responsible to free memory allocated for address - */ -static char * -get_mpathd_dest(char *addr, int family) -{ - int sock; - char *buf; - struct lifnum lifn; - struct lifconf lifc; - struct lifreq *lifrp; - sa_family_t af = AF_INET; /* IPv4 by default */ - int i; - struct lifreq lifreq; - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - struct hostent *hp; - char *ifname = NULL; - char *prefix = NULL; - char addrstr[INET6_ADDRSTRLEN]; - char ifaddr[INET6_ADDRSTRLEN]; - int err; - - if (addr == NULL) { - return (NULL); - } - - rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s)\n", addr); - - if (family & CONFIG_AF_INET6) { - af = AF_INET6; - } else { - af = AF_INET; - } - - if ((sock = socket(af, SOCK_DGRAM, 0)) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: failure opening %s socket: %s\n"), - af == AF_INET6 ? "IPv6" : "IPv4", strerror(errno)); - return (NULL); - } - - lifn.lifn_family = af; - lifn.lifn_flags = 0; - if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCLGIFNUM failed: %s\n"), - strerror(errno)); - (void) close(sock); - return (NULL); - } - - if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) { - rcm_log_message(RCM_ERROR, _("IP: calloc: %s\n"), - strerror(errno)); - (void) close(sock); - return (NULL); - } - - lifc.lifc_family = af; - lifc.lifc_flags = 0; - lifc.lifc_len = sizeof (struct lifreq) * lifn.lifn_count; - lifc.lifc_buf = buf; - - if (ioctl(sock, SIOCGLIFCONF, (char *)&lifc) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFCONF failed: %s\n"), - strerror(errno)); - free(buf); - (void) close(sock); - return (NULL); - } + if ((af == AF_INET6 || ipmp) && !ifconfig(ifinst, fstr, "up", B_FALSE)) + goto fail; - /* Filter out prefix address from netmask */ - (void) strcpy(ifaddr, addr); - if ((prefix = strchr(ifaddr, '/')) != NULL) { - *prefix = '\0'; /* We care about the address part only */ - } + /* + * For IPv4, if a DHCP configuration file exists, have DHCP configure + * the interface. As with the boot scripts, this is done after the + * hostname files are processed so that configuration in those files + * (such as IPMP group names) will be applied first. + */ + if (af == AF_INET) { + char dhcpfile[MAXPATHLEN]; + char *dhcpbuf; + off_t i, dhcpsize; - /* Check for aliases */ - hp = getipnodebyname(ifaddr, af, AI_DEFAULT, &err); - if (hp) { - if (inet_ntop(af, (void *)hp->h_addr_list[0], - ifaddr, sizeof (ifaddr)) == NULL) { - /* Restore original address and use it */ - (void) strcpy(ifaddr, addr); - if ((prefix = strchr(ifaddr, '/')) != NULL) { - *prefix = '\0'; - } - } - freehostent(hp); - } - rcm_log_message(RCM_TRACE2, "IP: ifaddr(%s) = %s\n", addr, ifaddr); + (void) snprintf(dhcpfile, MAXPATHLEN, DHCPFILE_FMT, ifinst); + if (stat(dhcpfile, &statb) == -1) + goto out; - /* now search the interfaces */ - lifrp = lifc.lifc_req; - for (i = 0; i < lifn.lifn_count; i++, lifrp++) { - (void) strcpy(lifreq.lifr_name, lifrp->lifr_name); - /* Get the interface address for this interface */ - if (ioctl(sock, SIOCGLIFADDR, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFADDR: %s\n"), strerror(errno)); - free(buf); - (void) close(sock); - return (NULL); - } - - if (af == AF_INET6) { - sin6 = (struct sockaddr_in6 *)&lifreq.lifr_addr; - if (inet_ntop(AF_INET6, (void *)&sin6->sin6_addr, - addrstr, sizeof (addrstr)) == NULL) { - continue; - } - } else { - sin = (struct sockaddr_in *)&lifreq.lifr_addr; - if (inet_ntop(AF_INET, (void *)&sin->sin_addr, - addrstr, sizeof (addrstr)) == NULL) { - continue; - } + if ((dhcpbuf = copylist(dhcpfile, &dhcpsize)) == NULL) { + rcm_log_message(RCM_ERROR, _("IP: cannot read " + "(%s): %s\n"), dhcpfile, strerror(errno)); + goto fail; } - if (STREQ(addrstr, ifaddr)) { - /* Allocate memory to hold interface name */ - if ((ifname = (char *)malloc(LIFNAMSIZ)) == NULL) { - rcm_log_message(RCM_ERROR, - _("IP: malloc: %s\n"), strerror(errno)); - free(buf); - (void) close(sock); - return (NULL); - } - - /* Copy the interface name */ - /* - * (void) memcpy(ifname, lifrp->lifr_name, - * sizeof (ifname)); - * ifname[sizeof (ifname) - 1] = '\0'; - */ - (void) strcpy(ifname, lifrp->lifr_name); - break; + /* + * The copylist() API converts \n's to \0's, but we want them + * to be spaces. + */ + if (dhcpsize > 0) { + for (i = 0; i < dhcpsize; i++) + if (dhcpbuf[i] == '\0') + dhcpbuf[i] = ' '; + dhcpbuf[dhcpsize - 1] = '\0'; } + (void) ifconfig(ifinst, CFG_DHCP_CMD, dhcpbuf, B_FALSE); + free(dhcpbuf); } - - (void) close(sock); +out: + free(ifparsebuf); free(buf); - - if (ifname == NULL) - rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s): none\n", - addr); - else - rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s): %s\n", - addr, ifname); - - return (ifname); -} - -static int -if_getcount(int af) -{ - int sock; - struct lifnum lifn; - - rcm_log_message(RCM_TRACE1, "IP: if_getcount\n"); - - if ((sock = socket(af, SOCK_DGRAM, 0)) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: failure opening %s socket: %s\n"), - af == AF_INET6 ? "IPv6" : "IPv4", strerror(errno)); - return (-1); - } - - lifn.lifn_family = af; - lifn.lifn_flags = 0; - if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCLGIFNUM failed: %s\n"), - strerror(errno)); - (void) close(sock); - return (-1); - } - (void) close(sock); - - rcm_log_message(RCM_TRACE1, "IP: if_getcount success: %d\n", - lifn.lifn_count); - - return (lifn.lifn_count); + rcm_log_message(RCM_TRACE1, "IP: if_config_inst(%s) success\n", ifinst); + return (0); +fail: + free(ifparsebuf); + free(buf); + rcm_log_message(RCM_ERROR, "IP: if_config_inst(%s) failure\n", ifinst); + return (-1); } /* - * tokenize() - turn a command line into tokens; caller is responsible to - * provide enough memory to hold all tokens + * ntok() - count the number of tokens in the provided buffer. */ -static void -tokenize(char *line, char **tokens, char *tspace, int *ntok) +static uint_t +ntok(const char *cp) { - char *cp; - char *sp; + uint_t ntok = 0; - sp = tspace; - cp = line; - for (*ntok = 0; *ntok < MAXARGS; (*ntok)++) { - tokens[*ntok] = sp; + for (;;) { while (ISSPACE(*cp)) cp++; + if (ISEOL(*cp)) break; + do { - *sp++ = *cp++; + cp++; } while (!ISSPACE(*cp) && !ISEOL(*cp)); - *sp++ = '\0'; + ntok++; + } + return (ntok); +} + +static boolean_t +ifconfig(const char *ifinst, const char *fstr, const char *buf, boolean_t stdif) +{ + char syscmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1]; + int status; + + (void) snprintf(syscmd, sizeof (syscmd), SBIN_IFCONFIG " %s %s %s", + ifinst, fstr, buf); + + if (stdif) + (void) strlcat(syscmd, CFG_CMDS_STD, sizeof (syscmd)); + + rcm_log_message(RCM_TRACE1, "IP: Exec: %s\n", syscmd); + if ((status = rcm_exec_cmd(syscmd)) != 0) { + if (WIFEXITED(status)) { + rcm_log_message(RCM_ERROR, _("IP: \"%s\" failed with " + "exit status %d\n"), syscmd, WEXITSTATUS(status)); + } else { + rcm_log_message(RCM_ERROR, _("IP: Error: %s: %s\n"), + syscmd, strerror(errno)); + } + return (B_FALSE); } + return (B_TRUE); } diff --git a/usr/src/cmd/svc/milestone/net-init b/usr/src/cmd/svc/milestone/net-init index 26b295dce9..7f0804af67 100644 --- a/usr/src/cmd/svc/milestone/net-init +++ b/usr/src/cmd/svc/milestone/net-init @@ -20,11 +20,9 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This is the second phase of TCP/IP configuration. The first part is # run by the svc:/network/physical service and includes configuring the # interfaces and setting the machine's hostname. The svc:/network/initial @@ -52,10 +50,11 @@ if [ -f /etc/inet/ipaddrsel.conf ]; then fi # -# Now that /usr is mounted, see if in.mpathd needs to be started by firing it -# up in "adopt" mode; if there are no interfaces it needs to manage, it will -# automatically exit. Note that it may already be running if we're not -# executing as part of system boot. +# If explicit IPMP groups are being used, in.mpathd will already be started. +# However, if TRACK_INTERFACES_ONLY_WITH_GROUPS=no and no explicit IPMP +# groups have been configured, then it still needs to be started. So, fire +# it up in "adopt" mode; if there are no interfaces it needs to manage, it +# will automatically exit. # /usr/bin/pgrep -x -u 0 -z `smf_zonename` in.mpathd >/dev/null 2>&1 || \ /usr/lib/inet/in.mpathd -a diff --git a/usr/src/cmd/svc/milestone/net-loopback b/usr/src/cmd/svc/milestone/net-loopback index 3bd5a0f525..d07afd4ada 100644 --- a/usr/src/cmd/svc/milestone/net-loopback +++ b/usr/src/cmd/svc/milestone/net-loopback @@ -20,10 +20,9 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" . /lib/svc/share/smf_include.sh @@ -36,14 +35,6 @@ smf_configure_ip || exit $SMF_EXIT_OK # -# Cause ifconfig to not automatically start in.mpathd when IPMP groups are -# configured. This is not strictly necessary but makes it so that in.mpathd -# will always be started explicitly from /lib/svc/method/net-init (the -# svc:/network/initial service), when we're sure that /usr is mounted. -# -SUNW_NO_MPATHD=; export SUNW_NO_MPATHD - -# # Before any interfaces are configured, we need to set the system # default IP forwarding behavior. This will be the setting for # interfaces that don't modify the per-interface setting with the diff --git a/usr/src/cmd/svc/milestone/net-physical b/usr/src/cmd/svc/milestone/net-physical index 8530806768..bc74c2a206 100644 --- a/usr/src/cmd/svc/milestone/net-physical +++ b/usr/src/cmd/svc/milestone/net-physical @@ -20,7 +20,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. @@ -38,22 +38,9 @@ # smf_configure_ip || exit $SMF_EXIT_OK -# Print warnings to console -warn_failed_ifs() { - echo "Failed to $1 interface(s): $2" >/dev/msglog -} - # Make sure that the libraries essential to this stage of booting can be found. LD_LIBRARY_PATH=/lib; export LD_LIBRARY_PATH -# -# Cause ifconfig to not automatically start in.mpathd when IPMP groups are -# configured. This is not strictly necessary but makes it so that in.mpathd -# will always be started explicitly from /etc/init.d/inetinit, when we're -# sure that /usr is mounted. -# -SUNW_NO_MPATHD=; export SUNW_NO_MPATHD - smf_netstrategy if smf_is_globalzone; then @@ -127,13 +114,18 @@ if [ "$interface_names" != "/etc/hostname.*[0-9]" ]; then IFS="$ORIGIFS" while [ $# -ge 2 ]; do shift - if [ $# -gt 1 -a "$2" != "/etc/hostname" ]; then - while [ $# -gt 1 -a "$1" != "/etc/hostname" ]; do - shift - done - else - inet_list="$inet_list $1" + intf_name=$1 + while [ $# -gt 1 -a "$2" != "/etc/hostname" ]; do + intf_name="$intf_name.$2" shift + done + shift + + read one rest < /etc/hostname.$intf_name + if [ "$one" = ipmp ]; then + ipmp_list="$ipmp_list $intf_name" + else + inet_list="$inet_list $intf_name" fi done fi @@ -151,17 +143,38 @@ if [ "$interface_names" != "/etc/hostname6.*[0-9]" ]; then IFS="$ORIGIFS" while [ $# -ge 2 ]; do shift - if [ $# -gt 1 -a "$2" != "/etc/hostname6" ]; then - while [ $# -gt 1 -a "$1" != "/etc/hostname6" ]; do - shift - done - else - inet6_list="$inet6_list $1" + intf_name=$1 + while [ $# -gt 1 -a "$2" != "/etc/hostname6" ]; do + intf_name="$intf_name.$2" shift + done + shift + + read one rest < /etc/hostname6.$intf_name + if [ "$one" = ipmp ]; then + ipmp6_list="$ipmp6_list $intf_name" + else + inet6_list="$inet6_list $intf_name" fi done fi +# +# Create all of the IPv4 IPMP interfaces. +# +if [ -n "$ipmp_list" ]; then + set -- $ipmp_list + while [ $# -gt 0 ]; do + if /sbin/ifconfig $1 ipmp; then + ipmp_created="$ipmp_created $1" + else + ipmp_failed="$ipmp_failed $1" + fi + shift + done + [ -n "$ipmp_failed" ] && warn_failed_ifs "create IPv4 IPMP" \ + "$ipmp_failed" +fi # # Step through the IPv4 interface list and try to plumb every interface. @@ -178,7 +191,7 @@ if [ -n "$inet_list" ]; then fi shift done - [ -n "$inet_failed" ] && warn_failed_ifs "plumb IPv4" $inet_failed + [ -n "$inet_failed" ] && warn_failed_ifs "plumb IPv4" "$inet_failed" fi # Run autoconf to connect to a WLAN if the interface is a wireless one @@ -209,7 +222,24 @@ if [ -n "$inet6_list" ]; then fi shift done - [ -n "$inet6_failed" ] && warn_failed_ifs "plumb IPv6" $inet6_failed + [ -n "$inet6_failed" ] && warn_failed_ifs "plumb IPv6" "$inet6_failed" +fi + +# +# Create all of the IPv6 IPMP interfaces. +# +if [ -n "$ipmp6_list" ]; then + set -- $ipmp6_list + while [ $# -gt 0 ]; do + if /sbin/ifconfig $1 inet6 ipmp; then + ipmp6_created="$ipmp6_created $1" + else + ipmp6_failed="$ipmp6_failed $1" + fi + shift + done + [ -n "$ipmp6_failed" ] && warn_failed_ifs "create IPv6 IPMP" \ + "$ipmp6_failed" fi if smf_is_globalzone; then @@ -224,49 +254,24 @@ if smf_is_globalzone; then fi # -# Process the /etc/hostname.* files of plumbed IPv4 interfaces. If an -# /etc/hostname file is not present or is empty, the ifconfig auto-dhcp -# / auto-revarp command will attempt to set the address, later. +# Process the /etc/hostname[6].* files for IPMP interfaces. Processing these +# before non-IPMP interfaces avoids accidental implicit IPMP group creation. +# +[ -n "$ipmp_created" ] && if_configure inet "IPMP" $ipmp_created +[ -n "$ipmp6_created" ] && if_configure inet6 "IPMP" $ipmp6_created + # -# If /etc/hostname.lo0 exists the loop below will do additional -# configuration of lo0. +# Process the /etc/hostname[6].* files for non-IPMP interfaces. # -if [ -n "$inet_plumbed" ]; then - i4s_fail= - echo "configuring IPv4 interfaces:\c" - set -- $inet_plumbed - while [ $# -gt 0 ]; do - inet_process_hostname /sbin/ifconfig $1 inet \ - </etc/hostname.$1 >/dev/null - [ $? != 0 ] && i4s_fail="$i4s_fail $1" - echo " $1\c" - shift - done - echo "." - [ -n "$i4s_fail" ] && warn_failed_ifs "configure IPv4" $i4s_fail -fi +[ -n "$inet_plumbed" ] && if_configure inet "" $inet_plumbed +[ -n "$inet6_plumbed" ] && if_configure inet6 "" $inet6_plumbed # -# Process the /etc/hostname6.* files of plumbed IPv6 interfaces. After -# processing the hostname6 file, bring the interface up. If -# /etc/hostname6.lo0 exists the loop below will do additional -# configuration of lo0. +# For the IPv4 and IPv6 interfaces that failed to plumb, find (or create) +# IPMP meta-interfaces to host their data addresses. # -if [ -n "$inet6_plumbed" ]; then - i6_fail= - echo "configuring IPv6 interfaces:\c" - set -- $inet6_plumbed - while [ $# -gt 0 ]; do - inet6_process_hostname /sbin/ifconfig $1 inet6 \ - </etc/hostname6.$1 >/dev/null && - /sbin/ifconfig $1 inet6 up - [ $? != 0 ] && i6_fail="$i6_fail $1" - echo " $1\c" - shift - done - echo "." - [ -n "$i6_fail" ] && warn_failed_ifs "configure IPv6" $i6_fail -fi +[ -n "$inet_failed" ] && move_addresses inet +[ -n "$inet6_failed" ] && move_addresses inet6 # Run DHCP if requested. Skip boot-configured interface. interface_names="`echo /etc/dhcp.*[0-9] 2>/dev/null`" @@ -326,7 +331,7 @@ if [ "$interface_names" != '/etc/dhcp.*[0-9]' ]; then done IFS="$ORIGIFS" unset ORIGIFS - [ -n "$i4d_fail" ] && warn_failed_ifs "configure IPv4 DHCP" $i4d_fail + [ -n "$i4d_fail" ] && warn_failed_ifs "configure IPv4 DHCP" "$i4d_fail" fi # In order to avoid bringing up the interfaces that have @@ -338,14 +343,6 @@ if [ "$_INIT_NET_STRATEGY" = "rarp" -o -z "$hostname" ]; then fi # -# Process IPv4 and IPv6 interfaces that failed to plumb. Find an -# alternative interface to host the addresses. -# -[ -n "$inet_failed" ] && move_addresses inet - -[ -n "$inet6_failed" ] && move_addresses inet6 - -# # If the /etc/defaultrouter file exists, process it now so that the next # stage of booting will have access to NFS. # diff --git a/usr/src/cmd/svc/shell/net_include.sh b/usr/src/cmd/svc/shell/net_include.sh index 51c87a40a8..71dc6a8256 100644 --- a/usr/src/cmd/svc/shell/net_include.sh +++ b/usr/src/cmd/svc/shell/net_include.sh @@ -20,13 +20,18 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. # All rights reserved. # +# Print warnings to console +warn_failed_ifs() { + echo "Failed to $1 interface(s):$2" >/dev/msglog +} + # # shcat file # Simulates cat in sh so it doesn't need to be on the root filesystem. @@ -41,20 +46,28 @@ shcat() { } # -# Inet_list, list of IPv4 interfaces. -# Inet_plumbed, list of plumbed IPv4 interfaces. -# Inet_failed, list of IPv4 interfaces that failed to plumb. -# Inet6_list, list of IPv6 interfaces. -# Inet6_plumbed, list of plumbed IPv6 interfaces. -# Inet6_failed, list of IPv6 interfaces that failed to plumb. +# inet_list list of IPv4 interfaces. +# inet6_list list of IPv6 interfaces. +# ipmp_list list of IPMP IPv4 interfaces. +# ipmp6_list list of IPMP IPv6 interfaces. +# inet_plumbed list of plumbed IPv4 interfaces. +# inet6_plumbed list of plumbed IPv6 interfaces. +# ipmp_created list of created IPMP IPv4 interfaces. +# ipmp6_created list of created IPMP IPv6 interfaces. +# inet_failed list of IPv4 interfaces that failed to plumb. +# inet6_failed list of IPv6 interfaces that failed to plumb. +# ipmp_failed list of IPMP IPv4 interfaces that failed to be created. +# ipmp6_failed list of IPMP IPv6 interfaces that failed to be created. # unset inet_list inet_plumbed inet_failed \ - inet6_list inet6_plumbed inet6_failed + inet6_list inet6_plumbed inet6_failed \ + ipmp_list ipmp_created ipmp_failed \ + ipmp6_list ipmp6_created ipmp6_failed + # # get_physical interface # -# Return physical interface corresponding to the given logical -# interface. +# Return physical interface corresponding to the given interface. # get_physical() { @@ -70,7 +83,7 @@ get_physical() # get_logical interface # # Return logical interface number. Zero will be returned -# if there is no explicit logical device number. +# if there is no explicit logical number. # get_logical() { @@ -89,19 +102,18 @@ get_logical() # # if_comp if1 if2 # -# Compare Interfaces. Do the physical interface names and logical interface +# Compare interfaces. Do the physical interface names and logical interface # numbers match? # if_comp() { - [ "`get_physical $1`" = "`get_physical $2`" ] && \ - [ `get_logical $1` -eq `get_logical $2` ] + physical_comp $1 $2 && [ `get_logical $1` -eq `get_logical $2` ] } - + # # physical_comp if1 if2 # -# Do the two devices share a physical interface? +# Do the two interfaces share a physical interface? # physical_comp() { @@ -129,19 +141,110 @@ in_list() } # -# get_group_from_hostname interface type +# get_inactive_ifname groupname +# +# Return the name of an inactive interface in `groupname', if one exists. +# +get_inactive_ifname() +{ + ORIGIFS="$IFS" + /sbin/ipmpstat -gP -o groupname,interfaces | + while IFS=: read groupname ifnames; do + # + # Skip other IPMP groups. + # + [ "$groupname" != "$1" ] && continue + + # + # Standby interfaces are always enclosed in ()'s, so look + # for the first interface name starting with a "(", and + # strip those off. + # + IFS=" " + for ifname in $ifnames; do + case "$ifname" in + '('*) IFS="()" + echo $ifname + IFS="$ORIGIFS" + return + ;; + *) ;; + esac + done + done + IFS="$ORIGIFS" +} + +# +# get_groupifname groupname +# +# Return the IPMP meta-interface name for the group, if it exists. +# +get_groupifname() +{ + /sbin/ipmpstat -gP -o groupname,group | while IFS=: read name ifname; do + if [ "$name" = "$1" ]; then + echo "$ifname" + return + fi + done +} + +# +# create_ipmp ifname groupname type +# +# Helper function for create_groupifname() that returns zero if it's able +# to create an IPMP interface of the specified type and place it in the +# specified group, or non-zero otherwise. +# +create_ipmp() +{ + /sbin/ifconfig $1 >/dev/null 2>&1 && return 1 + /sbin/ifconfig $1 inet6 >/dev/null 2>&1 && return 1 + /sbin/ifconfig $1 $3 ipmp group $2 2>/dev/null +} + +# +# create_groupifname groupname type +# +# Create an IPMP meta-interface name for the group. We only use this +# function if all of the interfaces in the group failed at boot and there +# were no /etc/hostname[6].<if> files for the IPMP meta-interface. +# +create_groupifname() +{ + # + # This is a horrible way to count from 0 to 999, but in sh and + # without necessarily having /usr mounted, what else can we do? + # + for a in "" 1 2 3 4 5 6 7 8 9; do + for b in 0 1 2 3 4 5 6 7 8 9; do + for c in 0 1 2 3 4 5 6 7 8 9; do + # strip leading zeroes + [ "$a" = "" ] && [ "$b" = 0 ] && b="" + if create_ipmp ipmp$a$b$c $1 $2; then + echo ipmp$a$b$c + return + fi + done + done + done +} + +# +# get_hostname_ipmpinfo interface type # -# Return all group settings from hostname file for a given interface. +# Return all requested IPMP keywords from hostname file for a given interface. # # Example: -# get_group_from_hostname hme0 inet +# get_hostname_ipmpinfo hme0 inet keyword [ keyword ... ] # -get_group_from_hostname() +get_hostname_ipmpinfo() { case "$2" in - inet) file=/etc/hostname.$1 + inet) file=/etc/hostname.$1 ;; - inet6) file=/etc/hostname6.$1 + inet6) file=/etc/hostname6.$1 ;; *) return @@ -150,16 +253,21 @@ get_group_from_hostname() [ -r "$file" ] || return + type=$2 + shift 2 + # - # Read through the hostname file looking for group settings - # There may be several group settings in the file. It is up - # to the caller to pick the right one (i.e. the last one). + # Read through the hostname file looking for the specified + # keywords. Since there may be several keywords that cancel + # each other out, the caller must post-process as appropriate. # while read line; do [ -z "$line" ] && continue - /sbin/ifparse -s "$2" $line - done < "$file" | while read one two three; do - [ "$one" = "group" ] && echo "$two" + /sbin/ifparse -s "$type" $line + done < "$file" | while read one two; do + for keyword in "$@"; do + [ "$one" = "$keyword" ] && echo "$one $two" + done done } @@ -174,7 +282,6 @@ get_group_from_hostname() get_group_for_type() { physical=`get_physical $1` - type=$2 group="" @@ -183,184 +290,77 @@ get_group_for_type() # the reason for the second while loop. # shift 2 - while [ $# -gt 0 ]; do - if if_comp "$physical" $1; then - get_group_from_hostname $1 $type + for ifname in "$@"; do + if if_comp "$physical" $ifname; then + get_hostname_ipmpinfo $ifname $type group fi - shift done | while :; do - read next || { + read keyword grname || { echo "$group" break } - group="$next" + group="$grname" done } # -# get_group interface [ configured | failed ] -# -# If there is both an inet and inet6 version of an interface, the group -# could be set in either set of hostname files. -# -# Inet6 is configured after inet, so if the group is set in both -# sets of hostname files, the inet6 file wins. -# -# The "configured" argument should be used to get the group for -# an interface that has been plumbed into the stack and configured. Use -# the "failed" argument to get the group for an interface that failed to -# plumb. -# -get_group() -{ - group="" - - case "$2" in - configured) - group=`get_group_for_type $1 inet6 $inet6_plumbed` - ;; - failed) - group=`get_group_for_type $1 inet6 $inet6_list` - ;; - *) - return - ;; - esac - - if [ -z "$group" ]; then - if [ "$2" = configured ]; then - group=`get_group_for_type $1 inet $inet_plumbed` - else - group=`get_group_for_type $1 inet $inet_list` - fi - fi - - echo $group -} - -# -# get_standby_from_hostname interface type -# -# Return any "standby" or "-standby" flags in the hostname file. -# -# Example: -# get_standby_from_hostname hme0 inet6 -# -# -get_standby_from_hostname() -{ - case "$2" in - inet) file=/etc/hostname.$1 - ;; - inet6) file=/etc/hostname6.$1 - ;; - *) - return - ;; - esac - - [ -r "$file" ] || return - - # - # There may be several instances of the "standby" and - # "-standby" flags in the hostname file. It is up to - # the caller to pick the correct one. - # - while read line; do - [ -z "$line" ] && continue - /sbin/ifparse -s "$2" $line - done < "$file" | while read one two; do - [ "$one" = "standby" ] || [ "$one" = "-standby" ] \ - && echo "$one" - done -} - -# -# get_standby_for_type interface type plumbed_list +# get_standby_for_type interface type list # # Look through the set of hostname files associated with the same physical -# interface as "interface", and determine whether they would configure -# the interface as a standby interface. +# interface as "interface", and print the standby value ("standby", +# "-standby", or nothing). Only hostname files associated with the +# physical interface or logical interface zero can set this flag. # get_standby_for_type() { - physical=`get_physical $1` type=$2 - final="" - # - # The last "standby" or "-standby" flag is the one that counts, - # which is the reason for the second while loop. + # The last setting of "standby" or "-standby" is the one that + # counts, which is the reason for the second while loop. # shift 2 - while [ $# -gt 0 ]; do - if [ "`get_physical $1`" = "$physical" ]; then - get_standby_from_hostname $1 $type + for ifname in "$@"; do + if if_comp "$physical" $ifname; then + get_hostname_ipmpinfo $ifname $type standby -standby fi - shift done | while :; do - read next || { - echo "$final" + read keyword || { + echo "$iftype" break } - final="$next" + iftype="$keyword" done } # -# is_standby interface +# get_group interface # -# Determine whether a configured interface is a standby interface. -# -# Both the inet and inet6 hostname file sets must be checked. -# If "standby" or "-standby" is set in the inet6 hostname file set, -# don't bother looking at the inet set. +# If there is both an inet and inet6 version of an interface, the group +# could be set in either set of hostname files. Since inet6 is configured +# after inet, if there's a setting in both files, inet6 wins. # -is_standby() +get_group() { - standby=`get_standby_for_type $1 inet6 $inet6_plumbed` - - if [ -z "$standby" ]; then - standby=`get_standby_for_type $1 inet $inet_plumbed` - fi - - # The return value is the value of the following test. - [ "$standby" = "standby" ] + group=`get_group_for_type $1 inet6 $inet6_list` + [ -z "$group" ] && group=`get_group_for_type $1 inet $inet_list` + echo $group } # -# get_alternate interface plumbed_list -# -# Look for a plumbed interface in the same group as "interface". -# A standby interface is preferred over a non-standby interface. +# is_standby interface # -# Example: -# get_alternate hme0 $inet_plumbed +# If there is both an inet and inet6 version of an interface, the +# "standby" or "-standby" flag could be set in either set of hostname +# files. Since inet6 is configured after inet, if there's a setting in +# both files, inet6 wins. # -get_alternate() +is_standby() { - mygroup=`get_group $1 failed` - [ -z "$mygroup" ] && return - - maybe="" - - shift - while [ $# -gt 0 ]; do - group=`get_group $1 configured` - if [ "$group" = "$mygroup" ]; then - if is_standby $1; then - get_physical $1 - return - else - [ -z "$maybe" ] && maybe=$1 - fi - fi - shift - done - - get_physical $maybe + standby=`get_standby_for_type $1 inet6 $inet6_list` + [ -z "$standby" ] && standby=`get_standby_for_type $1 inet $inet_list` + [ "$standby" = "standby" ] } # @@ -394,7 +394,7 @@ doDHCPhostname() # # If there is only line in an hostname file we assume it contains # the old style address which results in the interface being brought up -# and the netmask and broadcast address being set. +# and the netmask and broadcast address being set ($inet_oneline_epilogue). # # If there are multiple lines we assume the file contains a list of # commands to the processor with neither the implied bringing up of the @@ -403,6 +403,8 @@ doDHCPhostname() # Return non-zero if any command fails so that the caller may alert # users to errors in the configuration. # +inet_oneline_epilogue="netmask + broadcast + up" + inet_process_hostname() { if doDHCPhostname $2; then @@ -418,7 +420,7 @@ inet_process_hostname() ifcmds="" retval=0 - while read line; do + while read one rest; do if [ -n "$ifcmds" ]; then # # This handles the first N-1 @@ -427,7 +429,14 @@ inet_process_hostname() $* $ifcmds || retval=$? multiple_lines=true fi - ifcmds="$line" + + # + # Strip out the "ipmp" keyword if it's the + # first token, since it's used to control + # interface creation, not configuration. + # + [ "$one" = ipmp ] && one= + ifcmds="$one $rest" done # @@ -437,8 +446,8 @@ inet_process_hostname() # [ -z "$ifcmds" ] && return $retval if [ $multiple_lines = false ]; then - # The traditional single-line hostname file. - ifcmds="$ifcmds netmask + broadcast + up" + # The traditional one-line hostname file. + ifcmds="$ifcmds $inet_oneline_epilogue" fi # @@ -470,7 +479,13 @@ inet_process_hostname() inet6_process_hostname() { retval=0 - while read ifcmds; do + while read one rest; do + # + # See comment in inet_process_hostname for details. + # + [ "$one" = ipmp ] && one= + ifcmds="$one $rest" + if [ -n "$ifcmds" ]; then $* $ifcmds || retval=$? fi @@ -479,10 +494,9 @@ inet6_process_hostname() } # -# Process interfaces that failed to plumb. Find an alternative -# interface to host the addresses. For IPv6, only static addresses -# defined in hostname6 files are moved, autoconfigured addresses are -# not moved. +# Process interfaces that failed to plumb. Find the IPMP meta-interface +# that should host the addresses. For IPv6, only static addresses defined +# in hostname6 files are moved, autoconfigured addresses are not moved. # # Example: # move_addresses inet6 @@ -491,35 +505,43 @@ move_addresses() { type="$1" eval "failed=\"\$${type}_failed\"" - eval "plumbed=\"\$${type}_plumbed\"" eval "list=\"\$${type}_list\"" - process_hostname="${type}_process_hostname" + process_func="${type}_process_hostname" processed="" if [ "$type" = inet ]; then - echo "moving addresses from failed IPv4 interfaces:\c" + typedesc="IPv4" zaddr="0.0.0.0" hostpfx="/etc/hostname" else - echo "moving addresses from failed IPv6 interfaces:\c" + typedesc="IPv6" zaddr="::" hostpfx="/etc/hostname6" fi - set -- $failed - while [ $# -gt 0 ]; do - in_list if_comp $1 $processed && { shift; continue; } - - alternate="`get_alternate $1 $plumbed`" - if [ -z "$alternate" ]; then - in_list physical_comp $1 $processed || { - echo " $1 (couldn't move, no" \ - "alternative interface)\c" - processed="$processed $1" + echo "Moving addresses from missing ${typedesc} interface(s):\c" \ + >/dev/msglog + + for ifname in $failed; do + in_list if_comp $ifname $processed && continue + + group=`get_group $ifname` + if [ -z "$group" ]; then + in_list physical_comp $ifname $processed || { + echo " $ifname (not moved -- not" \ + "in an IPMP group)\c" >/dev/msglog + processed="$processed $ifname" } - shift continue fi + + # + # Lookup the IPMP meta-interface name. If one doesn't exist, + # create it. + # + grifname=`get_groupifname $group` + [ -z "$grifname" ] && grifname=`create_groupifname $group $type` + # # The hostname files are processed twice. In the first # pass, we are looking for all commands that apply @@ -528,7 +550,7 @@ move_addresses() # whether the address represents a failover address # or not until we've read all the files associated with the # interface. - + # # In the first pass through the hostname files, all # additional logical interface commands are removed. # The remaining commands are concatenated together and @@ -541,19 +563,18 @@ move_addresses() # the embedded "set" command set the address later. # /sbin/ifparse -f $type ` - for item in $list; do - if_comp $1 $item && \ - $process_hostname /sbin/ifparse \ - $type < $hostpfx.$item - done | while read three four; do - [ "$three" != addif ] && \ - echo "$three $four \c" - done` | while read one two; do - [ -z "$one" ] && continue - line="addif $zaddr $one $two" - /sbin/ifconfig $alternate $type \ - -standby $line >/dev/null - done + for item in $list; do + if_comp $ifname $item && $process_func \ + /sbin/ifparse $type < $hostpfx.$item + done | while read three four; do + [ "$three" != addif ] && echo "$three $four \c" + done` | while read one two; do + [ -z "$one" ] && continue + [ "$one $two" = "$inet_oneline_epilogue" ] && \ + continue + line="addif $zaddr $one $two" + /sbin/ifconfig $grifname $type $line >/dev/null + done # # In the second pass, look for the the "addif" commands @@ -561,22 +582,75 @@ move_addresses() # commands are not valid in logical interface hostname # files. # - if [ "$1" = "`get_physical $1`" ]; then - $process_hostname /sbin/ifparse -f $type \ - <$hostpfx.$1 | while read one two; do - [ "$one" = addif ] && \ - /sbin/ifconfig $alternate $type -standby \ - addif $two >/dev/null + if [ "$ifname" = "`get_physical $ifname`" ]; then + $process_func /sbin/ifparse -f $type < $hostpfx.$ifname \ + | while read one two; do + [ "$one" = addif ] && \ + /sbin/ifconfig $grifname $type \ + addif $two >/dev/null done fi - in_list physical_comp $1 $processed || { - echo " $1 (moved to $alternate)\c" - processed="$processed $1" + # + # Check if this was an active interface in the group. If so, + # activate another IP interface (if possible) + # + is_standby $ifname || inactive=`get_inactive_ifname $group` + [ -n "$inactive" ] && /sbin/ifconfig $inactive $type -standby + + in_list physical_comp $ifname $processed || { + processed="$processed $ifname" + echo " $ifname (moved to $grifname\c" > /dev/msglog + if [ -n "$inactive" ]; then + echo " and cleared 'standby' on\c" > /dev/msglog + echo " $inactive to compensate\c" > /dev/msglog + fi + echo ")\c" > /dev/msglog } + inactive="" + done + echo "." >/dev/msglog +} + +# +# if_configure type class interface_list +# +# Configure all of the interfaces of type `type' (e.g., "inet6") in +# `interface_list' according to their /etc/hostname[6].* files. `class' +# describes the class of interface (e.g., "IPMP"), as a diagnostic aid. +# For inet6 interfaces, the interface is also brought up. +# +if_configure() +{ + fail= + type=$1 + class=$2 + process_func=${type}_process_hostname + shift 2 + + if [ "$type" = inet ]; then + desc="IPv4" + hostpfx="/etc/hostname" + else + desc="IPv6" + hostpfx="/etc/hostname6" + fi + [ -n "$class" ] && desc="$class $desc" + + echo "configuring $desc interfaces:\c" + while [ $# -gt 0 ]; do + $process_func /sbin/ifconfig $1 $type < $hostpfx.$1 >/dev/null + if [ $? != 0 ]; then + fail="$fail $1" + elif [ "$type" = inet6 ]; then + /sbin/ifconfig $1 inet6 up || fail="$fail $1" + fi + echo " $1\c" shift done echo "." + + [ -n "$fail" ] && warn_failed_ifs "configure $desc" "$fail" } # diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c index 46b2b5a958..dc90957dfa 100644 --- a/usr/src/cmd/truss/codes.c +++ b/usr/src/cmd/truss/codes.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -615,13 +615,10 @@ const struct ioc { { (uint_t)SIOCSIPSECONFIG, "SIOCSIPSECONFIG", NULL }, { (uint_t)SIOCDIPSECONFIG, "SIOCDIPSECONFIG", NULL }, { (uint_t)SIOCLIPSECONFIG, "SIOCLIPSECONFIG", NULL }, - { (uint_t)SIOCLIFFAILOVER, "SIOCLIFFAILOVER", "lifreq" }, - { (uint_t)SIOCLIFFAILBACK, "SIOCLIFFAILBACK", "lifreq" }, - { (uint_t)SIOCSIPMPFAILBACK, "SIOCSIPMPFAILBACK", NULL }, + { (uint_t)SIOCGLIFBINDING, "SIOCGLIFBINDING", "lifreq" }, { (uint_t)SIOCSLIFGROUPNAME, "SIOCSLIFGROUPNAME", "lifreq" }, { (uint_t)SIOCGLIFGROUPNAME, "SIOCGLIFGROUPNAME", "lifreq" }, - { (uint_t)SIOCGLIFOINDEX, "SIOCGLIFOINDEX", "lifreq" }, - { (uint_t)SIOCSLIFOINDEX, "SIOCSLIFOINDEX", "lifreq" }, + { (uint_t)SIOCGLIFGROUPINFO, "SIOCGLIFGROUPINFO", "lifgroupinfo" }, { (uint_t)SIOCGDSTINFO, "SIOCGDSTINFO", NULL }, { (uint_t)SIOCGIP6ADDRPOLICY, "SIOCGIP6ADDRPOLICY", NULL }, { (uint_t)SIOCSIP6ADDRPOLICY, "SIOCSIP6ADDRPOLICY", NULL }, diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c index edc610559d..8165f64f99 100644 --- a/usr/src/cmd/truss/print.c +++ b/usr/src/cmd/truss/print.c @@ -19,16 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ - -#pragma ident "%Z%%M% %I% %E% SMI" - #define _SYSCALL32 /* make 32-bit compat headers visible */ #include <stdio.h> @@ -73,6 +70,7 @@ #include <netinet/tcp.h> #include <netinet/udp.h> #include <netinet/sctp.h> +#include <net/route.h> #include <sys/utrap.h> #include <sys/lgrp_user.h> #include <sys/door.h> @@ -1749,6 +1747,8 @@ prt_sol(private_t *pri, int raw, long val) { if (val == SOL_SOCKET) { outstring(pri, "SOL_SOCKET"); + } else if (val == SOL_ROUTE) { + outstring(pri, "SOL_ROUTE"); } else { const struct protoent *p; struct protoent res; @@ -1826,6 +1826,18 @@ sol_optname(private_t *pri, long val) #undef CBSIZE } +const char * +route_optname(private_t *pri, long val) +{ + switch (val) { + case RT_AWARE: + return ("RT_AWARE"); + default: + (void) snprintf(pri->code_buf, sizeof (pri->code_buf), + "0x%lx", val); + return (pri->code_buf); + } +} const char * tcp_optname(private_t *pri, long val) @@ -1918,6 +1930,8 @@ prt_son(private_t *pri, int raw, long val) switch (pri->sys_args[1]) { case SOL_SOCKET: outstring(pri, sol_optname(pri, val)); break; + case SOL_ROUTE: outstring(pri, route_optname(pri, val)); + break; case IPPROTO_TCP: outstring(pri, tcp_optname(pri, val)); break; case IPPROTO_UDP: outstring(pri, udp_optname(pri, val)); diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c index 72b6ce5c76..fb8f540cb5 100644 --- a/usr/src/cmd/zoneadmd/vplat.c +++ b/usr/src/cmd/zoneadmd/vplat.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -2397,6 +2397,7 @@ configure_one_interface(zlog_t *zlogp, zoneid_t zone_id, */ char buffer[INET6_ADDRSTRLEN]; void *addr; + const char *nomatch = "no matching subnet found in netmasks(4)"; if (af == AF_INET) addr = &((struct sockaddr_in *) @@ -2405,14 +2406,23 @@ configure_one_interface(zlog_t *zlogp, zoneid_t zone_id, addr = &((struct sockaddr_in6 *) (&lifr.lifr_addr))->sin6_addr; - /* Find out what netmask interface is going to be using */ + /* + * Find out what netmask the interface is going to be using. + * If we just brought up an IPMP data address on an underlying + * interface above, the address will have already migrated, so + * the SIOCGLIFNETMASK won't be able to find it (but we need + * to bring the address up to get the actual netmask). Just + * omit printing the actual netmask in this corner-case. + */ if (ioctl(s, SIOCGLIFNETMASK, (caddr_t)&lifr) < 0 || - inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL) - goto bad; - zerror(zlogp, B_FALSE, - "WARNING: %s: no matching subnet found in netmasks(4) for " - "%s; using default of %s.", - lifr.lifr_name, addrstr4, buffer); + inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL) { + zerror(zlogp, B_FALSE, "WARNING: %s; using default.", + nomatch); + } else { + zerror(zlogp, B_FALSE, + "WARNING: %s: %s: %s; using default of %s.", + lifr.lifr_name, nomatch, addrstr4, buffer); + } } /* diff --git a/usr/src/lib/brand/native/zone/platform.xml b/usr/src/lib/brand/native/zone/platform.xml index f7030ba0a1..69e86cefd2 100644 --- a/usr/src/lib/brand/native/zone/platform.xml +++ b/usr/src/lib/brand/native/zone/platform.xml @@ -20,7 +20,7 @@ CDDL HEADER END - Copyright 2008 Sun Microsystems, Inc. All rights reserved. + Copyright 2009 Sun Microsystems, Inc. All rights reserved. Use is subject to license terms. DO NOT EDIT THIS FILE. @@ -97,6 +97,7 @@ <device match="ipf" ip-type="exclusive" /> <device match="ipl" ip-type="exclusive" /> <device match="iplookup" ip-type="exclusive" /> + <device match="ipmpstub" ip-type="exclusive" /> <device match="ipnat" ip-type="exclusive" /> <device match="ipscan" ip-type="exclusive" /> <device match="ipsecah" ip-type="exclusive" /> diff --git a/usr/src/lib/brand/sn1/zone/platform.xml b/usr/src/lib/brand/sn1/zone/platform.xml index 1659d8851c..b3bb0d7962 100644 --- a/usr/src/lib/brand/sn1/zone/platform.xml +++ b/usr/src/lib/brand/sn1/zone/platform.xml @@ -20,7 +20,7 @@ CDDL HEADER END - Copyright 2008 Sun Microsystems, Inc. All rights reserved. + Copyright 2009 Sun Microsystems, Inc. All rights reserved. Use is subject to license terms. DO NOT EDIT THIS FILE. @@ -101,6 +101,7 @@ <device match="ipf" ip-type="exclusive" /> <device match="ipl" ip-type="exclusive" /> <device match="iplookup" ip-type="exclusive" /> + <device match="ipmpstub" ip-type="exclusive" /> <device match="ipnat" ip-type="exclusive" /> <device match="ipscan" ip-type="exclusive" /> <device match="ipsecah" ip-type="exclusive" /> diff --git a/usr/src/lib/libbsm/common/adt.c b/usr/src/lib/libbsm/common/adt.c index 23f78b6247..d9947622d4 100644 --- a/usr/src/lib/libbsm/common/adt.c +++ b/usr/src/lib/libbsm/common/adt.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -2137,7 +2137,7 @@ adt_get_local_address(int family, struct ifaddrlist *al) int ifal_count; int i; - if ((ifal_count = ifaddrlist(&ifal, family, errbuf)) <= 0) { + if ((ifal_count = ifaddrlist(&ifal, family, 0, errbuf)) <= 0) { int serrno = errno; (void) snprintf(msg, sizeof (msg), "adt_get_local_address " diff --git a/usr/src/lib/libdlpi/common/libdlpi.c b/usr/src/lib/libdlpi/common/libdlpi.c index 14c4451081..d546807342 100644 --- a/usr/src/lib/libdlpi/common/libdlpi.c +++ b/usr/src/lib/libdlpi/common/libdlpi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1109,7 +1109,7 @@ i_dlpi_open(const char *provider, int *fd, uint_t flags, boolean_t style1) /* open libdladm handle rather than taking it as input */ if (dladm_open(&handle) != DLADM_STATUS_OK) - return (DLPI_FAILURE); + goto fallback; if (dladm_dev2linkid(handle, device, &linkid) == DLADM_STATUS_OK) { @@ -1400,7 +1400,7 @@ i_dlpi_strgetmsg(dlpi_impl_t *dip, int msec, dlpi_msg_t *dlreplyp, void *databuf, size_t *datalenp, size_t *totdatalenp) { int retval; - int flags = 0; + int flags; int fd = dip->dli_fd; struct strbuf ctl, data; struct pollfd pfd; @@ -1437,16 +1437,17 @@ i_dlpi_strgetmsg(dlpi_impl_t *dip, int msec, dlpi_msg_t *dlreplyp, start = gethrtime() / (NANOSEC / MILLISEC); switch (poll(&pfd, 1, msec)) { - default: - if (pfd.revents & POLLHUP) - return (DL_SYSERR); - break; - case 0: - return (DLPI_ETIMEDOUT); - case -1: + default: + if (pfd.revents & POLLHUP) return (DL_SYSERR); + break; + case 0: + return (DLPI_ETIMEDOUT); + case -1: + return (DL_SYSERR); } + flags = 0; if ((retval = getmsg(fd, &ctl, &data, &flags)) < 0) return (DL_SYSERR); diff --git a/usr/src/lib/libinetcfg/common/inetcfg.c b/usr/src/lib/libinetcfg/common/inetcfg.c index 38beca5574..e1f09a881a 100644 --- a/usr/src/lib/libinetcfg/common/inetcfg.c +++ b/usr/src/lib/libinetcfg/common/inetcfg.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -743,7 +741,8 @@ icfg_set_flags(icfg_handle_t handle, uint64_t flags) struct lifreq lifr; uint64_t oflags; int ret; - int rtsock; + int rtsock = -1; + int aware = RTAW_UNDER_IPMP; (void) strlcpy(lifr.lifr_name, handle->ifh_interface.if_name, sizeof (lifr.lifr_name)); @@ -757,10 +756,16 @@ icfg_set_flags(icfg_handle_t handle, uint64_t flags) /* * Any time flags are changed on an interface that has IFF_UP set, * you'll get a routing socket message. We care about the status, - * though, only when the new flags are marked "up." + * though, only when the new flags are marked "up." Since we may be + * changing an IPMP test address, we enable RTAW_UNDER_IPMP. */ - rtsock = (flags & IFF_UP) ? - socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle)) : -1; + if (flags & IFF_UP) { + rtsock = socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle)); + if (rtsock != -1) { + (void) setsockopt(rtsock, SOL_ROUTE, RT_AWARE, &aware, + sizeof (aware)); + } + } lifr.lifr_flags = flags; if (ioctl(handle->ifh_sock, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) { @@ -993,7 +998,8 @@ icfg_set_addr(icfg_handle_t handle, const struct sockaddr *addr, struct lifreq lifr; uint64_t flags; int ret; - int rtsock; + int rtsock = -1; + int aware = RTAW_UNDER_IPMP; (void) memset(&lifr.lifr_addr, 0, sizeof (lifr.lifr_addr)); if ((ret = to_sockaddr_storage(ICFG_FAMILY(handle), addr, addrlen, @@ -1002,15 +1008,19 @@ icfg_set_addr(icfg_handle_t handle, const struct sockaddr *addr, } /* - * Need to do check on duplicate address detection results if the - * interface is up. + * Need to check duplicate address detection results if the address is + * up. Since this may be an IPMP test address, enable RTAW_UNDER_IPMP. */ - if ((ret = icfg_get_flags(handle, &flags)) != ICFG_SUCCESS) { + if ((ret = icfg_get_flags(handle, &flags)) != ICFG_SUCCESS) return (ret); - } - rtsock = (flags & IFF_UP) ? - socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle)) : -1; + if (flags & IFF_UP) { + rtsock = socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle)); + if (rtsock != -1) { + (void) setsockopt(rtsock, SOL_ROUTE, RT_AWARE, &aware, + sizeof (aware)); + } + } (void) strlcpy(lifr.lifr_name, handle->ifh_interface.if_name, sizeof (lifr.lifr_name)); diff --git a/usr/src/lib/libinetutil/Makefile.com b/usr/src/lib/libinetutil/Makefile.com index 810f24bd71..cd3a0d6e33 100644 --- a/usr/src/lib/libinetutil/Makefile.com +++ b/usr/src/lib/libinetutil/Makefile.com @@ -19,15 +19,13 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# -LIBRARY = libinetutil.a -VERS = .1 -OBJECTS = octet.o inetutil4.o ifspec.o ifaddrlist.o eh.o tq.o +LIBRARY = libinetutil.a +VERS = .1 +OBJECTS = octet.o inetutil.o ifspec.o ifaddrlist.o ifaddrlistx.o eh.o tq.o include ../../Makefile.lib @@ -38,9 +36,9 @@ LIBS = $(DYNLIB) $(LINTLIB) SRCDIR = ../common COMDIR = $(SRC)/common/net/dhcp -SRCS = $(COMDIR)/octet.c $(SRCDIR)/inetutil4.c \ +SRCS = $(COMDIR)/octet.c $(SRCDIR)/inetutil.c \ $(SRCDIR)/ifspec.c $(SRCDIR)/eh.c $(SRCDIR)/tq.c \ - $(SRCDIR)/ifaddrlist.c + $(SRCDIR)/ifaddrlist.c $(SRCDIR)/ifaddrlistx.c $(LINTLIB):= SRCS = $(SRCDIR)/$(LINTSRC) LDLIBS += -lsocket -lc diff --git a/usr/src/lib/libinetutil/common/ifaddrlist.c b/usr/src/lib/libinetutil/common/ifaddrlist.c index 383dc2afb0..fa67a0fc37 100644 --- a/usr/src/lib/libinetutil/common/ifaddrlist.c +++ b/usr/src/lib/libinetutil/common/ifaddrlist.c @@ -1,5 +1,5 @@ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,9 +38,6 @@ * @(#) $Header: ifaddrlist.c,v 1.2 97/04/22 13:31:05 leres Exp $ (LBL) */ -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <alloca.h> #include <errno.h> #include <libinetutil.h> #include <stdio.h> @@ -54,9 +51,9 @@ * See <libinetutil.h> for a description of the programming interface. */ int -ifaddrlist(struct ifaddrlist **ipaddrp, int family, char *errbuf) +ifaddrlist(struct ifaddrlist **ipaddrp, int family, uint_t flags, char *errbuf) { - struct ifaddrlist *ifaddrlist, *al; + struct ifaddrlist *ifaddrlist = NULL, *al = NULL; struct sockaddr_in *sin; struct sockaddr_in6 *sin6; struct lifconf lifc; @@ -64,31 +61,28 @@ ifaddrlist(struct ifaddrlist **ipaddrp, int family, char *errbuf) struct lifreq *lifrp; int i, count, nlifr; int fd; - const char *iocstr; + const char *opstr; + (void) memset(&lifc, 0, sizeof (lifc)); if (family != AF_INET && family != AF_INET6) { (void) strlcpy(errbuf, "invalid address family", ERRBUFSIZE); return (-1); } - fd = socket(family, SOCK_DGRAM, 0); - if (fd == -1) { - (void) snprintf(errbuf, ERRBUFSIZE, "socket: %s", - strerror(errno)); - return (-1); + if ((fd = socket(family, SOCK_DGRAM, 0)) == -1) { + opstr = "socket"; + goto fail; } /* * Get the number of network interfaces of type `family'. */ lifn.lifn_family = family; - lifn.lifn_flags = 0; + lifn.lifn_flags = flags; again: if (ioctl(fd, SIOCGLIFNUM, &lifn) == -1) { - (void) snprintf(errbuf, ERRBUFSIZE, "SIOCGLIFNUM: %s", - strerror(errno)); - (void) close(fd); - return (-1); + opstr = "SIOCGLIFNUM"; + goto fail; } /* @@ -97,16 +91,17 @@ again: */ lifn.lifn_count += 4; + lifc.lifc_flags = flags; lifc.lifc_family = family; lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); - lifc.lifc_buf = alloca(lifc.lifc_len); - lifc.lifc_flags = 0; + if ((lifc.lifc_buf = realloc(lifc.lifc_buf, lifc.lifc_len)) == NULL) { + opstr = "realloc"; + goto fail; + } if (ioctl(fd, SIOCGLIFCONF, &lifc) == -1) { - (void) snprintf(errbuf, ERRBUFSIZE, "SIOCGLIFCONF: %s", - strerror(errno)); - (void) close(fd); - return (-1); + opstr = "SIOCGLIFCONF"; + goto fail; } /* @@ -121,12 +116,9 @@ again: /* * Allocate the address list to return. */ - ifaddrlist = calloc(nlifr, sizeof (struct ifaddrlist)); - if (ifaddrlist == NULL) { - (void) snprintf(errbuf, ERRBUFSIZE, "calloc: %s", - strerror(errno)); - (void) close(fd); - return (-1); + if ((ifaddrlist = calloc(nlifr, sizeof (struct ifaddrlist))) == NULL) { + opstr = "calloc"; + goto fail; } /* @@ -142,7 +134,7 @@ again: if (ioctl(fd, SIOCGLIFFLAGS, lifrp) == -1) { if (errno == ENXIO) continue; - iocstr = "SIOCGLIFFLAGS"; + opstr = "SIOCGLIFFLAGS"; goto fail; } al->flags = lifrp->lifr_flags; @@ -150,7 +142,7 @@ again: if (ioctl(fd, SIOCGLIFINDEX, lifrp) == -1) { if (errno == ENXIO) continue; - iocstr = "SIOCGLIFINDEX"; + opstr = "SIOCGLIFINDEX"; goto fail; } al->index = lifrp->lifr_index; @@ -158,7 +150,7 @@ again: if (ioctl(fd, SIOCGLIFADDR, lifrp) == -1) { if (errno == ENXIO) continue; - iocstr = "SIOCGLIFADDR"; + opstr = "SIOCGLIFADDR"; goto fail; } @@ -174,6 +166,7 @@ again: } (void) close(fd); + free(lifc.lifc_buf); if (count == 0) { free(ifaddrlist); *ipaddrp = NULL; @@ -183,9 +176,14 @@ again: *ipaddrp = ifaddrlist; return (count); fail: - (void) snprintf(errbuf, ERRBUFSIZE, "%s: %s: %s", iocstr, al->device, - strerror(errno)); - + if (al == NULL) { + (void) snprintf(errbuf, ERRBUFSIZE, "%s: %s", opstr, + strerror(errno)); + } else { + (void) snprintf(errbuf, ERRBUFSIZE, "%s: %s: %s", opstr, + al->device, strerror(errno)); + } + free(lifc.lifc_buf); free(ifaddrlist); (void) close(fd); return (-1); diff --git a/usr/src/lib/libinetutil/common/ifaddrlistx.c b/usr/src/lib/libinetutil/common/ifaddrlistx.c new file mode 100644 index 0000000000..ce85c5521f --- /dev/null +++ b/usr/src/lib/libinetutil/common/ifaddrlistx.c @@ -0,0 +1,168 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <errno.h> +#include <libinetutil.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/sockio.h> + +/* + * Create a list of the addresses on physical interface `ifname' with at least + * one of the flags in `set' set and all of the flags in `clear' clear. + * Return the number of items in the list, or -1 on failure. + */ +int +ifaddrlistx(const char *ifname, uint64_t set, uint64_t clear, + ifaddrlistx_t **ifaddrsp) +{ + struct lifconf lifc; + struct lifnum lifn; + struct lifreq *lifrp; + ifaddrlistx_t *ifaddrp, *ifaddrs = NULL; + int i, nlifr, naddr = 0; + char *cp; + uint_t flags; + int s4, s6 = -1; + boolean_t isv6; + int save_errno; + struct sockaddr_storage addr; + + (void) memset(&lifc, 0, sizeof (lifc)); + flags = LIFC_NOXMIT | LIFC_ALLZONES | LIFC_TEMPORARY | LIFC_UNDER_IPMP; + + /* + * We need both IPv4 and IPv6 sockets to query both IPv4 and IPv6 + * interfaces below. + */ + if ((s4 = socket(AF_INET, SOCK_DGRAM, 0)) == -1 || + (s6 = socket(AF_INET6, SOCK_DGRAM, 0)) == -1) { + goto fail; + } + + /* + * Get the number of network interfaces of type `family'. + */ + lifn.lifn_family = AF_UNSPEC; + lifn.lifn_flags = flags; +again: + if (ioctl(s4, SIOCGLIFNUM, &lifn) == -1) + goto fail; + + /* + * Pad the interface count to detect when additional interfaces have + * been configured between SIOCGLIFNUM and SIOCGLIFCONF. + */ + lifn.lifn_count += 4; + + lifc.lifc_flags = flags; + lifc.lifc_family = AF_UNSPEC; + lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); + if ((lifc.lifc_buf = realloc(lifc.lifc_buf, lifc.lifc_len)) == NULL) + goto fail; + + if (ioctl(s4, SIOCGLIFCONF, &lifc) == -1) + goto fail; + + /* + * If every lifr_req slot is taken, then additional interfaces must + * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF. + * Recalculate to make sure we didn't miss any interfaces. + */ + nlifr = lifc.lifc_len / sizeof (struct lifreq); + if (nlifr >= lifn.lifn_count) + goto again; + + /* + * Populate the ifaddrlistx by querying each matching interface. If a + * query ioctl returns ENXIO, then the interface must have been + * removed after the SIOCGLIFCONF completed -- so we just ignore it. + */ + for (lifrp = lifc.lifc_req, i = 0; i < nlifr; i++, lifrp++) { + if ((cp = strchr(lifrp->lifr_name, ':')) != NULL) + *cp = '\0'; + + if (strcmp(lifrp->lifr_name, ifname) != 0) + continue; + + if (cp != NULL) + *cp = ':'; + + addr = lifrp->lifr_addr; + isv6 = addr.ss_family == AF_INET6; + if (ioctl(isv6 ? s6 : s4, SIOCGLIFFLAGS, lifrp) == -1) { + if (errno == ENXIO) + continue; + goto fail; + } + + if (set != 0 && ((lifrp->lifr_flags & set) == 0) || + (lifrp->lifr_flags & clear) != 0) + continue; + + /* + * We've got a match; allocate a new record. + */ + if ((ifaddrp = malloc(sizeof (ifaddrlistx_t))) == NULL) + goto fail; + + (void) strlcpy(ifaddrp->ia_name, lifrp->lifr_name, LIFNAMSIZ); + ifaddrp->ia_flags = lifrp->lifr_flags; + ifaddrp->ia_addr = addr; + ifaddrp->ia_next = ifaddrs; + ifaddrs = ifaddrp; + naddr++; + } + + (void) close(s4); + (void) close(s6); + free(lifc.lifc_buf); + *ifaddrsp = ifaddrs; + return (naddr); +fail: + save_errno = errno; + (void) close(s4); + (void) close(s6); + free(lifc.lifc_buf); + ifaddrlistx_free(ifaddrs); + errno = save_errno; + return (-1); +} + +/* + * Free the provided ifaddrlistx_t. + */ +void +ifaddrlistx_free(ifaddrlistx_t *ifaddrp) +{ + ifaddrlistx_t *next_ifaddrp; + + for (; ifaddrp != NULL; ifaddrp = next_ifaddrp) { + next_ifaddrp = ifaddrp->ia_next; + free(ifaddrp); + } +} diff --git a/usr/src/lib/libinetutil/common/inetutil4.c b/usr/src/lib/libinetutil/common/inetutil.c index ff5607e192..195d080b79 100644 --- a/usr/src/lib/libinetutil/common/inetutil4.c +++ b/usr/src/lib/libinetutil/common/inetutil.c @@ -18,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <unistd.h> #include <netinet/in.h> #include <libinetutil.h> @@ -32,7 +31,7 @@ extern int getnetmaskbyaddr(const struct in_addr, struct in_addr *); /* - * Generic internet (v4) functions. + * Internet utility functions. */ /* @@ -67,3 +66,32 @@ get_netmask4(const struct in_addr *n_addrp, struct in_addr *s_addrp) else s_addrp->s_addr = IN_CLASSE_NET; } + +/* + * Checks if the IP addresses `ssp1' and `ssp2' are equal. + */ +boolean_t +sockaddrcmp(const struct sockaddr_storage *ssp1, + const struct sockaddr_storage *ssp2) +{ + struct in_addr addr1, addr2; + const struct in6_addr *addr6p1, *addr6p2; + + if (ssp1->ss_family != ssp2->ss_family) + return (B_FALSE); + + if (ssp1 == ssp2) + return (B_TRUE); + + switch (ssp1->ss_family) { + case AF_INET: + addr1 = ((const struct sockaddr_in *)ssp1)->sin_addr; + addr2 = ((const struct sockaddr_in *)ssp2)->sin_addr; + return (addr1.s_addr == addr2.s_addr); + case AF_INET6: + addr6p1 = &((const struct sockaddr_in6 *)ssp1)->sin6_addr; + addr6p2 = &((const struct sockaddr_in6 *)ssp2)->sin6_addr; + return (IN6_ARE_ADDR_EQUAL(addr6p1, addr6p2)); + } + return (B_FALSE); +} diff --git a/usr/src/lib/libinetutil/common/libinetutil.h b/usr/src/lib/libinetutil/common/libinetutil.h index b21d54f56c..0bece07e07 100644 --- a/usr/src/lib/libinetutil/common/libinetutil.h +++ b/usr/src/lib/libinetutil/common/libinetutil.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,15 +20,13 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _LIBINETUTIL_H #define _LIBINETUTIL_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Contains SMI-private API for general Internet functionality */ @@ -59,11 +56,14 @@ typedef struct { extern boolean_t ifparse_ifspec(const char *, ifspec_t *); extern void get_netmask4(const struct in_addr *, struct in_addr *); +extern boolean_t sockaddrcmp(const struct sockaddr_storage *, + const struct sockaddr_storage *); /* * Extended version of the classic BSD ifaddrlist() interface: * - * int ifaddrlist(struct ifaddrlist **addrlistp, int af, char *errbuf); + * int ifaddrlist(struct ifaddrlist **addrlistp, int af, uint_t flags, + * char *errbuf); * * * addrlistp: Upon success, ifaddrlist() sets *addrlistp to a * dynamically-allocated array of addresses. @@ -71,6 +71,9 @@ extern void get_netmask4(const struct in_addr *, struct in_addr *); * * af: Either AF_INET to obtain IPv4 addresses, or AF_INET6 to * obtain IPv6 addresses. * + * * flags: LIFC_* flags that control the classes of interfaces that + * will be visible. + * * * errbuf: A caller-supplied buffer of ERRBUFSIZE. Upon failure, * provides the reason for the failure. * @@ -89,9 +92,43 @@ struct ifaddrlist { uint64_t flags; /* interface flags */ }; -#define ERRBUFSIZE 128 /* expected size of third argument */ +#define ERRBUFSIZE 128 /* expected size of fourth argument */ + +extern int ifaddrlist(struct ifaddrlist **, int, uint_t, char *); -extern int ifaddrlist(struct ifaddrlist **, int, char *); +/* + * Similar to ifaddrlist(), but returns a linked-list of addresses for a + * *specific* interface name, and allows specific address flags to be matched + * against. A linked list is used rather than an array so that information + * can grow over time without affecting binary compatibility. Also, leaves + * error-handling up to the caller. Returns the number of ifaddrlistx's + * chained through ifaddrp. + * + * int ifaddrlistx(const char *ifname, uint64_t set, uint64_t clear, + * ifaddrlistx_t **ifaddrp); + * + * * ifname: Interface name to match against. + * + * * set: One or more flags that must be set on the address for + * it to be returned. + * + * * clear: Flags that must be clear on the address for it to be + * returned. + * + * * ifaddrp: Upon success, ifaddrlistx() sets *ifaddrp to the head + * of a dynamically-allocated array of ifaddrlistx structures. + * + * Once done, the caller must free `ifaddrp' by calling ifaddrlistx_free(). + */ +typedef struct ifaddrlistx { + struct ifaddrlistx *ia_next; + char ia_name[LIFNAMSIZ]; + uint64_t ia_flags; + struct sockaddr_storage ia_addr; +} ifaddrlistx_t; + +extern int ifaddrlistx(const char *, uint64_t, uint64_t, ifaddrlistx_t **); +extern void ifaddrlistx_free(ifaddrlistx_t *); /* * Timer queues diff --git a/usr/src/lib/libinetutil/common/mapfile-vers b/usr/src/lib/libinetutil/common/mapfile-vers index 51c168fcc4..c9a7829fdb 100644 --- a/usr/src/lib/libinetutil/common/mapfile-vers +++ b/usr/src/lib/libinetutil/common/mapfile-vers @@ -19,17 +19,17 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# SUNWprivate_1.1 { global: get_netmask4; hexascii_to_octet; ifaddrlist; + ifaddrlistx; + ifaddrlistx_free; ifparse_ifspec; iu_adjust_timer; iu_cancel_timer; @@ -48,6 +48,7 @@ SUNWprivate_1.1 { iu_tq_destroy; iu_unregister_event; octet_to_hexascii; + sockaddrcmp; local: *; }; diff --git a/usr/src/lib/libipmp/Makefile b/usr/src/lib/libipmp/Makefile index 188c49c073..5d52f304dc 100644 --- a/usr/src/lib/libipmp/Makefile +++ b/usr/src/lib/libipmp/Makefile @@ -19,15 +19,13 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# include $(SRC)/lib/Makefile.lib -HDRS = ipmp.h ipmp_mpathd.h ipmp_query.h ipmp_query_impl.h +HDRS = ipmp.h ipmp_admin.h ipmp_mpathd.h ipmp_query.h ipmp_query_impl.h HDRDIR = common SUBDIRS = $(MACH) diff --git a/usr/src/lib/libipmp/Makefile.com b/usr/src/lib/libipmp/Makefile.com index bea02659a8..d3065ae37c 100644 --- a/usr/src/lib/libipmp/Makefile.com +++ b/usr/src/lib/libipmp/Makefile.com @@ -19,20 +19,19 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# LIBRARY = libipmp.a VERS = .1 -OBJECTS = ipmp_query.o ipmp_mpathd.o ipmp.o +OBJECTS = ipmp_admin.o ipmp_query.o ipmp_mpathd.o ipmp.o include ../../Makefile.lib +include ../../Makefile.rootfs LIBS = $(DYNLIB) $(LINTLIB) -LDLIBS += -lsocket -lc +LDLIBS += -linetutil -lsocket -lc SRCDIR = ../common $(LINTLIB):= SRCS = $(SRCDIR)/$(LINTSRC) diff --git a/usr/src/lib/libipmp/common/ipmp.c b/usr/src/lib/libipmp/common/ipmp.c index b9a7984889..cf9c3c7c3c 100644 --- a/usr/src/lib/libipmp/common/ipmp.c +++ b/usr/src/lib/libipmp/common/ipmp.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * IPMP general interfaces (PSARC/2002/615). */ @@ -34,6 +31,8 @@ #include <stdlib.h> #include <locale.h> #include <unistd.h> +#include <string.h> +#include <errno.h> #include "ipmp_impl.h" @@ -92,13 +91,15 @@ static char *errmsgs[IPMP_NERR] = { "operation failed", /* 1 IPMP_FAILURE */ "minimum failover redundancy not met", /* 2 IPMP_EMINRED */ "failback disabled", /* 3 IPMP_EFBDISABLED */ - "unable to completely fail back", /* 4 IPMP_EFBPARTIAL */ + "unknown IPMP data address", /* 4 IPMP_EUNKADDR */ "invalid argument", /* 5 IPMP_EINVAL */ "out of memory", /* 6 IPMP_ENOMEM */ "cannot contact in.mpathd", /* 7 IPMP_ENOMPATHD */ "unknown IPMP group", /* 8 IPMP_EUNKGROUP */ "interface is not using IPMP", /* 9 IPMP_EUNKIF */ - "unable to communicate with in.mpathd" /* 10 IPMP_EPROTO */ + "unable to communicate with in.mpathd", /* 10 IPMP_EPROTO */ + "interface has duplicate hardware address" + /* 11 IPMP_EHWADDRDUP */ }; /* @@ -110,5 +111,8 @@ ipmp_errmsg(int error) if (error >= IPMP_NERR || error < 0) return (dgettext(TEXT_DOMAIN, "<unknown error>")); + if (error == IPMP_FAILURE) + return (strerror(errno)); + return (dgettext(TEXT_DOMAIN, errmsgs[error])); } diff --git a/usr/src/lib/libipmp/common/ipmp.h b/usr/src/lib/libipmp/common/ipmp.h index 0112615a84..2ca0a9b2b9 100644 --- a/usr/src/lib/libipmp/common/ipmp.h +++ b/usr/src/lib/libipmp/common/ipmp.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPMP_H #define _IPMP_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * General IPMP-related definitions and functions. * @@ -50,13 +47,14 @@ enum { IPMP_FAILURE, /* operation failed (check errno) */ IPMP_EMINRED, /* minimum failover redundancy not met */ IPMP_EFBDISABLED, /* failback disabled */ - IPMP_EFBPARTIAL, /* unable to completely fail back */ + IPMP_EUNKADDR, /* unknown IPMP data address */ IPMP_EINVAL, /* invalid argument */ IPMP_ENOMEM, /* out of memory */ IPMP_ENOMPATHD, /* cannot contact in.mpathd */ IPMP_EUNKGROUP, /* unknown IPMP group */ IPMP_EUNKIF, /* interface is not using IPMP */ IPMP_EPROTO, /* unable to communicate with in.mpathd */ + IPMP_EHWADDRDUP, /* interface has duplicate hardware address */ IPMP_NERR /* number of error codes */ }; diff --git a/usr/src/lib/libipmp/common/ipmp_admin.c b/usr/src/lib/libipmp/common/ipmp_admin.c new file mode 100644 index 0000000000..8a282f5286 --- /dev/null +++ b/usr/src/lib/libipmp/common/ipmp_admin.c @@ -0,0 +1,104 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * IPMP administrative interfaces (see PSARC/2007/272). + */ + +#include <assert.h> +#include <errno.h> +#include <string.h> +#include <unistd.h> +#include <sys/time.h> +#include <sys/types.h> + +#include "ipmp_impl.h" +#include "ipmp_mpathd.h" +#include "ipmp_admin.h" + +static int +ipmp_command(ipmp_handle_t handle, const void *req, uint_t reqsize) +{ + ipmp_state_t *statep = (ipmp_state_t *)handle; + mi_result_t result; + struct timeval end; + int save_errno; + int retval; + + if (gettimeofday(&end, NULL) == -1) + return (IPMP_FAILURE); + end.tv_sec += IPMP_REQTIMEOUT; + + assert(statep->st_fd == -1); + retval = ipmp_connect(&statep->st_fd); + if (retval != IPMP_SUCCESS) + return (retval); + + retval = ipmp_write(statep->st_fd, req, reqsize); + if (retval != IPMP_SUCCESS) + goto out; + + retval = ipmp_read(statep->st_fd, &result, sizeof (result), &end); + if (retval != IPMP_SUCCESS) + goto out; + + errno = result.me_sys_error; + retval = result.me_mpathd_error; +out: + save_errno = errno; + (void) close(statep->st_fd); + statep->st_fd = -1; + errno = save_errno; + return (retval); +} + +int +ipmp_offline(ipmp_handle_t handle, const char *ifname, uint_t minred) +{ + mi_offline_t mio; + + mio.mio_command = MI_OFFLINE; + mio.mio_min_redundancy = minred; + (void) strlcpy(mio.mio_ifname, ifname, LIFNAMSIZ); + return (ipmp_command(handle, &mio, sizeof (mio))); +} + +int +ipmp_undo_offline(ipmp_handle_t handle, const char *ifname) +{ + mi_undo_offline_t miu; + + miu.miu_command = MI_UNDO_OFFLINE; + (void) strlcpy(miu.miu_ifname, ifname, LIFNAMSIZ); + return (ipmp_command(handle, &miu, sizeof (miu))); +} + +int +ipmp_ping_daemon(ipmp_handle_t handle) +{ + mi_ping_t mip; + + mip.mip_command = MI_PING; + return (ipmp_command(handle, &mip, sizeof (mip))); +} diff --git a/usr/src/lib/libipmp/common/ipmp_admin.h b/usr/src/lib/libipmp/common/ipmp_admin.h new file mode 100644 index 0000000000..fa0986f7fa --- /dev/null +++ b/usr/src/lib/libipmp/common/ipmp_admin.h @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IPMP_ADMIN_H +#define _IPMP_ADMIN_H + +#include <ipmp.h> +#include <sys/types.h> + +/* + * IPMP administrative interfaces. + * + * These interfaces may only be used within ON or after signing a contract + * with ON. For documentation, refer to PSARC/2007/272. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +extern int ipmp_offline(ipmp_handle_t, const char *, uint_t); +extern int ipmp_undo_offline(ipmp_handle_t, const char *); +extern int ipmp_ping_daemon(ipmp_handle_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _IPMP_ADMIN_H */ diff --git a/usr/src/lib/libipmp/common/ipmp_mpathd.c b/usr/src/lib/libipmp/common/ipmp_mpathd.c index ee1d35de33..e24de71017 100644 --- a/usr/src/lib/libipmp/common/ipmp_mpathd.c +++ b/usr/src/lib/libipmp/common/ipmp_mpathd.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -18,14 +17,11 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Low-level interfaces for communicating with in.mpathd(1M). * @@ -66,16 +62,16 @@ ipmp_connect(int *fdp) return (IPMP_FAILURE); /* - * Enable TCP_ANONPRIVBIND so the kernel will choose our source port. - * Since we're using loopback sockets, requiring use of privileged - * source ports is sufficient for security. + * If we have sufficient privilege, enable TCP_ANONPRIVBIND so the + * kernel will choose a privileged source port (since in.mpathd only + * accepts requests on loopback, this is sufficient for security). + * If not, drive on since MI_QUERY and MI_PING commands are allowed + * from non-privileged ports. */ - if (setsockopt(fd, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) == -1) - goto fail; + (void) setsockopt(fd, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, sizeof (on)); /* - * Bind to a privileged port chosen by the kernel. + * Bind to a port chosen by the kernel. */ (void) memset(&sin, 0, sizeof (struct sockaddr_in)); sin.sin_port = htons(0); diff --git a/usr/src/lib/libipmp/common/ipmp_mpathd.h b/usr/src/lib/libipmp/common/ipmp_mpathd.h index 61ae71b78f..7df3b4fd92 100644 --- a/usr/src/lib/libipmp/common/ipmp_mpathd.h +++ b/usr/src/lib/libipmp/common/ipmp_mpathd.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -18,26 +17,17 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 1999-2002 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPMP_MPATHD_H #define _IPMP_MPATHD_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Definitions for the messaging protocol between in.mpathd and libipmp. - * This interface is loosely documented in PSARC/2000/306. - * - * PLEASE NOTE: Although this interface is officially consolidation-private, - * we will be reclassifying it as project-private in the future, and - * transitioning any existing consumers to use higher-level libipmp routines. - * - * Put another way: treat this as if it was project-private! + * This interface is project-private to the IPMP subsystem. */ #include <sys/types.h> @@ -49,33 +39,41 @@ extern "C" { #endif #define MPATHD_PORT 5999 -#define MPATHD_PATH "/usr/lib/inet/in.mpathd" +#define MPATHD_PATH "/lib/inet/in.mpathd" /* * Supported commands. */ enum { - MI_PING = 0, /* sanity test */ + MI_PING = 0, /* ping in.mpathd */ MI_OFFLINE = 1, /* offline the interface */ MI_UNDO_OFFLINE = 2, /* undo the offline */ - MI_SETOINDEX = 3, /* set original interface index */ - MI_QUERY = 4, /* query ipmp-related information */ + MI_QUERY = 3, /* query ipmp-related information */ MI_NCMD /* total number of commands */ }; /* * Types of information which can be requested and received (except for - * IPMP_IFLIST, which can only be received). + * IPMP_IFLIST and IPMP_ADDRLIST, which can only be received). */ typedef enum { IPMP_GROUPLIST = 1, IPMP_GROUPINFO = 2, IPMP_IFINFO = 3, IPMP_IFLIST = 4, - IPMP_SNAP = 5 + IPMP_SNAP = 5, + IPMP_ADDRLIST = 6, + IPMP_ADDRINFO = 7 } ipmp_infotype_t; /* + * Daemon ping request. + */ +typedef struct mi_ping { + uint32_t mip_command; +} mi_ping_t; + +/* * Interface offline request; `mio_ifname' is the interface to offline; * `mio_min_redundancy' is the minimum amount of usable interfaces after * offline that must exist for the operation to succeed. @@ -83,7 +81,6 @@ typedef enum { typedef struct mi_offline { uint32_t mio_command; char mio_ifname[LIFNAMSIZ]; - char mio_move_to_if[LIFNAMSIZ]; /* currently unused */ uint32_t mio_min_redundancy; } mi_offline_t; @@ -97,24 +94,12 @@ typedef struct mi_undo_offline { } mi_undo_offline_t; /* - * Set original interface index request: `mis_lifname' is the name of the - * logical interface that is having its index reset; `mis_new_pifname' is the - * name of the interface whose index will be associated with `mis_lifname'; - * `mis_iftype' is the interface type. - */ -typedef struct mi_setoindex { - uint32_t mis_command; - char mis_lifname[LIFNAMSIZ]; - char mis_new_pifname[LIFNAMSIZ]; - uint32_t mis_iftype; -} mi_setoindex_t; - -/* * Retrieve IPMP-related information: `miq_inforeq' is the type of information - * being request (see above for the list of types). If the request is for - * either IPMP_GROUPINFO or IPMP_IFINFO, then either `miq_grname' or - * `miq_ifname' should be set (respectively) to indicate the name of the - * group or interface to retrieve the information for. + * being request (see above for the list of types). If the request type is + * IPMP_GROUPINFO, then `miq_grname' indicates the group. If the request type + * is IPMP_IFINFO, then `miq_ifname' indicates the interface. If the request + * type is IPMP_ADDRINFO then `miq_grname' indicates the group and `miq_addr' + * indicates the address. */ typedef struct mi_query { uint32_t miq_command; @@ -123,6 +108,7 @@ typedef struct mi_query { char miqu_ifname[LIFNAMSIZ]; char miqu_grname[LIFGRNAMSIZ]; } miq_infodata; + struct sockaddr_storage miq_addr; } mi_query_t; #define miq_ifname miq_infodata.miqu_ifname #define miq_grname miq_infodata.miqu_grname @@ -132,10 +118,10 @@ typedef struct mi_query { * requirement for receiving any command. */ union mi_commands { - uint32_t mi_command; + uint32_t mi_command; + mi_ping_t mi_pcmd; mi_offline_t mi_ocmd; mi_undo_offline_t mi_ucmd; - mi_setoindex_t mi_scmd; mi_query_t mi_qcmd; }; @@ -147,18 +133,7 @@ typedef struct mi_result { uint32_t me_mpathd_error; /* Mpathd error */ } mi_result_t; -/* - * Legacy values for me_mpathd_error; the daemon now returns the IPMP - * error codes defined in <ipmp.h>, which are compatible with these error - * codes. These will be removed in the future. - */ -enum { - MPATHD_SUCCESS = 0, /* operation succeeded */ - MPATHD_SYS_ERROR = 1, /* check me_sys_error for the errno */ - MPATHD_MIN_RED_ERROR = 2, /* minimum redundancy not met */ - MPATHD_FAILBACK_DISABLED = 3, /* failback administratively disabled */ - MPATHD_FAILBACK_PARTIAL = 4 /* unable to completely failback */ -}; +#define IPMP_REQTIMEOUT 5 /* seconds */ extern int ipmp_connect(int *); extern int ipmp_read(int, void *, size_t, const struct timeval *); diff --git a/usr/src/lib/libipmp/common/ipmp_query.c b/usr/src/lib/libipmp/common/ipmp_query.c index 8a7dc7ee69..a0af2da578 100644 --- a/usr/src/lib/libipmp/common/ipmp_query.c +++ b/usr/src/lib/libipmp/common/ipmp_query.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -18,20 +17,18 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* - * IPMP query interfaces (PSARC/2002/615). + * IPMP query interfaces (see PSARC/2002/615 and PSARC/2007/272). */ #include <assert.h> #include <errno.h> +#include <libinetutil.h> #include <string.h> #include <stdlib.h> #include <unistd.h> @@ -41,13 +38,19 @@ #include "ipmp_mpathd.h" #include "ipmp_query_impl.h" -#define IPMP_REQTIMEOUT 5 /* seconds */ - static ipmp_ifinfo_t *ipmp_ifinfo_clone(ipmp_ifinfo_t *); +static ipmp_addrinfo_t *ipmp_addrinfo_clone(ipmp_addrinfo_t *); +static ipmp_addrlist_t *ipmp_addrlist_clone(ipmp_addrlist_t *); static ipmp_grouplist_t *ipmp_grouplist_clone(ipmp_grouplist_t *); static ipmp_groupinfo_t *ipmp_groupinfo_clone(ipmp_groupinfo_t *); +static ipmp_iflist_t *ipmp_iflist_create(uint_t, char (*)[LIFNAMSIZ]); +static void ipmp_freeiflist(ipmp_iflist_t *); +static ipmp_addrlist_t *ipmp_addrlist_create(uint_t, struct sockaddr_storage *); +static void ipmp_freeaddrlist(ipmp_addrlist_t *); static ipmp_groupinfo_t *ipmp_snap_getgroupinfo(ipmp_snap_t *, const char *); static ipmp_ifinfo_t *ipmp_snap_getifinfo(ipmp_snap_t *, const char *); +static ipmp_addrinfo_t *ipmp_snap_getaddrinfo(ipmp_snap_t *, const char *, + struct sockaddr_storage *); static int ipmp_snap_take(ipmp_state_t *, ipmp_snap_t **); static boolean_t ipmp_checktlv(ipmp_infotype_t, size_t, void *); static int ipmp_querydone(ipmp_state_t *, int); @@ -62,7 +65,7 @@ static int ipmp_querydone(ipmp_state_t *, int); */ static int ipmp_sendquery(ipmp_state_t *statep, ipmp_infotype_t type, const char *name, - struct timeval *endtp) + const void *addr, struct timeval *endtp) { mi_query_t query; mi_result_t result; @@ -72,6 +75,11 @@ ipmp_sendquery(ipmp_state_t *statep, ipmp_infotype_t type, const char *name, query.miq_inforeq = type; switch (type) { + case IPMP_ADDRINFO: + (void) strlcpy(query.miq_grname, name, LIFGRNAMSIZ); + query.miq_addr = *(struct sockaddr_storage *)addr; + break; + case IPMP_GROUPINFO: (void) strlcpy(query.miq_grname, name, LIFGRNAMSIZ); break; @@ -138,6 +146,61 @@ ipmp_readinfo(ipmp_state_t *statep, ipmp_infotype_t infotype, void **infop, } /* + * Using `statep', read in the remaining IPMP group information TLVs from + * in.mpathd into `grinfop' before the current time becomes `endtp'. Returns + * an IPMP error code. On failure, `grinfop' will have its original contents. + */ +static int +ipmp_readgroupinfo_lists(ipmp_state_t *statep, ipmp_groupinfo_t *grinfop, + const struct timeval *endtp) +{ + int retval; + ipmp_iflist_t *iflistp; + ipmp_addrlist_t *adlistp; + + retval = ipmp_readinfo(statep, IPMP_IFLIST, (void **)&iflistp, endtp); + if (retval != IPMP_SUCCESS) + return (retval); + + retval = ipmp_readinfo(statep, IPMP_ADDRLIST, (void **)&adlistp, endtp); + if (retval != IPMP_SUCCESS) { + ipmp_freeiflist(iflistp); + return (retval); + } + + grinfop->gr_iflistp = iflistp; + grinfop->gr_adlistp = adlistp; + return (IPMP_SUCCESS); +} + +/* + * Using `statep', read in the remaining IPMP interface information TLVs from + * in.mpathd into `ifinfop' before the current time becomes `endtp'. Returns + * an IPMP error code. On failure, `ifinfop' will have its original contents. + */ +static int +ipmp_readifinfo_lists(ipmp_state_t *statep, ipmp_ifinfo_t *ifinfop, + const struct timeval *endtp) +{ + int retval; + ipmp_addrlist_t *tlist4p, *tlist6p; + + retval = ipmp_readinfo(statep, IPMP_ADDRLIST, (void **)&tlist4p, endtp); + if (retval != IPMP_SUCCESS) + return (retval); + + retval = ipmp_readinfo(statep, IPMP_ADDRLIST, (void **)&tlist6p, endtp); + if (retval != IPMP_SUCCESS) { + ipmp_freeaddrlist(tlist4p); + return (retval); + } + + ifinfop->if_targinfo4.it_targlistp = tlist4p; + ifinfop->if_targinfo6.it_targlistp = tlist6p; + return (IPMP_SUCCESS); +} + +/* * Complete the query operation started in ipmp_sendquery(). The interface is * designed to be easy to use in the `return' statement of a function, and * thus returns the passed in `retval' and preserves `errno'. @@ -169,7 +232,7 @@ ipmp_getgrouplist(ipmp_handle_t handle, ipmp_grouplist_t **grlistpp) return (*grlistpp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM); } - retval = ipmp_sendquery(statep, IPMP_GROUPLIST, NULL, &end); + retval = ipmp_sendquery(statep, IPMP_GROUPLIST, NULL, NULL, &end); if (retval != IPMP_SUCCESS) return (retval); @@ -196,7 +259,6 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name, ipmp_groupinfo_t **grinfopp) { ipmp_state_t *statep = handle; - ipmp_iflist_t *iflistp; int retval; struct timeval end; ipmp_groupinfo_t *grinfop; @@ -210,7 +272,7 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name, return (*grinfopp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM); } - retval = ipmp_sendquery(statep, IPMP_GROUPINFO, name, &end); + retval = ipmp_sendquery(statep, IPMP_GROUPINFO, name, NULL, &end); if (retval != IPMP_SUCCESS) return (retval); @@ -218,11 +280,9 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name, if (retval != IPMP_SUCCESS) return (ipmp_querydone(statep, retval)); - retval = ipmp_readinfo(statep, IPMP_IFLIST, (void **)&iflistp, &end); + retval = ipmp_readgroupinfo_lists(statep, *grinfopp, &end); if (retval != IPMP_SUCCESS) free(*grinfopp); - else - (*grinfopp)->gr_iflistp = iflistp; return (ipmp_querydone(statep, retval)); } @@ -233,7 +293,8 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name, void ipmp_freegroupinfo(ipmp_groupinfo_t *grinfop) { - free(grinfop->gr_iflistp); + ipmp_freeaddrlist(grinfop->gr_adlistp); + ipmp_freeiflist(grinfop->gr_iflistp); free(grinfop); } @@ -259,11 +320,18 @@ ipmp_getifinfo(ipmp_handle_t handle, const char *name, ipmp_ifinfo_t **ifinfopp) return (*ifinfopp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM); } - retval = ipmp_sendquery(statep, IPMP_IFINFO, name, &end); + retval = ipmp_sendquery(statep, IPMP_IFINFO, name, NULL, &end); if (retval != IPMP_SUCCESS) return (retval); retval = ipmp_readinfo(statep, IPMP_IFINFO, (void **)ifinfopp, &end); + if (retval != IPMP_SUCCESS) + return (ipmp_querydone(statep, retval)); + + retval = ipmp_readifinfo_lists(statep, *ifinfopp, &end); + if (retval != IPMP_SUCCESS) + free(*ifinfopp); + return (ipmp_querydone(statep, retval)); } @@ -273,10 +341,52 @@ ipmp_getifinfo(ipmp_handle_t handle, const char *name, ipmp_ifinfo_t **ifinfopp) void ipmp_freeifinfo(ipmp_ifinfo_t *ifinfop) { + ipmp_freeaddrlist(ifinfop->if_targinfo4.it_targlistp); + ipmp_freeaddrlist(ifinfop->if_targinfo6.it_targlistp); free(ifinfop); } /* + * Using `handle', get the address information associated with address `addrp' + * on group `grname' and store the results in a dynamically allocated buffer + * pointed to by `*adinfopp'. Returns an IPMP error code. + */ +int +ipmp_getaddrinfo(ipmp_handle_t handle, const char *grname, + struct sockaddr_storage *addrp, ipmp_addrinfo_t **adinfopp) +{ + ipmp_state_t *statep = handle; + ipmp_addrinfo_t *adinfop; + int retval; + struct timeval end; + + if (statep->st_snap != NULL) { + adinfop = ipmp_snap_getaddrinfo(statep->st_snap, grname, addrp); + if (adinfop == NULL) + return (IPMP_EUNKADDR); + + *adinfopp = ipmp_addrinfo_clone(adinfop); + return (*adinfopp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM); + } + + retval = ipmp_sendquery(statep, IPMP_ADDRINFO, grname, addrp, &end); + if (retval != IPMP_SUCCESS) + return (retval); + + retval = ipmp_readinfo(statep, IPMP_ADDRINFO, (void **)adinfopp, &end); + return (ipmp_querydone(statep, retval)); +} + +/* + * Free the address information pointed to by `adinfop'. + */ +void +ipmp_freeaddrinfo(ipmp_addrinfo_t *adinfop) +{ + free(adinfop); +} + +/* * Check if `buf' has a NUL byte in its first `bufsize' bytes. */ static boolean_t @@ -300,12 +410,25 @@ ipmp_checktlv(ipmp_infotype_t type, size_t len, void *value) ipmp_ifinfo_t *ifinfop; ipmp_grouplist_t *grlistp; ipmp_groupinfo_t *grinfop; + ipmp_addrlist_t *adlistp; unsigned int i; switch (type) { + case IPMP_ADDRINFO: + if (len != sizeof (ipmp_addrinfo_t)) + return (B_FALSE); + break; + + case IPMP_ADDRLIST: + adlistp = (ipmp_addrlist_t *)value; + if (len < IPMP_ADDRLIST_SIZE(0) || + len < IPMP_ADDRLIST_SIZE(adlistp->al_naddr)) + return (B_FALSE); + break; + case IPMP_IFLIST: iflistp = (ipmp_iflist_t *)value; - if (len < IPMP_IFLIST_MINSIZE || + if (len < IPMP_IFLIST_SIZE(0) || len < IPMP_IFLIST_SIZE(iflistp->il_nif)) return (B_FALSE); @@ -326,7 +449,7 @@ ipmp_checktlv(ipmp_infotype_t type, size_t len, void *value) case IPMP_GROUPLIST: grlistp = (ipmp_grouplist_t *)value; - if (len < IPMP_GROUPLIST_MINSIZE || + if (len < IPMP_GROUPLIST_SIZE(0) || len < IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup)) return (B_FALSE); @@ -357,9 +480,8 @@ ipmp_checktlv(ipmp_infotype_t type, size_t len, void *value) } /* - * Create a group list with signature `sig' containing `ngroup' groups named - * by `groups'. Returns a pointer to the new group list on success, or NULL - * on failure. + * Create a group list; arguments match ipmp_grouplist_t fields. Returns a + * pointer to the new group list on success, or NULL on failure. */ ipmp_grouplist_t * ipmp_grouplist_create(uint64_t sig, unsigned int ngroup, @@ -392,13 +514,80 @@ ipmp_grouplist_clone(ipmp_grouplist_t *grlistp) } /* - * Create an interface information structure for interface `name' and - * associate `group', `state' and `type' with it. Returns a pointer to the - * interface information on success, or NULL on failure. + * Create target information; arguments match ipmp_targinfo_t fields. Returns + * a pointer to the new target info on success, or NULL on failure. + */ +ipmp_targinfo_t * +ipmp_targinfo_create(const char *name, struct sockaddr_storage *testaddrp, + ipmp_if_targmode_t targmode, uint_t ntarg, struct sockaddr_storage *targs) +{ + ipmp_targinfo_t *targinfop; + + targinfop = malloc(sizeof (ipmp_targinfo_t)); + if (targinfop == NULL) + return (NULL); + + targinfop->it_testaddr = *testaddrp; + targinfop->it_targmode = targmode; + targinfop->it_targlistp = ipmp_addrlist_create(ntarg, targs); + if (targinfop->it_targlistp == NULL) { + ipmp_freetarginfo(targinfop); + return (NULL); + } + (void) strlcpy(targinfop->it_name, name, LIFNAMSIZ); + + return (targinfop); +} + +/* + * Free the target information pointed to by `targinfop'. + */ +void +ipmp_freetarginfo(ipmp_targinfo_t *targinfop) +{ + free(targinfop->it_targlistp); + free(targinfop); +} + +/* + * Create an interface list; arguments match ipmp_iflist_t fields. Returns a + * pointer to the new interface list on success, or NULL on failure. + */ +static ipmp_iflist_t * +ipmp_iflist_create(uint_t nif, char (*ifs)[LIFNAMSIZ]) +{ + unsigned int i; + ipmp_iflist_t *iflistp; + + iflistp = malloc(IPMP_IFLIST_SIZE(nif)); + if (iflistp == NULL) + return (NULL); + + iflistp->il_nif = nif; + for (i = 0; i < nif; i++) + (void) strlcpy(iflistp->il_ifs[i], ifs[i], LIFNAMSIZ); + + return (iflistp); +} + +/* + * Free the interface list pointed to by `iflistp'. + */ +static void +ipmp_freeiflist(ipmp_iflist_t *iflistp) +{ + free(iflistp); +} + +/* + * Create an interface; arguments match ipmp_ifinfo_t fields. Returns a + * pointer to the new interface on success, or NULL on failure. */ ipmp_ifinfo_t * ipmp_ifinfo_create(const char *name, const char *group, ipmp_if_state_t state, - ipmp_if_type_t type) + ipmp_if_type_t type, ipmp_if_linkstate_t linkstate, + ipmp_if_probestate_t probestate, ipmp_if_flags_t flags, + ipmp_targinfo_t *targinfo4p, ipmp_targinfo_t *targinfo6p) { ipmp_ifinfo_t *ifinfop; @@ -408,8 +597,25 @@ ipmp_ifinfo_create(const char *name, const char *group, ipmp_if_state_t state, (void) strlcpy(ifinfop->if_name, name, LIFNAMSIZ); (void) strlcpy(ifinfop->if_group, group, LIFGRNAMSIZ); - ifinfop->if_state = state; - ifinfop->if_type = type; + + ifinfop->if_state = state; + ifinfop->if_type = type; + ifinfop->if_linkstate = linkstate; + ifinfop->if_probestate = probestate; + ifinfop->if_flags = flags; + ifinfop->if_targinfo4 = *targinfo4p; + ifinfop->if_targinfo6 = *targinfo6p; + + ifinfop->if_targinfo4.it_targlistp = + ipmp_addrlist_clone(targinfo4p->it_targlistp); + ifinfop->if_targinfo6.it_targlistp = + ipmp_addrlist_clone(targinfo6p->it_targlistp); + + if (ifinfop->if_targinfo4.it_targlistp == NULL || + ifinfop->if_targinfo6.it_targlistp == NULL) { + ipmp_freeifinfo(ifinfop); + return (NULL); + } return (ifinfop); } @@ -422,40 +628,41 @@ ipmp_ifinfo_t * ipmp_ifinfo_clone(ipmp_ifinfo_t *ifinfop) { return (ipmp_ifinfo_create(ifinfop->if_name, ifinfop->if_group, - ifinfop->if_state, ifinfop->if_type)); + ifinfop->if_state, ifinfop->if_type, ifinfop->if_linkstate, + ifinfop->if_probestate, ifinfop->if_flags, &ifinfop->if_targinfo4, + &ifinfop->if_targinfo6)); } /* - * Create a group named `name' with signature `sig', in state `state', and - * with the `nif' interfaces named by `ifs' as members. Returns a pointer + * Create a group; arguments match ipmp_groupinfo_t fields. Returns a pointer * to the new group on success, or NULL on failure. */ ipmp_groupinfo_t * -ipmp_groupinfo_create(const char *name, uint64_t sig, ipmp_group_state_t state, - unsigned int nif, char (*ifs)[LIFNAMSIZ]) +ipmp_groupinfo_create(const char *name, uint64_t sig, uint_t fdt, + ipmp_group_state_t state, uint_t nif, char (*ifs)[LIFNAMSIZ], + const char *grifname, const char *m4ifname, const char *m6ifname, + const char *bcifname, uint_t naddr, struct sockaddr_storage *addrs) { ipmp_groupinfo_t *grinfop; - ipmp_iflist_t *iflistp; - unsigned int i; grinfop = malloc(sizeof (ipmp_groupinfo_t)); if (grinfop == NULL) return (NULL); - iflistp = malloc(IPMP_IFLIST_SIZE(nif)); - if (iflistp == NULL) { - free(grinfop); + grinfop->gr_sig = sig; + grinfop->gr_fdt = fdt; + grinfop->gr_state = state; + grinfop->gr_iflistp = ipmp_iflist_create(nif, ifs); + grinfop->gr_adlistp = ipmp_addrlist_create(naddr, addrs); + if (grinfop->gr_iflistp == NULL || grinfop->gr_adlistp == NULL) { + ipmp_freegroupinfo(grinfop); return (NULL); } - - grinfop->gr_sig = sig; - grinfop->gr_state = state; - grinfop->gr_iflistp = iflistp; (void) strlcpy(grinfop->gr_name, name, LIFGRNAMSIZ); - - iflistp->il_nif = nif; - for (i = 0; i < nif; i++) - (void) strlcpy(iflistp->il_ifs[i], ifs[i], LIFNAMSIZ); + (void) strlcpy(grinfop->gr_ifname, grifname, LIFNAMSIZ); + (void) strlcpy(grinfop->gr_m4ifname, m4ifname, LIFNAMSIZ); + (void) strlcpy(grinfop->gr_m6ifname, m6ifname, LIFNAMSIZ); + (void) strlcpy(grinfop->gr_bcifname, bcifname, LIFNAMSIZ); return (grinfop); } @@ -467,9 +674,86 @@ ipmp_groupinfo_create(const char *name, uint64_t sig, ipmp_group_state_t state, ipmp_groupinfo_t * ipmp_groupinfo_clone(ipmp_groupinfo_t *grinfop) { + ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; + return (ipmp_groupinfo_create(grinfop->gr_name, grinfop->gr_sig, - grinfop->gr_state, grinfop->gr_iflistp->il_nif, - grinfop->gr_iflistp->il_ifs)); + grinfop->gr_fdt, grinfop->gr_state, grinfop->gr_iflistp->il_nif, + grinfop->gr_iflistp->il_ifs, grinfop->gr_ifname, + grinfop->gr_m4ifname, grinfop->gr_m6ifname, grinfop->gr_bcifname, + adlistp->al_naddr, adlistp->al_addrs)); +} + +/* + * Create an address list; arguments match ipmp_addrlist_t fields. Returns + * a pointer to the new address list on success, or NULL on failure. + */ +static ipmp_addrlist_t * +ipmp_addrlist_create(uint_t naddr, struct sockaddr_storage *addrs) +{ + unsigned int i; + ipmp_addrlist_t *adlistp; + + adlistp = malloc(IPMP_ADDRLIST_SIZE(naddr)); + if (adlistp == NULL) + return (NULL); + + adlistp->al_naddr = naddr; + for (i = 0; i < naddr; i++) + adlistp->al_addrs[i] = addrs[i]; + + return (adlistp); +} + +/* + * Clone the address list named by `adlistp'. Returns a pointer to the clone + * on success, or NULL on failure. + */ +static ipmp_addrlist_t * +ipmp_addrlist_clone(ipmp_addrlist_t *adlistp) +{ + return (ipmp_addrlist_create(adlistp->al_naddr, adlistp->al_addrs)); +} + +/* + * Free the address list pointed to by `adlistp'. + */ +static void +ipmp_freeaddrlist(ipmp_addrlist_t *adlistp) +{ + free(adlistp); +} + +/* + * Create an address; arguments match ipmp_addrinfo_t fields. Returns a + * pointer to the new address on success, or NULL on failure. + */ +ipmp_addrinfo_t * +ipmp_addrinfo_create(struct sockaddr_storage *addrp, ipmp_addr_state_t state, + const char *group, const char *binding) +{ + ipmp_addrinfo_t *adinfop; + + adinfop = malloc(sizeof (ipmp_addrinfo_t)); + if (adinfop == NULL) + return (NULL); + + adinfop->ad_addr = *addrp; + adinfop->ad_state = state; + (void) strlcpy(adinfop->ad_group, group, LIFGRNAMSIZ); + (void) strlcpy(adinfop->ad_binding, binding, LIFNAMSIZ); + + return (adinfop); +} + +/* + * Clone the address information named by `adinfop'. Returns a pointer to + * the clone on success, or NULL on failure. + */ +ipmp_addrinfo_t * +ipmp_addrinfo_clone(ipmp_addrinfo_t *adinfop) +{ + return (ipmp_addrinfo_create(&adinfop->ad_addr, adinfop->ad_state, + adinfop->ad_group, adinfop->ad_binding)); } /* @@ -523,8 +807,10 @@ ipmp_snap_create(void) snap->sn_grlistp = NULL; snap->sn_grinfolistp = NULL; snap->sn_ifinfolistp = NULL; + snap->sn_adinfolistp = NULL; snap->sn_ngroup = 0; snap->sn_nif = 0; + snap->sn_naddr = 0; return (snap); } @@ -536,6 +822,7 @@ void ipmp_snap_free(ipmp_snap_t *snap) { ipmp_ifinfolist_t *iflp, *ifnext; + ipmp_addrinfolist_t *adlp, *adnext; ipmp_groupinfolist_t *grlp, *grnext; ipmp_freegrouplist(snap->sn_grlistp); @@ -552,6 +839,12 @@ ipmp_snap_free(ipmp_snap_t *snap) free(iflp); } + for (adlp = snap->sn_adinfolistp; adlp != NULL; adlp = adnext) { + adnext = adlp->adl_next; + ipmp_freeaddrinfo(adlp->adl_adinfop); + free(adlp); + } + free(snap); } @@ -612,6 +905,34 @@ ipmp_snap_addifinfo(ipmp_snap_t *snap, ipmp_ifinfo_t *ifinfop) } /* + * Add the address information in `adinfop' to the snapshot named by `snap'. + * Returns an IPMP error code. + */ +int +ipmp_snap_addaddrinfo(ipmp_snap_t *snap, ipmp_addrinfo_t *adinfop) +{ + ipmp_addrinfolist_t *adlp; + + /* + * Any duplicate addresses should've already been weeded by in.mpathd. + */ + if (ipmp_snap_getaddrinfo(snap, adinfop->ad_group, + &adinfop->ad_addr) != NULL) + return (IPMP_EPROTO); + + adlp = malloc(sizeof (ipmp_addrinfolist_t)); + if (adlp == NULL) + return (IPMP_ENOMEM); + + adlp->adl_adinfop = adinfop; + adlp->adl_next = snap->sn_adinfolistp; + snap->sn_adinfolistp = adlp; + snap->sn_naddr++; + + return (IPMP_SUCCESS); +} + +/* * Retrieve the information for the group `name' in snapshot `snap'. * Returns a pointer to the group information on success, or NULL on failure. */ @@ -647,6 +968,26 @@ ipmp_snap_getifinfo(ipmp_snap_t *snap, const char *name) } /* + * Retrieve the information for the address `addrp' on group `grname' in + * snapshot `snap'. Returns a pointer to the address information on success, + * or NULL on failure. + */ +static ipmp_addrinfo_t * +ipmp_snap_getaddrinfo(ipmp_snap_t *snap, const char *grname, + struct sockaddr_storage *addrp) +{ + ipmp_addrinfolist_t *adlp; + + for (adlp = snap->sn_adinfolistp; adlp != NULL; adlp = adlp->adl_next) { + if (strcmp(grname, adlp->adl_adinfop->ad_group) == 0 && + sockaddrcmp(addrp, &adlp->adl_adinfop->ad_addr)) + break; + } + + return (adlp != NULL ? adlp->adl_adinfop : NULL); +} + +/* * Using `statep', take a snapshot of the IPMP subsystem and if successful * return it in a dynamically allocated snapshot pointed to by `*snapp'. * Returns an IPMP error code. @@ -656,7 +997,6 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp) { ipmp_snap_t *snap, *osnap; ipmp_infotype_t type; - ipmp_iflist_t *iflistp; int retval; size_t len; void *infop; @@ -666,7 +1006,7 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp) if (snap == NULL) return (IPMP_ENOMEM); - retval = ipmp_sendquery(statep, IPMP_SNAP, NULL, &end); + retval = ipmp_sendquery(statep, IPMP_SNAP, NULL, NULL, &end); if (retval != IPMP_SUCCESS) { ipmp_snap_free(snap); return (retval); @@ -679,12 +1019,11 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp) } /* - * Using the information in the passed `osnap' snapshot, build up our - * own snapshot. If we receive more than one grouplist, or more than - * the expected number of interfaces or groups, then bail out. Note - * that there's only so much we can do to check that the information - * sent by in.mpathd makes sense. We know there will always be at - * least one TLV (IPMP_GROUPLIST). + * Using the information in the `osnap' snapshot, build up our own + * snapshot. We know there will always be at least one TLV (for + * IPMP_GROUPLIST). If we receive anything illogical (e.g., more than + * the expected number of interfaces), then bail out. However, to a + * large extent we have to trust the information sent by in.mpathd. */ do { infop = NULL; @@ -711,7 +1050,32 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp) retval = IPMP_EPROTO; break; } + + /* + * Read in V4 and V6 targlist TLVs that follow. + */ + retval = ipmp_readifinfo_lists(statep, infop, &end); + if (retval != IPMP_SUCCESS) + break; + retval = ipmp_snap_addifinfo(snap, infop); + if (retval != IPMP_SUCCESS) { + ipmp_freeifinfo(infop); + infop = NULL; + } + break; + + case IPMP_ADDRINFO: + if (snap->sn_naddr == osnap->sn_naddr) { + retval = IPMP_EPROTO; + break; + } + + retval = ipmp_snap_addaddrinfo(snap, infop); + /* + * NOTE: since we didn't call ipmp_read*info_lists(), + * no need to use ipmp_freeaddrinfo() on failure. + */ break; case IPMP_GROUPINFO: @@ -721,18 +1085,17 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp) } /* - * An IPMP_IFLIST TLV always follows the - * IPMP_GROUPINFO TLV; read it in. + * Read in IPMP groupinfo list TLVs that follow. */ - retval = ipmp_readinfo(statep, IPMP_IFLIST, - (void **)&iflistp, &end); + retval = ipmp_readgroupinfo_lists(statep, infop, &end); if (retval != IPMP_SUCCESS) break; - ((ipmp_groupinfo_t *)infop)->gr_iflistp = iflistp; retval = ipmp_snap_addgroupinfo(snap, infop); - if (retval != IPMP_SUCCESS) - free(iflistp); + if (retval != IPMP_SUCCESS) { + ipmp_freegroupinfo(infop); + infop = NULL; + } break; default: @@ -747,7 +1110,8 @@ fail: return (ipmp_querydone(statep, retval)); } } while (snap->sn_grlistp == NULL || snap->sn_nif < osnap->sn_nif || - snap->sn_ngroup < osnap->sn_ngroup); + snap->sn_ngroup < osnap->sn_ngroup || + snap->sn_naddr < osnap->sn_naddr); free(osnap); *snapp = snap; diff --git a/usr/src/lib/libipmp/common/ipmp_query.h b/usr/src/lib/libipmp/common/ipmp_query.h index d92554887a..160f561dd2 100644 --- a/usr/src/lib/libipmp/common/ipmp_query.h +++ b/usr/src/lib/libipmp/common/ipmp_query.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -18,17 +17,14 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPMP_QUERY_H #define _IPMP_QUERY_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/socket.h> /* needed by <net/if.h> */ #include <net/if.h> /* for LIF*NAMSIZ */ @@ -38,7 +34,7 @@ * IPMP query interfaces. * * These interfaces may only be used within ON or after signing a contract - * with ON. For documentation, refer to PSARC/2002/615. + * with ON. For documentation, refer to PSARC/2002/615 and PSARC/2007/272. */ #ifdef __cplusplus @@ -46,6 +42,43 @@ extern "C" { #endif /* + * Assorted enumerations used in the data types described below. + */ +typedef enum ipmp_if_probestate { + IPMP_PROBE_OK, /* probes detect no problems */ + IPMP_PROBE_FAILED, /* probes detect failure */ + IPMP_PROBE_UNKNOWN, /* probe detection unavailable */ + IPMP_PROBE_DISABLED /* probe detection disabled */ +} ipmp_if_probestate_t; + +typedef enum ipmp_if_linkstate { + IPMP_LINK_UP, /* link detects up */ + IPMP_LINK_DOWN, /* link detects down */ + IPMP_LINK_UNKNOWN /* link detection unavailable */ +} ipmp_if_linkstate_t; + +typedef enum ipmp_if_flags { + IPMP_IFFLAG_INACTIVE = 0x1, + IPMP_IFFLAG_HWADDRDUP = 0x2, + IPMP_IFFLAG_ACTIVE = 0x4, + IPMP_IFFLAG_DOWN = 0x8 +} ipmp_if_flags_t; + +typedef enum ipmp_addr_state { + IPMP_ADDR_UP, /* address is up */ + IPMP_ADDR_DOWN /* address is down */ +} ipmp_addr_state_t; + +typedef enum ipmp_if_targmode { + IPMP_TARG_DISABLED, /* use of targets is disabled */ + IPMP_TARG_ROUTES, /* route-learned targets */ + IPMP_TARG_MULTICAST /* multicast-learned targets */ +} ipmp_if_targmode_t; + +#define IPMP_LIST_SIZE(listtype, elsize, nel) \ + ((sizeof (ipmp_ ## listtype ## _t) - (elsize)) + ((nel) * (elsize))) + +/* * Data type describing a list of IPMP groups. */ typedef struct ipmp_grouplist { @@ -54,8 +87,8 @@ typedef struct ipmp_grouplist { char gl_groups[1][LIFGRNAMSIZ]; } ipmp_grouplist_t; -#define IPMP_GROUPLIST_MINSIZE (sizeof (ipmp_grouplist_t) - LIFGRNAMSIZ) -#define IPMP_GROUPLIST_SIZE(ngr) (IPMP_GROUPLIST_MINSIZE + (ngr) * LIFGRNAMSIZ) +#define IPMP_GROUPLIST_SIZE(ngr) \ + IPMP_LIST_SIZE(grouplist, LIFGRNAMSIZ, ngr) /* * Data type describing a list of interfaces. @@ -65,8 +98,19 @@ typedef struct ipmp_iflist { char il_ifs[1][LIFNAMSIZ]; } ipmp_iflist_t; -#define IPMP_IFLIST_MINSIZE (sizeof (ipmp_iflist_t) - LIFNAMSIZ) -#define IPMP_IFLIST_SIZE(nif) (IPMP_IFLIST_MINSIZE + (nif) * LIFNAMSIZ) +#define IPMP_IFLIST_SIZE(nif) \ + IPMP_LIST_SIZE(iflist, LIFNAMSIZ, nif) + +/* + * Data type describing a list of addresses. + */ +typedef struct ipmp_addrlist { + unsigned int al_naddr; + struct sockaddr_storage al_addrs[1]; +} ipmp_addrlist_t; + +#define IPMP_ADDRLIST_SIZE(naddr) \ + IPMP_LIST_SIZE(addrlist, sizeof (struct sockaddr_storage), naddr) /* * Data type describing the state of an IPMP group. @@ -76,18 +120,49 @@ typedef struct ipmp_groupinfo { uint64_t gr_sig; ipmp_group_state_t gr_state; ipmp_iflist_t *gr_iflistp; + ipmp_addrlist_t *gr_adlistp; + char gr_ifname[LIFNAMSIZ]; + char gr_m4ifname[LIFNAMSIZ]; + char gr_m6ifname[LIFNAMSIZ]; + char gr_bcifname[LIFNAMSIZ]; + unsigned int gr_fdt; } ipmp_groupinfo_t; /* + * Data type describing IPMP target information for a particular interface. + */ +typedef struct ipmp_targinfo { + char it_name[LIFNAMSIZ]; + struct sockaddr_storage it_testaddr; + ipmp_if_targmode_t it_targmode; + ipmp_addrlist_t *it_targlistp; +} ipmp_targinfo_t; + +/* * Data type describing the IPMP-related state of an interface. */ typedef struct ipmp_ifinfo { - char if_name[LIFNAMSIZ]; - char if_group[LIFGRNAMSIZ]; - ipmp_if_state_t if_state; - ipmp_if_type_t if_type; + char if_name[LIFNAMSIZ]; + char if_group[LIFGRNAMSIZ]; + ipmp_if_state_t if_state; + ipmp_if_type_t if_type; + ipmp_if_linkstate_t if_linkstate; + ipmp_if_probestate_t if_probestate; + ipmp_if_flags_t if_flags; + ipmp_targinfo_t if_targinfo4; + ipmp_targinfo_t if_targinfo6; } ipmp_ifinfo_t; +/* + * Data type describing an IPMP data address. + */ +typedef struct ipmp_addrinfo { + struct sockaddr_storage ad_addr; + ipmp_addr_state_t ad_state; + char ad_group[LIFGRNAMSIZ]; + char ad_binding[LIFNAMSIZ]; +} ipmp_addrinfo_t; + typedef enum { IPMP_QCONTEXT_LIVE, IPMP_QCONTEXT_SNAP @@ -100,6 +175,9 @@ extern int ipmp_getgroupinfo(ipmp_handle_t, const char *, ipmp_groupinfo_t **); extern void ipmp_freegroupinfo(ipmp_groupinfo_t *); extern int ipmp_getifinfo(ipmp_handle_t, const char *, ipmp_ifinfo_t **); extern void ipmp_freeifinfo(ipmp_ifinfo_t *); +extern int ipmp_getaddrinfo(ipmp_handle_t, const char *, + struct sockaddr_storage *, ipmp_addrinfo_t **); +extern void ipmp_freeaddrinfo(ipmp_addrinfo_t *); #ifdef __cplusplus } diff --git a/usr/src/lib/libipmp/common/ipmp_query_impl.h b/usr/src/lib/libipmp/common/ipmp_query_impl.h index 03ecb5cd84..6ac5c3ca27 100644 --- a/usr/src/lib/libipmp/common/ipmp_query_impl.h +++ b/usr/src/lib/libipmp/common/ipmp_query_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -18,17 +17,14 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPMP_QUERY_IMPL_H #define _IPMP_QUERY_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <ipmp_query.h> /* @@ -58,14 +54,24 @@ typedef struct ipmp_ifinfolist { } ipmp_ifinfolist_t; /* + * List of ipmp_addrinfo_t structures. + */ +typedef struct ipmp_addrinfolist { + struct ipmp_addrinfolist *adl_next; + ipmp_addrinfo_t *adl_adinfop; +} ipmp_addrinfolist_t; + +/* * Snapshot of IPMP state. */ typedef struct ipmp_snap { ipmp_grouplist_t *sn_grlistp; ipmp_groupinfolist_t *sn_grinfolistp; ipmp_ifinfolist_t *sn_ifinfolistp; + ipmp_addrinfolist_t *sn_adinfolistp; unsigned int sn_ngroup; unsigned int sn_nif; + unsigned int sn_naddr; } ipmp_snap_t; /* @@ -74,17 +80,28 @@ typedef struct ipmp_snap { extern ipmp_snap_t *ipmp_snap_create(void); extern void ipmp_snap_free(ipmp_snap_t *); extern int ipmp_snap_addifinfo(ipmp_snap_t *, ipmp_ifinfo_t *); +extern int ipmp_snap_addaddrinfo(ipmp_snap_t *, ipmp_addrinfo_t *); extern int ipmp_snap_addgroupinfo(ipmp_snap_t *, ipmp_groupinfo_t *); /* - * IPMP structure creation routines. + * IPMP structure creation/destruction routines. */ extern ipmp_ifinfo_t *ipmp_ifinfo_create(const char *, const char *, - ipmp_if_state_t, ipmp_if_type_t); -extern ipmp_groupinfo_t *ipmp_groupinfo_create(const char *, uint64_t, - ipmp_group_state_t, unsigned int, char (*)[LIFNAMSIZ]); + ipmp_if_state_t, ipmp_if_type_t, ipmp_if_linkstate_t, ipmp_if_probestate_t, + ipmp_if_flags_t, ipmp_targinfo_t *, ipmp_targinfo_t *); +extern ipmp_groupinfo_t *ipmp_groupinfo_create(const char *, uint64_t, uint_t, + ipmp_group_state_t, uint_t, char (*)[LIFNAMSIZ], const char *, + const char *, const char *, const char *, uint_t, + struct sockaddr_storage *); extern ipmp_grouplist_t *ipmp_grouplist_create(uint64_t, unsigned int, char (*)[LIFGRNAMSIZ]); +extern ipmp_addrinfo_t *ipmp_addrinfo_create(struct sockaddr_storage *, + ipmp_addr_state_t, const char *, const char *); +extern ipmp_targinfo_t *ipmp_targinfo_create(const char *, + struct sockaddr_storage *, ipmp_if_targmode_t, uint_t, + struct sockaddr_storage *); +extern void ipmp_freetarginfo(ipmp_targinfo_t *); + #ifdef __cplusplus } diff --git a/usr/src/lib/libipmp/common/llib-lipmp b/usr/src/lib/libipmp/common/llib-lipmp index a16011745a..a22eec5d66 100644 --- a/usr/src/lib/libipmp/common/llib-lipmp +++ b/usr/src/lib/libipmp/common/llib-lipmp @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* LINTLIBRARY */ /* PROTOLIB1 */ #include <ipmp.h> +#include <ipmp_admin.h> #include <ipmp_mpathd.h> #include <ipmp_query_impl.h> diff --git a/usr/src/lib/libipmp/common/mapfile-vers b/usr/src/lib/libipmp/common/mapfile-vers index a4052bfcd3..8c93248338 100644 --- a/usr/src/lib/libipmp/common/mapfile-vers +++ b/usr/src/lib/libipmp/common/mapfile-vers @@ -19,32 +19,39 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# SUNWprivate_1.1 { global: + ipmp_addrinfo_create; ipmp_close; ipmp_errmsg; + ipmp_freeaddrinfo; ipmp_freegroupinfo; ipmp_freegrouplist; ipmp_freeifinfo; + ipmp_freetarginfo; + ipmp_getaddrinfo; ipmp_getgroupinfo; ipmp_getgrouplist; ipmp_getifinfo; ipmp_groupinfo_create; ipmp_grouplist_create; ipmp_ifinfo_create; + ipmp_offline; ipmp_open; + ipmp_ping_daemon; ipmp_read; ipmp_setqcontext; + ipmp_snap_addaddrinfo; ipmp_snap_addgroupinfo; ipmp_snap_addifinfo; ipmp_snap_create; ipmp_snap_free; + ipmp_targinfo_create; + ipmp_undo_offline; ipmp_write; ipmp_writetlv; local: diff --git a/usr/src/lib/libnsl/nss/netdir_inet_sundry.c b/usr/src/lib/libnsl/nss/netdir_inet_sundry.c index 742e7408b2..4e9473a8cf 100644 --- a/usr/src/lib/libnsl/nss/netdir_inet_sundry.c +++ b/usr/src/lib/libnsl/nss/netdir_inet_sundry.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,7 +20,7 @@ */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * lib/libnsl/nss/netdir_inet_sundry.c @@ -39,8 +38,6 @@ * Copied mostly from erstwhile lib/nametoaddr/tcpip/tcpip.c. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mt.h" #include <stdlib.h> #include <stdio.h> @@ -69,9 +66,6 @@ #include <syslog.h> #include <values.h> #include <limits.h> -#ifdef DEBUG -#include <stdio.h> -#endif #include <nss_dbdefs.h> #include "nss.h" @@ -151,8 +145,8 @@ __inet_taddr2uaddr(struct netconfig *tp, struct netbuf *addr) /* LINTED pointer cast */ sa6 = (struct sockaddr_in6 *)(addr->buf); myport = ntohs(sa6->sin6_port); - if (inet_ntop(AF_INET6, (void *)sa6->sin6_addr.s6_addr, - tmp, sizeof (tmp)) == 0) { + if (inet_ntop(AF_INET6, sa6->sin6_addr.s6_addr, tmp, + sizeof (tmp)) == NULL) { _nderror = ND_BADARG; return (NULL); } @@ -400,7 +394,7 @@ getifnum: continue; if_info[n_ifs].if_address = - ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr; + ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr; if (nss_ioctl(AF_INET, SIOCGLIFFLAGS, lifr) < 0) continue; @@ -413,7 +407,7 @@ getifnum: continue; if_info[n_ifs].if_netmask = - ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr; + ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr; n_ifs++; } free(buf); @@ -528,21 +522,12 @@ get_best_match(struct in_addr addr) if_addr = ntohl(ifn->if_address.s_addr); /* host order */ /* - * Checking if the interface selected is FAILED or DEPRECATED. - * In case IFF_FAILED or IFF_DEPRECATED flag for the interface - * is set, we move on to the next interface in the list. - * Refer IPMP(IP Multi Pathing) for more details. - */ - - if ((ifn->if_flags & (IFF_FAILED | IFF_DEPRECATED)) != 0) - continue; - - /* * set initial count to first bit set in netmask, with * zero being the number of the least significant bit. */ - for (count = 0, mask = netmask; mask && ((mask & 1) == 0); - count++, mask >>= 1); + count = 0; + for (mask = netmask; mask && ((mask & 1) == 0); mask >>= 1) + count++; /* * Set limit so that we don't try to match prefixes shorter @@ -570,12 +555,6 @@ get_best_match(struct in_addr addr) * (2) the best partial subnet match * (3) the first non-loopback && non-PPP interface * (4) the first non-loopback interface (PPP is OK) - * - * While checking for condition (3) and (4), we also look - * if the interface we are returning is neither FAILED - * nor DEPRECATED. In case there are no interface - * available, which are neither FAILED nor DEPRECRATED, - * we return 0. */ found = FALSE; while (netmask && count < subnet_count) { @@ -607,8 +586,7 @@ get_best_match(struct in_addr addr) */ if (bestmatch == NULL) { for (ifn = if_info; ifn < (if_info + n_ifs); ifn++) { - if ((ifn->if_flags & (IFF_LOOPBACK | - IFF_FAILED | IFF_DEPRECATED)) == 0) { + if ((ifn->if_flags & IFF_LOOPBACK) == 0) { bestmatch = ifn; /* @@ -619,10 +597,6 @@ get_best_match(struct in_addr addr) * list... */ if ((ifn->if_flags & IFF_POINTOPOINT) == 0) { -#ifdef DEBUG - (void) printf("found !loopback && !non-PPP interface: %s\n", - inet_ntoa(ifn->if_address)); -#endif break; } } @@ -701,9 +675,9 @@ select_server_addr(union any_in_addr *dst_addr, int family, } /* open a UDP socket */ - if ((tmp_fd = _so_socket(family, SOCK_DGRAM, 0, - NULL, SOV_SOCKBSD)) < 0) { - syslog(LOG_ERR, "selsect_server_addr:connect failed\n"); + tmp_fd = _so_socket(family, SOCK_DGRAM, 0, NULL, SOV_SOCKBSD); + if (tmp_fd < 0) { + syslog(LOG_ERR, "select_server_addr: connect failed\n"); return (FALSE); } @@ -716,15 +690,16 @@ select_server_addr(union any_in_addr *dst_addr, int family, * message, as it'll try to send the probe packet out and will * receive ICMP unreachable. */ - if (family == AF_INET) + if (family == AF_INET) { src_addr->addr.s_addr = INADDR_ANY; - else + } else { /* * Since in6addr_any is not in the scope * use the following hack */ (void) memset(src_addr->addr6.s6_addr, - 0, sizeof (struct in6_addr)); + 0, sizeof (struct in6_addr)); + } (void) close(tmp_fd); free(sock); return (FALSE); @@ -732,7 +707,7 @@ select_server_addr(union any_in_addr *dst_addr, int family, /* get the local sock info */ if (_so_getsockname(tmp_fd, sock, &sock_len, SOV_DEFAULT) < 0) { - syslog(LOG_ERR, "selsect_server_addr:getsockname failed\n"); + syslog(LOG_ERR, "select_server_addr: getsockname failed\n"); (void) close(tmp_fd); free(sock); return (FALSE); @@ -799,11 +774,6 @@ inet_netdir_mergeaddr(struct netconfig *tp, char *ruaddr, char *uaddr) clientaddr.s_addr = inet_addr(ruaddr); -#ifdef DEBUG - (void) printf("client's address is %s and %s\n", - ruaddr, inet_ntoa(clientaddr)); -#endif - /* We know cp is not NULL due to the check above */ *cp = '.'; /* Put the dot back in the IP addr */ @@ -895,28 +865,22 @@ inet_netdir_mergeaddr(struct netconfig *tp, char *ruaddr, char *uaddr) FALSE) return (NULL); server_addr.sin6_addr = out_addr.addr6; + } else { + (void) memcpy(&server_addr, &sa, sizeof (server_addr)); } - else - (void) memcpy(&server_addr, &sa, - sizeof (struct sockaddr_in6)); -#ifdef DEBUG - printf("%s\n", inet_ntop(af, out_addr.addr6.s6_addr, - tmp, sizeof (tmp))); -#endif - - if (inet_ntop(af, server_addr.sin6_addr.s6_addr, - tmp, sizeof (tmp)) == NULL) { + + if (inet_ntop(af, server_addr.sin6_addr.s6_addr, tmp, + sizeof (tmp)) == NULL) { _nderror = ND_NOHOST; return (NULL); } /* now extract the port info */ if ((dot = strrchr(uaddr, '.')) != 0) { + char *p = --dot; - char *p; - - p = --dot; - while (*p-- != '.'); + while (*p-- != '.') + ; p++; (void) strcat(tmp + strlen(tmp), p); _nderror = ND_OK; @@ -1051,7 +1015,7 @@ bindresvport(struct netconfig *nconf, int fd, struct netbuf *addr) * this, if the caller has set this option before calling * bindresvport(), it will be unset. Better be safe... */ - *optval = 0; + *optval = 0; resp.flags = 0; resp.opt.buf = (char *)reqbuf; resp.opt.maxlen = sizeof (reqbuf); diff --git a/usr/src/lib/libsocket/inet/interface_id.c b/usr/src/lib/libsocket/inet/interface_id.c index 2a512b025f..88854fe9da 100644 --- a/usr/src/lib/libsocket/inet/interface_id.c +++ b/usr/src/lib/libsocket/inet/interface_id.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdio.h> #include <ctype.h> #include <string.h> @@ -120,6 +117,9 @@ if_indextoname(uint32_t ifindex, char *ifname) int numifs; size_t bufsize; boolean_t found; + uint_t flags; + + flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES | LIFC_UNDER_IPMP; /* A interface index of 0 is invalid */ if (ifindex == 0) { @@ -137,14 +137,19 @@ if_indextoname(uint32_t ifindex, char *ifname) /* Prepare to send a SIOCGLIFNUM request message */ lifn.lifn_family = AF_UNSPEC; - lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; + lifn.lifn_flags = flags; if (ioctl(s, SIOCGLIFNUM, (char *)&lifn) < 0) { int save_err = errno; (void) close(s); errno = save_err; return (NULL); } - numifs = lifn.lifn_count; + + /* + * NOTE: "+ 10" sleaze mitigates new IP interfaces showing up between + * the SIOCGLIFNUM and the SIOCGLIFCONF. + */ + numifs = lifn.lifn_count + 10; /* * Provide enough buffer to obtain the interface @@ -161,7 +166,7 @@ if_indextoname(uint32_t ifindex, char *ifname) return (NULL); } lifc.lifc_family = AF_UNSPEC; - lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; + lifc.lifc_flags = flags; lifc.lifc_len = bufsize; lifc.lifc_buf = buf; if (ioctl(s, SIOCGLIFCONF, (char *)&lifc) < 0) { diff --git a/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c b/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c index dabc2e0929..62ebedf522 100644 --- a/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c +++ b/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c @@ -1936,7 +1936,7 @@ dyndns_update_core(char *fqdn) return (-1); do { - if (ni.ni_nic.nic_sysflags & (IFF_STANDBY | IFF_PRIVATE)) + if (ni.ni_nic.nic_sysflags & IFF_PRIVATE) continue; addr.s_addr = ni.ni_nic.nic_ip; @@ -2003,7 +2003,7 @@ dyndns_clear_rev_zone(char *fqdn) return (-1); do { - if (ni.ni_nic.nic_sysflags & (IFF_STANDBY | IFF_PRIVATE)) + if (ni.ni_nic.nic_sysflags & IFF_PRIVATE) continue; addr.s_addr = ni.ni_nic.nic_ip; diff --git a/usr/src/pkgdefs/SUNWarc/prototype_com b/usr/src/pkgdefs/SUNWarc/prototype_com index e9d6270d88..7e04f8b580 100644 --- a/usr/src/pkgdefs/SUNWarc/prototype_com +++ b/usr/src/pkgdefs/SUNWarc/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -122,8 +122,6 @@ s none usr/lib/llib-lintl=../../lib/llib-lintl s none usr/lib/llib-lintl.ln=../../lib/llib-lintl.ln f none usr/lib/llib-lipmi 644 root bin f none usr/lib/llib-lipmi.ln 644 root bin -f none usr/lib/llib-lipmp 644 root bin -f none usr/lib/llib-lipmp.ln 644 root bin f none usr/lib/llib-lipp 644 root bin f none usr/lib/llib-lipp.ln 644 root bin s none usr/lib/llib-lkstat=../../lib/llib-lkstat diff --git a/usr/src/pkgdefs/SUNWarcr/prototype_com b/usr/src/pkgdefs/SUNWarcr/prototype_com index 6095ff7fe5..852330d742 100644 --- a/usr/src/pkgdefs/SUNWarcr/prototype_com +++ b/usr/src/pkgdefs/SUNWarcr/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -80,6 +80,8 @@ f none lib/llib-lgen 644 root bin f none lib/llib-lgen.ln 644 root bin f none lib/llib-lintl 644 root bin f none lib/llib-lintl.ln 644 root bin +f none lib/llib-lipmp 644 root bin +f none lib/llib-lipmp.ln 644 root bin f none lib/llib-lkmf.ln 644 root bin f none lib/llib-lkmfberder.ln 644 root bin f none lib/llib-lkstat 644 root bin diff --git a/usr/src/pkgdefs/SUNWckr/prototype_com b/usr/src/pkgdefs/SUNWckr/prototype_com index ead3a7e5e8..989847d09d 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_com +++ b/usr/src/pkgdefs/SUNWckr/prototype_com @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -81,6 +81,7 @@ f none kernel/drv/crypto.conf 644 root sys f none kernel/drv/cryptoadm.conf 644 root sys f none kernel/drv/devinfo.conf 644 root sys f none kernel/drv/dld.conf 644 root sys +f none kernel/drv/dlpistub.conf 644 root sys f none kernel/drv/icmp.conf 644 root sys f none kernel/drv/icmp6.conf 644 root sys f none kernel/drv/ip.conf 644 root sys @@ -123,7 +124,6 @@ f none kernel/drv/tcp6.conf 644 root sys f none kernel/drv/tl.conf 644 root sys f none kernel/drv/udp.conf 644 root sys f none kernel/drv/udp6.conf 644 root sys -f none kernel/drv/vni.conf 644 root sys f none kernel/drv/vnic.conf 644 root sys f none kernel/drv/wc.conf 644 root sys d none kernel/exec 755 root sys diff --git a/usr/src/pkgdefs/SUNWckr/prototype_i386 b/usr/src/pkgdefs/SUNWckr/prototype_i386 index 421d760621..e2972713c6 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_i386 +++ b/usr/src/pkgdefs/SUNWckr/prototype_i386 @@ -86,6 +86,7 @@ f none kernel/drv/crypto 755 root sys f none kernel/drv/cryptoadm 755 root sys f none kernel/drv/devinfo 755 root sys f none kernel/drv/dld 755 root sys +f none kernel/drv/dlpistub 755 root sys f none kernel/drv/i8042 755 root sys f none kernel/drv/icmp 755 root sys f none kernel/drv/icmp6 755 root sys @@ -152,7 +153,6 @@ f none kernel/drv/ucode.conf 644 root sys f none kernel/drv/udp 755 root sys f none kernel/drv/udp6 755 root sys f none kernel/drv/vgatext 755 root sys -f none kernel/drv/vni 755 root sys f none kernel/drv/vnic 755 root sys f none kernel/drv/wc 755 root sys f none kernel/exec/elfexec 755 root sys @@ -308,6 +308,7 @@ f none kernel/drv/amd64/crypto 755 root sys f none kernel/drv/amd64/cryptoadm 755 root sys f none kernel/drv/amd64/devinfo 755 root sys f none kernel/drv/amd64/dld 755 root sys +f none kernel/drv/amd64/dlpistub 755 root sys f none kernel/drv/amd64/i8042 755 root sys f none kernel/drv/amd64/icmp 755 root sys f none kernel/drv/amd64/icmp6 755 root sys @@ -366,7 +367,6 @@ f none kernel/drv/amd64/ucode 755 root sys f none kernel/drv/amd64/udp 755 root sys f none kernel/drv/amd64/udp6 755 root sys f none kernel/drv/amd64/vgatext 755 root sys -f none kernel/drv/amd64/vni 755 root sys f none kernel/drv/amd64/vnic 755 root sys f none kernel/drv/amd64/wc 755 root sys d none kernel/exec/amd64 755 root sys diff --git a/usr/src/pkgdefs/SUNWckr/prototype_sparc b/usr/src/pkgdefs/SUNWckr/prototype_sparc index e81a86168e..a8f0b93be0 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_sparc +++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -82,6 +82,7 @@ f none kernel/drv/sparcv9/cryptoadm 755 root sys f none kernel/drv/sparcv9/dad 755 root sys f none kernel/drv/sparcv9/devinfo 755 root sys f none kernel/drv/sparcv9/dld 755 root sys +f none kernel/drv/sparcv9/dlpistub 755 root sys f none kernel/drv/sparcv9/esp 755 root sys f none kernel/drv/sparcv9/i8042 755 root sys f none kernel/drv/sparcv9/icmp 755 root sys @@ -137,7 +138,6 @@ f none kernel/drv/sparcv9/ttymux 755 root sys f none kernel/drv/sparcv9/uata 755 root sys f none kernel/drv/sparcv9/udp 755 root sys f none kernel/drv/sparcv9/udp6 755 root sys -f none kernel/drv/sparcv9/vni 755 root sys f none kernel/drv/sparcv9/vnic 755 root sys f none kernel/drv/sparcv9/wc 755 root sys d none kernel/exec/sparcv9 755 root sys diff --git a/usr/src/pkgdefs/SUNWcsd/postinstall b/usr/src/pkgdefs/SUNWcsd/postinstall index b481a763ca..caa9bb3402 100644 --- a/usr/src/pkgdefs/SUNWcsd/postinstall +++ b/usr/src/pkgdefs/SUNWcsd/postinstall @@ -20,7 +20,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -48,6 +48,7 @@ prototype_com=' devices/pseudo/arp@0:arp dev/arp devices/pseudo/clone@0:ibd dev/ibd devices/pseudo/dld@0:ctl dev/dld +devices/pseudo/dlpistub@0:ipmpstub dev/ipmpstub devices/pseudo/icmp@0:icmp dev/icmp devices/pseudo/icmp@0:icmp dev/rawip devices/pseudo/icmp6@0:icmp6 dev/icmp6 diff --git a/usr/src/pkgdefs/SUNWcsl/prototype_com b/usr/src/pkgdefs/SUNWcsl/prototype_com index a856560c5e..d5918f5883 100644 --- a/usr/src/pkgdefs/SUNWcsl/prototype_com +++ b/usr/src/pkgdefs/SUNWcsl/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -144,8 +144,6 @@ s none usr/lib/libintl.so=../../lib/libintl.so.1 s none usr/lib/libintl.so.1=../../lib/libintl.so.1 f none usr/lib/libipmi.so.1 755 root bin s none usr/lib/libipmi.so=./libipmi.so.1 -s none usr/lib/libipmp.so=./libipmp.so.1 -f none usr/lib/libipmp.so.1 755 root bin s none usr/lib/libipp.so=./libipp.so.1 f none usr/lib/libipp.so.1 755 root bin f none usr/lib/libipsecutil.so.1 755 root bin diff --git a/usr/src/pkgdefs/SUNWcslr/prototype_com b/usr/src/pkgdefs/SUNWcslr/prototype_com index ed7059250a..71ebaff013 100644 --- a/usr/src/pkgdefs/SUNWcslr/prototype_com +++ b/usr/src/pkgdefs/SUNWcslr/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -95,6 +95,8 @@ f none lib/libinetcfg.so.1 755 root bin f none lib/libinetutil.so.1 755 root bin s none lib/libintl.so=libintl.so.1 f none lib/libintl.so.1 755 root bin +s none lib/libipmp.so=./libipmp.so.1 +f none lib/libipmp.so.1 755 root bin s none lib/libkmf.so=libkmf.so.1 f none lib/libkmf.so.1 755 root bin s none lib/libkmfberder.so=libkmfberder.so.1 diff --git a/usr/src/pkgdefs/SUNWcsr/prototype_com b/usr/src/pkgdefs/SUNWcsr/prototype_com index 02051a08ae..b60abe0f00 100644 --- a/usr/src/pkgdefs/SUNWcsr/prototype_com +++ b/usr/src/pkgdefs/SUNWcsr/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -332,6 +332,7 @@ d none lib 755 root bin d none lib/crypto 755 root bin f none lib/crypto/kcfd 555 root bin d none lib/inet 755 root bin +f none lib/inet/in.mpathd 555 root bin f none lib/inet/nwamd 555 root bin d none lib/svc 0755 root bin d none lib/svc/bin 0755 root bin @@ -404,7 +405,8 @@ f none sbin/fiocompress 555 root bin f none sbin/hostconfig 555 root bin f none sbin/ifconfig 555 root bin f none sbin/ifparse 555 root bin -s none sbin/in.mpathd=../usr/lib/inet/in.mpathd +s none sbin/in.mpathd=../lib/inet/in.mpathd +f none sbin/ipmpstat 555 root bin f none sbin/soconfig 555 root bin f none sbin/init 555 root sys s none sbin/jsh=sh diff --git a/usr/src/pkgdefs/SUNWcsu/prototype_com b/usr/src/pkgdefs/SUNWcsu/prototype_com index 6bb2772f1a..464da8254a 100644 --- a/usr/src/pkgdefs/SUNWcsu/prototype_com +++ b/usr/src/pkgdefs/SUNWcsu/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -649,7 +649,7 @@ d none usr/lib/inet/dhcp 755 root bin d none usr/lib/inet/dhcp/nsu 755 root bin d none usr/lib/inet/dhcp/svc 755 root bin f none usr/lib/inet/in.iked 555 root bin -f none usr/lib/inet/in.mpathd 555 root bin +s none usr/lib/inet/in.mpathd=../../../lib/inet/in.mpathd f none usr/lib/inet/inetd 555 root bin f none usr/lib/intrd 555 root bin f none usr/lib/isaexec 555 root bin @@ -865,6 +865,7 @@ s none usr/sbin/init=../../sbin/init f none usr/sbin/install 555 root bin f none usr/sbin/installboot 555 root sys f none usr/sbin/ipaddrsel 555 root bin +s none usr/sbin/ipmpstat=../../sbin/ipmpstat f none usr/sbin/ipsecalgs 555 root bin f none usr/sbin/ipsecconf 555 root bin f none usr/sbin/ipseckey 555 root bin diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com index 45536bf13e..555f28921c 100644 --- a/usr/src/pkgdefs/SUNWhea/prototype_com +++ b/usr/src/pkgdefs/SUNWhea/prototype_com @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -270,6 +270,7 @@ f none usr/include/inet/tcp_stack.h 644 root bin f none usr/include/inet/wifi_ioctl.h 644 root bin f none usr/include/inttypes.h 644 root bin f none usr/include/ipmp.h 644 root bin +f none usr/include/ipmp_admin.h 644 root bin f none usr/include/ipmp_mpathd.h 644 root bin f none usr/include/ipmp_query.h 644 root bin d none usr/include/ipp 755 root bin diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh index 3ac332b45c..7fd4a7186b 100644 --- a/usr/src/tools/scripts/bfu.sh +++ b/usr/src/tools/scripts/bfu.sh @@ -21,7 +21,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # Upgrade a machine from a cpio archive area in about 5 minutes. @@ -8060,10 +8060,14 @@ mondo_loop() { # The global zone needs to have its /dev/dld symlink created # during install so that processes can access it early in boot - # before devfsadm is run. + # before devfsadm is run. Likewise for /dev/ipmpstub. if [ ! -L $rootprefix/dev/dld ]; then ln -s ../devices/pseudo/dld@0:ctl $rootprefix/dev/dld fi + if [ ! -L $rootprefix/dev/ipmpstub ]; then + ln -s ../devices/pseudo/dlpistub@0:ipmpstub \ + $rootprefix/dev/ipmpstub + fi fi # Fix up audit permissions diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 5fcd81b433..448a0d712d 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -485,7 +485,7 @@ IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \ sctp_ioc.o sctp_bind.o sctp_notify.o sctp_asconf.o \ sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o -IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \ +IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \ ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \ ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \ @@ -1605,9 +1605,9 @@ IPF_OBJS += ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o ip_nat.o \ IBD_OBJS += ibd.o -SDP_OBJS += sdpddi.o +DLPISTUB_OBJS += dlpistub.o -VNI_OBJS += vni.o +SDP_OBJS += sdpddi.o CTF_OBJS += ctf_create.o ctf_decl.o ctf_error.o ctf_hash.o ctf_labels.o \ ctf_lookup.o ctf_open.o ctf_types.o ctf_util.o ctf_subr.o ctf_mod.o diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index 1cd82570c1..db550667da 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # uts/common/Makefile.rules @@ -447,7 +447,7 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ip/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) -$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ipnet/%.c +$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ipnet/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -489,7 +489,7 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/sockmods/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) -$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/vni/%.c +$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/dlpistub/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1572,7 +1572,7 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/arp/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ip/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) -$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipnet/%.c +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipnet/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipf/%.c @@ -1599,10 +1599,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/tcp/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/nca/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) -$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/vni/%.c +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/dlpistub/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) - $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/inet/arp.h b/usr/src/uts/common/inet/arp.h index 0bca52e9ae..4351c91666 100644 --- a/usr/src/uts/common/inet/arp.h +++ b/usr/src/uts/common/inet/arp.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -28,6 +28,7 @@ #define _INET_ARP_H #include <sys/types.h> +#include <net/if.h> #ifdef __cplusplus extern "C" { @@ -64,6 +65,8 @@ extern "C" { */ #define AR_ARP_CLOSING (AR_IOCTL + 16) #define AR_ARP_EXTEND (AR_IOCTL + 17) +#define AR_IPMP_ACTIVATE (AR_IOCTL + 18) +#define AR_IPMP_DEACTIVATE (AR_IOCTL + 19) /* Both ace_flags and area_flags; must also modify arp.c in mdb */ #define ACE_F_PERMANENT 0x0001 @@ -182,6 +185,14 @@ typedef struct ar_mapping_add_s { /* the mask&proto_addr */ } arma_t; +/* Structure used to notify ARP of changes to IPMP group topology */ +typedef struct ar_ipmp_event_s { + uint32_t arie_cmd; + uint32_t arie_name_offset; + uint32_t arie_name_length; + char arie_grifname[LIFNAMSIZ]; +} arie_t; + /* Structure used to notify clients of interesting conditions. */ typedef struct ar_client_notify_s { uint32_t arcn_cmd; diff --git a/usr/src/uts/common/inet/arp/arp.c b/usr/src/uts/common/inet/arp/arp.c index 815dfd19d3..06c499ced9 100644 --- a/usr/src/uts/common/inet/arp/arp.c +++ b/usr/src/uts/common/inet/arp/arp.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -85,6 +85,30 @@ * talking to a given peer, then it doesn't matter if we have the right mapping * for that peer. It would be possible to send queries on aging entries that * are active, but this isn't done. + * + * IPMP Notes + * ---------- + * + * ARP is aware of IPMP. In particular, IP notifies ARP about all "active" + * (able to transmit data packets) interfaces in a given group via + * AR_IPMP_ACTIVATE and AR_IPMP_DEACTIVATE messages. These messages, combined + * with the "IPMP arl_t" that ARP creates over the IPMP DLPI stub driver, + * enable ARP to track all the arl_t's that are in the same group and thus + * ensure that ACEs are shared across each group and the arl_t that ARP + * chooses to transmit on for a given ACE is optimal. + * + * ARP relies on IP for hardware address updates. In particular, if the + * hardware address of an interface changes (DL_NOTE_PHYS_ADDR), then IP will + * bring the interface down and back up -- and as part of bringing it back + * up, will send messages to ARP that allow it to update the affected arl's + * with new hardware addresses. + * + * N.B.: One side-effect of this approach is that when an interface fails and + * then starts to repair, it will temporarily populate the ARP cache with + * addresses that are owned by it rather than the group's arl_t. To address + * this, we could add more messages (e.g., AR_IPMP_JOIN and AR_IPMP_LEAVE), + * but as the issue appears to be only cosmetic (redundant entries in the ARP + * cache during interace repair), we've kept things simple for now. */ /* @@ -134,6 +158,12 @@ typedef struct { #define ARH_FIXED_LEN 8 /* + * Macro used when creating ACEs to determine the arl that should own it. + */ +#define OWNING_ARL(arl) \ + ((arl)->arl_ipmp_arl != NULL ? (arl)->arl_ipmp_arl : arl) + +/* * MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK * doesn't quite do it for us. */ @@ -154,7 +184,7 @@ static int ar_ce_create(arl_t *arl, uint32_t proto, uchar_t *hw_addr, uint32_t hw_addr_len, uchar_t *proto_addr, uint32_t proto_addr_len, uchar_t *proto_mask, uchar_t *proto_extract_mask, uint32_t hw_extract_start, - uint32_t flags); + uchar_t *sender_addr, uint32_t flags); static void ar_ce_delete(ace_t *ace); static void ar_ce_delete_per_arl(ace_t *ace, void *arg); static ace_t **ar_ce_hash(arp_stack_t *as, uint32_t proto, @@ -167,6 +197,8 @@ static ace_t *ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp, ace_t *matchfn()); static ace_t *ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr, uint32_t proto_addr_length); +static ace_t *ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, + uchar_t *proto_addr, uint32_t proto_addr_length); static boolean_t ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length); static void ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *), @@ -187,6 +219,8 @@ static int ar_interface_up(queue_t *q, mblk_t *mp); static int ar_interface_down(queue_t *q, mblk_t *mp); static int ar_interface_on(queue_t *q, mblk_t *mp); static int ar_interface_off(queue_t *q, mblk_t *mp); +static int ar_ipmp_activate(queue_t *q, mblk_t *mp); +static int ar_ipmp_deactivate(queue_t *q, mblk_t *mp); static void ar_ll_cleanup_arl_queue(queue_t *q); static void ar_ll_down(arl_t *arl); static arl_t *ar_ll_lookup_by_name(arp_stack_t *as, const char *name); @@ -208,7 +242,7 @@ static int ar_param_set(queue_t *q, mblk_t *mp, char *value, static void ar_query_delete(ace_t *ace, void *ar); static void ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, uint32_t proto_addr_len); -static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace); +static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace); static void ar_rput(queue_t *q, mblk_t *mp_orig); static void ar_rput_dlpi(queue_t *q, mblk_t *mp); static void ar_set_address(ace_t *ace, uchar_t *addrpos, @@ -344,6 +378,10 @@ static arct_t ar_cmd_tbl[] = { ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_ON" }, { ar_interface_off, AR_INTERFACE_OFF, sizeof (arc_t), ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_OFF" }, + { ar_ipmp_activate, AR_IPMP_ACTIVATE, sizeof (arie_t), + ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_ACTIVATE" }, + { ar_ipmp_deactivate, AR_IPMP_DEACTIVATE, sizeof (arie_t), + ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_DEACTIVATE" }, { ar_set_ppa, (uint32_t)IF_UNITSEL, sizeof (int), ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "IF_UNITSEL" }, { ar_nd_ioctl, ND_GET, 1, @@ -358,6 +396,65 @@ static arct_t ar_cmd_tbl[] = { }; /* + * Lookup and return an arl appropriate for sending packets with either source + * hardware address `hw_addr' or source protocol address `ip_addr', in that + * order. If neither was specified or neither match, return any arl in the + * same group as `arl'. + */ +static arl_t * +ar_ipmp_lookup_xmit_arl(arl_t *arl, uchar_t *hw_addr, uint_t hw_addrlen, + uchar_t *ip_addr) +{ + arlphy_t *ap; + ace_t *src_ace; + arl_t *xmit_arl = NULL; + arp_stack_t *as = ARL_TO_ARPSTACK(arl); + + ASSERT(arl->arl_flags & ARL_F_IPMP); + + if (hw_addr != NULL && hw_addrlen != 0) { + xmit_arl = as->as_arl_head; + for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) { + /* + * There may be arls with the same HW address that are + * not in our IPMP group; we don't want those. + */ + if (xmit_arl->arl_ipmp_arl != arl) + continue; + + ap = xmit_arl->arl_phy; + if (ap != NULL && ap->ap_hw_addrlen == hw_addrlen && + bcmp(ap->ap_hw_addr, hw_addr, hw_addrlen) == 0) + break; + } + + DTRACE_PROBE4(xmit_arl_hwsrc, arl_t *, arl, arl_t *, + xmit_arl, uchar_t *, hw_addr, uint_t, hw_addrlen); + } + + if (xmit_arl == NULL && ip_addr != NULL) { + src_ace = ar_ce_lookup_permanent(as, IP_ARP_PROTO_TYPE, ip_addr, + IP_ADDR_LEN); + if (src_ace != NULL) + xmit_arl = src_ace->ace_xmit_arl; + + DTRACE_PROBE4(xmit_arl_ipsrc, arl_t *, arl, arl_t *, + xmit_arl, uchar_t *, ip_addr, uint_t, IP_ADDR_LEN); + } + + if (xmit_arl == NULL) { + xmit_arl = as->as_arl_head; + for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) + if (xmit_arl->arl_ipmp_arl == arl && xmit_arl != arl) + break; + + DTRACE_PROBE2(xmit_arl_any, arl_t *, arl, arl_t *, xmit_arl); + } + + return (xmit_arl); +} + +/* * ARP Cache Entry creation routine. * Cache entries are allocated within timer messages and inserted into * the global hash list based on protocol and protocol address. @@ -365,7 +462,8 @@ static arct_t ar_cmd_tbl[] = { static int ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, uchar_t *proto_addr, uint_t proto_addr_len, uchar_t *proto_mask, - uchar_t *proto_extract_mask, uint_t hw_extract_start, uint_t flags) + uchar_t *proto_extract_mask, uint_t hw_extract_start, uchar_t *sender_addr, + uint_t flags) { static ace_t ace_null; ace_t *ace; @@ -373,17 +471,35 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, uchar_t *dst; mblk_t *mp; arp_stack_t *as = ARL_TO_ARPSTACK(arl); + arl_t *xmit_arl; arlphy_t *ap; if ((flags & ~ACE_EXTERNAL_FLAGS_MASK) || arl == NULL) return (EINVAL); - if ((ap = arl->arl_phy) == NULL) + if (proto_addr == NULL || proto_addr_len == 0 || + (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN)) return (EINVAL); if (flags & ACE_F_MYADDR) flags |= ACE_F_PUBLISH | ACE_F_AUTHORITY; + /* + * Latch a transmit arl for this ace. + */ + if (arl->arl_flags & ARL_F_IPMP) { + ASSERT(proto == IP_ARP_PROTO_TYPE); + xmit_arl = ar_ipmp_lookup_xmit_arl(arl, hw_addr, hw_addr_len, + sender_addr); + } else { + xmit_arl = arl; + } + + if (xmit_arl == NULL || xmit_arl->arl_phy == NULL) + return (EINVAL); + + ap = xmit_arl->arl_phy; + if (!hw_addr && hw_addr_len == 0) { if (flags == ACE_F_PERMANENT) { /* Not publish */ /* 224.0.0.0 to zero length address */ @@ -398,9 +514,6 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, flags |= ACE_F_RESOLVED; } - if (proto_addr == NULL || proto_addr_len == 0 || - (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN)) - return (EINVAL); /* Handle hw_addr_len == 0 for DL_ENABMULTI_REQ etc. */ if (hw_addr_len != 0 && hw_addr == NULL) return (EINVAL); @@ -432,6 +545,7 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, ace->ace_proto = proto; ace->ace_mp = mp; ace->ace_arl = arl; + ace->ace_xmit_arl = xmit_arl; dst = (uchar_t *)&ace[1]; @@ -510,12 +624,73 @@ ar_ce_delete(ace_t *ace) static void ar_ce_delete_per_arl(ace_t *ace, void *arl) { - if (ace->ace_arl == arl) { + if (ace->ace_arl == arl || ace->ace_xmit_arl == arl) { ace->ace_flags &= ~ACE_F_PERMANENT; ar_ce_delete(ace); } } +/* + * ar_ce_walk routine used when deactivating an `arl' in a group. Deletes + * `ace' if it was using `arl_arg' as its output interface. + */ +static void +ar_ce_ipmp_deactivate(ace_t *ace, void *arl_arg) +{ + arl_t *arl = arl_arg; + + ASSERT(!(arl->arl_flags & ARL_F_IPMP)); + + if (ace->ace_arl == arl) { + ASSERT(ace->ace_xmit_arl == arl); + /* + * This ACE is tied to the arl leaving the group (e.g., an + * ACE_F_PERMANENT for a test address) and is not used by the + * group, so we can leave it be. + */ + return; + } + + if (ace->ace_xmit_arl != arl) + return; + + ASSERT(ace->ace_arl == arl->arl_ipmp_arl); + + /* + * IP should've already sent us messages asking us to move any + * ACE_F_MYADDR entries to another arl, but there are two exceptions: + * + * 1. The group was misconfigured with interfaces that have duplicate + * hardware addresses, but in.mpathd was unable to offline those + * duplicate interfaces. + * + * 2. The messages from IP were lost or never created (e.g. due to + * memory pressure). + * + * We handle the first case by just quietly deleting the ACE. Since + * the second case cannot be distinguished from a more serious bug in + * the IPMP framework, we ASSERT() that this can't happen on DEBUG + * systems, but quietly delete the ACE on production systems (the + * deleted ACE will render the IP address unreachable). + */ + if (ace->ace_flags & ACE_F_MYADDR) { + arlphy_t *ap = arl->arl_phy; + uint_t hw_addrlen = ap->ap_hw_addrlen; + + ASSERT(hw_addrlen == ace->ace_hw_addr_length && + bcmp(ap->ap_hw_addr, ace->ace_hw_addr, hw_addrlen) == 0); + } + + /* + * NOTE: it's possible this arl got selected as the ace_xmit_arl when + * creating an ACE_F_PERMANENT ACE on behalf of an SIOCS*ARP ioctl for + * an IPMP IP interface. But it's still OK for us to delete such an + * ACE since ipmp_illgrp_refresh_arpent() will ask us to recreate it + * and we'll pick another arl then. + */ + ar_ce_delete(ace); +} + /* Cache entry hash routine, based on protocol and protocol address. */ static ace_t ** ar_ce_hash(arp_stack_t *as, uint32_t proto, const uchar_t *proto_addr, @@ -559,7 +734,8 @@ ar_ce_lookup_entry(arl_t *arl, uint32_t proto, const uchar_t *proto_addr, return (NULL); ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length); for (; ace; ace = ace->ace_next) { - if (ace->ace_arl == arl && + if ((ace->ace_arl == arl || + ace->ace_arl == arl->arl_ipmp_arl) && ace->ace_proto_addr_length == proto_addr_length && ace->ace_proto == proto) { int i1 = proto_addr_length; @@ -632,13 +808,6 @@ ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr, /* * Look for a permanent entry for proto_addr across all interfaces. - * This is used for sending ARP requests out. Requests may come from - * IP on le0 with the source address of le1 and we need to send out - * the request on le1 so that ARP does not think that somebody else - * is using its PERMANENT address. If le0 and le1 are sitting on - * the same wire, the same IP -> ethernet mapping might exist on - * both the interfaces. But we should look for the permanent - * mapping to avoid arp interpreting it as a duplicate. */ static ace_t * ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr, @@ -653,8 +822,8 @@ ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr, if (ace->ace_proto_addr_length == proto_addr_length && ace->ace_proto == proto) { int i1 = proto_addr_length; - uchar_t *ace_addr = ace->ace_proto_addr; - uchar_t *mask = ace->ace_proto_mask; + uchar_t *ace_addr = ace->ace_proto_addr; + uchar_t *mask = ace->ace_proto_mask; /* * Note that the ace_proto_mask is applied to the @@ -703,12 +872,8 @@ ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length) * 1. Resolution of unresolved entries and update of resolved entries. * 2. Detection of nodes with our own IP address (duplicates). * - * This is complicated by ill groups. We don't currently have knowledge of ill - * groups, so we can't distinguish between a packet that comes in on one of the - * arls that's part of the group versus one that's on an unrelated arl. Thus, - * we take a conservative approach. If the arls match, then we update resolved - * and unresolved entries alike. If they don't match, then we update only - * unresolved entries. + * If the resolving ARL is in the same group as a matching ACE's ARL, then + * update the ACE. Otherwise, make no updates. * * For all entries, we first check to see if this is a duplicate (probable * loopback) message. If so, then just ignore it. @@ -741,7 +906,7 @@ ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length) static int ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr, - uint32_t hlen, const uchar_t *src_paddr, uint32_t plen) + uint32_t hlen, const uchar_t *src_paddr, uint32_t plen, arl_t **ace_arlp) { ace_t *ace; ace_t *ace_next; @@ -778,31 +943,35 @@ ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr, if (i1 >= 0) continue; + *ace_arlp = ace->ace_arl; + /* - * If both IP addr and hardware address match what we already - * have, then this is a broadcast packet emitted by one of our - * interfaces, reflected by the switch and received on another - * interface. We return AR_LOOPBACK. + * If the IP address is ours, and the hardware address matches + * one of our own arls, then this is a broadcast packet + * emitted by one of our interfaces, reflected by the switch + * and received on another interface. We return AR_LOOPBACK. */ - if ((ace->ace_flags & ACE_F_MYADDR) && - hlen == ace->ace_hw_addr_length && - bcmp(ace->ace_hw_addr, src_haddr, - ace->ace_hw_addr_length) == 0) { - return (AR_LOOPBACK); + if (ace->ace_flags & ACE_F_MYADDR) { + arl_t *hw_arl = as->as_arl_head; + arlphy_t *ap; + + for (; hw_arl != NULL; hw_arl = hw_arl->arl_next) { + ap = hw_arl->arl_phy; + if (ap != NULL && ap->ap_hw_addrlen == hlen && + bcmp(ap->ap_hw_addr, src_haddr, hlen) == 0) + return (AR_LOOPBACK); + } } /* * If the entry is unverified, then we've just verified that * someone else already owns this address, because this is a * message with the same protocol address but different - * hardware address. Conflicts received via an interface which - * doesn't own the conflict address are not actioned. Multiple - * interfaces on the same segment imply any conflict will also - * be seen via the correct interface, so we can ignore anything - * not matching the arl from the ace. + * hardware address. NOTE: the ace_xmit_arl check ensures we + * don't send duplicate AR_FAILEDs if arl is in an IPMP group. */ if ((ace->ace_flags & ACE_F_UNVERIFIED) && - arl == ace->ace_arl) { + arl == ace->ace_xmit_arl) { ar_ce_delete(ace); return (AR_FAILED); } @@ -814,30 +983,29 @@ ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr, * that, if we're currently in initial announcement mode, we * switch back to the lazier defense mode. Knowing that * there's at least one duplicate out there, we ought not - * blindly announce. Conflicts received via an interface which - * doesn't own the conflict address are not actioned. Multiple - * interfaces on the same segment imply the conflict will also - * be seen via the correct interface, so we can ignore anything - * not matching the arl from the ace. + * blindly announce. NOTE: the ace_xmit_arl check ensures we + * don't send duplicate AR_BOGONs if arl is in an IPMP group. */ if ((ace->ace_flags & ACE_F_AUTHORITY) && - arl == ace->ace_arl) { + arl == ace->ace_xmit_arl) { ace->ace_xmit_count = 0; return (AR_BOGON); } /* - * Limit updating across other ills to unresolved - * entries only. We don't want to inadvertently update - * published entries. + * Only update this ACE if it's on the same network -- i.e., + * it's for our ARL or another ARL in the same IPMP group. */ - if (ace->ace_arl == arl || !ACE_RESOLVED(ace)) { + if (ace->ace_arl == arl || ace->ace_arl == arl->arl_ipmp_arl) { if (ar_ce_resolve(ace, src_haddr, hlen)) retv = AR_CHANGED; else if (retv == AR_NOTFOUND) retv = AR_MERGED; } } + + if (retv == AR_NOTFOUND) + *ace_arlp = NULL; return (retv); } @@ -917,7 +1085,7 @@ static void ar_delete_notify(const ace_t *ace) { const arl_t *arl = ace->ace_arl; - const arlphy_t *ap = arl->arl_phy; + const arlphy_t *ap = ace->ace_xmit_arl->arl_phy; mblk_t *mp; size_t len; arh_t *arh; @@ -945,7 +1113,7 @@ ar_close(queue_t *q) { ar_t *ar = (ar_t *)q->q_ptr; char name[LIFNAMSIZ]; - arl_t *arl; + arl_t *arl, *xarl; arl_t **arlp; cred_t *cr; arc_t *arc; @@ -999,6 +1167,21 @@ ar_close(queue_t *q) while (arl->arl_state != ARL_S_DOWN) qwait(arl->arl_rq); + if (arl->arl_flags & ARL_F_IPMP) { + /* + * Though rude, someone could force the IPMP arl + * closed without removing the underlying interfaces. + * In that case, force the ARLs out of the group. + */ + xarl = as->as_arl_head; + for (; xarl != NULL; xarl = xarl->arl_next) { + if (xarl->arl_ipmp_arl != arl || xarl == arl) + continue; + ar_ce_walk(as, ar_ce_ipmp_deactivate, xarl); + xarl->arl_ipmp_arl = NULL; + } + } + ar_ll_clear_defaults(arl); /* * If this is the control stream for an arl, delete anything @@ -1417,9 +1600,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) area_t *area; ace_t *ace; uchar_t *hw_addr; - uint32_t hw_addr_len; + uint32_t hw_addr_len; uchar_t *proto_addr; - uint32_t proto_addr_len; + uint32_t proto_addr_len; uchar_t *proto_mask; arl_t *arl; mblk_t *mp = mp_orig; @@ -1494,6 +1677,7 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) proto_mask, NULL, (uint32_t)0, + NULL, aflags & ~ACE_F_MAPPING & ~ACE_F_UNVERIFIED & ~ACE_F_DEFEND); if (err != 0) { DTRACE_PROBE3(eadd_create_failed, arl_t *, arl, area_t *, area, @@ -1502,7 +1686,13 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) } if (aflags & ACE_F_PUBLISH) { - arlphy_t *ap = arl->arl_phy; + arlphy_t *ap; + + ace = ar_ce_lookup(arl, area->area_proto, proto_addr, + proto_addr_len); + ASSERT(ace != NULL); + + ap = ace->ace_xmit_arl->arl_phy; if (hw_addr == NULL || hw_addr_len == 0) { hw_addr = ap->ap_hw_addr; @@ -1519,10 +1709,6 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) ap->ap_hw_addrlen = hw_addr_len; } - ace = ar_ce_lookup(arl, area->area_proto, proto_addr, - proto_addr_len); - ASSERT(ace != NULL); - if (ace->ace_flags & ACE_F_FAST) { ace->ace_xmit_count = as->as_fastprobe_count; ace->ace_xmit_interval = as->as_fastprobe_delay; @@ -1555,9 +1741,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) */ DTRACE_PROBE2(eadd_probe, ace_t *, ace, area_t *, area); - ar_xmit(arl, ARP_REQUEST, area->area_proto, - proto_addr_len, hw_addr, NULL, NULL, - proto_addr, NULL, as); + ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, + area->area_proto, proto_addr_len, + hw_addr, NULL, NULL, proto_addr, NULL, as); ace->ace_xmit_count--; ace->ace_xmit_interval = (ace->ace_flags & ACE_F_FAST) ? @@ -1573,9 +1759,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) } else { DTRACE_PROBE2(eadd_announce, ace_t *, ace, area_t *, area); - ar_xmit(arl, ARP_REQUEST, area->area_proto, - proto_addr_len, hw_addr, proto_addr, - ap->ap_arp_addr, proto_addr, NULL, as); + ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, + area->area_proto, proto_addr_len, hw_addr, + proto_addr, ap->ap_arp_addr, proto_addr, NULL, as); ace->ace_last_bcast = ddi_get_lbolt(); /* @@ -1583,9 +1769,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) * entry; we believe we're the authority for this * entry. In that case, and if we're not just doing * one-off defense of the address, we send more than - * one copy, so that if this is an IPMP failover, we'll - * still have a good chance of updating everyone even - * when there's a packet loss or two. + * one copy, so we'll still have a good chance of + * updating everyone even when there's a packet loss + * or two. */ if ((aflags & ACE_F_AUTHORITY) && !(aflags & ACE_F_DEFEND) && @@ -1667,7 +1853,6 @@ static int ar_entry_query(queue_t *q, mblk_t *mp_orig) { ace_t *ace; - ace_t *src_ace = NULL; areq_t *areq; arl_t *arl; int err; @@ -1782,20 +1967,12 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) err = ENXIO; goto err_ret; } - if (arl->arl_phy == NULL) { - /* Can't get help if we don't know how. */ - DTRACE_PROBE2(query_no_phy, ace_t *, ace, - areq_t *, areq); - mpp[0] = NULL; - mp->b_prev = NULL; - err = ENXIO; - goto err_ret; - } DTRACE_PROBE2(query_unresolved, ace_t, ace, areq_t *, areq); } else { /* No ace yet. Make one now. (This is the common case.) */ - if (areq->areq_xmit_count == 0 || arl->arl_phy == NULL) { - DTRACE_PROBE2(query_phy, arl_t *, arl, areq_t *, areq); + if (areq->areq_xmit_count == 0) { + DTRACE_PROBE2(query_template, arl_t *, arl, + areq_t *, areq); mp->b_prev = NULL; err = ENXIO; goto err_ret; @@ -1814,9 +1991,9 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) err = EINVAL; goto err_ret; } - err = ar_ce_create(arl, areq->areq_proto, NULL, 0, + err = ar_ce_create(OWNING_ARL(arl), areq->areq_proto, NULL, 0, proto_addr, proto_addr_len, NULL, - NULL, (uint32_t)0, + NULL, (uint32_t)0, sender_addr, areq->areq_flags); if (err != 0) { DTRACE_PROBE3(query_create_failed, arl_t *, arl, @@ -1835,49 +2012,13 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) goto err_ret; } ace->ace_query_mp = mp; - /* - * We don't have group information here. But if the sender - * address belongs to a different arl, we might as well - * search the other arl for a resolved ACE. If we find one, - * we resolve it rather than sending out a ARP request. - */ - src_ace = ar_ce_lookup_permanent(as, areq->areq_proto, - sender_addr, areq->areq_sender_addr_length); - if (src_ace == NULL) { - DTRACE_PROBE3(query_source_missing, arl_t *, arl, - areq_t *, areq, ace_t *, ace); - ar_query_reply(ace, ENXIO, NULL, (uint32_t)0); - /* - * ar_query_reply has already freed the mp. - * Return EINPROGRESS, so that caller won't attempt - * to free the 'mp' again. - */ - return (EINPROGRESS); - } - if (src_ace->ace_arl != ace->ace_arl) { - ace_t *dst_ace; - - /* - * Check for a resolved entry in the src_ace->ace_arl. - */ - dst_ace = ar_ce_lookup_entry(src_ace->ace_arl, - areq->areq_proto, proto_addr, proto_addr_len); - - if (dst_ace != NULL && ACE_RESOLVED(dst_ace)) { - DTRACE_PROBE3(query_other_arl, arl_t *, arl, - areq_t *, areq, ace_t *, dst_ace); - (void) ar_ce_resolve(ace, dst_ace->ace_hw_addr, - dst_ace->ace_hw_addr_length); - return (EINPROGRESS); - } - } } - ms = ar_query_xmit(as, ace, src_ace); + ms = ar_query_xmit(as, ace); if (ms == 0) { /* Immediate reply requested. */ ar_query_reply(ace, ENXIO, NULL, (uint32_t)0); } else { - mi_timer(arl->arl_wq, ace->ace_mp, ms); + mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, ms); } return (EINPROGRESS); err_ret: @@ -2073,6 +2214,80 @@ done: } /* + * Given an arie_t `mp', find the arl_t's that it names and return them + * in `*arlp' and `*ipmp_arlp'. If they cannot be found, return B_FALSE. + */ +static boolean_t +ar_ipmp_lookup(arp_stack_t *as, mblk_t *mp, arl_t **arlp, arl_t **ipmp_arlp) +{ + arie_t *arie = (arie_t *)mp->b_rptr; + + *arlp = ar_ll_lookup_from_mp(as, mp); + if (*arlp == NULL) { + DTRACE_PROBE1(ipmp_lookup_no_arl, mblk_t *, mp); + return (B_FALSE); + } + + arie->arie_grifname[LIFNAMSIZ - 1] = '\0'; + *ipmp_arlp = ar_ll_lookup_by_name(as, arie->arie_grifname); + if (*ipmp_arlp == NULL) { + DTRACE_PROBE1(ipmp_lookup_no_ipmp_arl, mblk_t *, mp); + return (B_FALSE); + } + + DTRACE_PROBE2(ipmp_lookup, arl_t *, *arlp, arl_t *, *ipmp_arlp); + return (B_TRUE); +} + +/* + * Bind an arl_t to an IPMP group arl_t. + */ +static int +ar_ipmp_activate(queue_t *q, mblk_t *mp) +{ + arl_t *arl, *ipmp_arl; + arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; + + if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl)) + return (EINVAL); + + if (arl->arl_ipmp_arl != NULL) { + DTRACE_PROBE1(ipmp_activated_already, arl_t *, arl); + return (EALREADY); + } + + DTRACE_PROBE2(ipmp_activate, arl_t *, arl, arl_t *, ipmp_arl); + arl->arl_ipmp_arl = ipmp_arl; + return (0); +} + +/* + * Unbind an arl_t from an IPMP group arl_t and update the ace_t's so + * that it is no longer part of the group. + */ +static int +ar_ipmp_deactivate(queue_t *q, mblk_t *mp) +{ + arl_t *arl, *ipmp_arl; + arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; + + if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl)) + return (EINVAL); + + if (ipmp_arl != arl->arl_ipmp_arl) { + DTRACE_PROBE2(ipmp_deactivate_notactive, arl_t *, arl, arl_t *, + ipmp_arl); + return (EINVAL); + } + + DTRACE_PROBE2(ipmp_deactivate, arl_t *, arl, arl_t *, + arl->arl_ipmp_arl); + ar_ce_walk(as, ar_ce_ipmp_deactivate, arl); + arl->arl_ipmp_arl = NULL; + return (0); +} + +/* * Enable an interface to process ARP_REQUEST and ARP_RESPONSE messages. */ /* ARGSUSED */ @@ -2199,6 +2414,11 @@ ar_ll_init(arp_stack_t *as, ar_t *ar, mblk_t *mp) if ((arl = (arl_t *)mi_zalloc(sizeof (arl_t))) == NULL) return; + if (dlia->dl_mac_type == SUNW_DL_IPMP) { + arl->arl_flags |= ARL_F_IPMP; + arl->arl_ipmp_arl = arl; + } + arl->arl_provider_style = dlia->dl_provider_style; arl->arl_rq = ar->ar_rq; arl->arl_wq = ar->ar_wq; @@ -2261,7 +2481,7 @@ ar_ll_set_defaults(arl_t *arl, mblk_t *mp) dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr; dl_unitdata_req_t *dlur; uchar_t *up; - arlphy_t *ap; + arlphy_t *ap; ASSERT(arl != NULL); @@ -2270,6 +2490,14 @@ ar_ll_set_defaults(arl_t *arl, mblk_t *mp) */ ar_ll_clear_defaults(arl); + if (arl->arl_flags & ARL_F_IPMP) { + /* + * If this is an IPMP arl_t, we have nothing to do, + * since we will never transmit or receive. + */ + return; + } + ap = kmem_zalloc(sizeof (arlphy_t), KM_NOSLEEP); if (ap == NULL) goto bad; @@ -2470,12 +2698,12 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig) mblk_t *mp = mp_orig; ace_t *ace; uchar_t *hw_addr; - uint32_t hw_addr_len; + uint32_t hw_addr_len; uchar_t *proto_addr; - uint32_t proto_addr_len; + uint32_t proto_addr_len; uchar_t *proto_mask; uchar_t *proto_extract_mask; - uint32_t hw_extract_start; + uint32_t hw_extract_start; arl_t *arl; arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; @@ -2524,6 +2752,7 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig) proto_mask, proto_extract_mask, hw_extract_start, + NULL, arma->arma_flags | ACE_F_MAPPING)); } @@ -2857,12 +3086,12 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, uint32_t proto_addr_len) { mblk_t *areq_mp; - arl_t *arl = ace->ace_arl; mblk_t *mp; mblk_t *xmit_mp; - arp_stack_t *as = ARL_TO_ARPSTACK(arl); + queue_t *arl_wq = ace->ace_arl->arl_wq; + arp_stack_t *as = ARL_TO_ARPSTACK(ace->ace_arl); ip_stack_t *ipst = as->as_netstack->netstack_ip; - arlphy_t *ap = arl->arl_phy; + arlphy_t *ap = ace->ace_xmit_arl->arl_phy; /* * On error or completion for a query, we need to shut down the timer. @@ -2870,7 +3099,8 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, * Duplicate Address Detection, or it will never finish that phase. */ if (!(ace->ace_flags & (ACE_F_UNVERIFIED | ACE_F_AUTHORITY))) - mi_timer(arl->arl_wq, ace->ace_mp, -1L); + mi_timer(arl_wq, ace->ace_mp, -1L); + /* Establish the return value appropriate. */ if (ret_val == 0) { if (!ACE_RESOLVED(ace) || ap == NULL) @@ -2973,25 +3203,24 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, */ ar_ce_delete(ace); } else { - mi_timer(arl->arl_wq, ace->ace_mp, - as->as_cleanup_interval); + mi_timer(arl_wq, ace->ace_mp, as->as_cleanup_interval); } } } /* * Returns number of milliseconds after which we should either rexmit or abort. - * Return of zero means we should abort. src_ace is the ace corresponding - * to the source address in the areq sent by IP. + * Return of zero means we should abort. */ static clock_t -ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace) +ar_query_xmit(arp_stack_t *as, ace_t *ace) { areq_t *areq; mblk_t *mp; uchar_t *proto_addr; uchar_t *sender_addr; - arl_t *src_arl; + ace_t *src_ace; + arl_t *xmit_arl = ace->ace_xmit_arl; mp = ace->ace_query_mp; /* @@ -3016,18 +3245,15 @@ ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace) areq->areq_sender_addr_length); /* - * Get the source h/w address for the sender addr. With interface - * groups, IP sends us source address belonging to a different - * interface. + * Get the ace for the sender address, so that we can verify that + * we have one and that DAD has completed. */ + src_ace = ar_ce_lookup(xmit_arl, areq->areq_proto, sender_addr, + areq->areq_sender_addr_length); if (src_ace == NULL) { - src_ace = ar_ce_lookup_permanent(as, areq->areq_proto, - sender_addr, areq->areq_sender_addr_length); - if (src_ace == NULL) { - DTRACE_PROBE3(xmit_no_source, ace_t *, ace, - areq_t *, areq, uchar_t *, sender_addr); - return (0); - } + DTRACE_PROBE3(xmit_no_source, ace_t *, ace, areq_t *, areq, + uchar_t *, sender_addr); + return (0); } /* @@ -3044,18 +3270,12 @@ ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace) return (areq->areq_xmit_interval); } - /* - * Transmit on src_arl. We should transmit on src_arl. Otherwise - * the switch will send back a copy on other interfaces of the - * same group and as we could be using somebody else's source - * address + hardware address, ARP will treat this as a bogon. - */ - src_arl = src_ace->ace_arl; DTRACE_PROBE3(xmit_send, ace_t *, ace, ace_t *, src_ace, areq_t *, areq); - ar_xmit(src_arl, ARP_REQUEST, areq->areq_proto, - areq->areq_sender_addr_length, src_arl->arl_phy->ap_hw_addr, - sender_addr, src_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as); + + ar_xmit(xmit_arl, ARP_REQUEST, areq->areq_proto, + areq->areq_sender_addr_length, xmit_arl->arl_phy->ap_hw_addr, + sender_addr, xmit_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as); src_ace->ace_last_bcast = ddi_get_lbolt(); return (areq->areq_xmit_interval); } @@ -3066,6 +3286,7 @@ ar_rput(queue_t *q, mblk_t *mp) { arh_t *arh; arl_t *arl; + arl_t *client_arl; ace_t *dst_ace; uchar_t *dst_paddr; int err; @@ -3079,6 +3300,8 @@ ar_rput(queue_t *q, mblk_t *mp) uchar_t *src_paddr; uchar_t *dst_haddr; boolean_t is_probe; + boolean_t is_unicast = B_FALSE; + dl_unitdata_ind_t *dlindp; int i; arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; @@ -3135,9 +3358,10 @@ ar_rput(queue_t *q, mblk_t *mp) return; case M_PCPROTO: case M_PROTO: + dlindp = (dl_unitdata_ind_t *)mp->b_rptr; if (MBLKL(mp) >= sizeof (dl_unitdata_ind_t) && - ((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive == - DL_UNITDATA_IND) { + dlindp->dl_primitive == DL_UNITDATA_IND) { + is_unicast = (dlindp->dl_group_address == 0); arl = ((ar_t *)q->q_ptr)->ar_arl; if (arl != NULL && arl->arl_phy != NULL) { /* Real messages from the wire! */ @@ -3261,19 +3485,24 @@ ar_rput(queue_t *q, mblk_t *mp) * RFC 826: first check if the <protocol, sender protocol address> is * in the cache, if there is a sender protocol address. Note that this * step also handles resolutions based on source. + * + * Note that IP expects that each notification it receives will be + * tied to the ill it received it on. Thus, we must talk to it over + * the arl tied to the resolved IP address (if any), hence client_arl. */ if (is_probe) err = AR_NOTFOUND; else err = ar_ce_resolve_all(arl, proto, src_haddr, hlen, src_paddr, - plen); + plen, &client_arl); + switch (err) { case AR_BOGON: - ar_client_notify(arl, mp1, AR_CN_BOGON); + ar_client_notify(client_arl, mp1, AR_CN_BOGON); mp1 = NULL; break; case AR_FAILED: - ar_client_notify(arl, mp1, AR_CN_FAILED); + ar_client_notify(client_arl, mp1, AR_CN_FAILED); mp1 = NULL; break; case AR_LOOPBACK: @@ -3293,7 +3522,9 @@ ar_rput(queue_t *q, mblk_t *mp) * Now look up the destination address. By RFC 826, we ignore the * packet at this step if the target isn't one of our addresses. This * is true even if the target is something we're trying to resolve and - * the packet is a response. + * the packet is a response. To avoid duplicate responses, we also + * ignore the packet if it was multicast/broadcast to an arl that's in + * an IPMP group but was not the designated xmit_arl for the ACE. * * Note that in order to do this correctly, we need to know when to * notify IP of a change implied by the source address of the ARP @@ -3304,6 +3535,7 @@ ar_rput(queue_t *q, mblk_t *mp) */ dst_ace = ar_ce_lookup_entry(arl, proto, dst_paddr, plen); if (dst_ace == NULL || !ACE_RESOLVED(dst_ace) || + (dst_ace->ace_xmit_arl != arl && !is_unicast) || !(dst_ace->ace_flags & ACE_F_PUBLISH)) { /* * Let the client know if the source mapping has changed, even @@ -3311,7 +3543,7 @@ ar_rput(queue_t *q, mblk_t *mp) * client. */ if (err == AR_CHANGED) - ar_client_notify(arl, mp1, AR_CN_ANNOUNCE); + ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE); else freemsg(mp1); freeb(mp); @@ -3341,6 +3573,7 @@ ar_rput(queue_t *q, mblk_t *mp) "arp_rput_end: q %p (%S)", q, "reflection"); return; } + /* * Conflicts seen via the wrong interface may be bogus. * Multiple interfaces on the same segment imply any conflict @@ -3378,12 +3611,21 @@ ar_rput(queue_t *q, mblk_t *mp) * the src_paddr field before sending it to IP. The same is * required for probes, where src_paddr will be INADDR_ANY. */ - if (is_probe || op == ARP_RESPONSE) { + if (is_probe) { + /* + * In this case, client_arl will be invalid (e.g., + * since probes don't have a valid sender address). + * But dst_ace has the appropriate arl. + */ bcopy(dst_paddr, src_paddr, plen); - ar_client_notify(arl, mp1, AR_CN_FAILED); + ar_client_notify(dst_ace->ace_arl, mp1, AR_CN_FAILED); + ar_ce_delete(dst_ace); + } else if (op == ARP_RESPONSE) { + bcopy(dst_paddr, src_paddr, plen); + ar_client_notify(client_arl, mp1, AR_CN_FAILED); ar_ce_delete(dst_ace); } else if (err == AR_CHANGED) { - ar_client_notify(arl, mp1, AR_CN_ANNOUNCE); + ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE); } else { DTRACE_PROBE3(rput_request_unverified, arl_t *, arl, arh_t *, arh, ace_t *, dst_ace); @@ -3431,19 +3673,19 @@ ar_rput(queue_t *q, mblk_t *mp) dst_ace->ace_hw_addr, dst_ace->ace_proto_addr, src_haddr, src_paddr, dstaddr, as); if (!is_probe && err == AR_NOTFOUND && - ar_ce_create(arl, proto, src_haddr, hlen, src_paddr, plen, - NULL, NULL, 0, 0) == 0) { + ar_ce_create(OWNING_ARL(arl), proto, src_haddr, hlen, + src_paddr, plen, NULL, NULL, 0, NULL, 0) == 0) { ace_t *ace; ace = ar_ce_lookup(arl, proto, src_paddr, plen); ASSERT(ace != NULL); - mi_timer(arl->arl_wq, ace->ace_mp, + mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, as->as_cleanup_interval); } } if (err == AR_CHANGED) { freeb(mp); - ar_client_notify(arl, mp1, AR_CN_ANNOUNCE); + ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE); TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, "arp_rput_end: q %p (%S)", q, "reqchange"); } else { @@ -3459,7 +3701,7 @@ ar_ce_restart_dad(ace_t *ace, void *arl_arg) arl_t *arl = arl_arg; arp_stack_t *as = ARL_TO_ARPSTACK(arl); - if ((ace->ace_arl == arl) && + if ((ace->ace_xmit_arl == arl) && (ace->ace_flags & (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) == (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) { /* @@ -4060,9 +4302,9 @@ ar_wput(queue_t *q, mblk_t *mp) static boolean_t arp_say_ready(ace_t *ace) { - mblk_t *mp; + mblk_t *mp; arl_t *arl = ace->ace_arl; - arlphy_t *ap = arl->arl_phy; + arlphy_t *ap = ace->ace_xmit_arl->arl_phy; arh_t *arh; uchar_t *cp; @@ -4107,7 +4349,7 @@ ace_reschedule(ace_t *ace, void *arg) ace_t **acemax; ace_t *atemp; - if (ace->ace_arl != art->art_arl) + if (ace->ace_xmit_arl != art->art_arl) return; /* * Only published entries that are ready for announcement are eligible. @@ -4179,7 +4421,6 @@ static void ar_wsrv(queue_t *q) { ace_t *ace; - arl_t *arl; arlphy_t *ap; mblk_t *mp; clock_t ms; @@ -4196,8 +4437,7 @@ ar_wsrv(queue_t *q) ace = (ace_t *)mp->b_rptr; if (ace->ace_flags & ACE_F_DYING) continue; - arl = ace->ace_arl; - ap = arl->arl_phy; + ap = ace->ace_xmit_arl->arl_phy; if (ace->ace_flags & ACE_F_UNVERIFIED) { ASSERT(ace->ace_flags & ACE_F_PUBLISH); ASSERT(ace->ace_query_mp == NULL); @@ -4216,7 +4456,7 @@ ar_wsrv(queue_t *q) DTRACE_PROBE1(timer_probe, ace_t *, ace); ace->ace_xmit_count--; - ar_xmit(arl, ARP_REQUEST, + ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, ace->ace_proto, ace->ace_proto_addr_length, ace->ace_hw_addr, NULL, NULL, @@ -4247,7 +4487,7 @@ ar_wsrv(queue_t *q) now - ap->ap_defend_start > SEC_TO_TICK(as->as_defend_period)) { ap->ap_defend_start = now; - arl_reschedule(arl); + arl_reschedule(ace->ace_xmit_arl); } /* * Finish the job that we started in @@ -4288,12 +4528,12 @@ ar_wsrv(queue_t *q) DTRACE_PROBE1(timer_defend, ace_t *, ace); } - ar_xmit(arl, ARP_REQUEST, + ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, ace->ace_proto, ace->ace_proto_addr_length, ace->ace_hw_addr, ace->ace_proto_addr, - ap->ap_arp_addr, + ace->ace_xmit_arl->arl_phy->ap_arp_addr, ace->ace_proto_addr, NULL, as); ace->ace_last_bcast = now; if (ace->ace_xmit_count == 0) @@ -4316,7 +4556,8 @@ ar_wsrv(queue_t *q) ndp_lookup_ipaddr(*(ipaddr_t *) ace->ace_proto_addr, as->as_netstack)) { ace->ace_flags |= ACE_F_OLD; - mi_timer(arl->arl_wq, ace->ace_mp, + mi_timer(ace->ace_arl->arl_wq, + ace->ace_mp, as->as_cleanup_interval); } else { ar_delete_notify(ace); @@ -4333,7 +4574,7 @@ ar_wsrv(queue_t *q) * we complete the operation with a failure indication. * Otherwise, we restart the timer. */ - ms = ar_query_xmit(as, ace, NULL); + ms = ar_query_xmit(as, ace); if (ms == 0) ar_query_reply(ace, ENXIO, NULL, (uint32_t)0); else @@ -4360,6 +4601,8 @@ ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen, mblk_t *mp; arlphy_t *ap = arl->arl_phy; + ASSERT(!(arl->arl_flags & ARL_F_IPMP)); + if (ap == NULL) { DTRACE_PROBE1(xmit_no_arl_phy, arl_t *, arl); return; diff --git a/usr/src/uts/common/inet/arp_impl.h b/usr/src/uts/common/inet/arp_impl.h index a2564d5602..f16fdc97a0 100644 --- a/usr/src/uts/common/inet/arp_impl.h +++ b/usr/src/uts/common/inet/arp_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +67,7 @@ typedef struct arl_s { uint_t arl_closing : 1; /* stream is closing */ uint32_t arl_index; /* instance number */ struct arlphy_s *arl_phy; /* physical info, if any */ + struct arl_s *arl_ipmp_arl; /* pointer to group arl_t */ } arl_t; /* @@ -75,7 +76,7 @@ typedef struct arl_s { */ #define ARL_TO_ARPSTACK(_arl) (((ar_t *)(_arl)->arl_rq->q_ptr)->ar_as) -/* ARL physical info structure for a link level device */ +/* ARL physical info structure, one per physical link level device */ typedef struct arlphy_s { uint32_t ap_arp_hw_type; /* hardware type */ uchar_t *ap_arp_addr; /* multicast address to use */ @@ -110,6 +111,7 @@ typedef struct ace_s { clock_t ace_last_bcast; /* last broadcast Response */ clock_t ace_xmit_interval; int ace_xmit_count; + arl_t *ace_xmit_arl; /* xmit on this arl */ } ace_t; #define ARPHOOK_INTERESTED_PHYSICAL_IN(as) \ @@ -216,6 +218,7 @@ struct arp_stack { typedef struct arp_stack arp_stack_t; #define ARL_F_NOARP 0x01 +#define ARL_F_IPMP 0x02 #define ARL_S_DOWN 0x00 #define ARL_S_PENDING 0x01 diff --git a/usr/src/uts/common/inet/dlpistub/dlpistub.c b/usr/src/uts/common/inet/dlpistub/dlpistub.c new file mode 100644 index 0000000000..961876ac47 --- /dev/null +++ b/usr/src/uts/common/inet/dlpistub/dlpistub.c @@ -0,0 +1,370 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * DLPI stub driver; currently supports VNI and IPMP stub devices. + */ + +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/dlpi.h> +#include <sys/stat.h> +#include <sys/strsun.h> +#include <sys/stropts.h> +#include <sys/types.h> +#include <sys/id_space.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/mkdev.h> +#include <sys/sdt.h> + +#include "dlpistub_impl.h" + +static id_space_t *ds_minors; +static dev_info_t *ds_dip; + +/* + * DL_INFO_ACK template. + */ +static dl_info_ack_t ds_infoack = { + DL_INFO_ACK, /* dl_primitive */ + 0, /* dl_max_sdu */ + 0, /* dl_min_sdu */ + 0, /* dl_addr_length */ + 0, /* dl_mac_type */ + 0, /* dl_reserved */ + 0, /* dl_current_state */ + 0, /* dl_sap_length */ + DL_CLDLS, /* dl_service_mode */ + 0, /* dl_qos_length */ + 0, /* dl_qos_offset */ + 0, /* dl_qos_range_length */ + 0, /* dl_qos_range_offset */ + DL_STYLE2, /* dl_provider_style */ + 0, /* dl_addr_offset */ + DL_VERSION_2, /* dl_version */ + 0, /* dl_brdcst_addr_length */ + 0, /* dl_brdcst_addr_offset */ + 0 /* dl_growth */ +}; + +static int +ds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, "vni", S_IFCHR, DS_MINOR_VNI, + DDI_PSEUDO, 0) == DDI_FAILURE || + ddi_create_minor_node(dip, "ipmpstub", S_IFCHR, DS_MINOR_IPMP, + DDI_PSEUDO, 0) == DDI_FAILURE) { + ddi_remove_minor_node(dip, NULL); + cmn_err(CE_NOTE, "ds_attach: cannot create minor nodes"); + return (DDI_FAILURE); + } + + ds_dip = dip; + ds_minors = id_space_create("ds_minors", DS_MINOR_START, MAXMIN32); + return (DDI_SUCCESS); +} + +static int +ds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + id_space_destroy(ds_minors); + ds_minors = NULL; + ASSERT(dip == ds_dip); + ddi_remove_minor_node(dip, NULL); + ds_dip = NULL; + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +static int +ds_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error = DDI_FAILURE; + + switch (infocmd) { + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2DEVINFO: + if (ds_dip != NULL) { + *result = ds_dip; + error = DDI_SUCCESS; + } + break; + } + return (error); +} + +/* ARGSUSED */ +static int +ds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) +{ + int type; + dlpistub_t *dsp; + + if (sflag == CLONEOPEN || sflag == MODOPEN) + return (EINVAL); + + if (q->q_ptr != NULL) + return (0); + + switch (getminor(*devp)) { + case DS_MINOR_VNI: + type = SUNW_DL_VNI; + break; + case DS_MINOR_IPMP: + type = SUNW_DL_IPMP; + break; + default: + return (ENXIO); + } + + dsp = kmem_zalloc(sizeof (dlpistub_t), KM_SLEEP); + dsp->ds_type = type; + dsp->ds_minor = (minor_t)id_alloc(ds_minors); + dsp->ds_state = DL_UNATTACHED; + *devp = makedevice(getmajor(*devp), dsp->ds_minor); + q->q_ptr = WR(q)->q_ptr = dsp; + qprocson(q); + + return (0); +} + +/* ARGSUSED */ +static int +ds_close(queue_t *q, int flag, cred_t *credp) +{ + dlpistub_t *dsp = q->q_ptr; + + qprocsoff(q); + q->q_ptr = WR(q)->q_ptr = NULL; + + id_free(ds_minors, dsp->ds_minor); + kmem_free(dsp, sizeof (dlpistub_t)); + + return (0); +} + +static int +ds_badprim(queue_t *q, mblk_t *mp, t_scalar_t prim) +{ + dlerrorack(q, mp, prim, DL_BADPRIM, 0); + return (0); +} + +static int +ds_outstate(queue_t *q, mblk_t *mp, t_scalar_t prim) +{ + dlerrorack(q, mp, prim, DL_OUTSTATE, 0); + return (0); +} + +static int +ds_wput(queue_t *q, mblk_t *mp) +{ + union DL_primitives *dlp; + dl_info_ack_t *dlip; + dlpistub_t *dsp = q->q_ptr; + t_scalar_t prim; + + switch (DB_TYPE(mp)) { + case M_PROTO: + case M_PCPROTO: + if (MBLKL(mp) < sizeof (t_scalar_t)) { + dlerrorack(q, mp, DL_PRIM_INVAL, DL_UNSUPPORTED, 0); + return (0); + } + + dlp = (void *)mp->b_rptr; + prim = dlp->dl_primitive; + switch (prim) { + case DL_ATTACH_REQ: + if (MBLKL(mp) < DL_ATTACH_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + if (dsp->ds_state != DL_UNATTACHED) + return (ds_outstate(q, mp, prim)); + + dsp->ds_state = DL_UNBOUND; + dlokack(q, mp, DL_ATTACH_REQ); + break; + + case DL_BIND_REQ: + if (MBLKL(mp) < DL_BIND_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + if (dsp->ds_state != DL_UNBOUND) + return (ds_outstate(q, mp, prim)); + + dsp->ds_state = DL_IDLE; + dlbindack(q, mp, dlp->bind_req.dl_sap, NULL, 0, 0, 0); + break; + + case DL_INFO_REQ: + if (MBLKL(mp) < DL_INFO_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + mp = mexchange(q, mp, sizeof (dl_info_ack_t), + M_PCPROTO, DL_INFO_ACK); + if (mp != NULL) { + dlip = (void *)mp->b_rptr; + *dlip = ds_infoack; + dlip->dl_mac_type = dsp->ds_type; + dlip->dl_current_state = dsp->ds_state; + qreply(q, mp); + } + break; + + case DL_PHYS_ADDR_REQ: + if (MBLKL(mp) < DL_PHYS_ADDR_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + dlphysaddrack(q, mp, NULL, 0); + break; + + case DL_UNBIND_REQ: + if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + if (dsp->ds_state != DL_IDLE) + return (ds_outstate(q, mp, prim)); + + dsp->ds_state = DL_UNBOUND; + dlokack(q, mp, DL_UNBIND_REQ); + break; + + case DL_DETACH_REQ: + if (MBLKL(mp) < DL_DETACH_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + if (dsp->ds_state != DL_UNBOUND) + return (ds_outstate(q, mp, prim)); + + dsp->ds_state = DL_UNATTACHED; + dlokack(q, mp, DL_DETACH_REQ); + break; + + case DL_UNITDATA_REQ: + DTRACE_PROBE2(dlpistub__data, dlpistub_t *, dsp, + mblk_t *, mp); + freemsg(mp); + break; + + default: + dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0); + } + break; + + case M_IOCTL: + miocnak(q, mp, 0, EINVAL); + break; + + case M_FLUSH: + *mp->b_rptr &= ~FLUSHW; + if (*mp->b_rptr & FLUSHR) + qreply(q, mp); + else + freemsg(mp); + break; + default: + freemsg(mp); + break; + } + + return (0); +} + +static struct module_info ds_minfo = { + DS_IDNUM, /* mi_idnum */ + "dlpistub", /* mi_idname */ + 0, /* mi_minpsz */ + INFPSZ, /* mi_maxpsz */ + 0, /* mi_hiwat */ + 0, /* mi_lowat */ +}; + +static struct qinit ds_rinit = { + NULL, /* qi_putp */ + NULL, /* qi_srvp */ + ds_open, /* qi_qopen */ + ds_close, /* qi_qclose */ + NULL, /* qi_qadmin */ + &ds_minfo, /* qi_minfo */ +}; + +static struct qinit ds_winit = { + ds_wput, /* qi_putp */ + NULL, /* qi_srvp */ + NULL, /* qi_qopen */ + NULL, /* qi_qclose */ + NULL, /* qi_qadmin */ + &ds_minfo, /* qi_minfo */ +}; + +static struct streamtab ds_info = { + &ds_rinit, /* st_rdinit */ + &ds_winit /* st_wrinit */ +}; + +DDI_DEFINE_STREAM_OPS(ds_ops, nulldev, nulldev, ds_attach, ds_detach, + nodev, ds_devinfo, D_MP|D_MTPERMOD, &ds_info, ddi_quiesce_not_supported); + +static struct modldrv modldrv = { + &mod_driverops, + "DLPI stub driver", + &ds_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/inet/vni/vni.conf b/usr/src/uts/common/inet/dlpistub/dlpistub.conf index d79915e01c..72264ca466 100644 --- a/usr/src/uts/common/inet/vni/vni.conf +++ b/usr/src/uts/common/inet/dlpistub/dlpistub.conf @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,10 +19,7 @@ # CDDL HEADER END # # -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # - -#ident "%Z%%M% %I% %E% SMI" -# -name="vni" parent="pseudo" instance=0; +name="dlpistub" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h b/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h new file mode 100644 index 0000000000..ece15320ee --- /dev/null +++ b/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _INET_DLPISTUB_IMPL_H +#define _INET_DLPISTUB_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> + +typedef struct dlpistub { + int ds_type; /* DLPI MAC type */ + t_uscalar_t ds_state; /* DLPI state */ + minor_t ds_minor; /* corresponding minor */ +} dlpistub_t; + +#define DS_IDNUM 0x2a84 + +enum { DS_MINOR_VNI = 1, DS_MINOR_IPMP, DS_MINOR_START }; + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_DLPISTUB_IMPL_H */ diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index 323c8fd0de..41595280cb 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -56,6 +56,7 @@ extern "C" { #include <net/route.h> #include <sys/systm.h> #include <sys/multidata.h> +#include <sys/list.h> #include <net/radix.h> #include <sys/modhash.h> @@ -565,15 +566,21 @@ typedef struct ipha_s { #define IPH_ECN_ECT0 0x2 /* ECN-Capable Transport, ECT(0) */ #define IPH_ECN_CE 0x3 /* ECN-Congestion Experienced (CE) */ +struct ill_s; + +typedef boolean_t ip_v6intfid_func_t(struct ill_s *, in6_addr_t *); +typedef boolean_t ip_v6mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *, + in6_addr_t *); +typedef boolean_t ip_v4mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *, + ipaddr_t *); + /* IP Mac info structure */ typedef struct ip_m_s { - t_uscalar_t ip_m_mac_type; /* From <sys/dlpi.h> */ - int ip_m_type; /* From <net/if_types.h> */ - boolean_t (*ip_m_v4mapinfo)(uint_t, uint8_t *, uint8_t *, - uint32_t *, ipaddr_t *); - boolean_t (*ip_m_v6mapinfo)(uint_t, uint8_t *, uint8_t *, - uint32_t *, in6_addr_t *); - boolean_t (*ip_m_v6intfid)(uint_t, uint8_t *, in6_addr_t *); + t_uscalar_t ip_m_mac_type; /* From <sys/dlpi.h> */ + int ip_m_type; /* From <net/if_types.h> */ + ip_v4mapinfo_func_t *ip_m_v4mapinfo; + ip_v6mapinfo_func_t *ip_m_v6mapinfo; + ip_v6intfid_func_t *ip_m_v6intfid; } ip_m_t; /* @@ -583,18 +590,22 @@ typedef struct ip_m_s { * layer multicast address range. * b. map from IPv6 multicast address range (ff00::/8) to the link * layer multicast address range. - * c. derive the default IPv6 interface identifier from the link layer - * address. + * c. derive the default IPv6 interface identifier from the interface. + * d. derive the default IPv6 destination interface identifier from + * the interface (point-to-point only). */ #define MEDIA_V4MINFO(ip_m, plen, bphys, maddr, hwxp, v4ptr) \ (((ip_m)->ip_m_v4mapinfo != NULL) && \ (*(ip_m)->ip_m_v4mapinfo)(plen, bphys, maddr, hwxp, v4ptr)) -#define MEDIA_V6INTFID(ip_m, plen, phys, v6ptr) \ - (((ip_m)->ip_m_v6intfid != NULL) && \ - (*(ip_m)->ip_m_v6intfid)(plen, phys, v6ptr)) #define MEDIA_V6MINFO(ip_m, plen, bphys, maddr, hwxp, v6ptr) \ (((ip_m)->ip_m_v6mapinfo != NULL) && \ (*(ip_m)->ip_m_v6mapinfo)(plen, bphys, maddr, hwxp, v6ptr)) +#define MEDIA_V6INTFID(ip_m, ill, v6ptr) \ + (((ip_m)->ip_m_v6intfid != NULL) && \ + (*(ip_m)->ip_m_v6intfid)(ill, v6ptr)) +#define MEDIA_V6DESTINTFID(ip_m, ill, v6ptr) \ + (((ip_m)->ip_m_v6destintfid != NULL) && \ + (*(ip_m)->ip_m_v6destintfid)(ill, v6ptr)) /* Router entry types */ #define IRE_BROADCAST 0x0001 /* Route entry for broadcast address */ @@ -621,18 +632,12 @@ typedef struct ip_m_s { * the bucket should delete this IRE from this bucket. */ #define IRE_MARK_CONDEMNED 0x0001 + /* - * If a broadcast IRE is marked with IRE_MARK_NORECV, ip_rput will drop the - * broadcast packets received on that interface. This is marked only - * on broadcast ires. Employed by IPMP, where we have multiple NICs on the - * same subnet receiving the same broadcast packet. - */ -#define IRE_MARK_NORECV 0x0002 -/* - * IRE_CACHE marked this way won't be returned by ire_cache_lookup. Need - * to look specifically using MATCH_IRE_MARK_HIDDEN. Used by IPMP. + * An IRE with IRE_MARK_TESTHIDDEN is used by in.mpathd for test traffic. It + * can only be looked up by requesting MATCH_IRE_MARK_TESTHIDDEN. */ -#define IRE_MARK_HIDDEN 0x0004 /* Typically Used by in.mpathd */ +#define IRE_MARK_TESTHIDDEN 0x0004 /* * An IRE with IRE_MARK_NOADD is created in ip_newroute_ipif when the outgoing @@ -788,45 +793,18 @@ typedef struct mrec_s { * ilm records the state of multicast memberships with the driver and is * maintained per interface. * - * Notes : - * - * 1) There is no direct link between a given ilg and ilm. If the - * application has joined a group G with ifindex I, we will have - * an ilg with ilg_v6group and ilg_ill. There will be a corresponding - * ilm with ilm_ill/ilm_v6addr recording the multicast membership. - * To delete the membership, - * - * a) Search for ilg matching on G and I with ilg_v6group - * and ilg_ill. Delete ilg_ill. - * b) Search the corresponding ilm matching on G and I with - * ilm_v6addr and ilm_ill. Delete ilm. - * - * In IPv4, the only difference is, we look using ipifs instead of - * ills. - * - * 2) With IP multipathing, we want to keep receiving even after the - * interface has failed. We do this by moving multicast memberships - * to a new_ill within the group. This is achieved by sending - * DL_DISABMULTI_REQS on ilg_ill/ilm_ill and sending DL_ENABMULTIREQS - * on the new_ill and changing ilg_ill/ilm_ill to new_ill. But, we - * need to be able to delete memberships which will still come down - * with the ifindex of the old ill which is what the application - * knows of. Thus we store the ilm_/ilg_orig_ifindex to keep track - * of where we joined initially so that we can lookup even after we - * moved the membership. It is also used for moving back the membership - * when the old ill has been repaired. This is done by looking up for - * ilms with ilm_orig_ifindex matching on the old ill's ifindex. Only - * ilms actually move from old ill to new ill. ilgs don't move (just - * the ilg_ill is changed when it moves) as it just records the state - * of the application that has joined a group G where as ilm records - * the state joined with the driver. Thus when we send DL_XXXMULTI_REQs - * we also need to keep the ilm in the right ill. - * - * In IPv4, as ipifs move from old ill to new_ill, ilgs and ilms move - * implicitly as we use only ipifs in IPv4. Thus, one can always lookup - * a given ilm/ilg even after it fails without the support of - * orig_ifindex. We move ilms still to record the driver state as - * mentioned above. + * There is no direct link between a given ilg and ilm. If the + * application has joined a group G with ifindex I, we will have + * an ilg with ilg_v6group and ilg_ill. There will be a corresponding + * ilm with ilm_ill/ilm_v6addr recording the multicast membership. + * To delete the membership: + * + * a) Search for ilg matching on G and I with ilg_v6group + * and ilg_ill. Delete ilg_ill. + * b) Search the corresponding ilm matching on G and I with + * ilm_v6addr and ilm_ill. Delete ilm. + * + * For IPv4 the only difference is that we look using ipifs, not ills. */ /* @@ -839,7 +817,6 @@ typedef struct ilg_s { in6_addr_t ilg_v6group; struct ipif_s *ilg_ipif; /* Logical interface we are member on */ struct ill_s *ilg_ill; /* Used by IPv6 */ - int ilg_orig_ifindex; /* Interface originally joined on */ uint_t ilg_flags; mcast_record_t ilg_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */ slist_t *ilg_filter; @@ -866,9 +843,7 @@ typedef struct ilm_s { struct ilm_s *ilm_next; /* Linked list for each ill */ uint_t ilm_state; /* state of the membership */ struct ill_s *ilm_ill; /* Back pointer to ill for IPv6 */ - int ilm_orig_ifindex; /* V6_MULTICAST_IF/ilm_ipif index */ uint_t ilm_flags; - boolean_t ilm_is_new; /* new ilm */ boolean_t ilm_notify_driver; /* Need to notify the driver */ zoneid_t ilm_zoneid; int ilm_no_ilg_cnt; /* number of joins w/ no ilg */ @@ -881,28 +856,11 @@ typedef struct ilm_s { #define ilm_addr V4_PART_OF_V6(ilm_v6addr) -/* - * ilm_walker_cleanup needs to execute when the ilm_walker_cnt goes down to - * zero. In addition it needs to block new walkers while it is unlinking ilm's - * from the list. Thus simple atomics for the ill_ilm_walker_cnt don't suffice. - */ -#define ILM_WALKER_HOLD(ill) { \ - mutex_enter(&(ill)->ill_lock); \ - ill->ill_ilm_walker_cnt++; \ - mutex_exit(&(ill)->ill_lock); \ -} - -/* - * ilm_walker_cleanup releases ill_lock - */ -#define ILM_WALKER_RELE(ill) { \ - mutex_enter(&(ill)->ill_lock); \ - (ill)->ill_ilm_walker_cnt--; \ - if ((ill)->ill_ilm_walker_cnt == 0 && (ill)->ill_ilm_cleanup_reqd) \ - ilm_walker_cleanup(ill); \ - else \ - mutex_exit(&(ill)->ill_lock); \ -} +typedef struct ilm_walker { + struct ill_s *ilw_ill; /* associated ill */ + struct ill_s *ilw_ipmp_ill; /* associated ipmp ill (if any) */ + struct ill_s *ilw_walk_ill; /* current ill being walked */ +} ilm_walker_t; /* * Soft reference to an IPsec SA. @@ -1047,11 +1005,8 @@ typedef struct conn_s conn_t; * ipc_acking_unbind conn_acking_unbind * ipc_pad_to_bit_31 conn_pad_to_bit_31 * - * ipc_nofailover_ill conn_nofailover_ill - * * ipc_proto conn_proto * ipc_incoming_ill conn_incoming_ill - * ipc_outgoing_pill conn_outgoing_pill * ipc_pending_ill conn_pending_ill * ipc_unbind_mp conn_unbind_mp * ipc_ilg conn_ilg @@ -1061,8 +1016,6 @@ typedef struct conn_s conn_t; * ipc_refcv conn_refcv * ipc_multicast_ipif conn_multicast_ipif * ipc_multicast_ill conn_multicast_ill - * ipc_orig_bound_ifindex conn_orig_bound_ifindex - * ipc_orig_multicast_ifindex conn_orig_multicast_ifindex * ipc_drain_next conn_drain_next * ipc_drain_prev conn_drain_prev * ipc_idl conn_idl @@ -1263,7 +1216,6 @@ typedef struct th_hash_s { /* The following are ipif_state_flags */ #define IPIF_CONDEMNED 0x1 /* The ipif is being removed */ #define IPIF_CHANGING 0x2 /* A critcal ipif field is changing */ -#define IPIF_MOVING 0x8 /* The ipif is being moved */ #define IPIF_SET_LINKLOCAL 0x10 /* transient flag during bringup */ #define IPIF_ZERO_SOURCE 0x20 /* transient flag during bringup */ @@ -1273,7 +1225,6 @@ typedef struct ipif_s { struct ill_s *ipif_ill; /* Back pointer to our ill */ int ipif_id; /* Logical unit number */ uint_t ipif_mtu; /* Starts at ipif_ill->ill_max_frag */ - uint_t ipif_saved_mtu; /* Save of mtu during ipif_move() */ in6_addr_t ipif_v6lcl_addr; /* Local IP address for this if. */ in6_addr_t ipif_v6src_addr; /* Source IP address for this if. */ in6_addr_t ipif_v6subnet; /* Subnet prefix for this if. */ @@ -1306,17 +1257,15 @@ typedef struct ipif_s { uint_t ipif_ob_pkt_count; /* Outbound packets to our dead IREs */ /* Exclusive bit fields, protected by ipsq_t */ unsigned int - ipif_multicast_up : 1, /* We have joined the allhosts group */ - ipif_replace_zero : 1, /* Replacement for zero */ + ipif_multicast_up : 1, /* ipif_multicast_up() successful */ ipif_was_up : 1, /* ipif was up before */ ipif_addr_ready : 1, /* DAD is done */ - ipif_was_dup : 1, /* DAD had failed */ + + ipif_joined_allhosts : 1, /* allhosts joined */ ipif_pad_to_31 : 27; - int ipif_orig_ifindex; /* ifindex before SLIFFAILOVER */ uint_t ipif_seqid; /* unique index across all ills */ - uint_t ipif_orig_ipifid; /* ipif_id before SLIFFAILOVER */ uint_t ipif_state_flags; /* See IPIF_* flag defs above */ uint_t ipif_refcnt; /* active consistent reader cnt */ @@ -1328,6 +1277,16 @@ typedef struct ipif_s { zoneid_t ipif_zoneid; /* zone ID number */ timeout_id_t ipif_recovery_id; /* Timer for DAD recovery */ boolean_t ipif_trace_disable; /* True when alloc fails */ + /* + * For an IPMP interface, ipif_bound_ill tracks the ill whose hardware + * information this ipif is associated with via ARP/NDP. We can use + * an ill pointer (rather than an index) because only ills that are + * part of a group will be pointed to, and an ill cannot disappear + * while it's in a group. + */ + struct ill_s *ipif_bound_ill; + struct ipif_s *ipif_bound_next; /* bound ipif chain */ + boolean_t ipif_bound; /* B_TRUE if we successfully bound */ } ipif_t; /* @@ -1405,8 +1364,6 @@ typedef struct ipif_s { * * bit fields ill_lock ill_lock * - * ipif_orig_ifindex ipsq None - * ipif_orig_ipifid ipsq None * ipif_seqid ipsq Write once * * ipif_state_flags ill_lock ill_lock @@ -1414,6 +1371,10 @@ typedef struct ipif_s { * ipif_ire_cnt ill_lock ill_lock * ipif_ilm_cnt ill_lock ill_lock * ipif_saved_ire_cnt + * + * ipif_bound_ill ipsq + ipmp_lock ipsq OR ipmp_lock + * ipif_bound_next ipsq ipsq + * ipif_bound ipsq ipsq */ #define IP_TR_HASH(tid) ((((uintptr_t)tid) >> 6) & (IP_TR_HASH_MAX - 1)) @@ -1457,103 +1418,154 @@ typedef struct ipif_s { #define IPI2MODE(ipi) ((ipi)->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT) /* - * The IP-MT design revolves around the serialization object ipsq_t. - * It is associated with an IPMP group. If IPMP is not enabled, there is - * 1 ipsq_t per phyint. Eg. an ipsq_t would cover both hme0's IPv4 stream - * - * ipsq_lock protects - * ipsq_reentry_cnt, ipsq_writer, ipsq_xopq_mphead, ipsq_xopq_mptail, - * ipsq_mphead, ipsq_mptail, ipsq_split - * - * ipsq_pending_ipif, ipsq_current_ipif, ipsq_pending_mp, ipsq_flags, - * ipsq_waitfor - * - * The fields in the last line above below are set mostly by a writer thread - * But there is an exception in the last call to ipif_ill_refrele_tail which - * could also race with a conn close which could be cleaning up the - * fields. So we choose to protect using ipsq_lock instead of depending on - * the property of the writer. - * ill_g_lock protects - * ipsq_refs, ipsq_phyint_list - */ -typedef struct ipsq_s { - kmutex_t ipsq_lock; - int ipsq_reentry_cnt; - kthread_t *ipsq_writer; /* current owner (thread id) */ - int ipsq_flags; - mblk_t *ipsq_xopq_mphead; /* list of excl ops mostly ioctls */ - mblk_t *ipsq_xopq_mptail; - mblk_t *ipsq_mphead; /* msgs on ipsq linked thru b_next */ - mblk_t *ipsq_mptail; /* msgs on ipsq linked thru b_next */ - int ipsq_current_ioctl; /* current ioctl, or 0 if no ioctl */ - boolean_t ipsq_current_done; /* is the current op done? */ - ipif_t *ipsq_current_ipif; /* ipif associated with current op */ - ipif_t *ipsq_pending_ipif; /* ipif associated w. ipsq_pending_mp */ - mblk_t *ipsq_pending_mp; /* current ioctl mp while waiting for */ - /* response from another module */ - struct ipsq_s *ipsq_next; /* list of all syncq's (ipsq_g_list) */ - uint_t ipsq_refs; /* Number of phyints on this ipsq */ - struct phyint *ipsq_phyint_list; /* List of phyints on this ipsq */ - boolean_t ipsq_split; /* ipsq may need to be split */ - int ipsq_waitfor; /* Values encoded below */ - char ipsq_name[LIFNAMSIZ+1]; /* same as phyint_groupname */ - ip_stack_t *ipsq_ipst; /* Does not have a netstack_hold */ - + * The IP-MT design revolves around the serialization objects ipsq_t (IPSQ) + * and ipxop_t (exclusive operation or "xop"). Becoming "writer" on an IPSQ + * ensures that no other threads can become "writer" on any IPSQs sharing that + * IPSQ's xop until the writer thread is done. + * + * Each phyint points to one IPSQ that remains fixed over the phyint's life. + * Each IPSQ points to one xop that can change over the IPSQ's life. If a + * phyint is *not* in an IPMP group, then its IPSQ will refer to the IPSQ's + * "own" xop (ipsq_ownxop). If a phyint *is* part of an IPMP group, then its + * IPSQ will refer to the "group" xop, which is shorthand for the xop of the + * IPSQ of the IPMP meta-interface's phyint. Thus, all phyints that are part + * of the same IPMP group will have their IPSQ's point to the group xop, and + * thus becoming "writer" on any phyint in the group will prevent any other + * writer on any other phyint in the group. All IPSQs sharing the same xop + * are chained together through ipsq_next (in the degenerate common case, + * ipsq_next simply refers to itself). Note that the group xop is guaranteed + * to exist at least as long as there are members in the group, since the IPMP + * meta-interface can only be destroyed if the group is empty. + * + * Incoming exclusive operation requests are enqueued on the IPSQ they arrived + * on rather than the xop. This makes switching xop's (as would happen when a + * phyint leaves an IPMP group) simple, because after the phyint leaves the + * group, any operations enqueued on its IPSQ can be safely processed with + * respect to its new xop, and any operations enqueued on the IPSQs of its + * former group can be processed with respect to their existing group xop. + * Even so, switching xops is a subtle dance; see ipsq_dq() for details. + * + * An IPSQ's "own" xop is embedded within the IPSQ itself since they have have + * identical lifetimes, and because doing so simplifies pointer management. + * While each phyint and IPSQ point to each other, it is not possible to free + * the IPSQ when the phyint is freed, since we may still *inside* the IPSQ + * when the phyint is being freed. Thus, ipsq_phyint is set to NULL when the + * phyint is freed, and the IPSQ free is later done in ipsq_exit(). + * + * ipsq_t synchronization: read write + * + * ipsq_xopq_mphead ipx_lock ipx_lock + * ipsq_xopq_mptail ipx_lock ipx_lock + * ipsq_xop_switch_mp ipsq_lock ipsq_lock + * ipsq_phyint write once write once + * ipsq_next RW_READER ill_g_lock RW_WRITER ill_g_lock + * ipsq_xop ipsq_lock or ipsq ipsq_lock + ipsq + * ipsq_swxop ipsq ipsq + * ipsq_ownxop see ipxop_t see ipxop_t + * ipsq_ipst write once write once + * + * ipxop_t synchronization: read write + * + * ipx_writer ipx_lock ipx_lock + * ipx_xop_queued ipx_lock ipx_lock + * ipx_mphead ipx_lock ipx_lock + * ipx_mptail ipx_lock ipx_lock + * ipx_ipsq write once write once + * ips_ipsq_queued ipx_lock ipx_lock + * ipx_waitfor ipsq or ipx_lock ipsq + ipx_lock + * ipx_reentry_cnt ipsq or ipx_lock ipsq + ipx_lock + * ipx_current_done ipsq ipsq + * ipx_current_ioctl ipsq ipsq + * ipx_current_ipif ipsq or ipx_lock ipsq + ipx_lock + * ipx_pending_ipif ipsq or ipx_lock ipsq + ipx_lock + * ipx_pending_mp ipsq or ipx_lock ipsq + ipx_lock + * ipx_forced ipsq ipsq + * ipx_depth ipsq ipsq + * ipx_stack ipsq ipsq + */ +typedef struct ipxop_s { + kmutex_t ipx_lock; /* see above */ + kthread_t *ipx_writer; /* current owner */ + mblk_t *ipx_mphead; /* messages tied to this op */ + mblk_t *ipx_mptail; + struct ipsq_s *ipx_ipsq; /* associated ipsq */ + boolean_t ipx_ipsq_queued; /* ipsq using xop has queued op */ + int ipx_waitfor; /* waiting; values encoded below */ + int ipx_reentry_cnt; + boolean_t ipx_current_done; /* is the current operation done? */ + int ipx_current_ioctl; /* current ioctl, or 0 if no ioctl */ + ipif_t *ipx_current_ipif; /* ipif for current op */ + ipif_t *ipx_pending_ipif; /* ipif for ipsq_pending_mp */ + mblk_t *ipx_pending_mp; /* current ioctl mp while waiting */ + boolean_t ipx_forced; /* debugging aid */ #ifdef DEBUG - int ipsq_depth; /* debugging aid */ -#define IPSQ_STACK_DEPTH 15 - pc_t ipsq_stack[IPSQ_STACK_DEPTH]; /* debugging aid */ + int ipx_depth; /* debugging aid */ +#define IPX_STACK_DEPTH 15 + pc_t ipx_stack[IPX_STACK_DEPTH]; /* debugging aid */ #endif -} ipsq_t; +} ipxop_t; -/* ipsq_flags */ -#define IPSQ_GROUP 0x1 /* This ipsq belongs to an IPMP group */ +typedef struct ipsq_s { + kmutex_t ipsq_lock; /* see above */ + mblk_t *ipsq_switch_mp; /* op to handle right after switch */ + mblk_t *ipsq_xopq_mphead; /* list of excl ops (mostly ioctls) */ + mblk_t *ipsq_xopq_mptail; + struct phyint *ipsq_phyint; /* associated phyint */ + struct ipsq_s *ipsq_next; /* next ipsq sharing ipsq_xop */ + struct ipxop_s *ipsq_xop; /* current xop synchronization info */ + struct ipxop_s *ipsq_swxop; /* switch xop to on ipsq_exit() */ + struct ipxop_s ipsq_ownxop; /* our own xop (may not be in-use) */ + ip_stack_t *ipsq_ipst; /* does not have a netstack_hold */ +} ipsq_t; /* - * ipsq_waitfor: - * - * IPIF_DOWN 1 ipif_down waiting for refcnts to drop - * ILL_DOWN 2 ill_down waiting for refcnts to drop - * IPIF_FREE 3 ipif_free waiting for refcnts to drop - * ILL_FREE 4 ill unplumb waiting for refcnts to drop - * ILL_MOVE_OK 5 failover waiting for refcnts to drop + * ipx_waitfor values: */ +enum { + IPIF_DOWN = 1, /* ipif_down() waiting for refcnts to drop */ + ILL_DOWN, /* ill_down() waiting for refcnts to drop */ + IPIF_FREE, /* ipif_free() waiting for refcnts to drop */ + ILL_FREE /* ill unplumb waiting for refcnts to drop */ +}; -enum { IPIF_DOWN = 1, ILL_DOWN, IPIF_FREE, ILL_FREE, ILL_MOVE_OK }; +/* Operation types for ipsq_try_enter() */ +#define CUR_OP 0 /* request writer within current operation */ +#define NEW_OP 1 /* request writer for a new operation */ +#define SWITCH_OP 2 /* request writer once IPSQ XOP switches */ -/* Flags passed to ipsq_try_enter */ -#define CUR_OP 0 /* Current ioctl continuing again */ -#define NEW_OP 1 /* New ioctl starting afresh */ +/* + * Kstats tracked on each IPMP meta-interface. Order here must match + * ipmp_kstats[] in ip/ipmp.c. + */ +enum { + IPMP_KSTAT_OBYTES, IPMP_KSTAT_OBYTES64, IPMP_KSTAT_RBYTES, + IPMP_KSTAT_RBYTES64, IPMP_KSTAT_OPACKETS, IPMP_KSTAT_OPACKETS64, + IPMP_KSTAT_OERRORS, IPMP_KSTAT_IPACKETS, IPMP_KSTAT_IPACKETS64, + IPMP_KSTAT_IERRORS, IPMP_KSTAT_MULTIRCV, IPMP_KSTAT_MULTIXMT, + IPMP_KSTAT_BRDCSTRCV, IPMP_KSTAT_BRDCSTXMT, IPMP_KSTAT_LINK_UP, + IPMP_KSTAT_MAX /* keep last */ +}; /* * phyint represents state that is common to both IPv4 and IPv6 interfaces. * There is a separate ill_t representing IPv4 and IPv6 which has a * backpointer to the phyint structure for accessing common state. - * - * NOTE : It just stores the group name as there is only one name for - * IPv4 and IPv6 i.e it is a underlying link property. Actually - * IPv4 and IPv6 ill are grouped together when their phyints have - * the same name. */ typedef struct phyint { struct ill_s *phyint_illv4; struct ill_s *phyint_illv6; - uint_t phyint_ifindex; /* SIOCLSLIFINDEX */ - char *phyint_groupname; /* SIOCSLIFGROUPNAME */ - uint_t phyint_groupname_len; + uint_t phyint_ifindex; /* SIOCSLIFINDEX */ uint64_t phyint_flags; avl_node_t phyint_avl_by_index; /* avl tree by index */ avl_node_t phyint_avl_by_name; /* avl tree by name */ kmutex_t phyint_lock; struct ipsq_s *phyint_ipsq; /* back pointer to ipsq */ - struct phyint *phyint_ipsq_next; /* phyint list on this ipsq */ - /* Once Clearview IPMP is added the follow two fields can be removed */ - uint_t phyint_group_ifindex; /* index assigned to group */ - uint_t phyint_hook_ifindex; /* index used with neti/hook */ + struct ipmp_grp_s *phyint_grp; /* associated IPMP group */ + char phyint_name[LIFNAMSIZ]; /* physical interface name */ + uint64_t phyint_kstats0[IPMP_KSTAT_MAX]; /* baseline kstats */ } phyint_t; #define CACHE_ALIGN_SIZE 64 - #define CACHE_ALIGN(align_struct) P2ROUNDUP(sizeof (struct align_struct),\ CACHE_ALIGN_SIZE) struct _phyint_list_s_ { @@ -1568,34 +1580,6 @@ typedef union phyint_list_u { #define phyint_list_avl_by_index phyint_list_s.phyint_list_avl_by_index #define phyint_list_avl_by_name phyint_list_s.phyint_list_avl_by_name -/* - * ILL groups. We group ills, - * - * - if the ills have the same group name. (New way) - * - * ill_group locking notes: - * - * illgrp_lock protects ill_grp_ill_schednext. - * - * ill_g_lock protects ill_grp_next, illgrp_ill, illgrp_ill_count. - * Holding ill_g_lock freezes the memberships of ills in IPMP groups. - * It also freezes the global list of ills and all ipifs in all ills. - * - * To remove an ipif from the linked list of ipifs of that ill ipif_free_tail - * holds both ill_g_lock, and ill_lock. Similarly to remove an ill from the - * global list of ills, ill_glist_delete() holds ill_g_lock as writer. - * This simplifies things for ipif_select_source, illgrp_scheduler etc. - * that need to walk the members of an illgrp. They just hold ill_g_lock - * as reader to do the walk. - * - */ -typedef struct ill_group { - kmutex_t illgrp_lock; - struct ill_group *illgrp_next; /* Next ill_group */ - struct ill_s *illgrp_ill_schednext; /* Next ill to be scheduled */ - struct ill_s *illgrp_ill; /* First ill in the group */ - int illgrp_ill_count; -} ill_group_t; /* * Fragmentation hash bucket @@ -1792,6 +1776,108 @@ typedef struct ill_lso_capab_s ill_lso_capab_t; #define IS_LOOPBACK(ill) \ ((ill)->ill_phyint->phyint_flags & PHYI_LOOPBACK) +/* Is this an IPMP meta-interface ILL? */ +#define IS_IPMP(ill) \ + ((ill)->ill_phyint->phyint_flags & PHYI_IPMP) + +/* Is this ILL under an IPMP meta-interface? (aka "in a group?") */ +#define IS_UNDER_IPMP(ill) \ + ((ill)->ill_grp != NULL && !IS_IPMP(ill)) + +/* Is ill1 in the same illgrp as ill2? */ +#define IS_IN_SAME_ILLGRP(ill1, ill2) \ + ((ill1)->ill_grp != NULL && ((ill1)->ill_grp == (ill2)->ill_grp)) + +/* Is ill1 on the same LAN as ill2? */ +#define IS_ON_SAME_LAN(ill1, ill2) \ + ((ill1) == (ill2) || IS_IN_SAME_ILLGRP(ill1, ill2)) + +#define ILL_OTHER(ill) \ + ((ill)->ill_isv6 ? (ill)->ill_phyint->phyint_illv4 : \ + (ill)->ill_phyint->phyint_illv6) + +/* + * IPMP group ILL state structure -- up to two per IPMP group (V4 and V6). + * Created when the V4 and/or V6 IPMP meta-interface is I_PLINK'd. It is + * guaranteed to persist while there are interfaces of that type in the group. + * In general, most fields are accessed outside of the IPSQ (e.g., in the + * datapath), and thus use locks in addition to the IPSQ for protection. + * + * synchronization: read write + * + * ig_if ipsq or ill_g_lock ipsq and ill_g_lock + * ig_actif ipsq or ipmp_lock ipsq and ipmp_lock + * ig_nactif ipsq or ipmp_lock ipsq and ipmp_lock + * ig_next_ill ipsq or ipmp_lock ipsq and ipmp_lock + * ig_ipmp_ill write once write once + * ig_cast_ill ipsq or ipmp_lock ipsq and ipmp_lock + * ig_arpent ipsq ipsq + * ig_mtu ipsq ipsq + */ +typedef struct ipmp_illgrp_s { + list_t ig_if; /* list of all interfaces */ + list_t ig_actif; /* list of active interfaces */ + uint_t ig_nactif; /* number of active interfaces */ + struct ill_s *ig_next_ill; /* next active interface to use */ + struct ill_s *ig_ipmp_ill; /* backpointer to IPMP meta-interface */ + struct ill_s *ig_cast_ill; /* nominated ill for multi/broadcast */ + list_t ig_arpent; /* list of ARP entries */ + uint_t ig_mtu; /* ig_ipmp_ill->ill_max_mtu */ +} ipmp_illgrp_t; + +/* + * IPMP group state structure -- one per IPMP group. Created when the + * IPMP meta-interface is plumbed; it is guaranteed to persist while there + * are interfaces in it. + * + * ipmp_grp_t synchronization: read write + * + * gr_name ipmp_lock ipmp_lock + * gr_ifname write once write once + * gr_mactype ipmp_lock ipmp_lock + * gr_phyint write once write once + * gr_nif ipmp_lock ipmp_lock + * gr_nactif ipsq ipsq + * gr_v4 ipmp_lock ipmp_lock + * gr_v6 ipmp_lock ipmp_lock + * gr_nv4 ipmp_lock ipmp_lock + * gr_nv6 ipmp_lock ipmp_lock + * gr_pendv4 ipmp_lock ipmp_lock + * gr_pendv6 ipmp_lock ipmp_lock + * gr_linkdownmp ipsq ipsq + * gr_ksp ipmp_lock ipmp_lock + * gr_kstats0 atomic atomic + */ +typedef struct ipmp_grp_s { + char gr_name[LIFGRNAMSIZ]; /* group name */ + char gr_ifname[LIFNAMSIZ]; /* interface name */ + t_uscalar_t gr_mactype; /* DLPI mactype of group */ + phyint_t *gr_phyint; /* IPMP group phyint */ + uint_t gr_nif; /* number of interfaces in group */ + uint_t gr_nactif; /* number of active interfaces */ + ipmp_illgrp_t *gr_v4; /* V4 group information */ + ipmp_illgrp_t *gr_v6; /* V6 group information */ + uint_t gr_nv4; /* number of ills in V4 group */ + uint_t gr_nv6; /* number of ills in V6 group */ + uint_t gr_pendv4; /* number of pending ills in V4 group */ + uint_t gr_pendv6; /* number of pending ills in V6 group */ + mblk_t *gr_linkdownmp; /* message used to bring link down */ + kstat_t *gr_ksp; /* group kstat pointer */ + uint64_t gr_kstats0[IPMP_KSTAT_MAX]; /* baseline group kstats */ +} ipmp_grp_t; + +/* + * IPMP ARP entry -- one per SIOCS*ARP entry tied to the group. Used to keep + * ARP up-to-date as the active set of interfaces in the group changes. + */ +typedef struct ipmp_arpent_s { + mblk_t *ia_area_mp; /* AR_ENTRY_ADD pointer */ + ipaddr_t ia_ipaddr; /* IP address for this entry */ + boolean_t ia_proxyarp; /* proxy ARP entry? */ + boolean_t ia_notified; /* ARP notified about this entry? */ + list_node_t ia_node; /* next ARP entry in list */ +} ipmp_arpent_t; + /* * IP Lower level Structure. * Instance data structure in ip_open when there is a device below us. @@ -1851,6 +1937,7 @@ typedef struct ill_s { mblk_t *ill_unbind_mp; /* unbind mp from ill_dl_up() */ mblk_t *ill_promiscoff_mp; /* for ill_leave_allmulti() */ mblk_t *ill_dlpi_deferred; /* b_next chain of control messages */ + mblk_t *ill_ardeact_mp; /* deact mp from ipmp_ill_activate() */ mblk_t *ill_phys_addr_mp; /* mblk which holds ill_phys_addr */ #define ill_last_mp_to_free ill_phys_addr_mp @@ -1867,21 +1954,19 @@ typedef struct ill_s { ill_dlpi_style_set : 1, ill_ifname_pending : 1, - ill_move_in_progress : 1, /* FAILOVER/FAILBACK in progress */ ill_join_allmulti : 1, ill_logical_down : 1, - ill_is_6to4tun : 1, /* Interface is a 6to4 tunnel */ + ill_promisc_on_phys : 1, /* phys interface in promisc mode */ ill_dl_up : 1, ill_up_ipifs : 1, - ill_note_link : 1, /* supports link-up notification */ + ill_capab_reneg : 1, /* capability renegotiation to be done */ ill_dld_capab_inprog : 1, /* direct dld capab call in prog */ ill_need_recover_multicast : 1, - - ill_pad_to_bit_31 : 16; + ill_pad_to_bit_31 : 17; /* Following bit fields protected by ill_lock */ uint_t @@ -1891,10 +1976,8 @@ typedef struct ill_s { ill_arp_closing : 1, ill_arp_bringup_pending : 1, - ill_mtu_userspecified : 1, /* SIOCSLIFLNKINFO has set the mtu */ ill_arp_extend : 1, /* ARP has DAD extensions */ - - ill_pad_bit_31 : 25; + ill_pad_bit_31 : 26; /* * Used in SIOCSIFMUXID and SIOCGIFMUXID for 'ifconfig unplumb'. @@ -1931,6 +2014,7 @@ typedef struct ill_s { */ uint8_t ill_max_hops; /* Maximum hops for any logical interface */ uint_t ill_max_mtu; /* Maximum MTU for any logical interface */ + uint_t ill_user_mtu; /* User-specified MTU via SIOCSLIFLNKINFO */ uint32_t ill_reachable_time; /* Value for ND algorithm in msec */ uint32_t ill_reachable_retrans_time; /* Value for ND algorithm msec */ uint_t ill_max_buf; /* Max # of req to buffer for ND */ @@ -1953,13 +2037,9 @@ typedef struct ill_s { * of the ipif. */ mblk_t *ill_arp_on_mp; - /* Peer ill of an IPMP move operation */ - struct ill_s *ill_move_peer; phyint_t *ill_phyint; uint64_t ill_flags; - ill_group_t *ill_group; - struct ill_s *ill_group_next; kmutex_t ill_lock; /* Please see table below */ /* @@ -2005,6 +2085,18 @@ typedef struct ill_s { void *ill_flownotify_mh; /* Tx flow ctl, mac cb handle */ uint_t ill_ilm_cnt; /* ilms referencing this ill */ uint_t ill_ipallmulti_cnt; /* ip_join_allmulti() calls */ + /* + * IPMP fields. + */ + ipmp_illgrp_t *ill_grp; /* IPMP group information */ + list_node_t ill_actnode; /* next active ill in group */ + list_node_t ill_grpnode; /* next ill in group */ + ipif_t *ill_src_ipif; /* source address selection rotor */ + ipif_t *ill_move_ipif; /* ipif awaiting move to new ill */ + boolean_t ill_nom_cast; /* nominated for mcast/bcast */ + uint_t ill_bound_cnt; /* # of data addresses bound to ill */ + ipif_t *ill_bound_ipif; /* ipif chain bound to ill */ + timeout_id_t ill_refresh_tid; /* ill refresh retry timeout id */ } ill_t; /* @@ -2088,6 +2180,7 @@ typedef struct ill_s { * * ill_max_mtu * + * ill_user_mtu ipsq + ill_lock ill_lock * ill_reachable_time ipsq + ill_lock ill_lock * ill_reachable_retrans_time ipsq + ill_lock ill_lock * ill_max_buf ipsq + ill_lock ill_lock @@ -2102,12 +2195,9 @@ typedef struct ill_s { * ill_arp_down_mp ipsq ipsq * ill_arp_del_mapping_mp ipsq ipsq * ill_arp_on_mp ipsq ipsq - * ill_move_peer ipsq ipsq * * ill_phyint ipsq, ill_g_lock, ill_lock Any of them * ill_flags ill_lock ill_lock - * ill_group ipsq, ill_g_lock, ill_lock Any of them - * ill_group_next ipsq, ill_g_lock, ill_lock Any of them * ill_nd_lla_mp ipsq + down ill only when ill is up * ill_nd_lla ipsq + down ill only when ill is up * ill_nd_lla_len ipsq + down ill only when ill is up @@ -2122,11 +2212,26 @@ typedef struct ill_s { * ill_ilm_walker_cnt ill_lock ill_lock * ill_nce_cnt ill_lock ill_lock * ill_ilm_cnt ill_lock ill_lock + * ill_src_ipif ill_g_lock ill_g_lock * ill_trace ill_lock ill_lock * ill_usesrc_grp_next ill_g_usesrc_lock ill_g_usesrc_lock * ill_dhcpinit atomics atomics * ill_flownotify_mh write once write once * ill_capab_pending_cnt ipsq ipsq + * + * ill_bound_cnt ipsq ipsq + * ill_bound_ipif ipsq ipsq + * ill_actnode ipsq + ipmp_lock ipsq OR ipmp_lock + * ill_grpnode ipsq + ill_g_lock ipsq OR ill_g_lock + * ill_src_ipif ill_g_lock ill_g_lock + * ill_move_ipif ipsq ipsq + * ill_nom_cast ipsq ipsq OR advisory + * ill_refresh_tid ill_lock ill_lock + * ill_grp (for IPMP ill) write once write once + * ill_grp (for underlying ill) ipsq + ill_g_lock ipsq OR ill_g_lock + * + * NOTE: It's OK to make heuristic decisions on an underlying interface + * by using IS_UNDER_IPMP() or comparing ill_grp's raw pointer value. */ /* @@ -2167,7 +2272,7 @@ enum { IF_CMD = 1, LIF_CMD, TUN_CMD, ARP_CMD, XARP_CMD, MSFILT_CMD, MISC_CMD }; #define IPI_MODOK 0x2 /* Permitted on mod instance of IP */ #define IPI_WR 0x4 /* Need to grab writer access */ #define IPI_GET_CMD 0x8 /* branch to mi_copyout on success */ -#define IPI_REPL 0x10 /* valid for replacement ipif created in MOVE */ +/* unused 0x10 */ #define IPI_NULL_BCONT 0x20 /* ioctl has not data and hence no b_cont */ #define IPI_PASS_DOWN 0x40 /* pass this ioctl down when a module only */ @@ -2176,17 +2281,6 @@ extern ip_ioctl_cmd_t ip_misc_ioctl_table[]; extern int ip_ndx_ioctl_count; extern int ip_misc_ioctl_count; -#define ILL_CLEAR_MOVE(ill) { \ - ill_t *peer_ill; \ - \ - peer_ill = (ill)->ill_move_peer; \ - ASSERT(peer_ill != NULL); \ - (ill)->ill_move_in_progress = B_FALSE; \ - peer_ill->ill_move_in_progress = B_FALSE; \ - (ill)->ill_move_peer = NULL; \ - peer_ill->ill_move_peer = NULL; \ -} - /* Passed down by ARP to IP during I_PLINK/I_PUNLINK */ typedef struct ipmx_s { char ipmx_name[LIFNAMSIZ]; /* if name */ @@ -2799,19 +2893,11 @@ typedef struct ip_pktinfo { (!((ipif)->ipif_state_flags & (IPIF_CONDEMNED)) || \ IAM_WRITER_IPIF(ipif)) -/* - * These macros are used by critical set ioctls and failover ioctls to - * mark the ipif appropriately before starting the operation and to clear the - * marks after completing the operation. - */ -#define IPIF_UNMARK_MOVING(ipif) \ - (ipif)->ipif_state_flags &= ~IPIF_MOVING & ~IPIF_CHANGING; - #define ILL_UNMARK_CHANGING(ill) \ (ill)->ill_state_flags &= ~ILL_CHANGING; /* Macros used to assert that this thread is a writer */ -#define IAM_WRITER_IPSQ(ipsq) ((ipsq)->ipsq_writer == curthread) +#define IAM_WRITER_IPSQ(ipsq) ((ipsq)->ipsq_xop->ipx_writer == curthread) #define IAM_WRITER_ILL(ill) IAM_WRITER_IPSQ((ill)->ill_phyint->phyint_ipsq) #define IAM_WRITER_IPIF(ipif) IAM_WRITER_ILL((ipif)->ipif_ill) @@ -2837,9 +2923,9 @@ typedef struct ip_pktinfo { #define RELEASE_ILL_LOCKS(ill_1, ill_2) \ { \ if (ill_1 != NULL) \ - mutex_exit(&(ill_1)->ill_lock); \ + mutex_exit(&(ill_1)->ill_lock); \ if (ill_2 != NULL && ill_2 != ill_1) \ - mutex_exit(&(ill_2)->ill_lock); \ + mutex_exit(&(ill_2)->ill_lock); \ } /* Get the other protocol instance ill */ @@ -2847,14 +2933,9 @@ typedef struct ip_pktinfo { ((ill)->ill_isv6 ? (ill)->ill_phyint->phyint_illv4 : \ (ill)->ill_phyint->phyint_illv6) -#define MATCH_V4_ONLY 0x1 -#define MATCH_V6_ONLY 0x2 -#define MATCH_ILL_ONLY 0x4 - /* ioctl command info: Ioctl properties extracted and stored in here */ typedef struct cmd_info_s { - char ci_groupname[LIFNAMSIZ + 1]; /* SIOCSLIFGROUPNAME */ ipif_t *ci_ipif; /* ipif associated with [l]ifreq ioctl's */ sin_t *ci_sin; /* the sin struct passed down */ sin6_t *ci_sin6; /* the sin6_t struct passed down */ @@ -2990,10 +3071,8 @@ extern struct module_info ip_mod_info; ((ipst)->ips_ip6_loopback_out_event.he_interested) /* - * Hooks marcos used inside of ip + * Hooks macros used inside of ip */ -#define IPHA_VHL ipha_version_and_hdr_length - #define FW_HOOKS(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst) \ \ if ((_hook).he_interested) { \ @@ -3002,21 +3081,8 @@ extern struct module_info ip_mod_info; _NOTE(CONSTCOND) \ ASSERT((_ilp != NULL) || (_olp != NULL)); \ \ - _NOTE(CONSTCOND) \ - if ((_ilp != NULL) && \ - (((ill_t *)(_ilp))->ill_phyint != NULL)) \ - info.hpe_ifp = (phy_if_t)((ill_t *) \ - (_ilp))->ill_phyint->phyint_hook_ifindex; \ - else \ - info.hpe_ifp = 0; \ - \ - _NOTE(CONSTCOND) \ - if ((_olp != NULL) && \ - (((ill_t *)(_olp))->ill_phyint != NULL)) \ - info.hpe_ofp = (phy_if_t)((ill_t *) \ - (_olp))->ill_phyint->phyint_hook_ifindex; \ - else \ - info.hpe_ofp = 0; \ + FW_SET_ILL_INDEX(info.hpe_ifp, (ill_t *)_ilp); \ + FW_SET_ILL_INDEX(info.hpe_ofp, (ill_t *)_olp); \ info.hpe_protocol = ipst->ips_ipv4_net_data; \ info.hpe_hdr = _iph; \ info.hpe_mp = &(_fm); \ @@ -3026,10 +3092,8 @@ extern struct module_info ip_mod_info; _event, (hook_data_t)&info) != 0) { \ ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\ (_hook).he_name, (void *)_fm, (void *)_m)); \ - if (_fm != NULL) { \ - freemsg(_fm); \ - _fm = NULL; \ - } \ + freemsg(_fm); \ + _fm = NULL; \ _iph = NULL; \ _m = NULL; \ } else { \ @@ -3046,21 +3110,8 @@ extern struct module_info ip_mod_info; _NOTE(CONSTCOND) \ ASSERT((_ilp != NULL) || (_olp != NULL)); \ \ - _NOTE(CONSTCOND) \ - if ((_ilp != NULL) && \ - (((ill_t *)(_ilp))->ill_phyint != NULL)) \ - info.hpe_ifp = (phy_if_t)((ill_t *) \ - (_ilp))->ill_phyint->phyint_hook_ifindex; \ - else \ - info.hpe_ifp = 0; \ - \ - _NOTE(CONSTCOND) \ - if ((_olp != NULL) && \ - (((ill_t *)(_olp))->ill_phyint != NULL)) \ - info.hpe_ofp = (phy_if_t)((ill_t *) \ - (_olp))->ill_phyint->phyint_hook_ifindex; \ - else \ - info.hpe_ofp = 0; \ + FW_SET_ILL_INDEX(info.hpe_ifp, (ill_t *)_ilp); \ + FW_SET_ILL_INDEX(info.hpe_ofp, (ill_t *)_olp); \ info.hpe_protocol = ipst->ips_ipv6_net_data; \ info.hpe_hdr = _iph; \ info.hpe_mp = &(_fm); \ @@ -3070,10 +3121,8 @@ extern struct module_info ip_mod_info; _event, (hook_data_t)&info) != 0) { \ ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\ (_hook).he_name, (void *)_fm, (void *)_m)); \ - if (_fm != NULL) { \ - freemsg(_fm); \ - _fm = NULL; \ - } \ + freemsg(_fm); \ + _fm = NULL; \ _iph = NULL; \ _m = NULL; \ } else { \ @@ -3082,6 +3131,17 @@ extern struct module_info ip_mod_info; } \ } +#define FW_SET_ILL_INDEX(fp, ill) \ + _NOTE(CONSTCOND) \ + if ((ill) == NULL || (ill)->ill_phyint == NULL) { \ + (fp) = 0; \ + _NOTE(CONSTCOND) \ + } else if (IS_UNDER_IPMP(ill)) { \ + (fp) = ipmp_ill_get_ipmp_ifindex(ill); \ + } else { \ + (fp) = (ill)->ill_phyint->phyint_ifindex; \ + } + /* * Network byte order macros */ @@ -3146,16 +3206,15 @@ struct ipsec_out_s; struct mac_header_info_s; -extern boolean_t ip_assign_ifindex(uint_t *, ip_stack_t *); extern void ill_frag_timer(void *); extern ill_t *ill_first(int, int, ill_walk_context_t *, ip_stack_t *); extern ill_t *ill_next(ill_walk_context_t *, ill_t *); extern void ill_frag_timer_start(ill_t *); extern void ill_nic_event_dispatch(ill_t *, lif_if_t, nic_event_t, nic_event_data_t, size_t); -extern void ill_nic_event_plumb(ill_t *, boolean_t); extern mblk_t *ip_carve_mp(mblk_t **, ssize_t); extern mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); +extern mblk_t *ip_dlnotify_alloc(uint_t, uint_t); extern char *ip_dot_addr(ipaddr_t, char *); extern const char *mac_colon_addr(const uint8_t *, size_t, char *, size_t); extern void ip_lwput(queue_t *, mblk_t *); @@ -3239,8 +3298,49 @@ extern int ip_hdr_complete(ipha_t *, zoneid_t, ip_stack_t *); extern struct qinit iprinitv6; extern struct qinit ipwinitv6; -extern void conn_drain_insert(conn_t *connp); -extern int conn_ipsec_length(conn_t *connp); +extern void ipmp_init(ip_stack_t *); +extern void ipmp_destroy(ip_stack_t *); +extern ipmp_grp_t *ipmp_grp_create(const char *, phyint_t *); +extern void ipmp_grp_destroy(ipmp_grp_t *); +extern void ipmp_grp_info(const ipmp_grp_t *, lifgroupinfo_t *); +extern int ipmp_grp_rename(ipmp_grp_t *, const char *); +extern ipmp_grp_t *ipmp_grp_lookup(const char *, ip_stack_t *); +extern int ipmp_grp_vet_phyint(ipmp_grp_t *, phyint_t *); +extern ipmp_illgrp_t *ipmp_illgrp_create(ill_t *); +extern void ipmp_illgrp_destroy(ipmp_illgrp_t *); +extern ill_t *ipmp_illgrp_add_ipif(ipmp_illgrp_t *, ipif_t *); +extern void ipmp_illgrp_del_ipif(ipmp_illgrp_t *, ipif_t *); +extern ill_t *ipmp_illgrp_next_ill(ipmp_illgrp_t *); +extern ill_t *ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *); +extern ill_t *ipmp_illgrp_cast_ill(ipmp_illgrp_t *); +extern ill_t *ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *); +extern ill_t *ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *); +extern void ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *); +extern ipmp_arpent_t *ipmp_illgrp_create_arpent(ipmp_illgrp_t *, mblk_t *, + boolean_t); +extern void ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *, ipmp_arpent_t *); +extern ipmp_arpent_t *ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *, ipaddr_t *); +extern void ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *); +extern void ipmp_illgrp_mark_arpent(ipmp_illgrp_t *, ipmp_arpent_t *); +extern ill_t *ipmp_illgrp_find_ill(ipmp_illgrp_t *, uchar_t *, uint_t); +extern void ipmp_illgrp_link_grp(ipmp_illgrp_t *, ipmp_grp_t *); +extern int ipmp_illgrp_unlink_grp(ipmp_illgrp_t *); +extern uint_t ipmp_ill_get_ipmp_ifindex(const ill_t *); +extern void ipmp_ill_join_illgrp(ill_t *, ipmp_illgrp_t *); +extern void ipmp_ill_leave_illgrp(ill_t *); +extern ill_t *ipmp_ill_hold_ipmp_ill(ill_t *); +extern boolean_t ipmp_ill_is_active(ill_t *); +extern void ipmp_ill_refresh_active(ill_t *); +extern void ipmp_phyint_join_grp(phyint_t *, ipmp_grp_t *); +extern void ipmp_phyint_leave_grp(phyint_t *); +extern void ipmp_phyint_refresh_active(phyint_t *); +extern ill_t *ipmp_ipif_bound_ill(const ipif_t *); +extern ill_t *ipmp_ipif_hold_bound_ill(const ipif_t *); +extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *); +extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *); + +extern void conn_drain_insert(conn_t *connp); +extern int conn_ipsec_length(conn_t *connp); extern void ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *, ire_t *); extern ipaddr_t ip_get_dst(ipha_t *); @@ -3274,9 +3374,6 @@ extern int ip_srcid_report(queue_t *, mblk_t *, caddr_t, cred_t *); extern uint8_t ipoptp_next(ipoptp_t *); extern uint8_t ipoptp_first(ipoptp_t *, ipha_t *); extern int ip_opt_get_user(const ipha_t *, uchar_t *); -extern ill_t *ip_grab_attach_ill(ill_t *, mblk_t *, int, boolean_t, - ip_stack_t *); -extern ire_t *conn_set_outgoing_ill(conn_t *, ire_t *, ill_t **); extern int ipsec_req_from_conn(conn_t *, ipsec_req_t *, int); extern int ip_snmp_get(queue_t *q, mblk_t *mctl, int level); extern int ip_snmp_set(queue_t *q, int, int, uchar_t *, int); @@ -3295,7 +3392,6 @@ extern void ip_savebuf(void **, uint_t *, boolean_t, const void *, uint_t); extern boolean_t ipsq_pending_mp_cleanup(ill_t *, conn_t *); extern void conn_ioctl_cleanup(conn_t *); extern ill_t *conn_get_held_ill(conn_t *, ill_t **, int *); -extern ill_t *ip_newroute_get_dst_ill(ill_t *); struct multidata_s; struct pdesc_s; @@ -3314,9 +3410,6 @@ extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *, uint_t); extern void ip_unbind(conn_t *connp); -extern phyint_t *phyint_lookup_group(char *, boolean_t, ip_stack_t *); -extern phyint_t *phyint_lookup_group_ifindex(uint_t, ip_stack_t *); - extern void tnet_init(void); extern void tnet_fini(void); @@ -3434,6 +3527,8 @@ typedef struct ipobs_cb { * ihd_ifindex Interface index that the packet was received/sent over. * For local packets, this is the index of the interface * associated with the local destination address. + * ihd_grifindex IPMP group interface index (zero unless ihd_ifindex + * is an IPMP underlying interface). * ihd_stack Netstack the packet is from. */ typedef struct ipobs_hook_data { @@ -3443,6 +3538,7 @@ typedef struct ipobs_hook_data { ipobs_hook_type_t ihd_htype; uint16_t ihd_ipver; uint64_t ihd_ifindex; + uint64_t ihd_grifindex; netstack_t *ihd_stack; } ipobs_hook_data_t; diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index 3f967ea183..d484831a3c 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -1892,7 +1892,6 @@ icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) * case MRT_VERSION: * case MRT_ASSERT: * case IP_SEC_OPT: - * case IP_DONTFAILOVER_IF: * case IP_NEXTHOP: */ default: @@ -2481,7 +2480,6 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, case MRT_VERSION: case MRT_ASSERT: case IP_SEC_OPT: - case IP_DONTFAILOVER_IF: case IP_NEXTHOP: /* * "soft" error (negative) @@ -3014,9 +3012,7 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, case IPV6_PATHMTU: return (EINVAL); - case IPV6_BOUND_PIF: case IPV6_SEC_OPT: - case IPV6_DONTFAILOVER_IF: case IPV6_SRC_PREFERENCES: case IPV6_V6ONLY: /* Handled at IP level */ diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c index 4f15801dfb..24ba9d689c 100644 --- a/usr/src/uts/common/inet/ip/icmp_opt_data.c +++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -138,9 +138,6 @@ opdes_t icmp_opt_arr[] = { { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 /* no ifindex */ }, -{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (struct in_addr), 0 /* not initialized */ }, - { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, sizeof (int), 0 }, @@ -222,12 +219,6 @@ opdes_t icmp_opt_arr[] = { { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 /* no ifindex */ }, -{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (int), 0 /* no ifindex */ }, - -{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (int), 0 /* no ifindex */ }, - { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, sizeof (int), 0 }, diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c index 091509c71e..681f198aa7 100644 --- a/usr/src/uts/common/inet/ip/igmp.c +++ b/usr/src/uts/common/inet/ip/igmp.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -46,7 +46,7 @@ #include <sys/cmn_err.h> #include <sys/atomic.h> #include <sys/zone.h> - +#include <sys/callb.h> #include <sys/param.h> #include <sys/socket.h> #include <inet/ipclassifier.h> @@ -83,7 +83,7 @@ static mrec_t *mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, static void mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype, slist_t *flist); static mrec_t *mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist); - +static void mcast_signal_restart_thread(ip_stack_t *ipst); /* * Macros used to do timer len conversions. Timer values are always @@ -122,7 +122,7 @@ static mrec_t *mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist); * The first multicast join will trigger the igmp timers / mld timers * The unit for next is milliseconds. */ -void +static void igmp_start_timers(unsigned next, ip_stack_t *ipst) { int time_left; @@ -207,7 +207,7 @@ igmp_start_timers(unsigned next, ip_stack_t *ipst) * mld_start_timers: * The unit for next is milliseconds. */ -void +static void mld_start_timers(unsigned next, ip_stack_t *ipst) { int time_left; @@ -306,7 +306,8 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) uint32_t group; uint_t next; ipif_t *ipif; - ip_stack_t *ipst; + ip_stack_t *ipst; + ilm_walker_t ilw; ASSERT(ill != NULL); ASSERT(!ill->ill_isv6); @@ -401,8 +402,7 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) "igmp_input: we are only " "member src 0x%x ipif_local 0x%x", (int)ntohl(src), - (int) - ntohl(ipif->ipif_lcl_addr)); + (int)ntohl(ipif->ipif_lcl_addr)); } mutex_exit(&ill->ill_lock); return (mp); @@ -440,23 +440,20 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) } /* - * If we belong to the group being reported, and - * we are a 'Delaying member' in the RFC terminology, - * stop our timer for that group and 'clear flag' i.e. - * mark as IGMP_OTHERMEMBER. Do this for all logical - * interfaces on the given physical interface. + * If our ill has ILMs that belong to the group being + * reported, and we are a 'Delaying Member' in the RFC + * terminology, stop our timer for that group and 'clear + * flag' i.e. mark as IGMP_OTHERMEMBER. */ - mutex_enter(&ill->ill_lock); - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - ilm = ilm_lookup_ipif(ipif, group); - if (ilm != NULL) { + ilm = ilm_walker_start(&ilw, ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + if (ilm->ilm_addr == group) { ++ipst->ips_igmpstat.igps_rcv_ourreports; ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_OTHERMEMBER; } - } /* for */ - mutex_exit(&ill->ill_lock); + } + ilm_walker_finish(&ilw); break; case IGMP_V3_MEMBERSHIP_REPORT: @@ -485,6 +482,7 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) int timer; uint_t next, current; ip_stack_t *ipst; + ilm_walker_t ilw; ipst = ill->ill_ipst; ++ipst->ips_igmpstat.igps_rcv_queries; @@ -583,11 +581,12 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) * the maximum timeout. */ next = (unsigned)INFINITY; - mutex_enter(&ill->ill_lock); + ilm = ilm_walker_start(&ilw, ill); + mutex_enter(&ill->ill_lock); current = CURRENT_MSTIME; - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { /* * A multicast router joins INADDR_ANY address * to enable promiscuous reception of all @@ -610,6 +609,7 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) } } mutex_exit(&ill->ill_lock); + ilm_walker_finish(&ilw); return (next); } @@ -623,6 +623,7 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen) ipaddr_t *src_array; uint8_t qrv; ip_stack_t *ipst; + ilm_walker_t ilw; ipst = ill->ill_ipst; /* make sure numsrc matches packet size */ @@ -693,8 +694,9 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen) } else { /* group or group/source specific query */ + ilm = ilm_walker_start(&ilw, ill); mutex_enter(&ill->ill_lock); - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) || (ilm->ilm_addr == htonl(INADDR_ANY)) || (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) || @@ -749,6 +751,7 @@ group_query: ilm->ilm_timer += current; } mutex_exit(&ill->ill_lock); + ilm_walker_finish(&ilw); } return (next); @@ -819,13 +822,22 @@ igmp_joingroup(ilm_t *ilm) mutex_exit(&ill->ill_lock); /* - * To avoid deadlock, we defer igmp_start_timers() to - * ipsq_exit(). See the comment in ipsq_exit() for details. + * We need to restart the IGMP timers, but we can't do it here + * since we're inside the IPSQ and thus igmp_start_timers() -> + * untimeout() (inside the IPSQ, waiting for a running timeout + * to finish) could deadlock with igmp_timeout_handler() -> + * ipsq_enter() (running the timeout, waiting to get inside + * the IPSQ). We also can't just delay it until after we + * ipsq_exit() since we could be inside more than one IPSQ and + * thus still have the other IPSQs pinned after we exit -- and + * igmp_start_timers() may be trying to enter one of those. + * Instead, signal a dedicated thread that will do it for us. */ mutex_enter(&ipst->ips_igmp_timer_lock); ipst->ips_igmp_deferred_next = MIN(timer, ipst->ips_igmp_deferred_next); mutex_exit(&ipst->ips_igmp_timer_lock); + mcast_signal_restart_thread(ipst); } if (ip_debug > 1) { @@ -897,13 +909,14 @@ mld_joingroup(ilm_t *ilm) mutex_exit(&ill->ill_lock); /* - * To avoid deadlock, we defer mld_start_timers() to - * ipsq_exit(). See the comment in ipsq_exit() for details. + * Signal another thread to restart the timers. See the + * comment in igmp_joingroup() for details. */ mutex_enter(&ipst->ips_mld_timer_lock); ipst->ips_mld_deferred_next = MIN(timer, ipst->ips_mld_deferred_next); mutex_exit(&ipst->ips_mld_timer_lock); + mcast_signal_restart_thread(ipst); } if (ip_debug > 1) { @@ -1073,8 +1086,8 @@ send_to_in: /* * Need to set up retransmission state; merge the new info with the * current state (which may be null). If the timer is not currently - * running, start it (need to do a delayed start of the timer as - * we're currently in the sq). + * running, signal a thread to restart it -- see the comment in + * igmp_joingroup() for details. */ rp = mcast_merge_rtx(ilm, rp, flist); if (ilm->ilm_rtx.rtx_timer == INFINITY) { @@ -1085,6 +1098,7 @@ send_to_in: ilm->ilm_rtx.rtx_timer); ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; mutex_exit(&ipst->ips_igmp_timer_lock); + mcast_signal_restart_thread(ipst); } mutex_exit(&ill->ill_lock); @@ -1161,8 +1175,8 @@ send_to_in: /* * Need to set up retransmission state; merge the new info with the * current state (which may be null). If the timer is not currently - * running, start it (need to do a deferred start of the timer as - * we're currently in the sq). + * running, signal a thread to restart it -- see the comment in + * igmp_joingroup() for details. */ rp = mcast_merge_rtx(ilm, rp, flist); ASSERT(ilm->ilm_rtx.rtx_cnt > 0); @@ -1174,6 +1188,7 @@ send_to_in: MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer); ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; mutex_exit(&ipst->ips_mld_timer_lock); + mcast_signal_restart_thread(ipst); } mutex_exit(&ill->ill_lock); @@ -1397,12 +1412,10 @@ per_ilm_rtxtimer: * * igmp_input() receives igmp queries and responds to the queries * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers(). - * Later the igmp_timer fires, the timeout handler igmp_timerout_handler() + * Later the igmp_timer fires, the timeout handler igmp_timeout_handler() * performs the action exclusively after entering each ill's ipsq as writer. - * The actual igmp timeout handler needs to run in the ipsq since it has to - * access the ilm's and we don't want another exclusive operation like - * say an IPMP failover to be simultaneously moving the ilms from one ill to - * another. + * (The need to enter the IPSQ is largely historical but there are still some + * fields like ilm_filter that rely on it.) * * The igmp_slowtimeo() function is called thru another timer. * igmp_slowtimeout_lock protects the igmp_slowtimeout_id @@ -1420,7 +1433,6 @@ igmp_timeout_handler(void *arg) ASSERT(arg != NULL); mutex_enter(&ipst->ips_igmp_timer_lock); ASSERT(ipst->ips_igmp_timeout_id != 0); - ipst->ips_igmp_timer_thread = curthread; ipst->ips_igmp_timer_scheduled_last = 0; ipst->ips_igmp_time_to_next = 0; mutex_exit(&ipst->ips_igmp_timer_lock); @@ -1452,7 +1464,6 @@ igmp_timeout_handler(void *arg) mutex_enter(&ipst->ips_igmp_timer_lock); ASSERT(ipst->ips_igmp_timeout_id != 0); ipst->ips_igmp_timeout_id = 0; - ipst->ips_igmp_timer_thread = NULL; mutex_exit(&ipst->ips_igmp_timer_lock); if (global_next != INFINITY) @@ -1663,7 +1674,6 @@ mld_timeout_handler(void *arg) ASSERT(arg != NULL); mutex_enter(&ipst->ips_mld_timer_lock); ASSERT(ipst->ips_mld_timeout_id != 0); - ipst->ips_mld_timer_thread = curthread; ipst->ips_mld_timer_scheduled_last = 0; ipst->ips_mld_time_to_next = 0; mutex_exit(&ipst->ips_mld_timer_lock); @@ -1695,7 +1705,6 @@ mld_timeout_handler(void *arg) mutex_enter(&ipst->ips_mld_timer_lock); ASSERT(ipst->ips_mld_timeout_id != 0); ipst->ips_mld_timeout_id = 0; - ipst->ips_mld_timer_thread = NULL; mutex_exit(&ipst->ips_mld_timer_lock); if (global_next != INFINITY) @@ -1871,7 +1880,7 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr) int hdrlen = sizeof (ipha_t) + RTRALERT_LEN; size_t size = hdrlen + sizeof (igmpa_t); ipif_t *ipif = ilm->ilm_ipif; - ill_t *ill = ipif->ipif_ill; /* Will be the "lower" ill */ + ill_t *ill = ipif->ipif_ill; mblk_t *first_mp; ipsec_out_t *io; zoneid_t zoneid; @@ -1887,14 +1896,6 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr) * not get forwarded on other interfaces or looped back, we * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop * to B_FALSE. - * - * We also need to make sure that this does not get load balanced - * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if - * here. If it gets load balanced, switches supporting igmp snooping - * will send the packet that it receives for this multicast group - * to the interface that we are sending on. As we have joined the - * multicast group on this ill, by sending the packet out on this - * ill, we receive all the packets back on this ill. */ first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI); if (first_mp == NULL) @@ -1909,7 +1910,6 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr) io->ipsec_out_len = sizeof (ipsec_out_t); io->ipsec_out_use_global_policy = B_TRUE; io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; - io->ipsec_out_attach_if = B_TRUE; io->ipsec_out_multicast_loop = B_FALSE; io->ipsec_out_dontroute = B_TRUE; if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES) @@ -1995,6 +1995,8 @@ igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist) zoneid_t zoneid; ip_stack_t *ipst = ill->ill_ipst; + ASSERT(IAM_WRITER_IPIF(ipif)); + /* if there aren't any records, there's nothing to send */ if (reclist == NULL) return; @@ -2022,6 +2024,14 @@ nextpkt: int srcspace, srcsperpkt; srcspace = ill->ill_max_frag - (size + sizeof (grphdra_t)); + + /* + * Skip if there's not even enough room in + * a single packet to send something useful. + */ + if (srcspace <= sizeof (ipaddr_t)) + continue; + srcsperpkt = srcspace / sizeof (ipaddr_t); /* * Increment size and numrec, because we will @@ -2082,7 +2092,6 @@ nextpkt: io->ipsec_out_len = sizeof (ipsec_out_t); io->ipsec_out_use_global_policy = B_TRUE; io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; - io->ipsec_out_attach_if = B_TRUE; io->ipsec_out_multicast_loop = B_FALSE; io->ipsec_out_dontroute = B_TRUE; if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES) @@ -2188,6 +2197,7 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) uint_t next; int mldlen; ip_stack_t *ipst = ill->ill_ipst; + ilm_walker_t ilw; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal); @@ -2294,7 +2304,6 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) return; } - /* * If we belong to the group being reported, and we are a * 'Delaying member' per the RFC terminology, stop our timer @@ -2303,8 +2312,8 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) * membership entries for the same group address (one per zone) * so we need to walk the ill_ilm list. */ - mutex_enter(&ill->ill_lock); - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { + ilm = ilm_walker_start(&ilw, ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr)) continue; BUMP_MIB(ill->ill_icmp6_mib, @@ -2313,7 +2322,7 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_OTHERMEMBER; } - mutex_exit(&ill->ill_lock); + ilm_walker_finish(&ilw); break; } case MLD_LISTENER_REDUCTION: @@ -2343,6 +2352,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) int timer; uint_t next, current; in6_addr_t *v6group; + ilm_walker_t ilw; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries); @@ -2397,10 +2407,12 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) * maximum timeout. */ next = INFINITY; - mutex_enter(&ill->ill_lock); + ilm = ilm_walker_start(&ilw, ill); + mutex_enter(&ill->ill_lock); current = CURRENT_MSTIME; - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { + + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr)); if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) || @@ -2430,6 +2442,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) } } mutex_exit(&ill->ill_lock); + ilm_walker_finish(&ilw); return (next); } @@ -2446,6 +2459,7 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen) in6_addr_t *v6group, *src_array; uint_t next, numsrc, i, mrd, delay, qqi, current; uint8_t qrv; + ilm_walker_t ilw; v6group = &mld2q->mld2q_addr; numsrc = ntohs(mld2q->mld2q_numsrc); @@ -2518,8 +2532,9 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen) } else { /* group or group/source specific query */ + ilm = ilm_walker_start(&ilw, ill); mutex_enter(&ill->ill_lock); - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) || IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) || IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) || @@ -2574,6 +2589,7 @@ group_query: break; } mutex_exit(&ill->ill_lock); + ilm_walker_finish(&ilw); } return (next); @@ -2591,9 +2607,8 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr) ip6_hbh_t *ip6hbh; struct ip6_opt_router *ip6router; size_t size = IPV6_HDR_LEN + sizeof (mld_hdr_t); - ill_t *ill = ilm->ilm_ill; /* Will be the "lower" ill */ + ill_t *ill = ilm->ilm_ill; ipif_t *ipif; - ip6i_t *ip6i; /* * We need to place a router alert option in this packet. The length @@ -2605,30 +2620,14 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr) ASSERT(ill->ill_isv6); - /* - * We need to make sure that this packet does not get load balanced. - * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and - * ip_newroute_ipif_v6 knows how to handle such packets. - * If it gets load balanced, switches supporting MLD snooping - * (in the future) will send the packet that it receives for this - * multicast group to the interface that we are sending on. As we have - * joined the multicast group on this ill, by sending the packet out - * on this ill, we receive all the packets back on this ill. - */ - size += sizeof (ip6i_t) + router_alert_length; + size += router_alert_length; mp = allocb(size, BPRI_HI); if (mp == NULL) return; bzero(mp->b_rptr, size); mp->b_wptr = mp->b_rptr + size; - ip6i = (ip6i_t *)mp->b_rptr; - ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - ip6i->ip6i_nxt = IPPROTO_RAW; - ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT; - ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; - - ip6h = (ip6_t *)&ip6i[1]; + ip6h = (ip6_t *)mp->b_rptr; ip6hbh = (struct ip6_hbh *)&ip6h[1]; ip6router = (struct ip6_opt_router *)&ip6hbh[1]; /* @@ -2698,7 +2697,6 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist) in6_addr_t *srcarray; ip6_t *ip6h; ip6_hbh_t *ip6hbh; - ip6i_t *ip6i; struct ip6_opt_router *ip6router; size_t size, optlen, padlen, icmpsize, rsize; ipif_t *ipif; @@ -2707,6 +2705,8 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist) mrec_t *next_reclist = reclist; boolean_t morepkts; + ASSERT(IAM_WRITER_ILL(ill)); + /* If there aren't any records, there's nothing to send */ if (reclist == NULL) return; @@ -2743,6 +2743,14 @@ nextpkt: int srcspace, srcsperpkt; srcspace = ill->ill_max_frag - (size + sizeof (mld2mar_t)); + + /* + * Skip if there's not even enough room in + * a single packet to send something useful. + */ + if (srcspace <= sizeof (in6_addr_t)) + continue; + srcsperpkt = srcspace / sizeof (in6_addr_t); /* * Increment icmpsize and size, because we will @@ -2787,30 +2795,13 @@ nextpkt: size += rsize; } - /* - * We need to make sure that this packet does not get load balanced. - * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and - * ip_newroute_ipif_v6 know how to handle such packets. - * If it gets load balanced, switches supporting MLD snooping - * (in the future) will send the packet that it receives for this - * multicast group to the interface that we are sending on. As we have - * joined the multicast group on this ill, by sending the packet out - * on this ill, we receive all the packets back on this ill. - */ - size += sizeof (ip6i_t); mp = allocb(size, BPRI_HI); if (mp == NULL) goto free_reclist; bzero(mp->b_rptr, size); mp->b_wptr = mp->b_rptr + size; - ip6i = (ip6i_t *)mp->b_rptr; - ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - ip6i->ip6i_nxt = IPPROTO_RAW; - ip6i->ip6i_flags = IP6I_ATTACH_IF; - ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; - - ip6h = (ip6_t *)&(ip6i[1]); + ip6h = (ip6_t *)mp->b_rptr; ip6hbh = (ip6_hbh_t *)&(ip6h[1]); ip6router = (struct ip6_opt_router *)&(ip6hbh[1]); mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen); @@ -3102,3 +3093,64 @@ mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist) return (rtnmrec); } + +/* + * Convenience routine to signal the restart-timer thread. + */ +static void +mcast_signal_restart_thread(ip_stack_t *ipst) +{ + mutex_enter(&ipst->ips_mrt_lock); + ipst->ips_mrt_flags |= IP_MRT_RUN; + cv_signal(&ipst->ips_mrt_cv); + mutex_exit(&ipst->ips_mrt_lock); +} + +/* + * Thread to restart IGMP/MLD timers. See the comment in igmp_joingroup() for + * the story behind this unfortunate thread. + */ +void +mcast_restart_timers_thread(ip_stack_t *ipst) +{ + int next; + char name[64]; + callb_cpr_t cprinfo; + + (void) snprintf(name, sizeof (name), "mcast_restart_timers_thread_%d", + ipst->ips_netstack->netstack_stackid); + CALLB_CPR_INIT(&cprinfo, &ipst->ips_mrt_lock, callb_generic_cpr, name); + + for (;;) { + mutex_enter(&ipst->ips_mrt_lock); + while (!(ipst->ips_mrt_flags & (IP_MRT_STOP|IP_MRT_RUN))) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(&ipst->ips_mrt_cv, &ipst->ips_mrt_lock); + CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_mrt_lock); + } + if (ipst->ips_mrt_flags & IP_MRT_STOP) + break; + ipst->ips_mrt_flags &= ~IP_MRT_RUN; + mutex_exit(&ipst->ips_mrt_lock); + + mutex_enter(&ipst->ips_igmp_timer_lock); + next = ipst->ips_igmp_deferred_next; + ipst->ips_igmp_deferred_next = INFINITY; + mutex_exit(&ipst->ips_igmp_timer_lock); + + if (next != INFINITY) + igmp_start_timers(next, ipst); + + mutex_enter(&ipst->ips_mld_timer_lock); + next = ipst->ips_mld_deferred_next; + ipst->ips_mld_deferred_next = INFINITY; + mutex_exit(&ipst->ips_mld_timer_lock); + if (next != INFINITY) + mld_start_timers(next, ipst); + } + + ipst->ips_mrt_flags |= IP_MRT_DONE; + cv_signal(&ipst->ips_mrt_done_cv); + CALLB_CPR_EXIT(&cprinfo); /* drops ips_mrt_lock */ + thread_exit(); +} diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 1d0bcf37de..dd87a09974 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -170,11 +170,14 @@ typedef struct listptr_s listptr_t; */ typedef struct iproutedata_s { uint_t ird_idx; + uint_t ird_flags; /* see below */ listptr_t ird_route; /* ipRouteEntryTable */ listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ listptr_t ird_attrs; /* ipRouteAttributeTable */ } iproutedata_t; +#define IRD_REPORT_TESTHIDDEN 0x01 /* include IRE_MARK_TESTHIDDEN routes */ + /* * Cluster specific hooks. These should be NULL when booted as a non-cluster */ @@ -228,31 +231,27 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any * MT level protection given by STREAMS. IP uses a combination of its own * internal serialization mechanism and standard Solaris locking techniques. - * The internal serialization is per phyint (no IPMP) or per IPMP group. - * This is used to serialize plumbing operations, IPMP operations, certain - * multicast operations, most set ioctls, igmp/mld timers etc. + * The internal serialization is per phyint. This is used to serialize + * plumbing operations, certain multicast operations, most set ioctls, + * igmp/mld timers etc. * * Plumbing is a long sequence of operations involving message * exchanges between IP, ARP and device drivers. Many set ioctls are typically * involved in plumbing operations. A natural model is to serialize these * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in * parallel without any interference. But various set ioctls on hme0 are best - * serialized. However if the system uses IPMP, the operations are easier if - * they are serialized on a per IPMP group basis since IPMP operations - * happen across ill's of a group. Thus the lowest common denominator is to - * serialize most set ioctls, multicast join/leave operations, IPMP operations - * igmp/mld timer operations, and processing of DLPI control messages received - * from drivers on a per IPMP group basis. If the system does not employ - * IPMP the serialization is on a per phyint basis. This serialization is - * provided by the ipsq_t and primitives operating on this. Details can - * be found in ip_if.c above the core primitives operating on ipsq_t. + * serialized, along with multicast join/leave operations, igmp/mld timer + * operations, and processing of DLPI control messages received from drivers + * on a per phyint basis. This serialization is provided by the ipsq_t and + * primitives operating on this. Details can be found in ip_if.c above the + * core primitives operating on ipsq_t. * * Lookups of an ipif or ill by a thread return a refheld ipif / ill. * Simiarly lookup of an ire by a thread also returns a refheld ire. * In addition ipif's and ill's referenced by the ire are also indirectly * refheld. Thus no ipif or ill can vanish nor can critical parameters like * the ipif's address or netmask change as long as an ipif is refheld - * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the + * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the * address of an ipif has to go through the ipsq_t. This ensures that only * 1 such exclusive operation proceeds at any time on the ipif. It then * deletes all ires associated with this ipif, and waits for all refcnts @@ -281,33 +280,24 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * - ill_g_lock: This is a global reader/writer lock. Protects the following * * The AVL tree based global multi list of all ills. * * The linked list of all ipifs of an ill - * * The <ill-ipsq> mapping - * * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next - * * The illgroup list threaded by ill_group_next. + * * The <ipsq-xop> mapping * * <ill-phyint> association * Insertion/deletion of an ill in the system, insertion/deletion of an ipif - * into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion - * of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill - * will all have to hold the ill_g_lock as writer for the actual duration - * of the insertion/deletion/change. More details about the <ill-ipsq> mapping - * may be found in the IPMP section. + * into an ill, changing the <ipsq-xop> mapping of an ill, changing the + * <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as + * writer for the actual duration of the insertion/deletion/change. * * - ill_lock: This is a per ill mutex. - * It protects some members of the ill and is documented below. - * It also protects the <ill-ipsq> mapping - * It also protects the illgroup list threaded by ill_group_next. + * It protects some members of the ill_t struct; see ip.h for details. * It also protects the <ill-phyint> assoc. * It also protects the list of ipifs hanging off the ill. * * - ipsq_lock: This is a per ipsq_t mutex lock. - * This protects all the other members of the ipsq struct except - * ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock + * This protects some members of the ipsq_t struct; see ip.h for details. + * It also protects the <ipsq-ipxop> mapping * - * - illgrp_lock: This is a per ill_group mutex lock. - * The only thing it protects is the illgrp_ill_schednext member of ill_group - * which dictates which is the next ill in an ill_group that is to be chosen - * for sending outgoing packets, through creation of an IRE_CACHE that - * references this ill. + * - ipx_lock: This is a per ipxop_t mutex lock. + * This protects some members of the ipxop_t struct; see ip.h for details. * * - phyint_lock: This is a per phyint mutex lock. Protects just the * phyint_flags @@ -335,27 +325,24 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * Note, it is only necessary to take this lock if the ill_usesrc_grp_next * field is changing state i.e from NULL to non-NULL or vice-versa. For * example, it is not necessary to take this lock in the initial portion - * of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and - * ip_sioctl_flags since the these operations are executed exclusively and - * that ensures that the "usesrc group state" cannot change. The "usesrc - * group state" change can happen only in the latter part of - * ip_sioctl_slifusesrc and in ill_delete. + * of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these + * operations are executed exclusively and that ensures that the "usesrc + * group state" cannot change. The "usesrc group state" change can happen + * only in the latter part of ip_sioctl_slifusesrc and in ill_delete. * - * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications. + * Changing <ill-phyint>, <ipsq-xop> assocications: * * To change the <ill-phyint> association, the ill_g_lock must be held * as writer, and the ill_locks of both the v4 and v6 instance of the ill * must be held. * - * To change the <ill-ipsq> association the ill_g_lock must be held as writer - * and the ill_lock of the ill in question must be held. - * - * To change the <ill-illgroup> association the ill_g_lock must be held as - * writer and the ill_lock of the ill in question must be held. + * To change the <ipsq-xop> association, the ill_g_lock must be held as + * writer, the ipsq_lock must be held, and one must be writer on the ipsq. + * This is only done when ills are added or removed from IPMP groups. * * To add or delete an ipif from the list of ipifs hanging off the ill, * ill_g_lock (writer) and ill_lock must be held and the thread must be - * a writer on the associated ipsq,. + * a writer on the associated ipsq. * * To add or delete an ill to the system, the ill_g_lock must be held as * writer and the thread must be a writer on the associated ipsq. @@ -367,8 +354,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * * Some lock hierarchy scenarios are listed below. * - * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock - * ill_g_lock -> illgrp_lock -> ill_lock + * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock * ill_g_lock -> ill_lock(s) -> phyint_lock * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock * ill_g_lock -> ip_addr_avail_lock @@ -587,8 +573,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * back, i.e. the loopback which is required since neither Ethernet drivers * nor Ethernet hardware loops them back. This is the case when the normal * routes (ignoring IREs with different zoneids) would send out the packet on - * the same ill (or ill group) as the ill with which is IRE_LOCAL is - * associated. + * the same ill as the ill with which is IRE_LOCAL is associated. * * Multiple zones can share a common broadcast address; typically all zones * share the 255.255.255.255 address. Incoming as well as locally originated @@ -695,8 +680,8 @@ static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *, mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *); static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *, ip_stack_t *); -static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *, - uint16_t *); +static boolean_t ip_rput_fragment(ill_t *, ill_t *, mblk_t **, ipha_t *, + uint32_t *, uint16_t *); int ip_snmp_get(queue_t *, mblk_t *, int); static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *, mib2_ipIfStatsEntry_t *, ip_stack_t *); @@ -723,9 +708,9 @@ static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *, ip_stack_t *ipst); -static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, +static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int, ip_stack_t *ipst); -static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, +static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int, ip_stack_t *ipst); static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); @@ -775,8 +760,6 @@ static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); -static int ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t, - cred_t *); static int ip_squeue_switch(int); static void *ip_kstat_init(netstackid_t, ip_stack_t *); @@ -946,8 +929,6 @@ static ipndp_t lcl_ndp_arr[] = { { ip_cgtp_filter_get, ip_cgtp_filter_set, NULL, "ip_cgtp_filter" }, #define IPNDP_IPMP_HOOK_OFFSET 10 - { ip_param_generic_get, ipmp_hook_emulation_set, NULL, - "ipmp_hook_emulation" }, { ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug, "ip_debug" }, }; @@ -984,20 +965,19 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, - /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, + /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD, IF_CMD, ip_sioctl_get_addr, NULL }, /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), - IPI_GET_CMD | IPI_REPL, - IF_CMD, ip_sioctl_get_dstaddr, NULL }, + IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL }, /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), - IPI_PRIV | IPI_WR | IPI_REPL, + IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), - IPI_MODOK | IPI_GET_CMD | IPI_REPL, + IPI_MODOK | IPI_GET_CMD, IF_CMD, ip_sioctl_get_flags, NULL }, /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, @@ -1009,31 +989,28 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_mtu, NULL }, - /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, + /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD, IF_CMD, ip_sioctl_get_mtu, NULL }, /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), - IPI_GET_CMD | IPI_REPL, - IF_CMD, ip_sioctl_get_brdaddr, NULL }, + IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL }, /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_brdaddr, NULL }, /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), - IPI_GET_CMD | IPI_REPL, - IF_CMD, ip_sioctl_get_netmask, NULL }, + IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL }, /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), - IPI_GET_CMD | IPI_REPL, - IF_CMD, ip_sioctl_get_metric, NULL }, + IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL }, /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, IF_CMD, ip_sioctl_metric, NULL }, /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* See 166-168 below for extended SIOC*XARP ioctls */ - /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV, + /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR, ARP_CMD, ip_sioctl_arp, NULL }, - /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL, + /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD, ARP_CMD, ip_sioctl_arp, NULL }, - /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV, + /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR, ARP_CMD, ip_sioctl_arp, NULL }, /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, @@ -1098,21 +1075,19 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, - /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL, + /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD, MISC_CMD, ip_sioctl_get_ifnum, NULL }, - /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, + /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD, IF_CMD, ip_sioctl_get_muxid, NULL }, /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - IF_CMD, ip_sioctl_muxid, NULL }, + IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL }, /* Both if and lif variants share same func */ - /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, + /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD, IF_CMD, ip_sioctl_get_lifindex, NULL }, /* Both if and lif variants share same func */ /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - IF_CMD, ip_sioctl_slifindex, NULL }, + IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL }, /* copyin size cannot be coded for SIOCGIFCONF */ /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD, @@ -1136,28 +1111,25 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_removeif, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif, ip_sioctl_removeif_restart }, /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), - IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL, + IPI_GET_CMD | IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_addif, NULL }, #define SIOCLIFADDR_NDX 112 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_addr, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL }, /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_dstaddr, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL }, /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), - IPI_GET_CMD | IPI_MODOK | IPI_REPL, + IPI_GET_CMD | IPI_MODOK, LIF_CMD, ip_sioctl_get_flags, NULL }, /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, @@ -1167,58 +1139,48 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { ip_sioctl_get_lifconf, NULL }, /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_mtu, NULL }, - /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, + /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD, LIF_CMD, ip_sioctl_get_mtu, NULL }, /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_brdaddr, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL }, /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_brdaddr, NULL }, /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_netmask, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL }, /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_metric, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL }, /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_metric, NULL }, /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL, + IPI_PRIV | IPI_WR | IPI_MODOK, LIF_CMD, ip_sioctl_slifname, ip_sioctl_slifname_restart }, - /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL, + /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD, MISC_CMD, ip_sioctl_get_lifnum, NULL }, /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_muxid, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL }, /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_muxid, NULL }, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL }, /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_lifindex, 0 }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 }, /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_slifindex, 0 }, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 }, /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_token, NULL }, /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_token, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL }, /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_subnet, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL }, /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_lnkinfo, NULL }, /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, LIF_CMD, ip_siocdelndp_v6, NULL }, /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, @@ -1231,8 +1193,8 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { MISC_CMD, ip_sioctl_tonlink, NULL }, /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, MISC_CMD, ip_sioctl_tmysite, NULL }, - /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL, - TUN_CMD, ip_sioctl_tunparam, NULL }, + /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), 0, + TUN_CMD, ip_sioctl_tunparam, NULL }, /* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req), IPI_PRIV | IPI_WR, TUN_CMD, ip_sioctl_tunparam, NULL }, @@ -1243,29 +1205,24 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, - /* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_move, ip_sioctl_move }, - /* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_move, ip_sioctl_move }, + /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, + + /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD | + IPI_WR, LIF_CMD, ip_sioctl_get_binding, NULL }, /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_groupname, NULL }, - /* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_oindex, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL }, + /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t), + IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL }, /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, - /* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR, - LIF_CMD, ip_sioctl_slifoindex, NULL }, + /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* These are handled in ip_sioctl_copyin_setup itself */ /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, @@ -1277,22 +1234,20 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, ip_sioctl_get_lifconf, NULL }, - /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV, + /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR, XARP_CMD, ip_sioctl_arp, NULL }, - /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL, + /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD, XARP_CMD, ip_sioctl_arp, NULL }, - /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV, + /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR, XARP_CMD, ip_sioctl_arp, NULL }, /* SIOCPOPSOCKFS is not handled by IP */ /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_lifzone, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL }, /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_slifzone, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone, ip_sioctl_slifzone_restart }, /* 172-174 are SCTP ioctls and not handled by IP */ /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, @@ -1315,8 +1270,7 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { MSFILT_CMD, ip_sioctl_msfilter, NULL }, /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR, MSFILT_CMD, ip_sioctl_msfilter, NULL }, - /* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD, - ip_sioctl_set_ipmpfailback, NULL }, + /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* SIOCSENABLESDP is handled by SDP */ /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL }, /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL }, @@ -1326,7 +1280,7 @@ int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); ip_ioctl_cmd_t ip_misc_ioctl_table[] = { { OSIOCGTUNPARAM, sizeof (struct old_iftun_req), - IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL }, + IPI_GET_CMD, TUN_CMD, ip_sioctl_tunparam, NULL }, { OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR, TUN_CMD, ip_sioctl_tunparam, NULL }, { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, @@ -1336,11 +1290,11 @@ ip_ioctl_cmd_t ip_misc_ioctl_table[] = { { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL }, { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, { IP_IOCTL, 0, 0, 0, NULL, NULL }, - { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD, + { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD, MISC_CMD, mrt_ioctl}, - { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD, + { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD, MISC_CMD, mrt_ioctl}, - { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD, + { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD, MISC_CMD, mrt_ioctl} }; @@ -1629,8 +1583,6 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, ipif_t *ipif; mblk_t *first_mp; ipsec_in_t *ii; - ire_t *src_ire; - boolean_t onlink; timestruc_t now; uint32_t ill_index; ip_stack_t *ipst; @@ -2014,59 +1966,6 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, if (!IS_SIMPLE_IPH(ipha)) icmp_options_update(ipha); - /* - * ICMP echo replies should go out on the same interface - * the request came on as probes used by in.mpathd for detecting - * NIC failures are ECHO packets. We turn-off load spreading - * by setting ipsec_in_attach_if to B_TRUE, which is copied - * to ipsec_out_attach_if by ipsec_in_to_out called later in this - * function. This is in turn handled by ip_wput and ip_newroute - * to make sure that the packet goes out on the interface it came - * in on. If we don't turnoff load spreading, the packets might get - * dropped if there are no non-FAILED/INACTIVE interfaces for it - * to go out and in.mpathd would wrongly detect a failure or - * mis-detect a NIC failure for link failure. As load spreading - * can happen only if ill_group is not NULL, we do only for - * that case and this does not affect the normal case. - * - * We turn off load spreading only on echo packets that came from - * on-link hosts. If the interface route has been deleted, this will - * not be enforced as we can't do much. For off-link hosts, as the - * default routes in IPv4 does not typically have an ire_ipif - * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute. - * Moreover, expecting a default route through this interface may - * not be correct. We use ipha_dst because of the swap above. - */ - onlink = B_FALSE; - if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) { - /* - * First, we need to make sure that it is not one of our - * local addresses. If we set onlink when it is one of - * our local addresses, we will end up creating IRE_CACHES - * for one of our local addresses. Then, we will never - * accept packets for them afterwards. - */ - src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (src_ire == NULL) { - ipif = ipif_get_next_ipif(NULL, ill); - if (ipif == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(mp); - return; - } - src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, - IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, - NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE, ipst); - ipif_refrele(ipif); - if (src_ire != NULL) { - onlink = B_TRUE; - ire_refrele(src_ire); - } - } else { - ire_refrele(src_ire); - } - } if (!mctl_present) { /* * This packet should go out the same way as it @@ -2085,20 +1984,7 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, /* This is not a secure packet */ ii->ipsec_in_secure = B_FALSE; - if (onlink) { - ii->ipsec_in_attach_if = B_TRUE; - ii->ipsec_in_ill_index = - ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = - recv_ill->ill_phyint->phyint_ifindex; - } first_mp->b_cont = mp; - } else if (onlink) { - ii = (ipsec_in_t *)first_mp->b_rptr; - ii->ipsec_in_attach_if = B_TRUE; - ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ } else { ii = (ipsec_in_t *)first_mp->b_rptr; ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ @@ -3733,7 +3619,6 @@ ipif_dup_recovery(void *arg) ill_t *ill = ipif->ipif_ill; mblk_t *arp_add_mp; mblk_t *arp_del_mp; - area_t *area; ip_stack_t *ipst = ill->ill_ipst; ipif->ipif_recovery_id = 0; @@ -3744,12 +3629,13 @@ ipif_dup_recovery(void *arg) */ if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) || (ipif->ipif_flags & IPIF_POINTOPOINT) || - (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))) { + (ipif->ipif_state_flags & (IPIF_CONDEMNED))) { /* No reason to try to bring this address back. */ return; } - if ((arp_add_mp = ipif_area_alloc(ipif)) == NULL) + /* ACE_F_UNVERIFIED restarts DAD */ + if ((arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) goto alloc_fail; if (ipif->ipif_arp_del_mp == NULL) { @@ -3758,10 +3644,6 @@ ipif_dup_recovery(void *arg) ipif->ipif_arp_del_mp = arp_del_mp; } - /* Setting the 'unverified' flag restarts DAD */ - area = (area_t *)arp_add_mp->b_rptr; - area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | - ACE_F_UNVERIFIED; putnext(ill->ill_rq, arp_add_mp); return; @@ -3873,6 +3755,7 @@ ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) EINPROGRESS) { ipif->ipif_addr_ready = 1; (void) ipif_up_done(ipif); + ASSERT(ill->ill_move_ipif == NULL); } continue; } @@ -3893,6 +3776,7 @@ ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) ill->ill_net_type == IRE_IF_RESOLVER && !(ipif->ipif_state_flags & IPIF_CONDEMNED) && ipst->ips_ip_dup_recovery > 0) { + ASSERT(ipif->ipif_recovery_id == 0); ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); } @@ -4196,8 +4080,9 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, { mblk_t *mp; ip_pktinfo_t *pinfo; - ipha_t *ipha; + ipha_t *ipha; struct ether_header *pether; + boolean_t ipmp_ill_held = B_FALSE; mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED); if (mp == NULL) { @@ -4205,12 +4090,53 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, return (data_mp); } - ipha = (ipha_t *)data_mp->b_rptr; + ipha = (ipha_t *)data_mp->b_rptr; pinfo = (ip_pktinfo_t *)mp->b_rptr; bzero(pinfo, sizeof (ip_pktinfo_t)); pinfo->ip_pkt_flags = (uchar_t)flags; pinfo->ip_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */ + pether = (struct ether_header *)((char *)ipha + - sizeof (struct ether_header)); + + /* + * Make sure the interface is an ethernet type, since this option + * is currently supported only on this type of interface. Also make + * sure we are pointing correctly above db_base. + */ + if ((flags & IPF_RECVSLLA) && + ((uchar_t *)pether >= data_mp->b_datap->db_base) && + (ill->ill_type == IFT_ETHER) && + (ill->ill_net_type == IRE_IF_RESOLVER)) { + pinfo->ip_pkt_slla.sdl_type = IFT_ETHER; + bcopy(pether->ether_shost.ether_addr_octet, + pinfo->ip_pkt_slla.sdl_data, ETHERADDRL); + } else { + /* + * Clear the bit. Indicate to upper layer that IP is not + * sending this ancillary info. + */ + pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA; + } + + /* + * If `ill' is in an IPMP group, use the IPMP ill to determine + * IPF_RECVIF and IPF_RECVADDR. (This currently assumes that + * IPF_RECVADDR support on test addresses is not needed.) + * + * Note that `ill' may already be an IPMP ill if e.g. we're + * processing a packet looped back to an IPMP data address + * (since those IRE_LOCALs are tied to IPMP ills). + */ + if (IS_UNDER_IPMP(ill)) { + if ((ill = ipmp_ill_hold_ipmp_ill(ill)) == NULL) { + ip1dbg(("ip_add_info: cannot hold IPMP ill.\n")); + freemsg(mp); + return (data_mp); + } + ipmp_ill_held = B_TRUE; + } + if (flags & (IPF_RECVIF | IPF_RECVADDR)) pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex; if (flags & IPF_RECVADDR) { @@ -4239,7 +4165,7 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL | IRE_LOOPBACK, ipif, zoneid, NULL, - MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst); + MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); if (ire == NULL) { /* * packet must have come on a different @@ -4276,29 +4202,8 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, } } - pether = (struct ether_header *)((char *)ipha - - sizeof (struct ether_header)); - /* - * Make sure the interface is an ethernet type, since this option - * is currently supported only on this type of interface. Also make - * sure we are pointing correctly above db_base. - */ - - if ((flags & IPF_RECVSLLA) && - ((uchar_t *)pether >= data_mp->b_datap->db_base) && - (ill->ill_type == IFT_ETHER) && - (ill->ill_net_type == IRE_IF_RESOLVER)) { - - pinfo->ip_pkt_slla.sdl_type = IFT_ETHER; - bcopy((uchar_t *)pether->ether_shost.ether_addr_octet, - (uchar_t *)pinfo->ip_pkt_slla.sdl_data, ETHERADDRL); - } else { - /* - * Clear the bit. Indicate to upper layer that IP is not - * sending this ancillary info. - */ - pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA; - } + if (ipmp_ill_held) + ill_refrele(ill); mp->b_datap->db_type = M_CTL; mp->b_wptr += sizeof (ip_pktinfo_t); @@ -4946,8 +4851,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, } } - if (dst_ire != NULL && - dst_ire->ire_type == IRE_LOCAL && + if (dst_ire != NULL && dst_ire->ire_type == IRE_LOCAL && dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) { /* * If the IRE belongs to a different zone, look for a matching @@ -4983,7 +4887,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, * Pick a source address so that a proper inbound * load spreading would happen. */ - ill_t *dst_ill = dst_ire->ire_ipif->ipif_ill; + ill_t *ire_ill = dst_ire->ire_ipif->ipif_ill; ipif_t *src_ipif = NULL; ire_t *ipif_ire; @@ -4998,10 +4902,10 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, * found above so that upper layers know that the * destination address is a broadcast address. * - * 2) If this is part of a group, select a better - * source address so that better inbound load - * balancing happens. Do the same if the ipif - * is DEPRECATED. + * 2) If the ipif is DEPRECATED, select a better + * source address. Similarly, if the ipif is on + * the IPMP meta-interface, pick a source address + * at random to improve inbound load spreading. * * 3) If the outgoing interface is part of a usesrc * group, then try selecting a source address from @@ -5011,9 +4915,9 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, dst_ire->ire_zoneid != ALL_ZONES) || (!(dst_ire->ire_flags & RTF_SETSRC)) && (!(dst_ire->ire_type & IRE_BROADCAST) && - ((dst_ill->ill_group != NULL) || + (IS_IPMP(ire_ill) || (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || - (dst_ill->ill_usesrc_ifindex != 0)))) { + (ire_ill->ill_usesrc_ifindex != 0)))) { /* * If the destination is reachable via a * given gateway, the selected source address @@ -5035,7 +4939,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, */ ipaddr_t saddr = dst_ire->ire_ipif->ipif_src_addr; - src_ipif = ipif_select_source(dst_ill, + src_ipif = ipif_select_source(ire_ill, saddr, zoneid); if (src_ipif != NULL) { if (IS_VNI(src_ipif->ipif_ill)) { @@ -5478,14 +5382,6 @@ ip_modclose(ill_t *ill) (void) ill_frag_timeout(ill, 0); /* - * If MOVE was in progress, clear the - * move_in_progress fields also. - */ - if (ill->ill_move_in_progress) { - ILL_CLEAR_MOVE(ill); - } - - /* * Call ill_delete to bring down the ipifs, ilms and ill on * this ill. Then wait for the refcnts to drop to zero. * ill_is_freeable checks whether the ill is really quiescent. @@ -5510,7 +5406,7 @@ ip_modclose(ill_t *ill) */ netstack_hold(ipst->ips_netstack); - /* qprocsoff is called in ill_delete_tail */ + /* qprocsoff is done via ill_delete_tail */ ill_delete_tail(ill); ASSERT(ill->ill_ipst == NULL); @@ -5755,6 +5651,11 @@ ip_stack_shutdown(netstackid_t stackid, void *arg) ipst->ips_capab_taskq_quit = B_TRUE; cv_signal(&ipst->ips_capab_taskq_cv); mutex_exit(&ipst->ips_capab_taskq_lock); + + mutex_enter(&ipst->ips_mrt_lock); + ipst->ips_mrt_flags |= IP_MRT_STOP; + cv_signal(&ipst->ips_mrt_cv); + mutex_exit(&ipst->ips_mrt_lock); } /* @@ -5766,6 +5667,9 @@ ip_stack_fini(netstackid_t stackid, void *arg) ip_stack_t *ipst = (ip_stack_t *)arg; int ret; +#ifdef NS_DEBUG + printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid); +#endif /* * At this point, all of the notifications that the events and * protocols are going away have been run, meaning that we can @@ -5779,9 +5683,14 @@ ip_stack_fini(netstackid_t stackid, void *arg) cv_destroy(&ipst->ips_capab_taskq_cv); list_destroy(&ipst->ips_capab_taskq_list); -#ifdef NS_DEBUG - printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid); -#endif + mutex_enter(&ipst->ips_mrt_lock); + while (!(ipst->ips_mrt_flags & IP_MRT_DONE)) + cv_wait(&ipst->ips_mrt_done_cv, &ipst->ips_mrt_lock); + mutex_destroy(&ipst->ips_mrt_lock); + cv_destroy(&ipst->ips_mrt_cv); + cv_destroy(&ipst->ips_mrt_done_cv); + + ipmp_destroy(ipst); rw_destroy(&ipst->ips_srcid_lock); ip_kstat_fini(stackid, ipst->ips_ip_mibkp); @@ -6038,10 +5947,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) "ip_cgtp_filter") == 0); ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_data = (caddr_t)&ipst->ips_ip_cgtp_filter; - ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_name, - "ipmp_hook_emulation") == 0); - ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_data = - (caddr_t)&ipst->ips_ipmp_hook_emulation; (void) ip_param_register(&ipst->ips_ip_g_nd, ipst->ips_param_arr, A_CNT(lcl_param_arr), @@ -6053,8 +5958,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) ipst->ips_ip6_kstat = ip6_kstat_init(stackid, &ipst->ips_ip6_statistics); - ipst->ips_ipmp_enable_failback = B_TRUE; - ipst->ips_ip_src_id = 1; rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL); @@ -6062,6 +5965,7 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) ip_net_init(ipst, ns); ipv4_hook_init(ipst); ipv6_hook_init(ipst); + ipmp_init(ipst); /* * Create the taskq dispatcher thread and initialize related stuff. @@ -6073,6 +5977,15 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) list_create(&ipst->ips_capab_taskq_list, sizeof (mblk_t), offsetof(mblk_t, b_next)); + /* + * Create the mcast_restart_timers_thread() worker thread. + */ + mutex_init(&ipst->ips_mrt_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ipst->ips_mrt_cv, NULL, CV_DEFAULT, NULL); + cv_init(&ipst->ips_mrt_done_cv, NULL, CV_DEFAULT, NULL); + ipst->ips_mrt_thread = thread_create(NULL, 0, + mcast_restart_timers_thread, ipst, 0, &p0, TS_RUN, minclsyspri); + major = mod_name_to_major(INET_NAME); (void) ldi_ident_from_major(major, &ipst->ips_ldi_ident); return (ipst); @@ -6109,6 +6022,24 @@ ip_dlpi_alloc(size_t len, t_uscalar_t prim) } /* + * Allocate and initialize a DLPI notification. (May be called as writer.) + */ +mblk_t * +ip_dlnotify_alloc(uint_t notification, uint_t data) +{ + dl_notify_ind_t *notifyp; + mblk_t *mp; + + if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL) + return (NULL); + + notifyp = (dl_notify_ind_t *)mp->b_rptr; + notifyp->dl_notification = notification; + notifyp->dl_data = data; + return (mp); +} + +/* * Debug formatting routine. Returns a character string representation of the * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer. @@ -7753,71 +7684,30 @@ ip_net_mask(ipaddr_t addr) } /* - * Select an ill for the packet by considering load spreading across - * a different ill in the group if dst_ill is part of some group. - */ -ill_t * -ip_newroute_get_dst_ill(ill_t *dst_ill) -{ - ill_t *ill; - - /* - * We schedule irrespective of whether the source address is - * INADDR_ANY or not. illgrp_scheduler returns a held ill. - */ - ill = illgrp_scheduler(dst_ill); - if (ill == NULL) - return (NULL); - - /* - * For groups with names ip_sioctl_groupname ensures that all - * ills are of same type. For groups without names, ifgrp_insert - * ensures this. - */ - ASSERT(dst_ill->ill_type == ill->ill_type); - - return (ill); -} - -/* - * Helper function for the IPIF_NOFAILOVER/ATTACH_IF interface attachment case. + * Helper ill lookup function used by IPsec. */ ill_t * -ip_grab_attach_ill(ill_t *ill, mblk_t *first_mp, int ifindex, boolean_t isv6, - ip_stack_t *ipst) +ip_grab_ill(mblk_t *first_mp, int ifindex, boolean_t isv6, ip_stack_t *ipst) { ill_t *ret_ill; ASSERT(ifindex != 0); + ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, ipst); - if (ret_ill == NULL || - (ret_ill->ill_phyint->phyint_flags & PHYI_OFFLINE)) { + if (ret_ill == NULL) { if (isv6) { - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip6_mib, - ipIfStatsOutDiscards); - } - ip1dbg(("ip_grab_attach_ill (IPv6): " - "bad ifindex %d.\n", ifindex)); + BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); + ip1dbg(("ip_grab_ill (IPv6): bad ifindex %d.\n", + ifindex)); } else { - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutDiscards); - } - ip1dbg(("ip_grab_attach_ill (IPv4): " - "bad ifindex %d.\n", ifindex)); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ip1dbg(("ip_grab_ill (IPv4): bad ifindex %d.\n", + ifindex)); } - if (ret_ill != NULL) - ill_refrele(ret_ill); freemsg(first_mp); return (NULL); } - return (ret_ill); } @@ -7859,7 +7749,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, ire_t *sire = NULL; mblk_t *first_mp; ire_t *save_ire; - ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER address */ ushort_t ire_marks = 0; boolean_t mctl_present; ipsec_out_t *io; @@ -7873,7 +7762,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, boolean_t multirt_is_resolvable; boolean_t multirt_resolve_next; boolean_t unspec_src; - boolean_t do_attach_ill = B_FALSE; boolean_t ip_nexthop = B_FALSE; tsol_ire_gw_secattr_t *attrp = NULL; tsol_gcgrp_t *gcgrp = NULL; @@ -7902,22 +7790,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, return; } - if (mctl_present && io->ipsec_out_attach_if) { - /* ip_grab_attach_ill returns a held ill */ - attach_ill = ip_grab_attach_ill(NULL, first_mp, - io->ipsec_out_ill_index, B_FALSE, ipst); - - /* Failure case frees things for us. */ - if (attach_ill == NULL) - return; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) - ire_marks = IRE_MARK_HIDDEN; - } if (mctl_present && io->ipsec_out_ip_nexthop) { ip_nexthop = B_TRUE; nexthop_addr = io->ipsec_out_nexthop_addr; @@ -7997,31 +7869,15 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, dst = nexthop_addr; } } - } else if (attach_ill == NULL) { + } else { ire = ire_ftable_lookup(dst, 0, 0, 0, NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp), MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT | MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE, ipst); - } else { - /* - * attach_ill is set only for communicating with - * on-link hosts. So, don't look for DEFAULT. - */ - ipif_t *attach_ipif; - - attach_ipif = ipif_get_next_ipif(NULL, attach_ill); - if (attach_ipif == NULL) { - ill_refrele(attach_ill); - goto icmp_err_ret; - } - ire = ire_ftable_lookup(dst, 0, 0, 0, attach_ipif, - &sire, zoneid, 0, MBLK_GETLABEL(mp), - MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL | - MATCH_IRE_SECATTR, ipst); - ipif_refrele(attach_ipif); } + ip3dbg(("ip_newroute: ire_ftable_lookup() " "returned ire %p, sire %p\n", (void *)ire, (void *)sire)); @@ -8122,8 +7978,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, } ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst); - if (attach_ill != NULL) - ill_refrele(attach_ill); goto icmp_err_ret; } @@ -8134,8 +7988,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, */ if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) { - if (attach_ill != NULL) - ill_refrele(attach_ill); goto icmp_err_ret; } /* @@ -8157,119 +8009,51 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, sire->ire_last_used_time = lbolt; } /* - * We have a route to reach the destination. - * - * 1) If the interface is part of ill group, try to get a new - * ill taking load spreading into account. - * - * 2) After selecting the ill, get a source address that - * might create good inbound load spreading. - * ipif_select_source does this for us. + * We have a route to reach the destination. Find the + * appropriate ill, then get a source address using + * ipif_select_source(). * - * If the application specified the ill (ifindex), we still - * load spread. Only if the packets needs to go out - * specifically on a given ill e.g. binding to - * IPIF_NOFAILOVER address, then we don't try to use a - * different ill for load spreading. + * If we are here trying to create an IRE_CACHE for an offlink + * destination and have an IRE_CACHE entry for VNI, then use + * ire_stq instead since VNI's queue is a black hole. */ - if (attach_ill == NULL) { - /* - * Don't perform outbound load spreading in the - * case of an RTF_MULTIRT route, as we actually - * typically want to replicate outgoing packets - * through particular interfaces. - */ - if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { - dst_ill = ire->ire_ipif->ipif_ill; - /* for uniformity */ - ill_refhold(dst_ill); - } else { - /* - * If we are here trying to create an IRE_CACHE - * for an offlink destination and have the - * IRE_CACHE for the next hop and the latter is - * using virtual IP source address selection i.e - * it's ire->ire_ipif is pointing to a virtual - * network interface (vni) then - * ip_newroute_get_dst_ll() will return the vni - * interface as the dst_ill. Since the vni is - * virtual i.e not associated with any physical - * interface, it cannot be the dst_ill, hence - * in such a case call ip_newroute_get_dst_ll() - * with the stq_ill instead of the ire_ipif ILL. - * The function returns a refheld ill. - */ - if ((ire->ire_type == IRE_CACHE) && - IS_VNI(ire->ire_ipif->ipif_ill)) - dst_ill = ip_newroute_get_dst_ill( - ire->ire_stq->q_ptr); - else - dst_ill = ip_newroute_get_dst_ill( - ire->ire_ipif->ipif_ill); - } - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute: " - "no dst ill for dst" - " %s\n", AF_INET, &dst); - } - goto icmp_err_ret; - } - } else { - dst_ill = ire->ire_ipif->ipif_ill; - /* for uniformity */ + if ((ire->ire_type == IRE_CACHE) && + IS_VNI(ire->ire_ipif->ipif_ill)) { + dst_ill = ire->ire_stq->q_ptr; ill_refhold(dst_ill); - /* - * We should have found a route matching ill as we - * called ire_ftable_lookup with MATCH_IRE_ILL. - * Rather than asserting, when there is a mismatch, - * we just drop the packet. - */ - if (dst_ill != attach_ill) { - ip0dbg(("ip_newroute: Packet dropped as " - "IPIF_NOFAILOVER ill is %s, " - "ire->ire_ipif->ipif_ill is %s\n", - attach_ill->ill_name, - dst_ill->ill_name)); - ill_refrele(attach_ill); - goto icmp_err_ret; + } else { + ill_t *ill = ire->ire_ipif->ipif_ill; + + if (IS_IPMP(ill)) { + dst_ill = + ipmp_illgrp_hold_next_ill(ill->ill_grp); + } else { + dst_ill = ill; + ill_refhold(dst_ill); } } - /* attach_ill can't go in loop. IPMP and CGTP are disjoint */ - if (attach_ill != NULL) { - ill_refrele(attach_ill); - attach_ill = NULL; - do_attach_ill = B_TRUE; + + if (dst_ill == NULL) { + if (ip_debug > 2) { + pr_addr_dbg("ip_newroute: no dst " + "ill for dst %s\n", AF_INET, &dst); + } + goto icmp_err_ret; } - ASSERT(dst_ill != NULL); ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name)); /* * Pick the best source address from dst_ill. * - * 1) If it is part of a multipathing group, we would - * like to spread the inbound packets across different - * interfaces. ipif_select_source picks a random source - * across the different ills in the group. - * - * 2) If it is not part of a multipathing group, we try - * to pick the source address from the destination + * 1) Try to pick the source address from the destination * route. Clustering assumes that when we have multiple * prefixes hosted on an interface, the prefix of the * source address matches the prefix of the destination * route. We do this only if the address is not * DEPRECATED. * - * 3) If the conn is in a different zone than the ire, we + * 2) If the conn is in a different zone than the ire, we * need to pick a source address from the right zone. - * - * NOTE : If we hit case (1) above, the prefix of the source - * address picked may not match the prefix of the - * destination routes prefix as ipif_select_source - * does not look at "dst" while picking a source - * address. - * If we want the same behavior as (2), we will need - * to change the behavior of ipif_select_source. */ ASSERT(src_ipif == NULL); if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { @@ -8287,7 +8071,8 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, if (src_ipif == NULL && (!unspec_src || ipha->ipha_src != INADDR_ANY)) { ire_marks |= IRE_MARK_USESRC_CHECK; - if ((dst_ill->ill_group != NULL) || + if (!IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && + IS_IPMP(ire->ire_ipif->ipif_ill) || (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || (connp != NULL && ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES) || @@ -8312,6 +8097,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, * as dst_ire source address. */ ipaddr_t saddr = ire->ire_ipif->ipif_src_addr; + src_ipif = ipif_select_source(dst_ill, saddr, zoneid); if (src_ipif == NULL) { @@ -8319,7 +8105,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, pr_addr_dbg("ip_newroute: " "no src for dst %s ", AF_INET, &dst); - printf("through interface %s\n", + printf("on interface %s\n", dst_ill->ill_name); } goto icmp_err_ret; @@ -8558,6 +8344,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, MULTIRT_DEBUG_TAG(first_mp); } } + ire_add_then_send(q, ire, xmit_mp); ire_refrele(save_ire); @@ -8766,7 +8553,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, "ip_newroute: no " "src for gw %s ", AF_INET, &gw); - printf("through " + printf("on " "interface %s\n", dst_ill->ill_name); } @@ -8867,16 +8654,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, areq = (areq_t *)mp->b_rptr; addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset); - if (do_attach_ill) { - /* - * This is bind to no failover case. - * arp packet also must go out on attach_ill. - */ - ASSERT(ipha->ipha_src != NULL); - *addrp = ipha->ipha_src; - } else { - *addrp = save_ire->ire_src_addr; - } + *addrp = save_ire->ire_src_addr; ire_refrele(save_ire); addrp = (ipaddr_t *)((char *)areq + @@ -9076,14 +8854,10 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, ipaddr_t *addrp; mblk_t *first_mp; ire_t *save_ire = NULL; - ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER */ ipif_t *src_ipif = NULL; ushort_t ire_marks = 0; ill_t *dst_ill = NULL; - boolean_t mctl_present; - ipsec_out_t *io; ipha_t *ipha; - int ihandle = 0; mblk_t *saved_mp; ire_t *fire = NULL; mblk_t *copy_mp = NULL; @@ -9117,10 +8891,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst), ipif->ipif_ill->ill_name)); - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - if (mctl_present) - io = (ipsec_out_t *)first_mp->b_rptr; - + first_mp = mp; + if (DB_TYPE(mp) == M_CTL) + mp = mp->b_cont; ipha = (ipha_t *)mp->b_rptr; /* @@ -9161,64 +8934,29 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, (void *)ipif, ntohl(dst), (void *)fire)); } - if (mctl_present && io->ipsec_out_attach_if) { - attach_ill = ip_grab_attach_ill(NULL, first_mp, - io->ipsec_out_ill_index, B_FALSE, ipst); - - /* Failure case frees things for us. */ - if (attach_ill == NULL) { - ipif_refrele(ipif); - if (fire != NULL) - ire_refrele(fire); - return; - } + /* + * Note: While we pick a dst_ill we are really only + * interested in the ill for load spreading. The source + * ipif is determined by source address selection below. + */ + if (IS_IPMP(ipif->ipif_ill)) { + ipmp_illgrp_t *illg = ipif->ipif_ill->ill_grp; - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) { - ire_marks = IRE_MARK_HIDDEN; - } - /* - * ip_wput passes the right ipif for IPIF_NOFAILOVER - * case. - */ - dst_ill = ipif->ipif_ill; - /* attach_ill has been refheld by ip_grab_attach_ill */ - ASSERT(dst_ill == attach_ill); + if (CLASSD(ipha_dst)) + dst_ill = ipmp_illgrp_hold_cast_ill(illg); + else + dst_ill = ipmp_illgrp_hold_next_ill(illg); } else { - /* - * If the interface belongs to an interface group, - * make sure the next possible interface in the group - * is used. This encourages load spreading among - * peers in an interface group. - * Note: load spreading is disabled for RTF_MULTIRT - * routes. - */ - if ((flags & RTF_MULTIRT) && (fire != NULL) && - (fire->ire_flags & RTF_MULTIRT)) { - /* - * Don't perform outbound load spreading - * in the case of an RTF_MULTIRT issued route, - * we actually typically want to replicate - * outgoing packets through particular - * interfaces. - */ - dst_ill = ipif->ipif_ill; - ill_refhold(dst_ill); - } else { - dst_ill = ip_newroute_get_dst_ill( - ipif->ipif_ill); - } - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute_ipif: " - "no dst ill for dst %s\n", - AF_INET, &dst); - } - goto err_ret; + dst_ill = ipif->ipif_ill; + ill_refhold(dst_ill); + } + + if (dst_ill == NULL) { + if (ip_debug > 2) { + pr_addr_dbg("ip_newroute_ipif: no dst ill " + "for dst %s\n", AF_INET, &dst); } + goto err_ret; } /* @@ -9242,7 +8980,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, unspec_src = (connp != NULL && connp->conn_unspec_src); - if (((!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) || + if (!IS_UNDER_IPMP(ipif->ipif_ill) && + (IS_IPMP(ipif->ipif_ill) || + (!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) || (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP || (connp != NULL && ipif->ipif_zoneid != zoneid && ipif->ipif_zoneid != ALL_ZONES)) && @@ -9256,7 +8996,7 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, "no src for dst %s", AF_INET, &dst); } - ip1dbg((" through interface %s\n", + ip1dbg((" on interface %s\n", dst_ill->ill_name)); goto err_ret; } @@ -9291,12 +9031,7 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, goto err_ret; if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) goto err_ret; - /* - * ihandle is needed when the ire is added to - * cache table. - */ save_ire = ire; - ihandle = save_ire->ire_ihandle; ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, " "flags %04x\n", @@ -9328,10 +9063,6 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, ipha->ipha_src = fire->ire_src_addr; } } else { - ASSERT((connp == NULL) || - (connp->conn_outgoing_ill != NULL) || - (connp->conn_dontroute) || - infop->ip_opt_ill_index != 0); /* * The only ways we can come here are: * 1) IP_BOUND_IF socket option is set @@ -9340,6 +9071,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, * In all cases, the new ire will not be added * into cache table. */ + ASSERT(connp == NULL || connp->conn_dontroute || + connp->conn_outgoing_ill != NULL || + infop->ip_opt_ill_index != 0); ire_marks |= IRE_MARK_NOADD; } @@ -9374,7 +9108,8 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, (save_ire != NULL ? save_ire->ire_mask : 0), (fire != NULL) ? /* Parent handle */ fire->ire_phandle : 0, - ihandle, /* Interface handle */ + (save_ire != NULL) ? /* Interface handle */ + save_ire->ire_ihandle : 0, (fire != NULL) ? (fire->ire_flags & (RTF_SETSRC | RTF_MULTIRT)) : 0, @@ -9533,7 +9268,8 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, (save_ire != NULL ? save_ire->ire_mask : 0), (fire != NULL) ? /* Parent handle */ fire->ire_phandle : 0, - ihandle, /* Interface handle */ + (save_ire != NULL) ? /* Interface handle */ + save_ire->ire_ihandle : 0, (fire != NULL) ? /* flags if any */ (fire->ire_flags & (RTF_SETSRC | RTF_MULTIRT)) : 0, @@ -9593,12 +9329,20 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, /* * Fill in the source and dest addrs for the resolver. * NOTE: this depends on memory layouts imposed by - * ill_init(). + * ill_init(). There are corner cases above where we + * might've created the IRE with an INADDR_ANY source + * address (e.g., if the zeroth ipif on an underlying + * ill in an IPMP group is 0.0.0.0, but another ipif + * on the ill has a usable test address). If so, tell + * ARP to use ipha_src as its sender address. */ areq = (areq_t *)mp->b_rptr; addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset); - *addrp = ire->ire_src_addr; + if (ire->ire_src_addr != INADDR_ANY) + *addrp = ire->ire_src_addr; + else + *addrp = ipha->ipha_src; addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset); *addrp = dst; @@ -10136,7 +9880,7 @@ ip_ipsec_load_complete(ipsec_stack_t *ipss) /* * Can't be used. Need to call svr4* -> optset directly. the leaf routine * determines the grp on which it has to become exclusive, queues the mp - * and sq draining restarts the optmgmt + * and IPSQ draining restarts the optmgmt */ static boolean_t ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp) @@ -10482,28 +10226,6 @@ ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option, } switch (option) { - case IP_DONTFAILOVER_IF: - /* - * This option is used by in.mpathd to ensure - * that IPMP probe packets only go out on the - * test interfaces. in.mpathd sets this option - * on the non-failover interfaces. - * For backward compatibility, this option - * implicitly sets IP_MULTICAST_IF, as used - * be done in bind(), so that ip_wput gets - * this ipif to send mcast packets. - */ - if (ipif != NULL) { - ASSERT(addr != INADDR_ANY); - connp->conn_nofailover_ill = ipif->ipif_ill; - connp->conn_multicast_ipif = ipif; - } else { - ASSERT(addr == INADDR_ANY); - connp->conn_nofailover_ill = NULL; - connp->conn_multicast_ipif = NULL; - } - break; - case IP_MULTICAST_IF: connp->conn_multicast_ipif = ipif; break; @@ -10551,7 +10273,7 @@ ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly, ill_refrele(ill); return (0); } - if (!ipif_lookup_zoneid_group(ill, connp->conn_zoneid, + if (!ipif_lookup_zoneid(ill, connp->conn_zoneid, 0, NULL)) { ill_refrele(ill); ill = NULL; @@ -10596,8 +10318,6 @@ setit: case IP_BOUND_IF: connp->conn_incoming_ill = ill; connp->conn_outgoing_ill = ill; - connp->conn_orig_bound_ifindex = (ill == NULL) ? - 0 : ifindex; break; case IP_MULTICAST_IF: @@ -10650,40 +10370,6 @@ setit: case IPV6_BOUND_IF: connp->conn_incoming_ill = ill; connp->conn_outgoing_ill = ill; - connp->conn_orig_bound_ifindex = (ill == NULL) ? - 0 : ifindex; - break; - - case IPV6_BOUND_PIF: - /* - * Limit all transmit to this ill. - * Unlike IPV6_BOUND_IF, using this option - * prevents load spreading and failover from - * happening when the interface is part of the - * group. That's why we don't need to remember - * the ifindex in orig_bound_ifindex as in - * IPV6_BOUND_IF. - */ - connp->conn_outgoing_pill = ill; - break; - - case IPV6_DONTFAILOVER_IF: - /* - * This option is used by in.mpathd to ensure - * that IPMP probe packets only go out on the - * test interfaces. in.mpathd sets this option - * on the non-failover interfaces. - */ - connp->conn_nofailover_ill = ill; - /* - * For backward compatibility, this option - * implicitly sets ip_multicast_ill as used in - * IPV6_MULTICAST_IF so that ip_wput gets - * this ill to send mcast packets. - */ - connp->conn_multicast_ill = ill; - connp->conn_orig_multicast_ifindex = (ill == NULL) ? - 0 : ifindex; break; case IPV6_MULTICAST_IF: @@ -10700,12 +10386,9 @@ setit: if (!checkonly) { if (ifindex == 0) { connp->conn_multicast_ill = NULL; - connp->conn_orig_multicast_ifindex = 0; connp->conn_multicast_ipif = NULL; } else if (ill != NULL) { connp->conn_multicast_ill = ill; - connp->conn_orig_multicast_ifindex = - ifindex; } } break; @@ -10867,8 +10550,7 @@ ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, if (secpolicy_ip_config(cr, B_FALSE) != 0) return (EPERM); /* FALLTHRU */ - case IP_MULTICAST_IF: - case IP_DONTFAILOVER_IF: { + case IP_MULTICAST_IF: { ipaddr_t addr = *i1; error = ip_opt_set_ipif(connp, addr, checkonly, name, @@ -11189,8 +10871,6 @@ ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, case IPPROTO_IPV6: switch (name) { case IPV6_BOUND_IF: - case IPV6_BOUND_PIF: - case IPV6_DONTFAILOVER_IF: error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, level, name, first_mp); if (error != 0) @@ -12288,11 +11968,10 @@ ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha, * frees mp on failure. */ static boolean_t -ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha, +ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, uint32_t *cksum_val, uint16_t *cksum_flags) { uint32_t frag_offset_flags; - ill_t *ill = (ill_t *)q->q_ptr; mblk_t *mp = *mpp; mblk_t *t_mp; ipaddr_t dst; @@ -12337,12 +12016,12 @@ ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha, /* * We utilize hardware computed checksum info only for UDP since - * IP fragmentation is a normal occurence for the protocol. In + * IP fragmentation is a normal occurrence for the protocol. In * addition, checksum offload support for IP fragments carrying * UDP payload is commonly implemented across network adapters. */ - ASSERT(ill != NULL); - if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && + ASSERT(recv_ill != NULL); + if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(recv_ill) && (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { mblk_t *mp1 = mp->b_cont; int32_t len; @@ -12808,7 +12487,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, goto ipoptions; /* Check the IP header checksum. */ - if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { + if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) { /* Clear the IP header h/w cksum flag */ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; } else if (!mctl_present) { @@ -12871,7 +12550,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, * Revert to software checksum calculation if the interface * isn't capable of checksum offload or if IPsec is present. */ - if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) + if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum) hck_flags = DB_CKSUMFLAGS(mp); if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) @@ -12958,8 +12637,11 @@ fragmented: * reassembled packet has a valid hardware computed * checksum information associated with it. */ - if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags)) + if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, &sum, + &reass_hck_flags)) { goto slow_done; + } + /* * Make sure that first_mp points back to mp as * the mp we came in with could have changed in @@ -13073,7 +12755,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, goto ipoptions; } else if (!mctl_present) { /* Check the IP header checksum. */ - if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { + if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) { /* Clear the IP header h/w cksum flag */ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; } else if (!mctl_present) { @@ -13159,7 +12841,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, * Revert to software checksum calculation if the interface * isn't capable of checksum offload or if IPsec is present. */ - if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) + if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum) hck_flags = DB_CKSUMFLAGS(mp); if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) @@ -13386,7 +13068,7 @@ ipoptions: u1 = ntohs(ipha->ipha_fragment_offset_and_flags); if (u1 & (IPH_MF | IPH_OFFSET)) { fragmented: - if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { + if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) { if (mctl_present) freeb(first_mp); goto slow_done; @@ -13530,7 +13212,7 @@ ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, goto ipoptions; } else { /* Check the IP header checksum. */ - if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, ill) && + if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill) && !mctl_present) { #define uph ((uint16_t *)ipha) sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + @@ -13644,7 +13326,7 @@ ipoptions: u1 = ntohs(ipha->ipha_fragment_offset_and_flags); if (u1 & (IPH_MF | IPH_OFFSET)) { fragmented: - if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) + if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) goto slow_done; /* * Make sure that first_mp points back to mp as @@ -13877,6 +13559,11 @@ ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst) return (B_TRUE); } +/* + * Handle the situation where a packet came in on `ill' but matched an IRE + * whose ire_rfq doesn't match `ill'. We return the IRE that should be used + * for interface statistics. + */ ire_t * ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) { @@ -13887,16 +13574,22 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) boolean_t strict_check = B_FALSE; /* - * This packet came in on an interface other than the one associated - * with the first ire we found for the destination address. We do - * another ire lookup here, using the ingress ill, to see if the - * interface is in an interface group. + * IPMP common case: if IRE and ILL are in the same group, there's no + * issue (e.g. packet received on an underlying interface matched an + * IRE_LOCAL on its associated group interface). + */ + if (ire->ire_rfq != NULL && + IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) { + return (ire); + } + + /* + * Do another ire lookup here, using the ingress ill, to see if the + * interface is in a usesrc group. * As long as the ills belong to the same group, we don't consider * them to be arriving on the wrong interface. Thus, if the switch * is doing inbound load spreading, we won't drop packets when the - * ip*_strict_dst_multihoming switch is on. Note, the same holds true - * for 'usesrc groups' where the destination address may belong to - * another interface to allow multipathing to happen. + * ip*_strict_dst_multihoming switch is on. * We also need to check for IPIF_UNNUMBERED point2point interfaces * where the local address may not be unique. In this case we were * at the mercy of the initial ire cache lookup and the IRE_LOCAL it @@ -13910,18 +13603,18 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) strict_check = B_TRUE; new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, - (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst); + (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst); } else { ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr)); if (ipst->ips_ipv6_strict_dst_multihoming) strict_check = B_TRUE; new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL, IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, - (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst); + (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst); } /* * If the same ire that was returned in ip_input() is found then this - * is an indication that interface groups are in use. The packet + * is an indication that usesrc groups are in use. The packet * arrived on a different ill in the group than the one associated with * the destination address. If a different ire was found then the same * IP address must be hosted on multiple ills. This is possible with @@ -14075,11 +13768,10 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) /* * Forwarding fastpath exception case: - * If either of the follwoing case is true, we take - * the slowpath + * If any of the following are true, we take the slowpath: * o forwarding is not enabled - * o incoming and outgoing interface are the same, or the same - * IPMP group + * o incoming and outgoing interface are the same, or in the same + * IPMP group. * o corresponding ire is in incomplete state * o packet needs fragmentation * o ARP cache is not resolved @@ -14090,8 +13782,7 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) pkt_len = ntohs(ipha->ipha_length); stq_ill = (ill_t *)ire->ire_stq->q_ptr; if (!(stq_ill->ill_flags & ILLF_ROUTER) || - (ill == stq_ill) || - (ill->ill_group != NULL && ill->ill_group == stq_ill->ill_group) || + (ill == stq_ill) || IS_IN_SAME_ILLGRP(ill, stq_ill) || (ire->ire_nce == NULL) || (pkt_len > ire->ire_max_frag) || ((fpmp = ire->ire_nce->nce_fp_mp) == NULL) || @@ -14185,11 +13876,10 @@ static void ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward) { - ill_group_t *ill_group; - ill_group_t *ire_group; queue_t *dev_q; ire_t *src_ire; ip_stack_t *ipst = ill->ill_ipst; + boolean_t same_illgrp = B_FALSE; ASSERT(ire->ire_stq != NULL); @@ -14200,11 +13890,8 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, * If the caller of this function is ip_fast_forward() skip the * next three checks as it does not apply. */ - if (from_ip_fast_forward) { - ill_group = ill->ill_group; - ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; + if (from_ip_fast_forward) goto skip; - } if (ll_multicast != 0) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); @@ -14230,13 +13917,10 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, goto drop_pkt; } - ill_group = ill->ill_group; - ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; /* * Check if we want to forward this one at this time. * We allow source routed packets on a host provided that - * they go out the same interface or same interface group - * as they came in on. + * they go out the same ill or illgrp as they came in on. * * XXX To be quicker, we may wish to not chase pointers to * get the ILLF_ROUTER flag and instead store the @@ -14245,11 +13929,12 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, * whenever the ILLF_ROUTER flag changes. */ skip: + same_illgrp = IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr); + if (((ill->ill_flags & - ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & - ILLF_ROUTER) == 0) && - !(ip_source_routed(ipha, ipst) && (ire->ire_rfq == q || - (ill_group != NULL && ill_group == ire_group)))) { + ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_ROUTER) == 0) && + !(ip_source_routed(ipha, ipst) && + (ire->ire_rfq == q || same_illgrp))) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); if (ip_source_routed(ipha, ipst)) { q = WR(q); @@ -14290,12 +13975,10 @@ skip: ire_t *nhop_ire = NULL; /* - * Check whether ire_rfq and q are from the same ill - * or if they are not same, they at least belong - * to the same group. If so, send redirects. + * Check whether ire_rfq and q are from the same ill or illgrp. + * If so, send redirects. */ - if ((ire->ire_rfq == q || - (ill_group != NULL && ill_group == ire_group)) && + if ((ire->ire_rfq == q || same_illgrp) && !ip_source_routed(ipha, ipst)) { nhop = (ire->ire_gateway_addr != 0 ? @@ -14396,26 +14079,15 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha, } /* * For multicast we have set dst to be INADDR_BROADCAST - * for delivering to all STREAMS. IRE_MARK_NORECV is really - * only for broadcast packets. + * for delivering to all STREAMS. */ if (!CLASSD(ipha->ipha_dst)) { ire_t *new_ire; ipif_t *ipif; - /* - * For ill groups, as the switch duplicates broadcasts - * across all the ports, we need to filter out and - * send up only one copy. There is one copy for every - * broadcast address on each ill. Thus, we look for a - * specific IRE on this ill and look at IRE_MARK_NORECV - * later to see whether this ill is eligible to receive - * them or not. ill_nominate_bcast_rcv() nominates only - * one set of IREs for receiving. - */ ipif = ipif_get_next_ipif(NULL, ill); if (ipif == NULL) { - ire_refrele(ire); +discard: ire_refrele(ire); freemsg(mp); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); return (NULL); @@ -14425,13 +14097,17 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha, ipif_refrele(ipif); if (new_ire != NULL) { - if (new_ire->ire_marks & IRE_MARK_NORECV) { - ire_refrele(ire); + /* + * If the matching IRE_BROADCAST is part of an IPMP + * group, then drop the packet unless our ill has been + * nominated to receive for the group. + */ + if (IS_IPMP(new_ire->ire_ipif->ipif_ill) && + new_ire->ire_rfq != q) { ire_refrele(new_ire); - freemsg(mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return (NULL); + goto discard; } + /* * In the special case of multirouted broadcast * packets, we unconditionally need to "gateway" @@ -14571,6 +14247,13 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, ntohs(ipha->ipha_length)); /* + * So that we don't end up with dups, only one ill an IPMP group is + * nominated to receive multicast traffic. + */ + if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast) + goto drop_pkt; + + /* * Forward packets only if we have joined the allmulti * group on this interface. */ @@ -14619,18 +14302,15 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, } } - ILM_WALKER_HOLD(ill); if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) { /* * This might just be caused by the fact that * multiple IP Multicast addresses map to the same * link layer multicast - no need to increment counter! */ - ILM_WALKER_RELE(ill); freemsg(mp); return (B_TRUE); } - ILM_WALKER_RELE(ill); done: ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp))); /* @@ -15498,8 +15178,8 @@ local: * broadcast ire. */ if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) { - if ((ire = ip_check_multihome(&ipha->ipha_dst, ire, - ill)) == NULL) { + ire = ip_check_multihome(&ipha->ipha_dst, ire, ill); + if (ire == NULL) { /* Drop packet */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); @@ -15935,19 +15615,12 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) ip1dbg(("ip_rput_dlpi_writer ..")); ill = (ill_t *)q->q_ptr; - ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); - + ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop); ASSERT(IAM_WRITER_ILL(ill)); ipst = ill->ill_ipst; - /* - * ipsq_pending_mp and ipsq_pending_ipif track each other. i.e. - * both are null or non-null. However we can assert that only - * after grabbing the ipsq_lock. So we don't make any assertion - * here and in other places in the code. - */ - ipif = ipsq->ipsq_pending_ipif; + ipif = ipsq->ipsq_xop->ipx_pending_ipif; /* * The current ioctl could have been aborted by the user and a new * ioctl to bring up another ill could have started. We could still @@ -16045,9 +15718,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) */ ASSERT(connp != NULL); q = CONNP_TO_WQ(connp); - if (ill->ill_move_in_progress) { - ILL_CLEAR_MOVE(ill); - } (void) ipif_down(ipif, NULL, NULL); /* error is set below the switch */ } @@ -16196,45 +15866,31 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * ill_dl_up(), which stopped ipif_up()'s processing. */ if (ill->ill_isv6) { - /* - * v6 interfaces. - * Unlike ARP which has to do another bind - * and attach, once we get here we are - * done with NDP. Except in the case of - * ILLF_XRESOLV, in which case we send an - * AR_INTERFACE_UP to the external resolver. - * If all goes well, the ioctl will complete - * in ip_rput(). If there's an error, we - * complete it here. - */ - if ((err = ipif_ndp_up(ipif)) == 0) { - if (ill->ill_flags & ILLF_XRESOLV) { - mutex_enter(&connp->conn_lock); - mutex_enter(&ill->ill_lock); - success = ipsq_pending_mp_add( - connp, ipif, q, mp1, 0); - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - if (success) { - err = ipif_resolver_up(ipif, - Res_act_initial); - if (err == EINPROGRESS) { - freemsg(mp); - return; - } - ASSERT(err != 0); - mp1 = ipsq_pending_mp_get(ipsq, - &connp); - ASSERT(mp1 != NULL); - } else { - /* conn has started closing */ - err = EINTR; - } - } else { /* Non XRESOLV interface */ - (void) ipif_resolver_up(ipif, + if (ill->ill_flags & ILLF_XRESOLV) { + mutex_enter(&connp->conn_lock); + mutex_enter(&ill->ill_lock); + success = ipsq_pending_mp_add(connp, ipif, q, + mp1, 0); + mutex_exit(&ill->ill_lock); + mutex_exit(&connp->conn_lock); + if (success) { + err = ipif_resolver_up(ipif, Res_act_initial); - err = ipif_up_done_v6(ipif); + if (err == EINPROGRESS) { + freemsg(mp); + return; + } + ASSERT(err != 0); + mp1 = ipsq_pending_mp_get(ipsq, &connp); + ASSERT(mp1 != NULL); + } else { + /* conn has started closing */ + err = EINTR; } + } else { /* Non XRESOLV interface */ + (void) ipif_resolver_up(ipif, Res_act_initial); + if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0) + err = ipif_up_done_v6(ipif); } } else if (ill->ill_net_type == IRE_IF_RESOLVER) { /* @@ -16275,14 +15931,31 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } } - if (ill->ill_up_ipifs) { - ill_group_cleanup(ill); + /* + * If we have a moved ipif to bring up, and everything has + * succeeded to this point, bring it up on the IPMP ill. + * Otherwise, leave it down -- the admin can try to bring it + * up by hand if need be. + */ + if (ill->ill_move_ipif != NULL) { + if (err != 0) { + ill->ill_move_ipif = NULL; + } else { + ipif = ill->ill_move_ipif; + ill->ill_move_ipif = NULL; + err = ipif_up(ipif, q, mp1); + if (err == EINPROGRESS) { + freemsg(mp); + return; + } + } } - break; + case DL_NOTIFY_IND: { dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr; ire_t *ire; + uint_t orig_mtu; boolean_t need_ire_walk_v4 = B_FALSE; boolean_t need_ire_walk_v6 = B_FALSE; @@ -16322,17 +15995,27 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * which it is being derived. */ mutex_enter(&ill->ill_lock); + + orig_mtu = ill->ill_max_mtu; ill->ill_max_frag = (uint_t)notify->dl_data; + ill->ill_max_mtu = (uint_t)notify->dl_data; + + /* + * If ill_user_mtu was set (via SIOCSLIFLNKINFO), + * clamp ill_max_mtu at it. + */ + if (ill->ill_user_mtu != 0 && + ill->ill_user_mtu < ill->ill_max_mtu) + ill->ill_max_mtu = ill->ill_user_mtu; /* - * If an SIOCSLIFLNKINFO has changed the ill_max_mtu - * leave it alone + * If the MTU is unchanged, we're done. */ - if (ill->ill_mtu_userspecified) { + if (orig_mtu == ill->ill_max_mtu) { mutex_exit(&ill->ill_lock); break; } - ill->ill_max_mtu = ill->ill_max_frag; + if (ill->ill_isv6) { if (ill->ill_max_mtu < IPV6_MIN_MTU) ill->ill_max_mtu = IPV6_MIN_MTU; @@ -16371,7 +16054,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) if (need_ire_walk_v6) ire_walk_v6(ill_mtu_change, (char *)ill, ALL_ZONES, ipst); + + /* + * Refresh IPMP meta-interface MTU if necessary. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_illgrp_refresh_mtu(ill->ill_grp); break; + case DL_NOTE_LINK_UP: case DL_NOTE_LINK_DOWN: { /* @@ -16385,9 +16075,17 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) went_up = notify->dl_notification == DL_NOTE_LINK_UP; mutex_enter(&phyint->phyint_lock); + new_phyint_flags = went_up ? phyint->phyint_flags | PHYI_RUNNING : phyint->phyint_flags & ~PHYI_RUNNING; + + if (IS_IPMP(ill)) { + new_phyint_flags = went_up ? + new_phyint_flags & ~PHYI_FAILED : + new_phyint_flags | PHYI_FAILED; + } + if (new_phyint_flags != phyint->phyint_flags) { phyint->phyint_flags = new_phyint_flags; changed = B_TRUE; @@ -16474,7 +16172,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * is invoked from an ill queue, conn_oper_pending_ill is not * available, but we know the ioctl is pending on ill_wq.) */ - uint_t paddrlen, paddroff; + uint_t paddrlen, paddroff; paddrreq = ill->ill_phys_addr_pend; paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length; @@ -16592,29 +16290,59 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } freemsg(mp); - if (mp1 != NULL) { + if (mp1 == NULL) + return; + + /* + * The operation must complete without EINPROGRESS since + * ipsq_pending_mp_get() has removed the mblk (mp1). Otherwise, + * the operation will be stuck forever inside the IPSQ. + */ + ASSERT(err != EINPROGRESS); + + switch (ipsq->ipsq_xop->ipx_current_ioctl) { + case 0: + ipsq_current_finish(ipsq); + break; + + case SIOCSLIFNAME: + case IF_UNITSEL: { + ill_t *ill_other = ILL_OTHER(ill); + /* - * The operation must complete without EINPROGRESS - * since ipsq_pending_mp_get() has removed the mblk - * from ipsq_pending_mp. Otherwise, the operation - * will be stuck forever in the ipsq. + * If SIOCSLIFNAME or IF_UNITSEL is about to succeed, and the + * ill has a peer which is in an IPMP group, then place ill + * into the same group. One catch: although ifconfig plumbs + * the appropriate IPMP meta-interface prior to plumbing this + * ill, it is possible for multiple ifconfig applications to + * race (or for another application to adjust plumbing), in + * which case the IPMP meta-interface we need will be missing. + * If so, kick the phyint out of the group. */ - ASSERT(err != EINPROGRESS); + if (err == 0 && ill_other != NULL && IS_UNDER_IPMP(ill_other)) { + ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; + ipmp_illgrp_t *illg; - switch (ipsq->ipsq_current_ioctl) { - case 0: - ipsq_current_finish(ipsq); - break; + illg = ill->ill_isv6 ? grp->gr_v6 : grp->gr_v4; + if (illg == NULL) + ipmp_phyint_leave_grp(ill->ill_phyint); + else + ipmp_ill_join_illgrp(ill, illg); + } - case SIOCLIFADDIF: - case SIOCSLIFNAME: + if (ipsq->ipsq_xop->ipx_current_ioctl == IF_UNITSEL) + ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); + else ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq); - break; + break; + } + case SIOCLIFADDIF: + ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq); + break; - default: - ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); - break; - } + default: + ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); + break; } } @@ -16626,20 +16354,16 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) void ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) { - ill_t *ill; + ill_t *ill = q->q_ptr; struct iocblk *iocp; mblk_t *mp1; conn_t *connp = NULL; ip1dbg(("ip_rput_other ")); - ill = (ill_t *)q->q_ptr; - /* - * This routine is not a writer in the case of SIOCGTUNPARAM - * in which case ipsq is NULL. - */ if (ipsq != NULL) { ASSERT(IAM_WRITER_IPSQ(ipsq)); - ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); + ASSERT(ipsq->ipsq_xop == + ill->ill_phyint->phyint_ipsq->ipsq_xop); } switch (mp->b_datap->db_type) { @@ -16752,7 +16476,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) case DL_IOC_HDR_INFO: /* - * If this was the first attempt turn of the + * If this was the first attempt, turn off the * fastpath probing. */ mutex_enter(&ill->ill_lock); @@ -16768,7 +16492,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } freemsg(mp); break; - case SIOCSTUNPARAM: + case SIOCSTUNPARAM: case OSIOCSTUNPARAM: ASSERT(ipsq != NULL); /* @@ -17017,14 +16741,13 @@ ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif) /* * Find an IRE which matches the destination and the outgoing * queue in the cache table. All we need is an IRE_CACHE which - * is pointing at ipif->ipif_ill. If it is part of some ill group, - * then it is enough to have some IRE_CACHE in the group. + * is pointing at ipif->ipif_ill. */ if (ipif->ipif_flags & IPIF_POINTOPOINT) dst = ipif->ipif_pp_dst_addr; ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, MBLK_GETLABEL(mp), - MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR, ipst); + MATCH_IRE_ILL | MATCH_IRE_SECATTR, ipst); if (ire == NULL) { /* * Mark this packet to make it be delivered to @@ -17321,7 +17044,8 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) */ mp->b_datap->db_type = M_DATA; icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp, - ip6h, icmp6, ill, B_TRUE, ii->ipsec_in_zoneid); + ip6h, icmp6, ill, recv_ill, B_TRUE, + ii->ipsec_in_zoneid); } if (ill_need_rele) ill_refrele(ill); @@ -17357,37 +17081,36 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) } switch (ipha->ipha_protocol) { - case IPPROTO_UDP: - ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, - recv_ill); - if (ire_need_rele) - ire_refrele(ire); - break; - case IPPROTO_TCP: - if (!ire_need_rele) - IRE_REFHOLD(ire); - mp = ip_tcp_input(mp, ipha, ill, B_TRUE, - ire, ipsec_mp, 0, ill->ill_rq, NULL); - IRE_REFRELE(ire); - if (mp != NULL) { - - SQUEUE_ENTER(GET_SQUEUE(mp), mp, - mp, 1, SQ_PROCESS, - SQTAG_IP_PROTO_AGAIN); - } - break; - case IPPROTO_SCTP: - if (!ire_need_rele) - IRE_REFHOLD(ire); - ip_sctp_input(mp, ipha, ill, B_TRUE, ire, - ipsec_mp, 0, ill->ill_rq, dst); - break; - default: - ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, - recv_ill, 0); - if (ire_need_rele) - ire_refrele(ire); - break; + case IPPROTO_UDP: + ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, + recv_ill); + if (ire_need_rele) + ire_refrele(ire); + break; + case IPPROTO_TCP: + if (!ire_need_rele) + IRE_REFHOLD(ire); + mp = ip_tcp_input(mp, ipha, ill, B_TRUE, + ire, ipsec_mp, 0, ill->ill_rq, NULL); + IRE_REFRELE(ire); + if (mp != NULL) { + SQUEUE_ENTER(GET_SQUEUE(mp), mp, + mp, 1, SQ_PROCESS, + SQTAG_IP_PROTO_AGAIN); + } + break; + case IPPROTO_SCTP: + if (!ire_need_rele) + IRE_REFHOLD(ire); + ip_sctp_input(mp, ipha, ill, B_TRUE, ire, + ipsec_mp, 0, ill->ill_rq, dst); + break; + default: + ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, + recv_ill, 0); + if (ire_need_rele) + ire_refrele(ire); + break; } } else { uint32_t rput_flags = 0; @@ -17621,9 +17344,9 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, */ ASSERT(!mctl_present); ASSERT(first_mp == mp); - if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { + if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) return; - } + /* * Make sure that first_mp points back to mp as * the mp we came in with could have changed in @@ -17647,17 +17370,10 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, ilm_t *ilm; mblk_t *mp1; zoneid_t last_zoneid; + ilm_walker_t ilw; if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) { ASSERT(ire->ire_type == IRE_BROADCAST); - /* - * Inactive/Failed interfaces are not supposed to - * respond to the multicast packets. - */ - if (ill_is_probeonly(ill)) { - freemsg(first_mp); - return; - } /* * In the multicast case, applications may have joined @@ -17680,11 +17396,9 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, * have been exhausted. */ last_zoneid = -1; - ILM_WALKER_HOLD(recv_ill); - for (ilm = recv_ill->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if ((ilm->ilm_flags & ILM_DELETED) || - ipha->ipha_dst != ilm->ilm_addr || + ilm = ilm_walker_start(&ilw, recv_ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + if (ipha->ipha_dst != ilm->ilm_addr || ilm->ilm_zoneid == last_zoneid || ilm->ilm_zoneid == ire->ire_zoneid || ilm->ilm_zoneid == ALL_ZONES || @@ -17693,12 +17407,12 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, mp1 = ip_copymsg(first_mp); if (mp1 == NULL) continue; - icmp_inbound(q, mp1, B_TRUE, ill, + icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill, 0, sum, mctl_present, B_TRUE, recv_ill, ilm->ilm_zoneid); last_zoneid = ilm->ilm_zoneid; } - ILM_WALKER_RELE(recv_ill); + ilm_walker_finish(&ilw); } else if (ire->ire_type == IRE_BROADCAST) { /* * In the broadcast case, there may be many zones @@ -18580,14 +18294,13 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level) return (1); } - if ((mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, ipst)) == NULL) { + mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, level, ipst); + if (mpctl == NULL) return (1); - } - mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, ipst); - if (mpctl == NULL) { + mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, level, ipst); + if (mpctl == NULL) return (1); - } if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) { return (1); @@ -19048,6 +18761,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) mblk_t *mp_tail = NULL; ill_walk_context_t ctx; zoneid_t zoneid; + ilm_walker_t ilw; /* * make a copy of the original message @@ -19064,7 +18778,10 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - ILM_WALKER_HOLD(ill); + if (IS_UNDER_IPMP(ill)) + continue; + + ilm = ilm_walker_start(&ilw, ill); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (ipif->ipif_zoneid != zoneid && @@ -19074,7 +18791,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) OCTET_LENGTH); ipm.ipGroupMemberIfIndex.o_length = mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes); - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { ASSERT(ilm->ilm_ipif != NULL); ASSERT(ilm->ilm_ill == NULL); if (ilm->ilm_ipif != ipif) @@ -19090,7 +18807,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) } } } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } rw_exit(&ipst->ips_ill_g_lock); optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); @@ -19112,6 +18829,7 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) mblk_t *mp_tail = NULL; ill_walk_context_t ctx; zoneid_t zoneid; + ilm_walker_t ilw; /* * make a copy of the original message @@ -19127,9 +18845,12 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - ILM_WALKER_HOLD(ill); + if (IS_UNDER_IPMP(ill)) + continue; + + ilm = ilm_walker_start(&ilw, ill); ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex; - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { ASSERT(ilm->ilm_ipif == NULL); ASSERT(ilm->ilm_ill != NULL); if (ilm->ilm_zoneid != zoneid) @@ -19145,7 +18866,7 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) (uint_t)sizeof (ipm6))); } } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } rw_exit(&ipst->ips_ill_g_lock); @@ -19171,6 +18892,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) zoneid_t zoneid; int i; slist_t *sl; + ilm_walker_t ilw; /* * make a copy of the original message @@ -19187,7 +18909,10 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - ILM_WALKER_HOLD(ill); + if (IS_UNDER_IPMP(ill)) + continue; + + ilm = ilm_walker_start(&ilw, ill); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (ipif->ipif_zoneid != zoneid) @@ -19196,7 +18921,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) OCTET_LENGTH); ips.ipGroupSourceIfIndex.o_length = mi_strlen(ips.ipGroupSourceIfIndex.o_bytes); - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { ASSERT(ilm->ilm_ipif != NULL); ASSERT(ilm->ilm_ill == NULL); sl = ilm->ilm_filter; @@ -19220,7 +18945,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) } } } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } rw_exit(&ipst->ips_ill_g_lock); optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); @@ -19244,6 +18969,7 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) zoneid_t zoneid; int i; slist_t *sl; + ilm_walker_t ilw; /* * make a copy of the original message @@ -19259,9 +18985,12 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - ILM_WALKER_HOLD(ill); + if (IS_UNDER_IPMP(ill)) + continue; + + ilm = ilm_walker_start(&ilw, ill); ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex; - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { ASSERT(ilm->ilm_ipif == NULL); ASSERT(ilm->ilm_ill != NULL); sl = ilm->ilm_filter; @@ -19279,7 +19008,7 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) } } } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } rw_exit(&ipst->ips_ill_g_lock); @@ -19345,7 +19074,8 @@ ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) * in one IRE walk. */ static mblk_t * -ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) +ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level, + ip_stack_t *ipst) { struct opthdr *optp; mblk_t *mp2ctl; /* Returned */ @@ -19377,6 +19107,14 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) ird.ird_route.lp_head = mpctl->b_cont; ird.ird_netmedia.lp_head = mp3ctl->b_cont; ird.ird_attrs.lp_head = mp4ctl->b_cont; + /* + * If the level has been set the special EXPER_IP_AND_TESTHIDDEN + * value, then also include IRE_MARK_TESTHIDDEN IREs. This is + * intended a temporary solution until a proper MIB API is provided + * that provides complete filtering/caller-opt-in. + */ + if (level == EXPER_IP_AND_TESTHIDDEN) + ird.ird_flags |= IRD_REPORT_TESTHIDDEN; zoneid = Q_TO_CONN(q)->conn_zoneid; ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst); @@ -19419,7 +19157,8 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) * ipv6NetToMediaEntryTable in an NDP walk. */ static mblk_t * -ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) +ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level, + ip_stack_t *ipst) { struct opthdr *optp; mblk_t *mp2ctl; /* Returned */ @@ -19451,6 +19190,14 @@ ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) ird.ird_route.lp_head = mpctl->b_cont; ird.ird_netmedia.lp_head = mp3ctl->b_cont; ird.ird_attrs.lp_head = mp4ctl->b_cont; + /* + * If the level has been set the special EXPER_IP_AND_TESTHIDDEN + * value, then also include IRE_MARK_TESTHIDDEN IREs. This is + * intended a temporary solution until a proper MIB API is provided + * that provides complete filtering/caller-opt-in. + */ + if (level == EXPER_IP_AND_TESTHIDDEN) + ird.ird_flags |= IRD_REPORT_TESTHIDDEN; zoneid = Q_TO_CONN(q)->conn_zoneid; ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst); @@ -19671,6 +19418,11 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) ASSERT(ire->ire_ipversion == IPV4_VERSION); + if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) && + ire->ire_marks & IRE_MARK_TESTHIDDEN) { + return; + } + if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) return; @@ -19812,6 +19564,11 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) ASSERT(ire->ire_ipversion == IPV6_VERSION); + if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) && + ire->ire_marks & IRE_MARK_TESTHIDDEN) { + return; + } + if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) return; @@ -20518,8 +20275,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, boolean_t mctl_present; ipsec_out_t *io; int match_flags; - ill_t *attach_ill = NULL; - /* Bind to IPIF_NOFAILOVER ill etc. */ ill_t *xmit_ill = NULL; /* IP_PKTINFO etc. */ ipif_t *dst_ipif; boolean_t multirt_need_resolve = B_FALSE; @@ -20639,16 +20394,11 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, } /* - * IP_DONTFAILOVER_IF and IP_BOUND_IF have precedence over ill index - * passed in IP_PKTINFO. + * IP_BOUND_IF has precedence over the ill index passed in IP_PKTINFO. */ - if (infop->ip_opt_ill_index != 0 && - connp->conn_outgoing_ill == NULL && - connp->conn_nofailover_ill == NULL) { - - xmit_ill = ill_lookup_on_ifindex( - infop->ip_opt_ill_index, B_FALSE, NULL, NULL, NULL, NULL, - ipst); + if (infop->ip_opt_ill_index != 0 && connp->conn_outgoing_ill == NULL) { + xmit_ill = ill_lookup_on_ifindex(infop->ip_opt_ill_index, + B_FALSE, NULL, NULL, NULL, NULL, ipst); if (xmit_ill == NULL || IS_VNI(xmit_ill)) goto drop_pkt; @@ -20659,7 +20409,7 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, * accessible from all zones i.e has a valid ipif in * all zones. */ - if (!ipif_lookup_zoneid_group(xmit_ill, zoneid, 0, NULL)) { + if (!ipif_lookup_zoneid(xmit_ill, zoneid, 0, NULL)) { goto drop_pkt; } } @@ -20696,18 +20446,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, goto version_hdrlen_check; dst = ipha->ipha_dst; - if (connp->conn_nofailover_ill != NULL) { - attach_ill = conn_get_held_ill(connp, - &connp->conn_nofailover_ill, &err); - if (err == ILL_LOOKUP_FAILED) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - if (need_decref) - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - } - /* If IP_BOUND_IF has been set, use that ill. */ if (connp->conn_outgoing_ill != NULL) { xmit_ill = conn_get_held_ill(connp, @@ -20761,9 +20499,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, ire = NULL; } - if (attach_ill != NULL) - goto send_from_ill; - /* * We cache IRE_CACHEs to avoid lookups. We don't do * this for the tcp global queue and listen end point @@ -21074,45 +20809,21 @@ notdata: } ASSERT(first_mp != NULL); - /* - * ICMP echo replies attach an ipsec_out and set ipsec_out_attach_if - * to make sure that this packet goes out on the same interface it - * came in. We handle that here. - */ - if (mctl_present) { - uint_t ifindex; + if (mctl_present) { io = (ipsec_out_t *)first_mp->b_rptr; - if (io->ipsec_out_attach_if || io->ipsec_out_ip_nexthop) { + if (io->ipsec_out_ip_nexthop) { /* * We may have lost the conn context if we are * coming here from ip_newroute(). Copy the * nexthop information. */ - if (io->ipsec_out_ip_nexthop) { - ip_nexthop = B_TRUE; - nexthop_addr = io->ipsec_out_nexthop_addr; + ip_nexthop = B_TRUE; + nexthop_addr = io->ipsec_out_nexthop_addr; - ipha = (ipha_t *)mp->b_rptr; - dst = ipha->ipha_dst; - goto send_from_ill; - } else { - ASSERT(io->ipsec_out_ill_index != 0); - ifindex = io->ipsec_out_ill_index; - attach_ill = ill_lookup_on_ifindex(ifindex, - B_FALSE, NULL, NULL, NULL, NULL, ipst); - if (attach_ill == NULL) { - ASSERT(xmit_ill == NULL); - ip1dbg(("ip_output: bad ifindex for " - "(BIND TO IPIF_NOFAILOVER) %d\n", - ifindex)); - freemsg(first_mp); - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutDiscards); - ASSERT(!need_decref); - return; - } - } + ipha = (ipha_t *)mp->b_rptr; + dst = ipha->ipha_dst; + goto send_from_ill; } } @@ -21161,7 +20872,7 @@ hdrtoosmall: ipha = (ipha_t *)mp->b_rptr; if (first_mp == NULL) { - ASSERT(attach_ill == NULL && xmit_ill == NULL); + ASSERT(xmit_ill == NULL); /* * If we got here because of "goto hdrtoosmall" * We need to attach a IPSEC_OUT. @@ -21213,8 +20924,6 @@ version_hdrlen_check: */ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion); ASSERT(xmit_ill == NULL); - if (attach_ill != NULL) - ill_refrele(attach_ill); if (need_decref) mp->b_flag |= MSGHASREF; (void) ip_output_v6(arg, first_mp, arg2, caller); @@ -21255,8 +20964,6 @@ version_hdrlen_check: zoneid, ipst)) { ASSERT(xmit_ill == NULL); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - if (attach_ill != NULL) - ill_refrele(attach_ill); TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, "ip_wput_end: q %p (%S)", q, "badopts"); if (need_decref) @@ -21295,22 +21002,6 @@ multicast: */ ill_t *ill = (ill_t *)q->q_ptr; - /* - * Don't honor attach_if for this case. If ill - * is part of the group, ipif could belong to - * any ill and we cannot maintain attach_ill - * and ipif_ill same anymore and the assert - * below would fail. - */ - if (mctl_present && io->ipsec_out_attach_if) { - io->ipsec_out_ill_index = 0; - io->ipsec_out_attach_if = B_FALSE; - ASSERT(attach_ill != NULL); - ill_refrele(attach_ill); - attach_ill = NULL; - } - - ASSERT(attach_ill == NULL); ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID); if (ipif == NULL) { if (need_decref) @@ -21429,25 +21120,11 @@ multicast: first_mp->b_cont = mp; mctl_present = B_TRUE; } - if (attach_ill != NULL) { - ASSERT(attach_ill == ipif->ipif_ill); - match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) { - match_flags |= MATCH_IRE_MARK_HIDDEN; - } - io->ipsec_out_ill_index = - attach_ill->ill_phyint->phyint_ifindex; - io->ipsec_out_attach_if = B_TRUE; - } else { - match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; - io->ipsec_out_ill_index = - ipif->ipif_ill->ill_phyint->phyint_ifindex; - } + match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; + io->ipsec_out_ill_index = + ipif->ipif_ill->ill_phyint->phyint_ifindex; + if (connp != NULL) { io->ipsec_out_multicast_loop = connp->conn_multicast_loop; @@ -21469,9 +21146,7 @@ multicast: * * NOTE : We need to do it for non-secure case also as * this might go out secure if there is a global policy - * match in ip_wput_ire. For bind to IPIF_NOFAILOVER - * address, the source should be initialized already and - * hence we won't be initializing here. + * match in ip_wput_ire. * * As we do not have the ire yet, it is possible that * we set the source address here and then later discover @@ -21507,14 +21182,6 @@ multicast: zoneid, MBLK_GETLABEL(mp), match_flags, ipst); } - /* - * refrele attach_ill as its not needed anymore. - */ - if (attach_ill != NULL) { - ill_refrele(attach_ill); - attach_ill = NULL; - } - if (ire == NULL) { /* * Multicast loopback and multicast forwarding is @@ -21630,33 +21297,9 @@ noroute: ipif_refrele(dst_ipif); } } - /* - * If we are bound to IPIF_NOFAILOVER address, look for - * an IRE_CACHE matching the ill. - */ -send_from_ill: - if (attach_ill != NULL) { - ipif_t *attach_ipif; - match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) { - match_flags |= MATCH_IRE_MARK_HIDDEN; - } - - attach_ipif = ipif_get_next_ipif(NULL, attach_ill); - if (attach_ipif == NULL) { - ip1dbg(("ip_wput: No ipif for attach_ill\n")); - goto discard_pkt; - } - ire = ire_ctable_lookup(dst, 0, 0, attach_ipif, - zoneid, MBLK_GETLABEL(mp), match_flags, ipst); - ipif_refrele(attach_ipif); - } else if (xmit_ill != NULL) { +send_from_ill: + if (xmit_ill != NULL) { ipif_t *ipif; /* @@ -21681,6 +21324,10 @@ send_from_ill: goto drop_pkt; } + match_flags = 0; + if (IS_UNDER_IPMP(xmit_ill)) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; + /* * Look for a ire that is part of the group, * if found use it else call ip_newroute_ipif. @@ -21689,7 +21336,7 @@ send_from_ill: * ill is accessible from all zones i.e has a * valid ipif in all zones. */ - match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; + match_flags |= MATCH_IRE_ILL | MATCH_IRE_SECATTR; ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, MBLK_GETLABEL(mp), match_flags, ipst); /* @@ -21729,12 +21376,7 @@ send_from_ill: ipst); } if (!ire) { - /* - * Make sure we don't load spread if this - * is IPIF_NOFAILOVER case. - */ - if ((attach_ill != NULL) || - (ip_nexthop && !ignore_nexthop)) { + if (ip_nexthop && !ignore_nexthop) { if (mctl_present) { io = (ipsec_out_t *)first_mp->b_rptr; ASSERT(first_mp->b_datap->db_type == @@ -21764,15 +21406,8 @@ send_from_ill: first_mp->b_cont = mp; mctl_present = B_TRUE; } - if (attach_ill != NULL) { - io->ipsec_out_ill_index = attach_ill-> - ill_phyint->phyint_ifindex; - io->ipsec_out_attach_if = B_TRUE; - } else { - io->ipsec_out_ip_nexthop = ip_nexthop; - io->ipsec_out_nexthop_addr = - nexthop_addr; - } + io->ipsec_out_ip_nexthop = ip_nexthop; + io->ipsec_out_nexthop_addr = nexthop_addr; } noirefound: /* @@ -21787,8 +21422,6 @@ noirefound: ip_newroute(q, first_mp, dst, connp, zoneid, ipst); TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, "ip_wput_end: q %p (%S)", q, "newroute"); - if (attach_ill != NULL) - ill_refrele(attach_ill); if (xmit_ill != NULL) ill_refrele(xmit_ill); if (need_decref) @@ -21869,8 +21502,6 @@ noirefound: ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); } } - if (attach_ill != NULL) - ill_refrele(attach_ill); if (xmit_ill != NULL) ill_refrele(xmit_ill); if (need_decref) @@ -21896,8 +21527,6 @@ drop_pkt: if (need_decref) CONN_DEC_REF(connp); freemsg(first_mp); - if (attach_ill != NULL) - ill_refrele(attach_ill); if (xmit_ill != NULL) ill_refrele(xmit_ill); TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, @@ -21923,8 +21552,8 @@ ip_wput(queue_t *q, mblk_t *mp) /* * * The following rules must be observed when accessing any ipif or ill - * that has been cached in the conn. Typically conn_nofailover_ill, - * conn_outgoing_ill, conn_multicast_ipif and conn_multicast_ill. + * that has been cached in the conn. Typically conn_outgoing_ill, + * conn_multicast_ipif and conn_multicast_ill. * * Access: The ipif or ill pointed to from the conn can be accessed under * the protection of the conn_lock or after it has been refheld under the @@ -21944,10 +21573,8 @@ ip_wput(queue_t *q, mblk_t *mp) * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock * or a reference to the ipif or a reference to an ire that references the - * ipif. An ipif does not change its ill except for failover/failback. Since - * failover/failback happens only after bringing down the ipif and making sure - * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock - * the above holds. + * ipif. An ipif only changes its ill when migrating from an underlying ill + * to an IPMP ill in ipif_up(). */ ipif_t * conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err) @@ -22302,96 +21929,6 @@ ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, zoneid)); } -ire_t * -conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill) -{ - ipaddr_t addr; - ire_t *save_ire; - irb_t *irb; - ill_group_t *illgrp; - int err; - - save_ire = ire; - addr = ire->ire_addr; - - ASSERT(ire->ire_type == IRE_BROADCAST); - - illgrp = connp->conn_outgoing_ill->ill_group; - if (illgrp == NULL) { - *conn_outgoing_ill = conn_get_held_ill(connp, - &connp->conn_outgoing_ill, &err); - if (err == ILL_LOOKUP_FAILED) { - ire_refrele(save_ire); - return (NULL); - } - return (save_ire); - } - /* - * If IP_BOUND_IF has been done, conn_outgoing_ill will be set. - * If it is part of the group, we need to send on the ire - * that has been cleared of IRE_MARK_NORECV and that belongs - * to this group. This is okay as IP_BOUND_IF really means - * any ill in the group. We depend on the fact that the - * first ire in the group is always cleared of IRE_MARK_NORECV - * if such an ire exists. This is possible only if you have - * at least one ill in the group that has not failed. - * - * First get to the ire that matches the address and group. - * - * We don't look for an ire with a matching zoneid because a given zone - * won't always have broadcast ires on all ills in the group. - */ - irb = ire->ire_bucket; - rw_enter(&irb->irb_lock, RW_READER); - if (ire->ire_marks & IRE_MARK_NORECV) { - /* - * If the current zone only has an ire broadcast for this - * address marked NORECV, the ire we want is ahead in the - * bucket, so we look it up deliberately ignoring the zoneid. - */ - for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_addr != addr) - continue; - /* skip over deleted ires */ - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - } - } - while (ire != NULL) { - /* - * If a new interface is coming up, we could end up - * seeing the loopback ire and the non-loopback ire - * may not have been added yet. So check for ire_stq - */ - if (ire->ire_stq != NULL && (ire->ire_addr != addr || - ire->ire_ipif->ipif_ill->ill_group == illgrp)) { - break; - } - ire = ire->ire_next; - } - if (ire != NULL && ire->ire_addr == addr && - ire->ire_ipif->ipif_ill->ill_group == illgrp) { - IRE_REFHOLD(ire); - rw_exit(&irb->irb_lock); - ire_refrele(save_ire); - *conn_outgoing_ill = ire_to_ill(ire); - /* - * Refhold the ill to make the conn_outgoing_ill - * independent of the ire. ip_wput_ire goes in a loop - * and may refrele the ire. Since we have an ire at this - * point we don't need to use ILL_CAN_LOOKUP on the ill. - */ - ill_refhold(*conn_outgoing_ill); - return (ire); - } - rw_exit(&irb->irb_lock); - ip1dbg(("conn_set_outgoing_ill: No matching ire\n")); - /* - * If we can't find a suitable ire, return the original ire. - */ - return (save_ire); -} - /* * This function does the ire_refrele of the ire passed in as the * argument. As this function looks up more ires i.e broadcast ires, @@ -22401,7 +21938,6 @@ conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill) * IPQoS Notes: * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for * IPsec packets are done in ipsec_out_process. - * */ void ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, @@ -22471,9 +22007,8 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, if ((first_ire->ire_flags & RTF_MULTIRT) && (first_ire->ire_addr == ire->ire_addr) && !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) break; - } } if ((first_ire != NULL) && (first_ire != ire)) { @@ -22489,36 +22024,15 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, * conn_outgoing_ill variable is used only in the broadcast loop. * for performance we don't grab the mutexs in the fastpath */ - if ((connp != NULL) && - (ire->ire_type == IRE_BROADCAST) && - ((connp->conn_nofailover_ill != NULL) || - (connp->conn_outgoing_ill != NULL))) { - /* - * Bind to IPIF_NOFAILOVER address overrides IP_BOUND_IF - * option. So, see if this endpoint is bound to a - * IPIF_NOFAILOVER address. If so, honor it. This implies - * that if the interface is failed, we will still send - * the packet on the same ill which is what we want. - */ + if (ire->ire_type == IRE_BROADCAST && connp != NULL && + connp->conn_outgoing_ill != NULL) { conn_outgoing_ill = conn_get_held_ill(connp, - &connp->conn_nofailover_ill, &err); + &connp->conn_outgoing_ill, &err); if (err == ILL_LOOKUP_FAILED) { ire_refrele(ire); freemsg(mp); return; } - if (conn_outgoing_ill == NULL) { - /* - * Choose a good ill in the group to send the - * packets on. - */ - ire = conn_set_outgoing_ill(connp, ire, - &conn_outgoing_ill); - if (ire == NULL) { - freemsg(mp); - return; - } - } } if (mp->b_datap->db_type != M_CTL) { @@ -22578,7 +22092,7 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, if (src_ire != NULL && !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) && (!ipst->ips_ip_restrict_interzone_loopback || - ire_local_same_ill_group(ire, src_ire))) { + ire_local_same_lan(ire, src_ire))) { if (ipha->ipha_src == INADDR_ANY && !unspec_src) ipha->ipha_src = src_ire->ire_src_addr; ire_refrele(src_ire); @@ -22741,39 +22255,7 @@ another:; */ ASSERT(ire->ire_ipversion == IPV4_VERSION); - /* - * With IP multipathing, broadcast packets are sent on the ire - * that has been cleared of IRE_MARK_NORECV and that belongs to - * the group. However, this ire might not be in the same zone so - * we can't always use its source address. We look for a - * broadcast ire in the same group and in the right zone. - */ - if (ire->ire_type == IRE_BROADCAST && - ire->ire_zoneid != zoneid) { - ire_t *src_ire = ire_ctable_lookup(dst, 0, - IRE_BROADCAST, ire->ire_ipif, zoneid, NULL, - (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst); - if (src_ire != NULL) { - src = src_ire->ire_src_addr; - ire_refrele(src_ire); - } else { - ire_refrele(ire); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - freemsg(first_mp); - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutDiscards); - } - return; - } - } else { - src = ire->ire_src_addr; - } - + src = ire->ire_src_addr; if (connp == NULL) { ip1dbg(("ip_wput_ire: no connp and no src " "address for dst 0x%x, using src 0x%x\n", @@ -22917,10 +22399,9 @@ another:; ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t)); io = (ipsec_out_t *)first_mp->b_rptr; - io->ipsec_out_ill_index = ((ill_t *)stq->q_ptr)-> - ill_phyint->phyint_ifindex; - - ipsec_out_process(q, first_mp, ire, ill_index); + io->ipsec_out_ill_index = + ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex; + ipsec_out_process(q, first_mp, ire, 0); ire_refrele(ire); if (conn_outgoing_ill != NULL) ill_refrele(conn_outgoing_ill); @@ -22960,7 +22441,7 @@ another:; if (ire1->ire_addr != ire->ire_addr) continue; if (ire1->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) continue; /* Got one */ @@ -23147,71 +22628,16 @@ broadcast: * back outbound packets in different zones but on the * same ill, as the application would see duplicates. * - * If the interfaces are part of the same group, - * we would want to send only one copy out for - * whole group. - * * This logic assumes that ire_add_v4() groups the * IRE_BROADCAST entries so that those with the same - * ire_addr and ill_group are kept together. + * ire_addr are kept together. */ ire_ill = ire->ire_ipif->ipif_ill; - if (ire->ire_stq == NULL && ire1->ire_stq != NULL) { - if (ire_ill->ill_group != NULL && - (ire->ire_marks & IRE_MARK_NORECV)) { - /* - * If the current zone only has an ire - * broadcast for this address marked - * NORECV, the ire we want is ahead in - * the bucket, so we look it up - * deliberately ignoring the zoneid. - */ - for (ire1 = ire->ire_bucket->irb_ire; - ire1 != NULL; - ire1 = ire1->ire_next) { - ire1_ill = - ire1->ire_ipif->ipif_ill; - if (ire1->ire_addr != dst) - continue; - /* skip over the current ire */ - if (ire1 == ire) - continue; - /* skip over deleted ires */ - if (ire1->ire_marks & - IRE_MARK_CONDEMNED) - continue; - /* - * non-loopback ire in our - * group: use it for the next - * pass in the loop - */ - if (ire1->ire_stq != NULL && - ire1_ill->ill_group == - ire_ill->ill_group) - break; - } - } - } else { + if (ire->ire_stq != NULL || ire1->ire_stq == NULL) { while (ire1 != NULL && ire1->ire_addr == dst) { ire1_ill = ire1->ire_ipif->ipif_ill; - /* - * We can have two broadcast ires on the - * same ill in different zones; here - * we'll send a copy of the packet on - * each ill and the fanout code will - * call conn_wantpacket() to check that - * the zone has the broadcast address - * configured on the ill. If the two - * ires are in the same group we only - * send one copy up. - */ - if (ire1_ill != ire_ill && - (ire1_ill->ill_group == NULL || - ire_ill->ill_group == NULL || - ire1_ill->ill_group != - ire_ill->ill_group)) { + if (ire1_ill != ire_ill) break; - } ire1 = ire1->ire_next; } } @@ -23403,13 +22829,8 @@ multi_loopback: * logic. */ if (ill != NULL) { - ilm_t *ilm; - - ILM_WALKER_HOLD(ill); - ilm = ilm_lookup_ill(ill, ipha->ipha_dst, - ALL_ZONES); - ILM_WALKER_RELE(ill); - if (ilm != NULL) { + if (ilm_lookup_ill(ill, ipha->ipha_dst, + ALL_ZONES) != NULL) { /* * Pass along the virtual output q. * ip_wput_local() will distribute the @@ -23565,18 +22986,17 @@ checksumoptions: ire1 != NULL; ire1 = ire1->ire_next) { if (!(ire1->ire_flags & - RTF_MULTIRT)) { + RTF_MULTIRT)) continue; - } + if (ire1->ire_addr != - ire->ire_addr) { + ire->ire_addr) continue; - } + if (ire1->ire_marks & - (IRE_MARK_CONDEMNED| - IRE_MARK_HIDDEN)) { + (IRE_MARK_CONDEMNED | + IRE_MARK_TESTHIDDEN)) continue; - } /* Got one */ IRE_REFHOLD(ire1); @@ -24743,9 +24163,8 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, if ((first_ire->ire_flags & RTF_MULTIRT) && (first_ire->ire_addr == ire->ire_addr) && !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) break; - } } if (first_ire != NULL) { @@ -24808,7 +24227,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, if (ire1->ire_addr != ire->ire_addr) continue; if (ire1->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) continue; /* * Ensure we do not exceed the MTU @@ -25130,10 +24549,9 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, if (ire1->ire_addr != ire->ire_addr) continue; if (ire1->ire_marks & - (IRE_MARK_CONDEMNED| - IRE_MARK_HIDDEN)) { + (IRE_MARK_CONDEMNED | + IRE_MARK_TESTHIDDEN)) continue; - } /* * Ensure we do not exceed the MTU * of the next route. @@ -25500,6 +24918,7 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, ilm_t *ilm; mblk_t *mp1; zoneid_t last_zoneid; + ilm_walker_t ilw; if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) { ASSERT(ire_type == IRE_BROADCAST); @@ -25524,11 +24943,9 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, * have been exhausted. */ last_zoneid = -1; - ILM_WALKER_HOLD(ill); - for (ilm = ill->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if ((ilm->ilm_flags & ILM_DELETED) || - ipha->ipha_dst != ilm->ilm_addr || + ilm = ilm_walker_start(&ilw, ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + if (ipha->ipha_dst != ilm->ilm_addr || ilm->ilm_zoneid == last_zoneid || ilm->ilm_zoneid == zoneid || !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) @@ -25536,12 +24953,12 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, mp1 = ip_copymsg(first_mp); if (mp1 == NULL) continue; - icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, - mctl_present, B_FALSE, ill, + icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill, + 0, 0, mctl_present, B_FALSE, ill, ilm->ilm_zoneid); last_zoneid = ilm->ilm_zoneid; } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); /* * Loopback case: the sending endpoint has * IP_MULTICAST_LOOP disabled, therefore we don't @@ -25859,14 +25276,9 @@ ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid) * caller and hence matching on ILL (MATCH_IRE_ILL) would * be sufficient rather than MATCH_IRE_IPIF. * - * This function is used for sending IGMP packets. We need - * to make sure that we send the packet out of the interface - * (ipif->ipif_ill) where we joined the group. This is to - * prevent from switches doing IGMP snooping to send us multicast - * packets for a given group on the interface we have joined. - * If we can't find an ire, igmp_sendpkt has already initialized - * ipsec_out_attach_if so that this will not be load spread in - * ip_newroute_ipif. + * This function is used for sending IGMP packets. For IPMP, + * we sidestep IGMP snooping issues by sending all multicast + * traffic on a single interface in the IPMP group. */ ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL, MATCH_IRE_ILL, ipst); @@ -26035,7 +25447,7 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, ip6_t *ip6h1; uint_t ill_index; ipsec_out_t *io; - boolean_t attach_if, hwaccel; + boolean_t hwaccel; uint32_t flags = IP6_NO_IPPOLICY; int match_flags; zoneid_t zoneid; @@ -26052,42 +25464,22 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, if (io->ipsec_out_reachable) { flags |= IPV6_REACHABILITY_CONFIRMATION; } - attach_if = io->ipsec_out_attach_if; hwaccel = io->ipsec_out_accelerated; zoneid = io->ipsec_out_zoneid; ASSERT(zoneid != ALL_ZONES); - match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; + match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; /* Multicast addresses should have non-zero ill_index. */ v6dstp = &ip6h->ip6_dst; ASSERT(ip6h->ip6_nxt != IPPROTO_RAW); ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0); - ASSERT(!attach_if || ill_index != 0); - if (ill_index != 0) { - if (ill == NULL) { - ill = ip_grab_attach_ill(NULL, ipsec_mp, ill_index, - B_TRUE, ipst); - /* Failure case frees things for us. */ - if (ill == NULL) - return; - - ill_need_rele = B_TRUE; - } - /* - * If this packet needs to go out on a particular interface - * honor it. - */ - if (attach_if) { - match_flags = MATCH_IRE_ILL; + if (ill == NULL && ill_index != 0) { + ill = ip_grab_ill(ipsec_mp, ill_index, B_TRUE, ipst); + /* Failure case frees things for us. */ + if (ill == NULL) + return; - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) { - match_flags |= MATCH_IRE_MARK_HIDDEN; - } - } + ill_need_rele = B_TRUE; } ASSERT(mp != NULL); @@ -26138,32 +25530,15 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, return; } - ip_newroute_ipif_v6(q, ipsec_mp, ipif, *v6dstp, + ip_newroute_ipif_v6(q, ipsec_mp, ipif, v6dstp, &ip6h->ip6_src, unspec_src, zoneid); ipif_refrele(ipif); } else { - if (attach_if) { - ipif_t *ipif; - - ipif = ipif_get_next_ipif(NULL, ill); - if (ipif == NULL) { - if (ill_need_rele) - ill_refrele(ill); - freemsg(ipsec_mp); - return; - } - ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, - zoneid, MBLK_GETLABEL(mp), match_flags, ipst); - ire_need_rele = B_TRUE; - ipif_refrele(ipif); + if (ire_arg != NULL) { + ire = ire_arg; } else { - if (ire_arg != NULL) { - ire = ire_arg; - } else { - ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, - ipst); - ire_need_rele = B_TRUE; - } + ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, ipst); + ire_need_rele = B_TRUE; } if (ire != NULL) goto send; @@ -26350,7 +25725,6 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, ipha_t *ipha1; uint_t ill_index; ipsec_out_t *io; - boolean_t attach_if; int match_flags; irb_t *irb = NULL; boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE; @@ -26372,39 +25746,19 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, io = (ipsec_out_t *)ipsec_mp->b_rptr; ill_index = io->ipsec_out_ill_index; - attach_if = io->ipsec_out_attach_if; zoneid = io->ipsec_out_zoneid; ASSERT(zoneid != ALL_ZONES); ipst = io->ipsec_out_ns->netstack_ip; ASSERT(io->ipsec_out_ns != NULL); - match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; - if (ill_index != 0) { - if (ill == NULL) { - ill = ip_grab_attach_ill(NULL, ipsec_mp, - ill_index, B_FALSE, ipst); - - /* Failure case frees things for us. */ - if (ill == NULL) - return; - - ill_need_rele = B_TRUE; - } - /* - * If this packet needs to go out on a particular interface - * honor it. - */ - if (attach_if) { - match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; + match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; + if (ill == NULL && ill_index != 0) { + ill = ip_grab_ill(ipsec_mp, ill_index, B_FALSE, ipst); + /* Failure case frees things for us. */ + if (ill == NULL) + return; - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) { - match_flags |= MATCH_IRE_MARK_HIDDEN; - } - } + ill_need_rele = B_TRUE; } if (CLASSD(dst)) { @@ -26474,17 +25828,12 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT, zoneid, &zero_info); } else { - if (attach_if) { - ire = ire_ctable_lookup(dst, 0, 0, ill->ill_ipif, - zoneid, MBLK_GETLABEL(mp), match_flags, ipst); + if (ire_arg != NULL) { + ire = ire_arg; + ire_need_rele = B_FALSE; } else { - if (ire_arg != NULL) { - ire = ire_arg; - ire_need_rele = B_FALSE; - } else { - ire = ire_cache_lookup(dst, zoneid, - MBLK_GETLABEL(mp), ipst); - } + ire = ire_cache_lookup(dst, zoneid, + MBLK_GETLABEL(mp), ipst); } if (ire != NULL) { goto send; @@ -26613,11 +25962,9 @@ send: (void *)ire->ire_ipif, (void *)ipif)); /* - * Multiroute the secured packet, unless IPsec really - * requires the packet to go out only through a particular - * interface. + * Multiroute the secured packet. */ - if ((ire->ire_flags & RTF_MULTIRT) && !attach_if) { + if (ire->ire_flags & RTF_MULTIRT) { ire_t *first_ire; irb = ire->ire_bucket; ASSERT(irb != NULL); @@ -26634,9 +25981,8 @@ send: if ((first_ire->ire_flags & RTF_MULTIRT) && (first_ire->ire_addr == ire->ire_addr) && !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) break; - } } if ((first_ire != NULL) && (first_ire != ire)) { @@ -26657,11 +26003,6 @@ send: multirt_send = B_TRUE; max_frag = ire->ire_max_frag; - } else { - if ((ire->ire_flags & RTF_MULTIRT) && attach_if) { - ip1dbg(("ip_wput_ipsec_out: ignoring multirouting " - "flag, attach_if %d\n", attach_if)); - } } /* @@ -26689,7 +26030,7 @@ send: if (ire1->ire_addr != ire->ire_addr) continue; if (ire1->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) continue; /* No loopback here */ if (ire1->ire_stq == NULL) @@ -27155,10 +26496,8 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) * before sending it the accelerated packet. */ if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) { - int ifindex; ill = ire_to_ill(ire); - ifindex = ill->ill_phyint->phyint_ifindex; - io->ipsec_out_capab_ill_index = ifindex; + io->ipsec_out_capab_ill_index = ill->ill_phyint->phyint_ifindex; } /* @@ -27284,17 +26623,18 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) } } /* - * We are done with IPsec processing. Send it over - * the wire. + * We are done with IPsec processing. Send it over the wire. */ done: mp = ipsec_mp->b_cont; ipha = (ipha_t *)mp->b_rptr; if (IPH_HDR_VERSION(ipha) == IP_VERSION) { - ip_wput_ipsec_out(q, ipsec_mp, ipha, ill, ire); + ip_wput_ipsec_out(q, ipsec_mp, ipha, ire->ire_ipif->ipif_ill, + ire); } else { ip6h = (ip6_t *)ipha; - ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ill, ire); + ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ire->ire_ipif->ipif_ill, + ire); } if (ill != NULL && ill_need_rele) ill_refrele(ill); @@ -27356,18 +26696,16 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) ipip = ip_sioctl_lookup(iocp->ioc_cmd); if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) { /* - * Special case where ipsq_current_ipif is not set: + * Special case where ipx_current_ipif is not set: * ill_phyint_reinit merged the v4 and v6 into a single ipsq. - * ill could also have become part of a ipmp group in the - * process, we are here as were not able to complete the - * operation in ipif_set_values because we could not become - * exclusive on the new ipsq, In such a case ipsq_current_ipif - * will not be set so we need to set it. + * We are here as were not able to complete the operation in + * ipif_set_values because we could not become exclusive on + * the new ipsq. */ ill_t *ill = q->q_ptr; ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd); } - ASSERT(ipsq->ipsq_current_ipif != NULL); + ASSERT(ipsq->ipsq_xop->ipx_current_ipif != NULL); if (ipip->ipi_cmd_type == IF_CMD) { /* This a old style SIOC[GS]IF* command */ @@ -27381,8 +26719,8 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) sin = NULL; } - err = (*ipip->ipi_func_restart)(ipsq->ipsq_current_ipif, sin, q, mp, - ipip, mp1->b_rptr); + err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin, + q, mp, ipip, mp1->b_rptr); ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); } @@ -27424,6 +26762,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ip_extract_func_t *extract_funcp; cmd_info_t ci; int err; + boolean_t entered_ipsq = B_FALSE; ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd)); @@ -27505,18 +26844,21 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) return; } + ASSERT(ci.ci_ipif != NULL); + /* - * If ipsq is non-null, we are already being called exclusively on an - * ill but in the case of a failover in progress it is the "from" ill, - * rather than the "to" ill (which is the ill ptr passed in). - * In order to ensure we are exclusive on both ILLs we rerun - * ipsq_try_enter() here, ipsq's support recursive entry. + * If ipsq is non-NULL, we are already being called exclusively. */ ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); - ASSERT(ci.ci_ipif != NULL); - - ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl, - NEW_OP, B_TRUE); + if (ipsq == NULL) { + ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl, + NEW_OP, B_TRUE); + if (ipsq == NULL) { + ipif_refrele(ci.ci_ipif); + return; + } + entered_ipsq = B_TRUE; + } /* * Release the ipif so that ipif_down and friends that wait for @@ -27525,8 +26867,6 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) * the ipif. */ ipif_refrele(ci.ci_ipif); - if (ipsq == NULL) - return; ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); @@ -27535,19 +26875,12 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) * where we set the IPIF_CHANGING flag. This ensures that there won't * be any new references to the ipif. This helps functions that go * through this path and end up trying to wait for the refcnts - * associated with the ipif to go down to zero. Some exceptions are - * Failover, Failback, and Groupname commands that operate on more than - * just the ci.ci_ipif. These commands internally determine the - * set of ipif's they operate on and set and clear the IPIF_CHANGING - * flags on that set. Another exception is the Removeif command that - * sets the IPIF_CONDEMNED flag internally after identifying the right - * ipif to operate on. + * associated with the ipif to go down to zero. The exception is + * SIOCSLIFREMOVEIF, which sets IPIF_CONDEMNED internally after + * identifying the right ipif to operate on. */ mutex_enter(&(ci.ci_ipif)->ipif_ill->ill_lock); - if (ipip->ipi_cmd != SIOCLIFREMOVEIF && - ipip->ipi_cmd != SIOCLIFFAILOVER && - ipip->ipi_cmd != SIOCLIFFAILBACK && - ipip->ipi_cmd != SIOCSLIFGROUPNAME) + if (ipip->ipi_cmd != SIOCLIFREMOVEIF) (ci.ci_ipif)->ipif_state_flags |= IPIF_CHANGING; mutex_exit(&(ci.ci_ipif)->ipif_ill->ill_lock); @@ -27560,7 +26893,8 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); - ipsq_exit(ipsq); + if (entered_ipsq) + ipsq_exit(ipsq); } /* @@ -27708,7 +27042,7 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * Refhold the conn, till the ioctl completes. This is * needed in case the ioctl ends up in the pending mp * list. Every mp in the ill_pending_mp list and - * the ipsq_pending_mp must have a refhold on the conn + * the ipx_pending_mp must have a refhold on the conn * to resume processing. The refhold is released when * the ioctl completes. (normally or abnormally) * In all cases ip_ioctl_finish is called to finish @@ -27753,8 +27087,25 @@ nak: if (CONN_Q(q)) goto nak; - /* Finish socket ioctls passed through to ARP. */ - ip_sioctl_iocack(q, mp); + /* + * Finish socket ioctls passed through to ARP. We use the + * ioc_cmd values we set in ip_sioctl_arp() to decide whether + * we need to become writer before calling ip_sioctl_iocack(). + * Note that qwriter_ip() will release the refhold, and that a + * refhold is OK without ILL_CAN_LOOKUP() since we're on the + * ill stream. + */ + iocp = (struct iocblk *)mp->b_rptr; + if (iocp->ioc_cmd == AR_ENTRY_SQUERY) { + ip_sioctl_iocack(NULL, q, mp, NULL); + return; + } + + ASSERT(iocp->ioc_cmd == AR_ENTRY_DELETE || + iocp->ioc_cmd == AR_ENTRY_ADD); + ill = q->q_ptr; + ill_refhold(ill); + qwriter_ip(ill, q, mp, ip_sioctl_iocack, CUR_OP, B_FALSE); return; case M_FLUSH: if (*mp->b_rptr & FLUSHW) @@ -28021,11 +27372,11 @@ nak: gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { - nce = ndp_lookup_v6(ill, + nce = ndp_lookup_v6(ill, B_FALSE, &ire->ire_addr_v6, B_FALSE); } else { - nce = ndp_lookup_v6(ill, &gw_addr_v6, - B_FALSE); + nce = ndp_lookup_v6(ill, B_FALSE, + &gw_addr_v6, B_FALSE); } if (nce != NULL) { nce_resolv_failed(nce); @@ -28061,10 +27412,11 @@ nak: gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { - nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, - B_FALSE); + nce = ndp_lookup_v6(ill, B_FALSE, + &ire->ire_addr_v6, B_FALSE); } else { - nce = ndp_lookup_v6(ill, &gw_addr_v6, B_FALSE); + nce = ndp_lookup_v6(ill, B_FALSE, + &gw_addr_v6, B_FALSE); } if (nce != NULL) { /* @@ -28238,13 +27590,14 @@ nak: fake_ire = (ire_t *)mp->b_rptr; /* - * By the time we come back here from ARP the incomplete ire - * created in ire_forward() could have been removed. We use - * the parameters stored in the fake_ire to specify the real - * ire as explicitly as possible. This avoids problems when - * IPMP groups are configured as an ipif can 'float' - * across several ill queues. We can be confident that the - * the inability to find an ire is because it no longer exists. + * By the time we come back here from ARP the logical outgoing + * interface of the incomplete ire we added in ire_forward() + * could have disappeared, causing the incomplete ire to also + * disappear. So we need to retreive the proper ipif for the + * ire before looking in ctable. In the case of IPMP, the + * ipif may be on the IPMP ill, so look it up based on the + * ire_ipif_ifindex we stashed back in ire_init_common(). + * Then, we can verify that ire_ipif_seqid still exists. */ ill = ill_lookup_on_ifindex(fake_ire->ire_ipif_ifindex, B_FALSE, NULL, NULL, NULL, NULL, ipst); @@ -28299,6 +27652,7 @@ nak: freemsg(mp); /* fake ire */ return; } + nce = ire->ire_nce; DTRACE_PROBE2(ire__arpresolve__type, ire_t *, ire, nce_t *, nce); @@ -29030,7 +28384,7 @@ boolean_t conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, zoneid_t zoneid) { - ill_t *in_ill; + ill_t *bound_ill; boolean_t found; ipif_t *ipif; ire_t *ire; @@ -29045,32 +28399,15 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, * unicast, broadcast and multicast reception to * conn_incoming_ill. conn_wantpacket itself is called * only for BROADCAST and multicast. - * - * 1) ip_rput supresses duplicate broadcasts if the ill - * is part of a group. Hence, we should be receiving - * just one copy of broadcast for the whole group. - * Thus, if it is part of the group the packet could - * come on any ill of the group and hence we need a - * match on the group. Otherwise, match on ill should - * be sufficient. - * - * 2) ip_rput does not suppress duplicate multicast packets. - * If there are two interfaces in a ill group and we have - * 2 applications (conns) joined a multicast group G on - * both the interfaces, ilm_lookup_ill filter in ip_rput - * will give us two packets because we join G on both the - * interfaces rather than nominating just one interface - * for receiving multicast like broadcast above. So, - * we have to call ilg_lookup_ill to filter out duplicate - * copies, if ill is part of a group. - */ - in_ill = connp->conn_incoming_ill; - if (in_ill != NULL) { - if (in_ill->ill_group == NULL) { - if (in_ill != ill) + */ + bound_ill = connp->conn_incoming_ill; + if (bound_ill != NULL) { + if (IS_IPMP(bound_ill)) { + if (bound_ill->ill_grp != ill->ill_grp) + return (B_FALSE); + } else { + if (bound_ill != ill) return (B_FALSE); - } else if (in_ill->ill_group != ill->ill_group) { - return (B_FALSE); } } @@ -29079,15 +28416,14 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, return (B_TRUE); /* * The conn is in a different zone; we need to check that this - * broadcast address is configured in the application's zone and - * on one ill in the group. + * broadcast address is configured in the application's zone. */ ipif = ipif_get_next_ipif(NULL, ill); if (ipif == NULL) return (B_FALSE); ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif, connp->conn_zoneid, NULL, - (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst); + (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); ipif_refrele(ipif); if (ire != NULL) { ire_refrele(ire); @@ -29171,7 +28507,7 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) } ipsq = ill->ill_phyint->phyint_ipsq; - ipif = ipsq->ipsq_pending_ipif; + ipif = ipsq->ipsq_xop->ipx_pending_ipif; mp1 = ipsq_pending_mp_get(ipsq, &connp); ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); if (mp1 == NULL) { @@ -29181,12 +28517,12 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) } /* - * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we + * If an IOCTL is waiting on this (ipx_current_ioctl != 0), then we * must have an associated conn_t. Otherwise, we're bringing this * interface back up as part of handling an asynchronous event (e.g., * physical address change). */ - if (ipsq->ipsq_current_ioctl != 0) { + if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { ASSERT(connp != NULL); q = CONNP_TO_WQ(connp); } else { @@ -29219,16 +28555,28 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) return; } - if (ill->ill_up_ipifs) - ill_group_cleanup(ill); + /* + * If we have a moved ipif to bring up, and everything has succeeded + * to this point, bring it up on the IPMP ill. Otherwise, leave it + * down -- the admin can try to bring it up by hand if need be. + */ + if (ill->ill_move_ipif != NULL) { + ipif = ill->ill_move_ipif; + ill->ill_move_ipif = NULL; + if (err == 0) { + err = ipif_up(ipif, q, mp1); + if (err == EINPROGRESS) + return; + } + } /* * The operation must complete without EINPROGRESS since - * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. - * Otherwise, the operation will be stuck forever in the ipsq. + * ipsq_pending_mp_get() has removed the mblk. Otherwise, the + * operation will be stuck forever in the ipsq. */ ASSERT(err != EINPROGRESS); - if (ipsq->ipsq_current_ioctl != 0) + if (ipsq->ipsq_xop->ipx_current_ioctl != 0) ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); else ipsq_current_finish(ipsq); @@ -29649,124 +28997,6 @@ ip_int_set(queue_t *q, mblk_t *mp, char *value, return (0); } -/* - * Handle changes to ipmp_hook_emulation ndd variable. - * Need to update phyint_hook_ifindex. - * Also generate a nic plumb event should a new ifidex be assigned to a group. - */ -static void -ipmp_hook_emulation_changed(ip_stack_t *ipst) -{ - phyint_t *phyi; - phyint_t *phyi_tmp; - char *groupname; - int namelen; - ill_t *ill; - boolean_t new_group; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - /* - * Group indicies are stored in the phyint - a common structure - * to both IPv4 and IPv6. - */ - phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); - for (; phyi != NULL; - phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, - phyi, AVL_AFTER)) { - /* Ignore the ones that do not have a group */ - if (phyi->phyint_groupname_len == 0) - continue; - - /* - * Look for other phyint in group. - * Clear name/namelen so the lookup doesn't find ourselves. - */ - namelen = phyi->phyint_groupname_len; - groupname = phyi->phyint_groupname; - phyi->phyint_groupname_len = 0; - phyi->phyint_groupname = NULL; - - phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst); - /* Restore */ - phyi->phyint_groupname_len = namelen; - phyi->phyint_groupname = groupname; - - new_group = B_FALSE; - if (ipst->ips_ipmp_hook_emulation) { - /* - * If the group already exists and has already - * been assigned a group ifindex, we use the existing - * group_ifindex, otherwise we pick a new group_ifindex - * here. - */ - if (phyi_tmp != NULL && - phyi_tmp->phyint_group_ifindex != 0) { - phyi->phyint_group_ifindex = - phyi_tmp->phyint_group_ifindex; - } else { - /* XXX We need a recovery strategy here. */ - if (!ip_assign_ifindex( - &phyi->phyint_group_ifindex, ipst)) - cmn_err(CE_PANIC, - "ip_assign_ifindex() failed"); - new_group = B_TRUE; - } - } else { - phyi->phyint_group_ifindex = 0; - } - if (ipst->ips_ipmp_hook_emulation) - phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex; - else - phyi->phyint_hook_ifindex = phyi->phyint_ifindex; - - /* - * For IP Filter to find out the relationship between - * names and interface indicies, we need to generate - * a NE_PLUMB event when a new group can appear. - * We always generate events when a new interface appears - * (even when ipmp_hook_emulation is set) so there - * is no need to generate NE_PLUMB events when - * ipmp_hook_emulation is turned off. - * And since it isn't critical for IP Filter to get - * the NE_UNPLUMB events we skip those here. - */ - if (new_group) { - /* - * First phyint in group - generate group PLUMB event. - * Since we are not running inside the ipsq we do - * the dispatch immediately. - */ - if (phyi->phyint_illv4 != NULL) - ill = phyi->phyint_illv4; - else - ill = phyi->phyint_illv6; - - if (ill != NULL) - ill_nic_event_plumb(ill, B_TRUE); - } - } - rw_exit(&ipst->ips_ill_g_lock); -} - -/* ARGSUSED */ -static int -ipmp_hook_emulation_set(queue_t *q, mblk_t *mp, char *value, - caddr_t addr, cred_t *cr) -{ - int *v = (int *)addr; - long new_value; - ip_stack_t *ipst = CONNQ_TO_IPST(q); - - if (ddi_strtol(value, NULL, 10, &new_value) != 0) - return (EINVAL); - - if (*v != new_value) { - *v = new_value; - ipmp_hook_emulation_changed(ipst); - } - return (0); -} - static void * ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp) { @@ -30448,12 +29678,12 @@ next_mp: arpce->nce_state = ND_INCOMPLETE; mutex_exit(&arpce->nce_lock); + /* * Note that ire_add() (called from ire_forward()) * holds a ref on the ire until ARP is completed. */ - - ire_arpresolve(ire, ire_to_ill(ire)); + ire_arpresolve(ire); return (LOOKUP_IN_PROGRESS); default: ASSERT(0); @@ -30596,7 +29826,7 @@ ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill, return (ALL_ZONES); if (IN6_IS_ADDR_LINKLOCAL(addr)) { - ire_flags |= MATCH_IRE_ILL_GROUP; + ire_flags |= MATCH_IRE_ILL; ipif_arg = ill->ill_ipif; } if (lookup_zoneid != ALL_ZONES) @@ -30648,20 +29878,24 @@ void ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst, const ill_t *ill, int ipver, uint32_t hlen, ip_stack_t *ipst) { + mblk_t *mp2; ipobs_cb_t *ipobs_cb; + ipobs_hook_data_t *ihd; + uint64_t grifindex = 0; ASSERT(DB_TYPE(mp) == M_DATA); + if (IS_UNDER_IPMP(ill)) + grifindex = ipmp_ill_get_ipmp_ifindex(ill); + mutex_enter(&ipst->ips_ipobs_cb_lock); ipst->ips_ipobs_cb_nwalkers++; mutex_exit(&ipst->ips_ipobs_cb_lock); for (ipobs_cb = list_head(&ipst->ips_ipobs_cb_list); ipobs_cb != NULL; ipobs_cb = list_next(&ipst->ips_ipobs_cb_list, ipobs_cb)) { - mblk_t *mp2 = allocb(sizeof (ipobs_hook_data_t), - BPRI_HI); + mp2 = allocb(sizeof (ipobs_hook_data_t), BPRI_HI); if (mp2 != NULL) { - ipobs_hook_data_t *ihd = - (ipobs_hook_data_t *)mp2->b_rptr; + ihd = (ipobs_hook_data_t *)mp2->b_rptr; if (((ihd->ihd_mp = dupmsg(mp)) == NULL) && ((ihd->ihd_mp = copymsg(mp)) == NULL)) { freemsg(mp2); @@ -30673,6 +29907,7 @@ ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst, ihd->ihd_zsrc = zsrc; ihd->ihd_zdst = zdst; ihd->ihd_ifindex = ill->ill_phyint->phyint_ifindex; + ihd->ihd_grifindex = grifindex; ihd->ihd_stack = ipst->ips_netstack; mp2->b_wptr += sizeof (*ihd); ipobs_cb->ipobs_cbfunc(mp2); diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index fe326778c2..6e63af32b3 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -95,7 +95,6 @@ #include <sys/pattr.h> #include <inet/ipclassifier.h> #include <inet/ipsecah.h> -#include <inet/udp_impl.h> #include <inet/rawip_impl.h> #include <inet/rts_impl.h> #include <sys/squeue_impl.h> @@ -186,7 +185,7 @@ const in6_addr_t ipv6_solicited_node_mcast = #define IP6_MBLK_HDR_ERR 1 #define IP6_MBLK_LEN_ERR 2 -static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *ill, +static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *, ill_t *, boolean_t, zoneid_t); static void icmp_pkt_v6(queue_t *, mblk_t *, void *, size_t, const in6_addr_t *, boolean_t, zoneid_t, ip_stack_t *); @@ -208,11 +207,13 @@ static void ip_fanout_udp_v6(queue_t *, mblk_t *, ip6_t *, uint32_t, ill_t *, ill_t *, uint_t, boolean_t, zoneid_t); static int ip_process_options_v6(queue_t *, mblk_t *, ip6_t *, uint8_t *, uint_t, uint8_t, ip_stack_t *); -static mblk_t *ip_rput_frag_v6(queue_t *, mblk_t *, ip6_t *, +static mblk_t *ip_rput_frag_v6(ill_t *, ill_t *, mblk_t *, ip6_t *, ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *); static boolean_t ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *); static void ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int, - conn_t *, int, int, int, zoneid_t); + conn_t *, int, int, zoneid_t); +static boolean_t ipif_lookup_testaddr_v6(ill_t *, const in6_addr_t *, + ipif_t **); /* * A template for an IPv6 AR_ENTRY_QUERY @@ -248,15 +249,14 @@ static areq_t ipv6_areq_template = { * call icmp_inbound_v6() for each relevant zone. */ static void -icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, - boolean_t mctl_present, uint_t flags, zoneid_t zoneid, mblk_t *dl_mp) +icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill, + uint_t hdr_length, boolean_t mctl_present, uint_t flags, zoneid_t zoneid, + mblk_t *dl_mp) { icmp6_t *icmp6; ip6_t *ip6h; boolean_t interested; - ip6i_t *ip6i; in6_addr_t origsrc; - ire_t *ire; mblk_t *first_mp; ipsec_in_t *ii; ip_stack_t *ipst = ill->ill_ipst; @@ -344,7 +344,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, break; case ICMP6_PACKET_TOO_BIG: - icmp_inbound_too_big_v6(q, first_mp, ill, mctl_present, + icmp_inbound_too_big_v6(q, first_mp, ill, inill, mctl_present, zoneid); return; case ICMP6_ECHO_REQUEST: @@ -422,66 +422,6 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, * checksum field. The checksum is calculated in ip_wput_v6. */ icmp6->icmp6_cksum = ip6h->ip6_plen; - /* - * ICMP echo replies should go out on the same interface - * the request came on as probes used by in.mpathd for - * detecting NIC failures are ECHO packets. We turn-off load - * spreading by allocating a ip6i and setting ip6i_attach_if - * to B_TRUE which is handled both by ip_wput_v6 and - * ip_newroute_v6. If we don't turnoff load spreading, - * the packets might get dropped if there are no - * non-FAILED/INACTIVE interfaces for it to go out on and - * in.mpathd would wrongly detect a failure or mis-detect - * a NIC failure as a link failure. As load spreading can - * happen only if ill_group is not NULL, we do only for - * that case and this does not affect the normal case. - * - * We force this only on echo packets that came from on-link - * hosts. We restrict this to link-local addresses which - * is used by in.mpathd for probing. In the IPv6 case, - * default routes typically have an ire_ipif pointer and - * hence a MATCH_IRE_ILL later in ip_newroute_v6/ip_wput_v6 - * might work. As a default route out of this interface - * may not be present, enforcing this packet to go out in - * this case may not work. - */ - if (ill->ill_group != NULL && - IN6_IS_ADDR_LINKLOCAL(&origsrc)) { - /* - * If we are sending replies to ourselves, don't - * set ATTACH_IF as we may not be able to find - * the IRE_LOCAL on this ill i.e setting ATTACH_IF - * causes ip_wput_v6 to look for an IRE_LOCAL on - * "ill" which it may not find and will try to - * create an IRE_CACHE for our local address. Once - * we do this, we will try to forward all packets - * meant to our LOCAL address. - */ - ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ALL_ZONES, - NULL, ipst); - if (ire == NULL || ire->ire_type != IRE_LOCAL) { - mp = ip_add_info_v6(mp, NULL, &ip6h->ip6_dst); - if (mp == NULL) { - BUMP_MIB(ill->ill_icmp6_mib, - ipv6IfIcmpInErrors); - if (ire != NULL) - ire_refrele(ire); - if (mctl_present) - freeb(first_mp); - return; - } else if (mctl_present) { - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - ip6i = (ip6i_t *)mp->b_rptr; - ip6i->ip6i_flags = IP6I_ATTACH_IF; - ip6i->ip6i_ifindex = - ill->ill_phyint->phyint_ifindex; - } - if (ire != NULL) - ire_refrele(ire); - } if (!mctl_present) { /* @@ -529,7 +469,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, if (mctl_present) freeb(first_mp); /* XXX may wish to pass first_mp up to ndp_input someday. */ - ndp_input(ill, mp, dl_mp); + ndp_input(inill, mp, dl_mp); return; case ND_NEIGHBOR_ADVERT: @@ -538,7 +478,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, if (mctl_present) freeb(first_mp); /* XXX may wish to pass first_mp up to ndp_input someday. */ - ndp_input(ill, mp, dl_mp); + ndp_input(inill, mp, dl_mp); return; case ND_REDIRECT: { @@ -579,7 +519,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, } if (interested) { icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, - mctl_present, zoneid); + inill, mctl_present, zoneid); } else { freemsg(first_mp); } @@ -592,7 +532,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, */ /* ARGSUSED */ static void -icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, +icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill, boolean_t mctl_present, zoneid_t zoneid) { ip6_t *ip6h; @@ -658,11 +598,10 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, * sufficient. Same link local addresses for different ILL's is * possible. */ - if (IN6_IS_ADDR_LINKLOCAL(&inner_ip6h->ip6_dst)) { first_ire = ire_ctable_lookup_v6(&inner_ip6h->ip6_dst, NULL, IRE_CACHE, ill->ill_ipif, ALL_ZONES, NULL, - MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst); + MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); if (first_ire == NULL) { if (ip_debug > 2) { @@ -773,7 +712,7 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, } rw_exit(&irb->irb_lock); } - icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, + icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, inill, mctl_present, zoneid); } @@ -783,7 +722,8 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, */ void icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, - icmp6_t *icmp6, ill_t *ill, boolean_t mctl_present, zoneid_t zoneid) + icmp6_t *icmp6, ill_t *ill, ill_t *inill, boolean_t mctl_present, + zoneid_t zoneid) { uint16_t *up; /* Pointer to ports in ULP header */ uint32_t ports; /* reversed ports for fanout */ @@ -861,7 +801,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ((uint16_t *)&ports)[0] = up[1]; ((uint16_t *)&ports)[1] = up[0]; - ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, ill, + ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, inill, IP6_NO_IPPOLICY, mctl_present, zoneid); return; } @@ -908,7 +848,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, up = (uint16_t *)((uchar_t *)ip6h + hdr_length); ((uint16_t *)&ports)[0] = up[1]; ((uint16_t *)&ports)[1] = up[0]; - ip_fanout_sctp(first_mp, ill, (ipha_t *)ip6h, ports, 0, + ip_fanout_sctp(first_mp, inill, (ipha_t *)ip6h, ports, 0, mctl_present, IP6_NO_IPPOLICY, zoneid); return; case IPPROTO_ESP: @@ -940,7 +880,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ASSERT(ill != NULL); ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = ii->ipsec_in_ill_index; + ii->ipsec_in_rill_index = + inill->ill_phyint->phyint_ifindex; first_mp->b_cont->b_datap->db_type = M_CTL; } else { /* @@ -970,7 +911,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, mp->b_datap->db_type = M_CTL; ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = ii->ipsec_in_ill_index; + ii->ipsec_in_rill_index = + inill->ill_phyint->phyint_ifindex; } if (!ipsec_loaded(ipss)) { @@ -985,7 +927,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, if (ipsec_rc == IPSEC_STATUS_FAILED) return; - ip_fanout_proto_again(first_mp, ill, ill, NULL); + ip_fanout_proto_again(first_mp, ill, inill, NULL); return; } case IPPROTO_ENCAP: @@ -1083,8 +1025,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, * doing here. */ icmp_inbound_error_fanout_v6(q, first_mp, - (ip6_t *)mp->b_rptr, icmp6, ill, mctl_present, - zoneid); + (ip6_t *)mp->b_rptr, icmp6, ill, inill, + mctl_present, zoneid); return; } /* FALLTHRU */ @@ -1096,7 +1038,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, rip6h.ip6_src = ip6h->ip6_dst; rip6h.ip6_dst = ip6h->ip6_src; rip6h.ip6_nxt = nexthdr; - ip_fanout_proto_v6(q, first_mp, &rip6h, ill, ill, nexthdr, 0, + ip_fanout_proto_v6(q, first_mp, &rip6h, ill, inill, nexthdr, 0, IP6_NO_IPPOLICY, mctl_present, zoneid); return; } @@ -1194,9 +1136,8 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) * redirect packet.) */ - prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL, - ALL_ZONES, NULL, MATCH_IRE_GW | MATCH_IRE_ILL_GROUP | - MATCH_IRE_DEFAULT, ipst); + prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL, ALL_ZONES, + NULL, MATCH_IRE_GW | MATCH_IRE_ILL | MATCH_IRE_DEFAULT, ipst); /* * Check that @@ -1260,6 +1201,7 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR); if (opt != NULL) { err = ndp_lookup_then_add_v6(ill, + B_FALSE, /* don't match across illgrp */ (uchar_t *)&opt[1], /* Link layer address */ gateway, &ipv6_all_ones, /* prefix mask */ @@ -1367,8 +1309,7 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) */ redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST, ire->ire_ipif, NULL, ALL_ZONES, 0, NULL, - (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), - ipst); + (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); ire_refrele(ire); /* Held in ire_add_v6 */ @@ -1457,15 +1398,11 @@ icmp_pick_source_v6(queue_t *wq, in6_addr_t *origsrc, in6_addr_t *origdst, BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes); return (NULL); } - /* - * Does not matter whether we use ire_stq or ire_ipif here. - * Just pick an ill for ICMP replies. - */ ASSERT(ire->ire_ipif != NULL); ill = ire->ire_ipif->ipif_ill; ire_refrele(ire); } - ipif = ipif_select_source_v6(ill, origsrc, RESTRICT_TO_NONE, + ipif = ipif_select_source_v6(ill, origsrc, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); if (ipif != NULL) { *src = ipif->ipif_v6src_addr; @@ -1858,7 +1795,7 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp, mp = icmp_pkt_err_ok_v6(q, mp, llbcast, B_FALSE, ipst); if (mp == NULL) return; - nce = ndp_lookup_v6(ill, targetp, B_FALSE); + nce = ndp_lookup_v6(ill, B_TRUE, targetp, B_FALSE); if (nce != NULL && nce->nce_state != ND_INCOMPLETE) { ll_opt_len = (sizeof (nd_opt_hdr_t) + ill->ill_phys_addr_length + 7)/8 * 8; @@ -1908,31 +1845,8 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp, rdh->nd_opt_rh_reserved1 = 0; rdh->nd_opt_rh_reserved2 = 0; /* ipif_v6src_addr contains the link-local source address */ - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - if (ill->ill_group != NULL) { - /* - * The receiver of the redirect will verify whether it - * had a route through us (srcp that we will use in - * the redirect) or not. As we load spread even link-locals, - * we don't know which source address the receiver of - * redirect has in its route for communicating with us. - * Thus we randomly choose a source here and finally we - * should get to the right one and it will eventually - * accept the redirect from us. We can't call - * ip_lookup_scope_v6 because we don't have the right - * link-local address here. Thus we randomly choose one. - */ - int cnt = ill->ill_group->illgrp_ill_count; + srcp = &ill->ill_ipif->ipif_v6src_addr; - ill = ill->ill_group->illgrp_ill; - cnt = ++ipst->ips_icmp_redirect_v6_src_index % cnt; - while (cnt--) - ill = ill->ill_group_next; - srcp = &ill->ill_ipif->ipif_v6src_addr; - } else { - srcp = &ill->ill_ipif->ipif_v6src_addr; - } - rw_exit(&ipst->ips_ill_g_lock); /* Redirects sent by router, and router is global zone */ icmp_pkt_v6(q, mp, buf, len, srcp, B_FALSE, GLOBAL_ZONEID, ipst); kmem_free(buf, len); @@ -2231,6 +2145,7 @@ ip_bind_post_handling_v6(conn_t *connp, mblk_t *mp, if (version_changed) { ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst); } + /* * Pass the IPSEC headers size in ire_ipsec_overhead. * We can't do this in ip_bind_insert_ire because the policy @@ -2771,8 +2686,8 @@ ip_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol, } if (ip6_asp_can_lookup(ipst)) { src_ipif = ipif_select_source_v6(dst_ill, - v6dst, RESTRICT_TO_NONE, - connp->conn_src_preferences, zoneid); + v6dst, B_FALSE, connp->conn_src_preferences, + zoneid); ip6_asp_table_refrele(ipst); if (src_ipif == NULL) { pr_addr_dbg("ip_bind_connected_v6: " @@ -3111,7 +3026,15 @@ ip_add_info_v6(mblk_t *mp, ill_t *ill, const in6_addr_t *dst) ip6i->ip6i_nxt = IPPROTO_RAW; if (ill != NULL) { ip6i->ip6i_flags = IP6I_IFINDEX; - ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; + /* + * If `ill' is in an IPMP group, make sure we use the IPMP + * interface index so that e.g. IPV6_RECVPKTINFO will get the + * IPMP interface index and not an underlying interface index. + */ + if (IS_UNDER_IPMP(ill)) + ip6i->ip6i_ifindex = ipmp_ill_get_ipmp_ifindex(ill); + else + ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; } else { ip6i->ip6i_flags = 0; } @@ -4257,33 +4180,6 @@ ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h) } /* - * Select an ill for the packet by considering load spreading across - * a different ill in the group if dst_ill is part of some group. - */ -static ill_t * -ip_newroute_get_dst_ill_v6(ill_t *dst_ill) -{ - ill_t *ill; - - /* - * We schedule irrespective of whether the source address is - * INADDR_UNSPECIED or not. - */ - ill = illgrp_scheduler(dst_ill); - if (ill == NULL) - return (NULL); - - /* - * For groups with names ip_sioctl_groupname ensures that all - * ills are of same type. For groups without names, ifgrp_insert - * ensures this. - */ - ASSERT(dst_ill->ill_type == ill->ill_type); - - return (ill); -} - -/* * IPv6 - * ip_newroute_v6 is called by ip_rput_data_v6 or ip_wput_v6 whenever we need * to send out a packet to a destination address for which we do not have @@ -4303,14 +4199,6 @@ ip_newroute_get_dst_ill_v6(ill_t *dst_ill) * node sits at a site boundary). * We create the cache entries in the regular ctable since * it can not "confuse" things for other destinations. - * table. - * - * When ill is part of a ill group, we subject the packets - * to load spreading even if the ill is specified by the - * means described above. We disable only for IPV6_BOUND_PIF - * and for the cases where IP6I_ATTACH_IF is set i.e NS/NA/ - * Echo replies to link-local destinations have IP6I_ATTACH_IF - * set. * * NOTE : These are the scopes of some of the variables that point at IRE, * which needs to be followed while making any future modifications @@ -4327,8 +4215,6 @@ ip_newroute_get_dst_ill_v6(ill_t *dst_ill) * * Thus on failures, we have to REFRELE only ire and sire, if they * are not NULL. - * - * v6srcp may be used in the future. Currently unused. */ /* ARGSUSED */ void @@ -4346,10 +4232,8 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, int err = 0; mblk_t *first_mp; ipsec_out_t *io; - ill_t *attach_ill = NULL; ushort_t ire_marks = 0; int match_flags; - boolean_t ip6i_present; ire_t *first_sire = NULL; mblk_t *copy_mp = NULL; mblk_t *xmit_mp = NULL; @@ -4359,7 +4243,6 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, boolean_t multirt_is_resolvable; boolean_t multirt_resolve_next; boolean_t need_rele = B_FALSE; - boolean_t do_attach_ill = B_FALSE; boolean_t ip6_asp_table_held = B_FALSE; tsol_ire_gw_secattr_t *attrp = NULL; tsol_gcgrp_t *gcgrp = NULL; @@ -4376,39 +4259,12 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, io = NULL; } - /* - * If this end point is bound to IPIF_NOFAILOVER, set bnf_ill and - * bind_to_nofailover B_TRUE. We can't use conn to determine as it - * could be NULL. - * - * This information can appear either in an ip6i_t or an IPSEC_OUT - * message. - */ ip6h = (ip6_t *)mp->b_rptr; - ip6i_present = (ip6h->ip6_nxt == IPPROTO_RAW); - if (ip6i_present || (io != NULL && io->ipsec_out_attach_if)) { - if (!ip6i_present || - ((ip6i_t *)ip6h)->ip6i_flags & IP6I_ATTACH_IF) { - attach_ill = ip_grab_attach_ill(ill, first_mp, - (ip6i_present ? ((ip6i_t *)ip6h)->ip6i_ifindex : - io->ipsec_out_ill_index), B_TRUE, ipst); - /* Failure case frees things for us. */ - if (attach_ill == NULL) - return; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) - ire_marks = IRE_MARK_HIDDEN; - } - } if (IN6_IS_ADDR_LOOPBACK(v6dstp)) { ip1dbg(("ip_newroute_v6: dst with loopback addr\n")); goto icmp_err_ret; - } else if ((v6srcp != NULL) && IN6_IS_ADDR_LOOPBACK(v6srcp)) { + } else if (IN6_IS_ADDR_LOOPBACK(v6srcp)) { ip1dbg(("ip_newroute_v6: src with loopback addr\n")); goto icmp_err_ret; } @@ -4436,30 +4292,24 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp), match_flags, ipst); + } else { + match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | + MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL; + match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR; + /* - * ire_add_then_send -> ip_newroute_v6 in the CGTP case passes - * in a NULL ill, but the packet could be a neighbor - * solicitation/advertisment and could have a valid attach_ill. + * Because nce_xmit() calls ip_output_v6() and NCEs are always + * tied to an underlying interface, IS_UNDER_IPMP() may be + * true even when building IREs that will be used for data + * traffic. As such, use the packet's source address to + * determine whether the traffic is test traffic, and set + * MATCH_IRE_MARK_TESTHIDDEN if so. */ - if (attach_ill != NULL) - ill_refrele(attach_ill); - } else { - if (attach_ill != NULL) { - /* - * attach_ill is set only for communicating with - * on-link hosts. So, don't look for DEFAULT. - * ip_wput_v6 passes the right ill in this case and - * hence we can assert. - */ - ASSERT(ill == attach_ill); - ill_refrele(attach_ill); - do_attach_ill = B_TRUE; - match_flags = MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL; - } else { - match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL_GROUP; + if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) { + if (ipif_lookup_testaddr_v6(ill, v6srcp, NULL)) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; } - match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR; + ire = ire_ftable_lookup_v6(v6dstp, NULL, NULL, 0, ill->ill_ipif, &sire, zoneid, 0, MBLK_GETLABEL(mp), match_flags, ipst); } @@ -4601,106 +4451,56 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, } /* - * We have a route to reach the destination. - * - * 1) If the interface is part of ill group, try to get a new - * ill taking load spreading into account. + * We have a route to reach the destination. Find the + * appropriate ill, then get a source address that matches the + * right scope via ipif_select_source_v6(). * - * 2) After selecting the ill, get a source address that might - * create good inbound load spreading and that matches the - * right scope. ipif_select_source_v6 does this for us. + * If we are here trying to create an IRE_CACHE for an offlink + * destination and have an IRE_CACHE entry for VNI, then use + * ire_stq instead since VNI's queue is a black hole. * - * If the application specified the ill (ifindex), we still - * load spread. Only if the packets needs to go out specifically - * on a given ill e.g. bind to IPIF_NOFAILOVER address, - * IPV6_BOUND_PIF we don't try to use a different ill for load - * spreading. + * Note: While we pick a dst_ill we are really only interested + * in the ill for load spreading. The source ipif is + * determined by source address selection below. */ - if (!do_attach_ill) { - /* - * If the interface belongs to an interface group, - * make sure the next possible interface in the group - * is used. This encourages load spreading among - * peers in an interface group. However, in the case - * of multirouting, load spreading is not used, as we - * actually want to replicate outgoing packets through - * particular interfaces. - * - * Note: While we pick a dst_ill we are really only - * interested in the ill for load spreading. - * The source ipif is determined by source address - * selection below. - */ - if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { - dst_ill = ire->ire_ipif->ipif_ill; - /* For uniformity do a refhold */ - ill_refhold(dst_ill); + if ((ire->ire_type == IRE_CACHE) && + IS_VNI(ire->ire_ipif->ipif_ill)) { + dst_ill = ire->ire_stq->q_ptr; + ill_refhold(dst_ill); + } else { + ill_t *ill = ire->ire_ipif->ipif_ill; + + if (IS_IPMP(ill)) { + dst_ill = + ipmp_illgrp_hold_next_ill(ill->ill_grp); } else { - /* - * If we are here trying to create an IRE_CACHE - * for an offlink destination and have the - * IRE_CACHE for the next hop and the latter is - * using virtual IP source address selection i.e - * it's ire->ire_ipif is pointing to a virtual - * network interface (vni) then - * ip_newroute_get_dst_ll() will return the vni - * interface as the dst_ill. Since the vni is - * virtual i.e not associated with any physical - * interface, it cannot be the dst_ill, hence - * in such a case call ip_newroute_get_dst_ll() - * with the stq_ill instead of the ire_ipif ILL. - * The function returns a refheld ill. - */ - if ((ire->ire_type == IRE_CACHE) && - IS_VNI(ire->ire_ipif->ipif_ill)) - dst_ill = ip_newroute_get_dst_ill_v6( - ire->ire_stq->q_ptr); - else - dst_ill = ip_newroute_get_dst_ill_v6( - ire->ire_ipif->ipif_ill); + dst_ill = ill; + ill_refhold(dst_ill); } - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute_v6 : no dst " - "ill for dst %s\n", - AF_INET6, v6dstp); - } - goto icmp_err_ret; - } else if (dst_ill->ill_group == NULL && ill != NULL && - dst_ill != ill) { - /* - * If "ill" is not part of any group, we should - * have found a route matching "ill" as we - * called ire_ftable_lookup_v6 with - * MATCH_IRE_ILL_GROUP. - * Rather than asserting when there is a - * mismatch, we just drop the packet. - */ - ip0dbg(("ip_newroute_v6: BOUND_IF failed : " - "dst_ill %s ill %s\n", - dst_ill->ill_name, - ill->ill_name)); - goto icmp_err_ret; + } + + if (dst_ill == NULL) { + if (ip_debug > 2) { + pr_addr_dbg("ip_newroute_v6 : no dst " + "ill for dst %s\n", AF_INET6, v6dstp); } - } else { - dst_ill = ire->ire_ipif->ipif_ill; - /* For uniformity do refhold */ - ill_refhold(dst_ill); + goto icmp_err_ret; + } + + if (ill != NULL && dst_ill != ill && + !IS_IN_SAME_ILLGRP(dst_ill, ill)) { /* - * We should have found a route matching ill as we - * called ire_ftable_lookup_v6 with MATCH_IRE_ILL. - * Rather than asserting, while there is a mismatch, - * we just drop the packet. + * We should have found a route matching "ill" + * as we called ire_ftable_lookup_v6 with + * MATCH_IRE_ILL. Rather than asserting when + * there is a mismatch, we just drop the packet. */ - if (dst_ill != ill) { - ip0dbg(("ip_newroute_v6: Packet dropped as " - "IP6I_ATTACH_IF ill is %s, " - "ire->ire_ipif->ipif_ill is %s\n", - ill->ill_name, - dst_ill->ill_name)); - goto icmp_err_ret; - } + ip0dbg(("ip_newroute_v6: BOUND_IF failed: " + "dst_ill %s ill %s\n", dst_ill->ill_name, + ill->ill_name)); + goto icmp_err_ret; } + /* * Pick a source address which matches the scope of the * destination address. @@ -4708,7 +4508,20 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, * parent ire (sire). */ ASSERT(src_ipif == NULL); - if (ire->ire_type == IRE_IF_RESOLVER && + + /* + * Because nce_xmit() calls ip_output_v6() and NCEs are always + * tied to the underlying interface, IS_UNDER_IPMP() may be + * true even when building IREs that will be used for data + * traffic. As such, see if the packet's source address is a + * test address, and if so use that test address's ipif for + * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in + * ire_add_v6() can work properly. + */ + if (ill != NULL && IS_UNDER_IPMP(ill)) + (void) ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif); + + if (src_ipif == NULL && ire->ire_type == IRE_IF_RESOLVER && !IN6_IS_ADDR_UNSPECIFIED(&v6gw) && ip6_asp_can_lookup(ipst)) { /* @@ -4718,10 +4531,10 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, */ ip6_asp_table_held = B_TRUE; src_ipif = ipif_select_source_v6(dst_ill, &v6gw, - RESTRICT_TO_GROUP, IPV6_PREFER_SRC_DEFAULT, zoneid); + B_TRUE, IPV6_PREFER_SRC_DEFAULT, zoneid); if (src_ipif != NULL) ire_marks |= IRE_MARK_USESRC_CHECK; - } else { + } else if (src_ipif == NULL) { if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { /* * Check that the ipif matching the requested @@ -4732,14 +4545,9 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, NULL, NULL, NULL, NULL, ipst); } if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) { - uint_t restrict_ill = RESTRICT_TO_NONE; - - if (ip6i_present && ((ip6i_t *)ip6h)->ip6i_flags - & IP6I_ATTACH_IF) - restrict_ill = RESTRICT_TO_ILL; ip6_asp_table_held = B_TRUE; src_ipif = ipif_select_source_v6(dst_ill, - v6dstp, restrict_ill, + v6dstp, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); if (src_ipif != NULL) ire_marks |= IRE_MARK_USESRC_CHECK; @@ -4750,7 +4558,7 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg("ip_newroute_v6: no src for " - "dst %s\n, ", AF_INET6, v6dstp); + "dst %s\n", AF_INET6, v6dstp); printf("ip_newroute_v6: interface name %s\n", dst_ill->ill_name); } @@ -4837,14 +4645,7 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, "ire_ihandle_lookup_offlink_v6 failed\n")); goto icmp_err_ret; } - /* - * Assume DL_UNITDATA_REQ is same for all physical - * interfaces in the ifgrp. If it isn't, this code will - * have to be seriously rewhacked to allow the - * fastpath probing (such that I cache the link - * header in the IRE_CACHE) to work over ifgrps. - * We have what we need to build an IRE_CACHE. - */ + /* * Note: the new ire inherits RTF_SETSRC * and RTF_MULTIRT to propagate these flags from prefix @@ -5659,24 +5460,22 @@ icmp_err_ret: */ void ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, - in6_addr_t v6dst, int unspec_src, zoneid_t zoneid) + const in6_addr_t *v6dstp, const in6_addr_t *v6srcp, int unspec_src, + zoneid_t zoneid) { ire_t *ire = NULL; ipif_t *src_ipif = NULL; int err = 0; ill_t *dst_ill = NULL; ire_t *save_ire; - ushort_t ire_marks = 0; ipsec_out_t *io; - ill_t *attach_ill = NULL; ill_t *ill; - ip6_t *ip6h; mblk_t *first_mp; - boolean_t ip6i_present; ire_t *fire = NULL; mblk_t *copy_mp = NULL; + const in6_addr_t *ire_v6srcp; + boolean_t probe = B_FALSE; boolean_t multirt_resolve_next; - in6_addr_t *v6dstp = &v6dst; boolean_t ipif_held = B_FALSE; boolean_t ill_held = B_FALSE; boolean_t ip6_asp_table_held = B_FALSE; @@ -5728,35 +5527,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, if (!(ill->ill_flags & ILLF_MULTICAST)) { goto err_ret; } - /* - * If this end point is bound to IPIF_NOFAILOVER, set bnf_ill - * and bind_to_nofailover B_TRUE. We can't use conn to determine - * as it could be NULL. - * - * This information can appear either in an ip6i_t or an - * IPSEC_OUT message. - */ - ip6h = (ip6_t *)mp->b_rptr; - ip6i_present = (ip6h->ip6_nxt == IPPROTO_RAW); - if (ip6i_present || (io != NULL && io->ipsec_out_attach_if)) { - if (!ip6i_present || - ((ip6i_t *)ip6h)->ip6i_flags & IP6I_ATTACH_IF) { - attach_ill = ip_grab_attach_ill(ill, first_mp, - (ip6i_present ? - ((ip6i_t *)ip6h)->ip6i_ifindex : - io->ipsec_out_ill_index), B_TRUE, ipst); - /* Failure case frees things for us. */ - if (attach_ill == NULL) - return; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) - ire_marks = IRE_MARK_HIDDEN; - } - } /* * We check if an IRE_OFFSUBNET for the addr that goes through @@ -5770,76 +5540,93 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, (void *)ipif, ntohl(V4_PART_OF_V6((*v6dstp))), (void *)fire)); + ASSERT(src_ipif == NULL); + /* - * If the application specified the ill (ifindex), we still - * load spread. Only if the packets needs to go out specifically - * on a given ill e.g. binding to IPIF_NOFAILOVER address or - * IPV6_BOUND_PIF, or there is a parent ire entry that specified - * multirouting, then we don't try to use a different ill for - * load spreading. + * Because nce_xmit() calls ip_output_v6() and NCEs are always + * tied to the underlying interface, IS_UNDER_IPMP() may be + * true even when building IREs that will be used for data + * traffic. As such, see if the packet's source address is a + * test address, and if so use that test address's ipif for + * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in + * ire_add_v6() can work properly. + */ + if (IS_UNDER_IPMP(ill)) + probe = ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif); + + /* + * Determine the outbound (destination) ill for this route. + * If IPMP is not in use, that's the same as our ill. If IPMP + * is in-use and we're on the IPMP interface, or we're on an + * underlying ill but sending data traffic, use a suitable + * destination ill from the group. The latter case covers a + * subtle edge condition with multicast: when we bring up an + * IPv6 data address, we will create an NCE on an underlying + * interface, and send solitications to ff02::1, which would + * take us through here, and cause us to create an IRE for + * ff02::1. To meet our defined semantics for multicast (and + * ensure there aren't unexpected echoes), that IRE needs to + * use the IPMP group's nominated multicast interface. + * + * Note: the source ipif is determined by source address + * selection later. */ - if (attach_ill == NULL) { - /* - * If the interface belongs to an interface group, - * make sure the next possible interface in the group - * is used. This encourages load spreading among peers - * in an interface group. - * - * Note: While we pick a dst_ill we are really only - * interested in the ill for load spreading. The source - * ipif is determined by source address selection below. - */ - if ((fire != NULL) && (fire->ire_flags & RTF_MULTIRT)) { - dst_ill = ipif->ipif_ill; - /* For uniformity do a refhold */ - ill_refhold(dst_ill); + if (IS_IPMP(ill) || (IS_UNDER_IPMP(ill) && !probe)) { + ill_t *ipmp_ill; + ipmp_illgrp_t *illg; + + if (IS_UNDER_IPMP(ill)) { + ipmp_ill = ipmp_ill_hold_ipmp_ill(ill); } else { - /* refheld by ip_newroute_get_dst_ill_v6 */ - dst_ill = - ip_newroute_get_dst_ill_v6(ipif->ipif_ill); + ipmp_ill = ill; + ill_refhold(ipmp_ill); /* for symmetry */ } - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute_ipif_v6: " - "no dst ill for dst %s\n", - AF_INET6, v6dstp); - } + + if (ipmp_ill == NULL) goto err_ret; - } + + illg = ipmp_ill->ill_grp; + if (IN6_IS_ADDR_MULTICAST(v6dstp)) + dst_ill = ipmp_illgrp_hold_cast_ill(illg); + else + dst_ill = ipmp_illgrp_hold_next_ill(illg); + + ill_refrele(ipmp_ill); } else { - dst_ill = ipif->ipif_ill; - /* - * ip_wput_v6 passes the right ipif for IPIF_NOFAILOVER - * and IPV6_BOUND_PIF case. - */ - ASSERT(dst_ill == attach_ill); - /* attach_ill is already refheld */ + dst_ill = ill; + ill_refhold(dst_ill); /* for symmetry */ + } + + if (dst_ill == NULL) { + if (ip_debug > 2) { + pr_addr_dbg("ip_newroute_ipif_v6: " + "no dst ill for dst %s\n", + AF_INET6, v6dstp); + } + goto err_ret; } + /* * Pick a source address which matches the scope of the * destination address. * For RTF_SETSRC routes, the source address is imposed by the * parent ire (fire). */ - ASSERT(src_ipif == NULL); - if ((fire != NULL) && (fire->ire_flags & RTF_SETSRC)) { + + if (src_ipif == NULL && fire != NULL && + (fire->ire_flags & RTF_SETSRC)) { /* * Check that the ipif matching the requested source * address still exists. */ - src_ipif = - ipif_lookup_addr_v6(&fire->ire_src_addr_v6, + src_ipif = ipif_lookup_addr_v6(&fire->ire_src_addr_v6, NULL, zoneid, NULL, NULL, NULL, NULL, ipst); } - if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) { - uint_t restrict_ill = RESTRICT_TO_NONE; - if (ip6i_present && ((ip6i_t *)ip6h)->ip6i_flags - & IP6I_ATTACH_IF) - restrict_ill = RESTRICT_TO_ILL; + if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) { ip6_asp_table_held = B_TRUE; src_ipif = ipif_select_source_v6(dst_ill, v6dstp, - restrict_ill, IPV6_PREFER_SRC_DEFAULT, zoneid); + B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); } if (src_ipif == NULL) { @@ -5847,16 +5634,20 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg("ip_newroute_ipif_v6: " - "no src for dst %s\n,", + "no src for dst %s\n", AF_INET6, v6dstp); printf(" through interface %s\n", dst_ill->ill_name); } goto err_ret; } + ire_v6srcp = &ipv6_all_zeros; src_ipif = ipif; ipif_refhold(src_ipif); + } else { + ire_v6srcp = &src_ipif->ipif_v6src_addr; } + ire = ipif_to_ire_v6(ipif); if (ire == NULL) { if (ip_debug > 2) { @@ -5903,7 +5694,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, } } - ASSERT((attach_ill == NULL) || (dst_ill == attach_ill)); switch (ire->ire_type) { case IRE_IF_NORESOLVER: { /* @@ -5921,7 +5711,7 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, ire = ire_create_v6( v6dstp, /* dest address */ &ipv6_all_ones, /* mask */ - &src_ipif->ipif_v6src_addr, /* source address */ + ire_v6srcp, /* source address */ NULL, /* gateway address */ &save_ire->ire_max_frag, NULL, /* no src nce */ @@ -5946,8 +5736,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, break; } - ire->ire_marks |= ire_marks; - err = ndp_noresolver(dst_ill, v6dstp); if (err != 0) { ire_refrele(save_ire); @@ -6051,7 +5839,7 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, ire = ire_create_v6( v6dstp, /* dest address */ &ipv6_all_ones, /* mask */ - &src_ipif->ipif_v6src_addr, /* source address */ + ire_v6srcp, /* source address */ NULL, /* gateway address */ &save_ire->ire_max_frag, NULL, /* src nce */ @@ -6076,8 +5864,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, break; } - ire->ire_marks |= ire_marks; - /* Resolve and add ire to the ctable */ err = ndp_resolver(dst_ill, v6dstp, first_mp, zoneid); switch (err) { @@ -6273,8 +6059,8 @@ err_ret: ipif_refrele(ipif); if (src_ipif != NULL) ipif_refrele(src_ipif); + /* Multicast - no point in trying to generate ICMP error */ - ASSERT((attach_ill == NULL) || (dst_ill == attach_ill)); if (dst_ill != NULL) { ill = dst_ill; ill_held = B_TRUE; @@ -6499,7 +6285,7 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, &ip6h->ip6_dst)) { ipif = ipif_select_source_v6( ill, &ip6h->ip6_src, - RESTRICT_TO_GROUP, + B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES); if (ipif != NULL) { @@ -7050,7 +6836,7 @@ ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr) */ static boolean_t ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present, - ill_t *ill, mblk_t *hada_mp, zoneid_t zoneid) + ill_t *ill, ill_t *inill, mblk_t *hada_mp, zoneid_t zoneid) { mblk_t *mp; uint8_t nexthdr; @@ -7093,7 +6879,7 @@ ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present, */ ii = (ipsec_in_t *)first_mp->b_rptr; ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = ii->ipsec_in_ill_index; + ii->ipsec_in_rill_index = inill->ill_phyint->phyint_ifindex; first_mp->b_cont = mp; } /* @@ -7122,7 +6908,7 @@ ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present, switch (ipsec_rc) { case IPSEC_STATUS_SUCCESS: /* we're done with IPsec processing, send it up */ - ip_fanout_proto_again(first_mp, ill, ill, NULL); + ip_fanout_proto_again(first_mp, ill, inill, NULL); break; case IPSEC_STATUS_FAILED: BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); @@ -7225,7 +7011,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, ip6_hbh_t *hbhhdr; boolean_t ll_multicast = (flags & IP6_IN_LLMCAST); conn_t *connp; - ilm_t *ilm; uint32_t ports; zoneid_t zoneid = GLOBAL_ZONEID; uint16_t hck_flags, reass_hck_flags; @@ -7347,10 +7132,8 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, /* * XXX TODO Give to mrouted to for multicast forwarding. */ - ILM_WALKER_HOLD(ill); - ilm = ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, ALL_ZONES); - ILM_WALKER_RELE(ill); - if (ilm == NULL) { + if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE, + ALL_ZONES) == NULL) { if (ip_debug > 3) { /* ip2dbg */ pr_addr_dbg("ip_rput_data_v6: got mcast packet" @@ -7405,7 +7188,7 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) { ire = ire_ctable_lookup_v6(&ip6h->ip6_dst, NULL, IRE_CACHE|IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, - MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst); + MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); } else { ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ALL_ZONES, MBLK_GETLABEL(mp), ipst); @@ -7466,9 +7249,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, } /* we have a matching IRE */ if (ire->ire_stq != NULL) { - ill_group_t *ill_group; - ill_group_t *ire_group; - /* * To be quicker, we may wish not to chase pointers * (ire->ire_ipif->ipif_ill...) and instead store the @@ -7483,7 +7263,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, no_forward = ((ill->ill_flags & ire->ire_ipif->ipif_ill->ill_flags & ILLF_ROUTER) == 0); - ASSERT(first_mp == mp); /* * This ire has a send-to queue - forward the packet. @@ -7568,10 +7347,8 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, * we're forwarding onto the same link), conditionally send * a redirect message. */ - ill_group = ill->ill_group; - ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; - if (ire->ire_rfq != q && (ill_group == NULL || - ill_group != ire_group)) { + if (ire->ire_rfq != q && + !IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) { if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) || IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) { BUMP_MIB(ill->ill_ip_mib, @@ -8006,7 +7783,10 @@ tcp_fanout: * where there is no conn. */ if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { - ASSERT(!IS_LOOPBACK((ill))); + ilm_t *ilm; + ilm_walker_t ilw; + + ASSERT(!IS_LOOPBACK(ill)); /* * In the multicast case, applications may have * joined the group from different zones, so we @@ -8015,32 +7795,32 @@ tcp_fanout: * structures (ilm) on the receive ill and send * a copy of the packet up each matching one. */ - ILM_WALKER_HOLD(ill); - for (ilm = ill->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if (ilm->ilm_flags & ILM_DELETED) - continue; + ilm = ilm_walker_start(&ilw, inill); + for (; ilm != NULL; + ilm = ilm_walker_step(&ilw, ilm)) { if (!IN6_ARE_ADDR_EQUAL( &ilm->ilm_v6addr, &ip6h->ip6_dst)) continue; - if (!ipif_lookup_zoneid(ill, - ilm->ilm_zoneid, IPIF_UP, NULL)) + if (!ipif_lookup_zoneid( + ilw.ilw_walk_ill, ilm->ilm_zoneid, + IPIF_UP, NULL)) continue; first_mp1 = ip_copymsg(first_mp); if (first_mp1 == NULL) continue; - icmp_inbound_v6(q, first_mp1, ill, + icmp_inbound_v6(q, first_mp1, + ilw.ilw_walk_ill, inill, hdr_len, mctl_present, 0, ilm->ilm_zoneid, dl_mp); } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } else { first_mp1 = ip_copymsg(first_mp); if (first_mp1 != NULL) icmp_inbound_v6(q, first_mp1, ill, - hdr_len, mctl_present, 0, zoneid, - dl_mp); + inill, hdr_len, mctl_present, 0, + zoneid, dl_mp); } } /* FALLTHRU */ @@ -8082,7 +7862,7 @@ tcp_fanout: /* Check if AH is present. */ if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill, - hada_mp, zoneid)) { + inill, hada_mp, zoneid)) { ip0dbg(("dst early hada drop\n")); return; } @@ -8206,7 +7986,7 @@ tcp_fanout: /* Restore the flags */ DB_CKSUMFLAGS(mp) = hck_flags; - mp = ip_rput_frag_v6(q, mp, ip6h, fraghdr, + mp = ip_rput_frag_v6(ill, inill, mp, ip6h, fraghdr, remlen - used, &prev_nexthdr_offset, &reass_sum, &reass_hck_flags); if (mp == NULL) { @@ -8249,7 +8029,7 @@ tcp_fanout: /* Check if AH is present. */ if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill, - hada_mp, zoneid)) { + inill, hada_mp, zoneid)) { ip0dbg(("routing hada drop\n")); return; } @@ -8322,7 +8102,7 @@ tcp_fanout: ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; ii->ipsec_in_rill_index = - ii->ipsec_in_ill_index; + inill->ill_phyint->phyint_ifindex; first_mp->b_cont = mp; /* * Cache hardware acceleration info. @@ -8480,11 +8260,10 @@ hada_drop: * nexthdr field when reassembly completes. */ static mblk_t * -ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, +ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset, uint32_t *cksum_val, uint16_t *cksum_flags) { - ill_t *ill = (ill_t *)q->q_ptr; uint32_t ident = ntohl(fraghdr->ip6f_ident); uint16_t offset; boolean_t more_frags; @@ -8518,8 +8297,8 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, * addition, checksum offload support for IP fragments carrying * UDP payload is commonly implemented across network adapters. */ - ASSERT(ill != NULL); - if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && + ASSERT(inill != NULL); + if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(inill) && (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { mblk_t *mp1 = mp->b_cont; int32_t len; @@ -8581,7 +8360,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, freemsg(mp); return (NULL); } - icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER, + icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER, (uint32_t)((char *)&ip6h->ip6_plen - (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst); return (NULL); @@ -8607,7 +8386,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, freemsg(mp); return (NULL); } - icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER, + icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER, (uint32_t)((char *)&fraghdr->ip6f_offlg - (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst); return (NULL); @@ -9204,16 +8983,14 @@ ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst) * The routine can handle an ICMPv6 header that is not in the first mblk. * * The order to determine the outgoing interface is as follows: - * 1. IPV6_BOUND_PIF is set, use that ill (conn_outgoing_pill) - * 2. If conn_nofailover_ill is set then use that ill. - * 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. - * 4. If q is an ill queue and (link local or multicast destination) then + * 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. + * 2. If q is an ill queue and (link local or multicast destination) then * use that ill. - * 5. If IPV6_BOUND_IF has been set use that ill. - * 6. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise + * 3. If IPV6_BOUND_IF has been set use that ill. + * 4. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise * look for the best IRE match for the unspecified group to determine * the ill. - * 7. For unicast: Just do an IRE lookup for the best match. + * 5. For unicast: Just do an IRE lookup for the best match. * * arg2 is always a queue_t *. * When that queue is an ill_t (i.e. q_next != NULL), then arg must be @@ -9238,12 +9015,10 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) int unspec_src; boolean_t do_outrequests; /* Increment OutRequests? */ mib2_ipIfStatsEntry_t *mibptr; - int match_flags = MATCH_IRE_ILL_GROUP; - boolean_t attach_if = B_FALSE; + int match_flags = MATCH_IRE_ILL; mblk_t *first_mp; boolean_t mctl_present; ipsec_out_t *io; - boolean_t drop_if_delayed = B_FALSE; boolean_t multirt_need_resolve = B_FALSE; mblk_t *copy_mp = NULL; int err = 0; @@ -9574,16 +9349,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) */ mp->b_rptr = (uchar_t *)ip6h; - /* - * IP6I_ATTACH_IF is set in this function when we had a - * conn and it was either bound to the IPFF_NOFAILOVER address - * or IPV6_BOUND_PIF was set. These options override other - * options that set the ifindex. We come here with - * IP6I_ATTACH_IF set when we can't find the ire and - * ip_newroute_v6 is feeding the packet for second time. - */ - if ((ip6i->ip6i_flags & IP6I_IFINDEX) || - (ip6i->ip6i_flags & IP6I_ATTACH_IF)) { + if (ip6i->ip6i_flags & IP6I_IFINDEX) { ASSERT(ip6i->ip6i_ifindex != 0); if (ill != NULL) ill_refrele(ill); @@ -9603,33 +9369,13 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) return; } mibptr = ill->ill_ip_mib; - if (ip6i->ip6i_flags & IP6I_IFINDEX) { - /* - * Preserve the index so that when we return - * from IPSEC processing, we know where to - * send the packet. - */ - if (mctl_present) { - ASSERT(io != NULL); - io->ipsec_out_ill_index = - ip6i->ip6i_ifindex; - } - } - if (ip6i->ip6i_flags & IP6I_ATTACH_IF) { - /* - * This is a multipathing probe packet that has - * been delayed in ND resolution. Drop the - * packet for the reasons mentioned in - * nce_queue_mp() - */ - if ((ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) && - (ip6i->ip6i_flags & IP6I_ND_DELAYED)) { - freemsg(first_mp); - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } + /* + * Preserve the index so that when we return from + * IPSEC processing, we know where to send the packet. + */ + if (mctl_present) { + ASSERT(io != NULL); + io->ipsec_out_ill_index = ip6i->ip6i_ifindex; } } if (ip6i->ip6i_flags & IP6I_VERIFY_SRC) { @@ -9698,114 +9444,20 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) if (IN6_IS_ADDR_MULTICAST(v6dstp)) goto ipv6multicast; - /* 1. IPV6_BOUND_PIF takes precedence over all the ifindex settings. */ - if (connp != NULL && connp->conn_outgoing_pill != NULL) { - ill_t *conn_outgoing_pill; - - conn_outgoing_pill = conn_get_held_ill(connp, - &connp->conn_outgoing_pill, &err); - if (err == ILL_LOOKUP_FAILED) { - if (ill != NULL) - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - if (conn_outgoing_pill != NULL) { - if (ill != NULL) - ill_refrele(ill); - ill = conn_outgoing_pill; - attach_if = B_TRUE; - match_flags = MATCH_IRE_ILL; - mibptr = ill->ill_ip_mib; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - goto send_from_ill; - } - } - - /* 2. If ipc_nofailover_ill is set then use that ill. */ - if (connp != NULL && connp->conn_nofailover_ill != NULL) { - ill_t *conn_nofailover_ill; - - conn_nofailover_ill = conn_get_held_ill(connp, - &connp->conn_nofailover_ill, &err); - if (err == ILL_LOOKUP_FAILED) { - if (ill != NULL) - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - if (conn_nofailover_ill != NULL) { - if (ill != NULL) - ill_refrele(ill); - ill = conn_nofailover_ill; - attach_if = B_TRUE; - /* - * Assumes that ipc_nofailover_ill is used only for - * multipathing probe packets. These packets are better - * dropped, if they are delayed in ND resolution, for - * the reasons described in nce_queue_mp(). - * IP6I_DROP_IFDELAYED will be set later on in this - * function for this packet. - */ - drop_if_delayed = B_TRUE; - match_flags = MATCH_IRE_ILL; - mibptr = ill->ill_ip_mib; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - goto send_from_ill; - } - } - - /* - * Redo 1. If we did not find an IRE_CACHE the first time, we should - * have an ip6i_t with IP6I_ATTACH_IF if IPV6_BOUND_PIF or - * bind to the IPIF_NOFAILOVER address was used on this endpoint. - */ - if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_ATTACH_IF)) { - ASSERT(ip6i->ip6i_ifindex != 0); - attach_if = B_TRUE; - ASSERT(ill != NULL); - match_flags = MATCH_IRE_ILL; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - goto send_from_ill; - } - - /* 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. */ + /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */ if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) { ASSERT(ill != NULL); goto send_from_ill; } /* - * 4. If q is an ill queue and (link local or multicast destination) + * 2. If q is an ill queue and there's a link-local destination * then use that ill. */ - if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp)) { + if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp)) goto send_from_ill; - } - /* 5. If IPV6_BOUND_IF has been set use that ill. */ + /* 3. If IPV6_BOUND_IF has been set use that ill. */ if (connp != NULL && connp->conn_outgoing_ill != NULL) { ill_t *conn_outgoing_ill; @@ -9827,7 +9479,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) } /* - * 6. For unicast: Just do an IRE lookup for the best match. + * 4. For unicast: Just do an IRE lookup for the best match. * If we get here for a link-local address it is rather random * what interface we pick on a multihomed host. * *If* there is an IRE_CACHE (and the link-local address @@ -9913,7 +9565,6 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) } BUMP_MIB(mibptr, ipIfStatsHCOutRequests); } - ASSERT(!attach_if); /* * Check if the ire has the RTF_MULTIRT flag, inherited @@ -9966,7 +9617,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) } } ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request, - connp, caller, 0, ip6i_flags, zoneid); + connp, caller, ip6i_flags, zoneid); if (need_decref) { CONN_DEC_REF(connp); connp = NULL; @@ -10086,9 +9737,6 @@ ipv6multicast: ip2dbg(("ip_wput_v6: multicast\n")); /* - * 1. IPV6_BOUND_PIF takes precedence over all the ifindex settings - * 2. If conn_nofailover_ill is set then use that ill. - * * Hold the conn_lock till we refhold the ill of interest that is * pointed to from the conn. Since we cannot do an ill/ipif_refrele * while holding any locks, postpone the refrele until after the @@ -10100,79 +9748,12 @@ ipv6multicast: } else { conn_lock_held = B_FALSE; } - if (connp != NULL && connp->conn_outgoing_pill != NULL) { - err = ill_check_and_refhold(connp->conn_outgoing_pill); - if (err == ILL_LOOKUP_FAILED) { - ip1dbg(("ip_output_v6: multicast" - " conn_outgoing_pill no ipif\n")); -multicast_discard: - ASSERT(saved_ill == NULL); - if (conn_lock_held) - mutex_exit(&connp->conn_lock); - if (ill != NULL) - ill_refrele(ill); - freemsg(first_mp); - if (do_outrequests) - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - saved_ill = ill; - ill = connp->conn_outgoing_pill; - attach_if = B_TRUE; - match_flags = MATCH_IRE_ILL; - mibptr = ill->ill_ip_mib; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - } else if (connp != NULL && connp->conn_nofailover_ill != NULL) { - err = ill_check_and_refhold(connp->conn_nofailover_ill); - if (err == ILL_LOOKUP_FAILED) { - ip1dbg(("ip_output_v6: multicast" - " conn_nofailover_ill no ipif\n")); - goto multicast_discard; - } - saved_ill = ill; - ill = connp->conn_nofailover_ill; - attach_if = B_TRUE; - match_flags = MATCH_IRE_ILL; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - } else if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_ATTACH_IF)) { - /* - * Redo 1. If we did not find an IRE_CACHE the first time, - * we should have an ip6i_t with IP6I_ATTACH_IF if - * IPV6_BOUND_PIF or bind to the IPIF_NOFAILOVER address was - * used on this endpoint. - */ - ASSERT(ip6i->ip6i_ifindex != 0); - attach_if = B_TRUE; - ASSERT(ill != NULL); - match_flags = MATCH_IRE_ILL; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - } else if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) { - /* 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. */ - + if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) { + /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */ ASSERT(ill != NULL); } else if (ill != NULL) { /* - * 4. If q is an ill queue and (link local or multicast + * 2. If q is an ill queue and (link local or multicast * destination) then use that ill. * We don't need the ipif initialization here. * This useless assert below is just to prevent lint from @@ -10181,9 +9762,9 @@ multicast_discard: ASSERT(ill != NULL); } else if (connp != NULL) { /* - * 5. If IPV6_BOUND_IF has been set use that ill. + * 3. If IPV6_BOUND_IF has been set use that ill. * - * 6. For multicast: if IPV6_MULTICAST_IF has been set use it. + * 4. For multicast: if IPV6_MULTICAST_IF has been set use it. * Otherwise look for the best IRE match for the unspecified * group to determine the ill. * @@ -10198,7 +9779,18 @@ multicast_discard: if (err == ILL_LOOKUP_FAILED) { ip1dbg(("ip_output_v6: multicast" " conn_outgoing_ill no ipif\n")); - goto multicast_discard; +multicast_discard: + ASSERT(saved_ill == NULL); + if (conn_lock_held) + mutex_exit(&connp->conn_lock); + if (ill != NULL) + ill_refrele(ill); + freemsg(first_mp); + if (do_outrequests) + BUMP_MIB(mibptr, ipIfStatsOutDiscards); + if (need_decref) + CONN_DEC_REF(connp); + return; } ill = connp->conn_outgoing_ill; } else if (connp->conn_multicast_ill != NULL) { @@ -10239,8 +9831,6 @@ multicast_discard: */ mutex_enter(&connp->conn_lock); connp->conn_multicast_ill = ill; - connp->conn_orig_multicast_ifindex = - ill->ill_phyint->phyint_ifindex; mutex_exit(&connp->conn_lock); } } @@ -10307,11 +9897,55 @@ multicast_discard: send_from_ill: ASSERT(ill != NULL); ASSERT(mibptr == ill->ill_ip_mib); + if (do_outrequests) { BUMP_MIB(mibptr, ipIfStatsHCOutRequests); do_outrequests = B_FALSE; } + /* + * Because nce_xmit() calls ip_output_v6() and NCEs are always tied to + * an underlying interface, IS_UNDER_IPMP() may be true even when + * building IREs that will be used for data traffic. As such, use the + * packet's source address to determine whether the traffic is test + * traffic, and set MATCH_IRE_MARK_TESTHIDDEN if so. + * + * Separately, we also need to mark probe packets so that ND can + * process them specially; see the comments in nce_queue_mp_common(). + */ + if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && + ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) { + if (ip6i == NULL) { + if ((mp = ip_add_info_v6(mp, NULL, v6dstp)) == NULL) { + if (mctl_present) + freeb(first_mp); + goto discard; + } + + if (mctl_present) + first_mp->b_cont = mp; + else + first_mp = mp; + + /* ndp_resolver() expects a pulled-up message */ + if (MBLKL(mp) == sizeof (ip6i_t) && + pullupmsg(mp, -1) == 0) { + ip1dbg(("ip_output_v6: pullupmsg failed\n")); +discard: BUMP_MIB(mibptr, ipIfStatsOutDiscards); + ill_refrele(ill); + if (need_decref) + CONN_DEC_REF(connp); + return; + } + ip6i = (ip6i_t *)mp->b_rptr; + ip6h = (ip6_t *)&ip6i[1]; + v6dstp = &ip6h->ip6_dst; + mp->b_rptr = (uchar_t *)ip6h; /* rewound below */ + } + ip6i->ip6i_flags |= IP6I_IPMP_PROBE; + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; + } + if (io != NULL) io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; @@ -10390,9 +10024,7 @@ send_from_ill: ill->ill_name, (void *)ire, ill->ill_phyint->phyint_ifindex)); ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request, - connp, caller, - (attach_if ? ill->ill_phyint->phyint_ifindex : 0), - ip6i_flags, zoneid); + connp, caller, ip6i_flags, zoneid); ire_refrele(ire); if (need_decref) { CONN_DEC_REF(connp); @@ -10422,7 +10054,8 @@ send_from_ill: return; } ip_newroute_ipif_v6(q, copy_mp, ipif, - ip6h->ip6_dst, unspec_src, zoneid); + &ip6h->ip6_dst, &ip6h->ip6_src, unspec_src, + zoneid); ipif_refrele(ipif); } else { ip_newroute_v6(q, copy_mp, &ip6h->ip6_dst, @@ -10440,12 +10073,11 @@ send_from_ill: /* Update rptr if there was an ip6i_t header. */ if (ip6i != NULL) mp->b_rptr -= sizeof (ip6i_t); - if (unspec_src || attach_if) { + if (unspec_src) { if (ip6i == NULL) { /* * Add ip6i_t header to carry unspec_src - * or attach_if until the packet comes back in - * ip_wput_v6. + * until the packet comes back in ip_wput_v6. */ if (mctl_present) { first_mp->b_cont = @@ -10481,28 +10113,15 @@ send_from_ill: ip6h = (ip6_t *)&ip6i[1]; v6dstp = &ip6h->ip6_dst; } - if (unspec_src) - ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; - if (attach_if) { - /* - * Bind to nofailover/BOUND_PIF overrides ifindex. - */ - ip6i->ip6i_flags |= IP6I_ATTACH_IF; - ip6i->ip6i_flags &= ~IP6I_IFINDEX; - ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; - if (drop_if_delayed) { - /* This is a multipathing probe packet */ - ip6i->ip6i_flags |= IP6I_DROP_IFDELAYED; - } - } + ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; if (mctl_present) { ASSERT(io != NULL); io->ipsec_out_unspec_src = unspec_src; } } if (IN6_IS_ADDR_MULTICAST(v6dstp)) { - ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, *v6dstp, - unspec_src, zoneid); + ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, v6dstp, + &ip6h->ip6_src, unspec_src, zoneid); } else { ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, ill, zoneid, ipst); @@ -10544,14 +10163,6 @@ ip_wput_v6(queue_t *q, mblk_t *mp) ip_output_v6(GLOBAL_ZONEID, mp, q, IP_WPUT); } -static void -ipsec_out_attach_if(ipsec_out_t *io, int attach_index) -{ - ASSERT(io->ipsec_out_type == IPSEC_OUT); - io->ipsec_out_attach_if = B_TRUE; - io->ipsec_out_ill_index = attach_index; -} - /* * NULL send-to queue - packet is to be delivered locally. */ @@ -10731,6 +10342,8 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp, */ if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && !IS_LOOPBACK(ill)) { + ilm_walker_t ilw; + /* * In the multicast case, applications may have * joined the group from different zones, so we @@ -10742,11 +10355,9 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp, * on the loopback interface (PHYI_LOOPBACK flag * set) as they must stay in the sender's zone. */ - ILM_WALKER_HOLD(ill); - for (ilm = ill->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if (ilm->ilm_flags & ILM_DELETED) - continue; + ilm = ilm_walker_start(&ilw, ill); + for (; ilm != NULL; + ilm = ilm_walker_step(&ilw, ilm)) { if (!IN6_ARE_ADDR_EQUAL( &ilm->ilm_v6addr, &ip6h->ip6_dst)) continue; @@ -10754,23 +10365,24 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp, IP_FF_NO_MCAST_LOOP) && ilm->ilm_zoneid == ire->ire_zoneid) continue; - if (!ipif_lookup_zoneid(ill, - ilm->ilm_zoneid, IPIF_UP, NULL)) + if (!ipif_lookup_zoneid( + ilw.ilw_walk_ill, ilm->ilm_zoneid, + IPIF_UP, NULL)) continue; first_mp1 = ip_copymsg(first_mp); if (first_mp1 == NULL) continue; - icmp_inbound_v6(q, first_mp1, ill, - hdr_length, mctl_present, - IP6_NO_IPPOLICY, ilm->ilm_zoneid, - NULL); + icmp_inbound_v6(q, first_mp1, + ilw.ilw_walk_ill, ill, hdr_length, + mctl_present, IP6_NO_IPPOLICY, + ilm->ilm_zoneid, NULL); } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } else { first_mp1 = ip_copymsg(first_mp); if (first_mp1 != NULL) - icmp_inbound_v6(q, first_mp1, ill, + icmp_inbound_v6(q, first_mp1, ill, ill, hdr_length, mctl_present, IP6_NO_IPPOLICY, ire->ire_zoneid, NULL); @@ -10823,8 +10435,7 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp, */ static void ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, - int cksum_request, conn_t *connp, int caller, int attach_index, int flags, - zoneid_t zoneid) + int cksum_request, conn_t *connp, int caller, int flags, zoneid_t zoneid) { ip6_t *ip6h; uint8_t nexthdr; @@ -10917,7 +10528,7 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, if (src_ire != NULL && !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) && (!ipst->ips_ip_restrict_interzone_loopback || - ire_local_same_ill_group(ire, src_ire))) { + ire_local_same_lan(ire, src_ire))) { if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && !unspec_src) { ip6h->ip6_src = src_ire->ire_src_addr_v6; @@ -10974,20 +10585,14 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, /* * Select the source address using ipif_select_source_v6. */ - if (attach_index != 0) { - ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, - RESTRICT_TO_ILL, IPV6_PREFER_SRC_DEFAULT, zoneid); - } else { - ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, - RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid); - } + ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, B_FALSE, + IPV6_PREFER_SRC_DEFAULT, zoneid); if (ipif == NULL) { if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg("ip_wput_ire_v6: no src for " - "dst %s\n, ", AF_INET6, &ip6h->ip6_dst); - printf("ip_wput_ire_v6: interface name %s\n", - ill->ill_name); + "dst %s\n", AF_INET6, &ip6h->ip6_dst); + printf("through interface %s\n", ill->ill_name); } freemsg(first_mp); return; @@ -10998,12 +10603,8 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { if ((connp != NULL && connp->conn_multicast_loop) || !IS_LOOPBACK(ill)) { - ilm_t *ilm; - - ILM_WALKER_HOLD(ill); - ilm = ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, ALL_ZONES); - ILM_WALKER_RELE(ill); - if (ilm != NULL) { + if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE, + ALL_ZONES) != NULL) { mblk_t *nmp; int fanout_flags = 0; @@ -11417,8 +11018,6 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, } /* Do IPSEC processing first */ if (mctl_present) { - if (attach_index != 0) - ipsec_out_attach_if(io, attach_index); ipsec_out_process(q, first_mp, ire, ill_index); return; } @@ -11456,8 +11055,6 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, max_frag, B_FALSE, B_TRUE, zoneid, ipst); return; } - if (attach_index != 0) - ipsec_out_attach_if(io, attach_index); ipsec_out_process(q, first_mp, ire, ill_index); return; } @@ -11948,8 +11545,8 @@ boolean_t conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags, zoneid_t zoneid) { - ill_t *in_ill; - boolean_t wantpacket = B_TRUE; + ill_t *bound_ill; + boolean_t wantpacket; in6_addr_t *v6dst_ptr = &ip6h->ip6_dst; in6_addr_t *v6src_ptr = &ip6h->ip6_src; @@ -11958,42 +11555,16 @@ conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags, * unicast and multicast reception to conn_incoming_ill. * conn_wantpacket_v6 is called both for unicast and * multicast. - * - * 1) The unicast copy of the packet can come anywhere in - * the ill group if it is part of the group. Thus, we - * need to check to see whether the ill group matches - * if in_ill is part of a group. - * - * 2) ip_rput does not suppress duplicate multicast packets. - * If there are two interfaces in a ill group and we have - * 2 applications (conns) joined a multicast group G on - * both the interfaces, ilm_lookup_ill filter in ip_rput - * will give us two packets because we join G on both the - * interfaces rather than nominating just one interface - * for receiving multicast like broadcast above. So, - * we have to call ilg_lookup_ill to filter out duplicate - * copies, if ill is part of a group, to supress duplicates. */ - in_ill = connp->conn_incoming_ill; - if (in_ill != NULL) { - mutex_enter(&connp->conn_lock); - in_ill = connp->conn_incoming_ill; - mutex_enter(&ill->ill_lock); - /* - * No IPMP, and the packet did not arrive on conn_incoming_ill - * OR, IPMP in use and the packet arrived on an IPMP group - * different from the conn_incoming_ill's IPMP group. - * Reject the packet. - */ - if ((in_ill->ill_group == NULL && in_ill != ill) || - (in_ill->ill_group != NULL && - in_ill->ill_group != ill->ill_group)) { - wantpacket = B_FALSE; + bound_ill = connp->conn_incoming_ill; + if (bound_ill != NULL) { + if (IS_IPMP(bound_ill)) { + if (bound_ill->ill_grp != ill->ill_grp) + return (B_FALSE); + } else { + if (bound_ill != ill) + return (B_FALSE); } - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - if (!wantpacket) - return (B_FALSE); } if (connp->conn_multi_router) @@ -12140,7 +11711,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp, (IN6_ARE_ADDR_EQUAL(&first_ire->ire_addr_v6, &ire->ire_addr_v6)) && !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) break; } @@ -12204,8 +11775,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp, &ire->ire_addr_v6)) continue; if (ire1->ire_marks & - (IRE_MARK_CONDEMNED| - IRE_MARK_HIDDEN)) + IRE_MARK_CONDEMNED) continue; /* Got one */ @@ -13279,3 +12849,31 @@ ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah) size += ehdrlen; } } + +/* + * Utility routine that checks if `v6srcp' is a valid address on underlying + * interface `ill'. If `ipifp' is non-NULL, it's set to a held ipif + * associated with `v6srcp' on success. NOTE: if this is not called from + * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the + * group during or after this lookup. + */ +static boolean_t +ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp) +{ + ipif_t *ipif; + + ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst); + if (ipif != NULL) { + if (ipifp != NULL) + *ipifp = ipif; + else + ipif_refrele(ipif); + return (B_TRUE); + } + + if (ip_debug > 2) { + pr_addr_dbg("ipif_lookup_testaddr_v6: cannot find ipif for " + "src %s\n", AF_INET6, v6srcp); + } + return (B_FALSE); +} diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c index 81447c2e30..c729118fec 100644 --- a/usr/src/uts/common/inet/ip/ip6_if.c +++ b/usr/src/uts/common/inet/ip/ip6_if.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -53,7 +53,6 @@ #include <netinet/igmp_var.h> #include <netinet/ip6.h> #include <netinet/icmp6.h> -#include <netinet/in.h> #include <inet/common.h> #include <inet/nd.h> @@ -178,10 +177,12 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst, } else if (IPIF_CAN_WAIT(ipif, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); rw_exit(&ipst->ips_ill_g_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (error != NULL) @@ -202,16 +203,12 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst, } /* - * Look for an ipif with the specified address. For point-point links - * we look for matches on either the destination address and the local - * address, but we ignore the check on the local address if IPIF_UNNUMBERED - * is set. - * Matches on a specific ill if match_ill is set. + * Common function for ipif_lookup_addr_v6() and ipif_lookup_addr_exact_v6(). */ -/* ARGSUSED */ -ipif_t * -ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid, - queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) +static ipif_t * +ipif_lookup_addr_common_v6(const in6_addr_t *addr, ill_t *match_ill, + boolean_t match_illgrp, zoneid_t zoneid, queue_t *q, mblk_t *mp, + ipsq_func_t func, int *error, ip_stack_t *ipst) { ipif_t *ipif; ill_t *ill; @@ -230,7 +227,8 @@ ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid, repeat: ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (match_ill != NULL && ill != match_ill) { + if (match_ill != NULL && ill != match_ill && + (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { continue; } GRAB_CONN_LOCK(q); @@ -257,10 +255,12 @@ repeat: } else if (IPIF_CAN_WAIT(ipif, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); rw_exit(&ipst->ips_ill_g_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (error != NULL) @@ -323,11 +323,41 @@ ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid, } /* + * Lookup an ipif with the specified address. For point-to-point links we + * look for matches on either the destination address or the local address, + * but we skip the local address check if IPIF_UNNUMBERED is set. If the + * `match_ill' argument is non-NULL, the lookup is restricted to that ill + * (or illgrp if `match_ill' is in an IPMP group). + */ +ipif_t * +ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid, + queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) +{ + return (ipif_lookup_addr_common_v6(addr, match_ill, B_TRUE, zoneid, q, + mp, func, error, ipst)); +} + +/* + * Special abbreviated version of ipif_lookup_addr_v6() that doesn't match + * `match_ill' across the IPMP group. This function is only needed in some + * corner-cases; almost everything should use ipif_lookup_addr_v6(). + */ +ipif_t * +ipif_lookup_addr_exact_v6(const in6_addr_t *addr, ill_t *match_ill, + ip_stack_t *ipst) +{ + ASSERT(match_ill != NULL); + return (ipif_lookup_addr_common_v6(addr, match_ill, B_FALSE, ALL_ZONES, + NULL, NULL, NULL, NULL, ipst)); +} + +/* * Look for an ipif with the specified address. For point-point links * we look for matches on either the destination address and the local * address, but we ignore the check on the local address if IPIF_UNNUMBERED * is set. - * Matches on a specific ill if match_ill is set. + * If the `match_ill' argument is non-NULL, the lookup is restricted to that + * ill (or illgrp if `match_ill' is in an IPMP group). * Return the zoneid for the ipif. ALL_ZONES if none found. */ zoneid_t @@ -348,7 +378,8 @@ ipif_lookup_addr_zoneid_v6(const in6_addr_t *addr, ill_t *match_ill, repeat: ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (match_ill != NULL && ill != match_ill) { + if (match_ill != NULL && ill != match_ill && + !IS_IN_SAME_ILLGRP(ill, match_ill)) { continue; } mutex_enter(&ill->ill_lock); @@ -1120,11 +1151,10 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, boolean_t ill_setdefaulttoken(ill_t *ill) { - int i; + int i; in6_addr_t v6addr, v6mask; - if (!MEDIA_V6INTFID(ill->ill_media, ill->ill_phys_addr_length, - ill->ill_phys_addr, &v6addr)) + if (!MEDIA_V6INTFID(ill->ill_media, ill, &v6addr)) return (B_FALSE); (void) ip_plen_to_mask_v6(IPV6_TOKEN_LEN, &v6mask); @@ -1161,7 +1191,7 @@ ipif_set_tun_auto_addr(ipif_t *ipif, struct iftun_req *ta) { sin6_t sin6; sin_t *sin; - ill_t *ill = ipif->ipif_ill; + ill_t *ill = ipif->ipif_ill; tun_t *tp = (tun_t *)ill->ill_wq->q_next->q_ptr; if (ta->ifta_saddr.ss_family != AF_INET || @@ -1227,7 +1257,7 @@ ipif_set_tun_llink(ill_t *ill, struct iftun_req *ta) if ((ta->ifta_flags & IFTUN_DST) && IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)) { - sin6_t sin6; + sin6_t sin6; ASSERT(!(ipif->ipif_flags & IPIF_UP)); bzero(&sin6, sizeof (sin6_t)); @@ -1344,13 +1374,22 @@ ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce) if (ret_nce != NULL) *ret_nce = NULL; + + /* + * IPMP meta-interfaces don't have any inherent multicast mappings, + * and instead use the ones on the underlying interfaces. + */ + if (IS_IPMP(ill)) + return (0); + /* * Delete the mapping nce. Normally these should not exist * as a previous ipif_down -> ipif_ndp_down should have deleted * all the nces. But they can exist if ip_rput_dlpi_writer - * calls this when PHYI_MULTI_BCAST is set. + * calls this when PHYI_MULTI_BCAST is set. Mappings are always + * tied to the underlying ill, so don't match across the illgrp. */ - mnce = ndp_lookup_v6(ill, &v6_mcast_addr, B_FALSE); + mnce = ndp_lookup_v6(ill, B_FALSE, &v6_mcast_addr, B_FALSE); if (mnce != NULL) { ndp_delete(mnce); NCE_REFRELE(mnce); @@ -1424,13 +1463,15 @@ ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce) * Get the resolver set up for a new ipif. (Always called as writer.) */ int -ipif_ndp_up(ipif_t *ipif) +ipif_ndp_up(ipif_t *ipif, boolean_t initial) { ill_t *ill = ipif->ipif_ill; int err = 0; nce_t *nce = NULL; nce_t *mnce = NULL; + boolean_t added_ipif = B_FALSE; + ASSERT(IAM_WRITER_ILL(ill)); ip1dbg(("ipif_ndp_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); /* @@ -1464,7 +1505,10 @@ ipif_ndp_up(ipif_t *ipif) if ((ipif->ipif_flags & (IPIF_UNNUMBERED|IPIF_NOLOCAL)) == 0) { uint16_t flags; - uchar_t *hw_addr = NULL; + uint16_t state; + uchar_t *hw_addr = NULL; + ill_t *bound_ill; + ipmp_illgrp_t *illg = ill->ill_grp; /* Permanent entries don't need NUD */ flags = NCE_F_PERMANENT | NCE_F_NONUD; @@ -1474,26 +1518,65 @@ ipif_ndp_up(ipif_t *ipif) if (ipif->ipif_flags & IPIF_ANYCAST) flags |= NCE_F_ANYCAST; - if (ill->ill_net_type == IRE_IF_RESOLVER) { - hw_addr = ill->ill_nd_lla; - - if (ill->ill_move_in_progress) { - /* - * Addresses are failing over to this ill. - * Don't wait for NUD to see this change. - * Publish our new link-layer address. - */ - flags |= NCE_F_UNSOL_ADV; + if (IS_IPMP(ill)) { + ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); + /* + * If we're here via ipif_up(), then the ipif won't be + * bound yet -- add it to the group, which will bind + * it if possible. (We would add it in ipif_up(), but + * deleting on failure there is gruesome.) If we're + * here via ipmp_ill_bind_ipif(), then the ipif has + * already been added to the group and we just need to + * use the binding. + */ + if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) { + bound_ill = ipmp_illgrp_add_ipif(illg, ipif); + if (bound_ill == NULL) { + /* + * We couldn't bind the ipif to an ill + * yet, so we have nothing to publish. + * Set ipif_addr_ready so that this + * address can be used locally for now. + * The routing socket message will be + * sent from ipif_up_done_v6(). + */ + ipif->ipif_addr_ready = 1; + return (0); + } + added_ipif = B_TRUE; } + hw_addr = bound_ill->ill_nd_lla; + } else { + bound_ill = ill; + if (ill->ill_net_type == IRE_IF_RESOLVER) + hw_addr = ill->ill_nd_lla; + } + + /* + * If this is an initial bring-up (or the ipif was never + * completely brought up), do DAD. Otherwise, we're here + * because IPMP has rebound an address to this ill: send + * unsolicited advertisements to inform others. + */ + if (initial || !ipif->ipif_addr_ready) { + state = ND_PROBE; + } else { + state = ND_REACHABLE; + flags |= NCE_F_UNSOL_ADV; } - err = ndp_lookup_then_add_v6(ill, + /* + * NOTE: for IPMP, local addresses are always associated with + * the ill they're bound to, so don't match across the illgrp. + */ + err = ndp_lookup_then_add_v6(bound_ill, + B_FALSE, hw_addr, &ipif->ipif_v6lcl_addr, &ipv6_all_ones, &ipv6_all_zeros, 0, flags, - ND_PROBE, /* Causes Duplicate Address Detection to run */ + state, &nce); switch (err) { case 0: @@ -1509,19 +1592,11 @@ ipif_ndp_up(ipif_t *ipif) NCE_REFRELE(nce); ip1dbg(("ipif_ndp_up: NCE already exists for %s\n", ill->ill_name)); - if (mnce != NULL) { - ndp_delete(mnce); - NCE_REFRELE(mnce); - } - return (err); + goto fail; default: - ip1dbg(("ipif_ndp_up: NCE creation failed %s\n", + ip1dbg(("ipif_ndp_up: NCE creation failed for %s\n", ill->ill_name)); - if (mnce != NULL) { - ndp_delete(mnce); - NCE_REFRELE(mnce); - } - return (err); + goto fail; } } else { /* No local NCE for this entry */ @@ -1532,6 +1607,15 @@ ipif_ndp_up(ipif_t *ipif) if (mnce != NULL) NCE_REFRELE(mnce); return (0); +fail: + if (mnce != NULL) { + ndp_delete(mnce); + NCE_REFRELE(mnce); + } + if (added_ipif) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); + + return (err); } /* Remove all cache entries for this logical interface */ @@ -1539,23 +1623,42 @@ void ipif_ndp_down(ipif_t *ipif) { nce_t *nce; + ill_t *ill = ipif->ipif_ill; + + ASSERT(IAM_WRITER_ILL(ill)); if (ipif->ipif_isv6) { - nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, - B_FALSE); - if (nce != NULL) { - ndp_delete(nce); - NCE_REFRELE(nce); + ill_t *bound_ill; + + if (IS_IPMP(ill)) + bound_ill = ipmp_ipif_bound_ill(ipif); + else + bound_ill = ill; + + if (bound_ill != NULL) { + nce = ndp_lookup_v6(bound_ill, + B_FALSE, /* see comment in ipif_ndp_up() */ + &ipif->ipif_v6lcl_addr, + B_FALSE); + if (nce != NULL) { + ndp_delete(nce); + NCE_REFRELE(nce); + } } + + /* + * Make IPMP aware of the deleted data address. + */ + if (IS_IPMP(ill)) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); } + /* * Remove mapping and all other nces dependent on this ill * when the last ipif is going away. */ - if (ipif->ipif_ill->ill_ipif_up_count == 0) { - ndp_walk(ipif->ipif_ill, (pfi_t)ndp_delete_per_ill, - (uchar_t *)ipif->ipif_ill, ipif->ipif_ill->ill_ipst); - } + if (ill->ill_ipif_up_count == 0) + ndp_walk(ill, (pfi_t)ndp_delete_per_ill, ill, ill->ill_ipst); } /* @@ -1936,9 +2039,7 @@ rule_preferred(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, } /* - * Prefer source addresses that are assigned to the outgoing interface, or - * to an interface that is in the same IPMP group as the outgoing - * interface. + * Prefer source addresses that are assigned to the outgoing interface. */ /* ARGSUSED3 */ static rule_res_t @@ -1955,15 +2056,11 @@ rule_interface(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, return (CAND_TIE); if (!bc->cand_matchedinterface_set) { - bc->cand_matchedinterface = (bc->cand_ill == dstill || - (dstill->ill_group != NULL && - dstill->ill_group == bc->cand_ill->ill_group)); + bc->cand_matchedinterface = bc->cand_ill == dstill; bc->cand_matchedinterface_set = B_TRUE; } - cc->cand_matchedinterface = (cc->cand_ill == dstill || - (dstill->ill_group != NULL && - dstill->ill_group == cc->cand_ill->ill_group)); + cc->cand_matchedinterface = cc->cand_ill == dstill; cc->cand_matchedinterface_set = B_TRUE; if (bc->cand_matchedinterface == cc->cand_matchedinterface) @@ -2134,6 +2231,13 @@ rule_addr_type(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, static rule_res_t rule_prefix(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst) { + /* + * For IPMP, we always want to choose a random source address from + * among any equally usable addresses, so always report a tie. + */ + if (IS_IPMP(dstinfo->dst_ill)) + return (CAND_TIE); + if (!bc->cand_common_pref_set) { bc->cand_common_pref = ip_common_prefix_v6(&bc->cand_srcaddr, dstinfo->dst_addr); @@ -2177,10 +2281,9 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, * specification's algorithm could traverse the list of addresses once for * every rule). * - * The restrict_ill argument restricts the algorithm to chose a source - * address that is assigned to the destination ill or an ill in the same - * IPMP group as the destination ill. This is used when the destination - * address is a link-local or multicast address, and when + * The restrict_ill argument restricts the algorithm to choose a source + * address that is assigned to the destination ill. This is used when + * the destination address is a link-local or multicast address, and when * ipv6_strict_dst_multihoming is turned on. * * src_prefs is the caller's set of source address preferences. If source @@ -2192,13 +2295,13 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, */ ipif_t * ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, - uint_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid) + boolean_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid) { dstinfo_t dstinfo; char dstr[INET6_ADDRSTRLEN]; char sstr[INET6_ADDRSTRLEN]; - ipif_t *ipif; - ill_t *ill, *usesrc_ill = NULL; + ipif_t *ipif, *start_ipif, *next_ipif; + ill_t *ill, *usesrc_ill = NULL, *ipmp_ill = NULL; ill_walk_context_t ctx; cand_t best_c; /* The best candidate */ cand_t curr_c; /* The current candidate */ @@ -2247,6 +2350,16 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, } else { return (NULL); } + } else if (IS_UNDER_IPMP(dstill)) { + /* + * Test addresses should never be used for source address + * selection, so if we were passed an underlying ill, switch + * to the IPMP meta-interface. + */ + if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(dstill)) != NULL) + dstinfo.dst_ill = ipmp_ill; + else + return (NULL); } else { dstinfo.dst_ill = dstill; } @@ -2286,10 +2399,7 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, */ if (IN6_IS_ADDR_LINKLOCAL(dst) || IN6_IS_ADDR_MULTICAST(dst) || ipst->ips_ipv6_strict_dst_multihoming || usesrc_ill != NULL) { - if (restrict_ill == RESTRICT_TO_NONE) - dstinfo.dst_restrict_ill = RESTRICT_TO_GROUP; - else - dstinfo.dst_restrict_ill = restrict_ill; + dstinfo.dst_restrict_ill = B_TRUE; } else { dstinfo.dst_restrict_ill = restrict_ill; } @@ -2297,39 +2407,41 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, bzero(&best_c, sizeof (cand_t)); /* - * Take a pass through the list of IPv6 interfaces to chose the - * best possible source address. If restrict_ill is true, we only - * iterate through the ill's that are in the same IPMP group as the - * destination's outgoing ill. If restrict_ill is false, we walk - * the entire list of IPv6 ill's. + * Take a pass through the list of IPv6 interfaces to choose the best + * possible source address. If restrict_ill is set, just use dst_ill. */ - if (dstinfo.dst_restrict_ill != RESTRICT_TO_NONE) { - if (dstinfo.dst_ill->ill_group != NULL && - dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP) { - ill = dstinfo.dst_ill->ill_group->illgrp_ill; - } else { - ill = dstinfo.dst_ill; - } - } else { + if (dstinfo.dst_restrict_ill) + ill = dstinfo.dst_ill; + else ill = ILL_START_WALK_V6(&ctx, ipst); - } - while (ill != NULL) { + for (; ill != NULL; ill = ill_next(&ctx, ill)) { ASSERT(ill->ill_isv6); /* - * Avoid FAILED/OFFLINE ills. - * Global and site local addresses will failover and - * will be available on the new ill. - * But link local addresses don't move. + * Test addresses should never be used for source address + * selection, so ignore underlying ills. */ - if (dstinfo.dst_restrict_ill != RESTRICT_TO_ILL && - ill->ill_phyint->phyint_flags & - (PHYI_OFFLINE | PHYI_FAILED)) - goto next_ill; + if (IS_UNDER_IPMP(ill)) + continue; - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { + /* + * For source address selection, we treat the ipif list as + * circular and continue until we get back to where we + * started. This allows IPMP to vary source address selection + * (which improves inbound load spreading) by caching its last + * ending point and starting from there. NOTE: we don't have + * to worry about ill_src_ipif changing ills since that can't + * happen on the IPMP ill. + */ + start_ipif = ill->ill_ipif; + if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) + start_ipif = ill->ill_src_ipif; + + ipif = start_ipif; + do { + if ((next_ipif = ipif->ipif_next) == NULL) + next_ipif = ill->ill_ipif; if (!IPIF_VALID_IPV6_SOURCE(ipif)) continue; @@ -2387,9 +2499,8 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, */ for (index = 0; rules[index] != NULL; index++) { /* Apply a comparison rule. */ - rule_result = - (rules[index])(&best_c, &curr_c, &dstinfo, - ipst); + rule_result = (rules[index])(&best_c, &curr_c, + &dstinfo, ipst); if (rule_result == CAND_AVOID) { /* * The best candidate is still the @@ -2417,21 +2528,29 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, * have been prefered as the best candidate so far. */ ASSERT(rule_result != CAND_TIE); + } while ((ipif = next_ipif) != start_ipif); + + /* + * For IPMP, update the source ipif rotor to the next ipif, + * provided we can look it up. (We must not use it if it's + * IPIF_CONDEMNED since we may have grabbed ill_g_lock after + * ipif_free() checked ill_src_ipif.) + */ + if (IS_IPMP(ill) && ipif != NULL) { + mutex_enter(&ipif->ipif_ill->ill_lock); + next_ipif = ipif->ipif_next; + if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif)) + ill->ill_src_ipif = next_ipif; + else + ill->ill_src_ipif = NULL; + mutex_exit(&ipif->ipif_ill->ill_lock); } /* - * We may be walking the linked-list of ill's in an - * IPMP group or traversing the IPv6 ill avl tree. If it is a - * usesrc ILL then it can't be part of IPMP group and we - * will exit the while loop. + * Only one ill to consider if dst_restrict_ill is set. */ -next_ill: - if (dstinfo.dst_restrict_ill == RESTRICT_TO_ILL) - ill = NULL; - else if (dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP) - ill = ill->ill_group_next; - else - ill = ill_next(&ctx, ill); + if (dstinfo.dst_restrict_ill) + break; } ipif = best_c.cand_ipif; @@ -2444,6 +2563,9 @@ next_ill: if (usesrc_ill != NULL) ill_refrele(usesrc_ill); + if (ipmp_ill != NULL) + ill_refrele(ipmp_ill); + if (dst_rhtp != NULL) TPC_RELE(dst_rhtp); @@ -2474,8 +2596,7 @@ next_ill: * ipif_update_other_ipifs calls us. * * If old_ipif is NULL, just redo the source address selection - * if needed. This happens when illgrp_insert or ipif_up_done_v6 - * calls us. + * if needed. This happens when ipif_up_done_v6 calls us. */ void ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif) @@ -2561,8 +2682,7 @@ ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif) if (ip6_asp_can_lookup(ipst)) { ip6_asp_table_held = B_TRUE; nipif = ipif_select_source_v6(ill, &ipif->ipif_v6subnet, - RESTRICT_TO_GROUP, IPV6_PREFER_SRC_DEFAULT, - ipif->ipif_zoneid); + B_TRUE, IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid); } if (nipif == NULL) { /* Last resort - all ipif's have IPIF_NOLOCAL */ @@ -2630,13 +2750,9 @@ ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif) * Find the IRE_INTERFACE for such ipif's and recreate them * to use an different source address following the rules in * ipif_up_done_v6. - * - * This function takes an illgrp as an argument so that illgrp_delete - * can call this to update source address even after deleting the - * old_ipif->ipif_ill from the ill group. */ void -ipif_update_other_ipifs_v6(ipif_t *old_ipif, ill_group_t *illgrp) +ipif_update_other_ipifs_v6(ipif_t *old_ipif) { ipif_t *ipif; ill_t *ill; @@ -2651,23 +2767,9 @@ ipif_update_other_ipifs_v6(ipif_t *old_ipif, ill_group_t *illgrp) inet_ntop(AF_INET6, &old_ipif->ipif_v6lcl_addr, buf, sizeof (buf)))); - /* - * If this part of a group, look at all ills as ipif_select_source - * borrows a source address across all the ills in the group. - */ - if (illgrp != NULL) - ill = illgrp->illgrp_ill; - - /* Don't need a lock since this is a writer */ - for (; ill != NULL; ill = ill->ill_group_next) { - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - - if (ipif == old_ipif) - continue; - + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + if (ipif != old_ipif) ipif_recreate_interface_routes_v6(old_ipif, ipif); - } } } @@ -2828,12 +2930,10 @@ ipif_up_done_v6(ipif_t *ipif) boolean_t flush_ire_cache = B_TRUE; int err; char buf[INET6_ADDRSTRLEN]; - phyint_t *phyi; ire_t **ipif_saved_irep = NULL; int ipif_saved_ire_cnt; int cnt; boolean_t src_ipif_held = B_FALSE; - boolean_t ire_added = B_FALSE; boolean_t loopback = B_FALSE; boolean_t ip6_asp_table_held = B_FALSE; ip_stack_t *ipst = ill->ill_ipst; @@ -2868,8 +2968,8 @@ ipif_up_done_v6(ipif_t *ipif) break; } if (flush_ire_cache) - ire_walk_ill_v6(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, - IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); + ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, + IRE_CACHE, ill_ipif_cache_delete, ill, ill); /* * Figure out which way the send-to queue should go. Only @@ -2900,7 +3000,9 @@ ipif_up_done_v6(ipif_t *ipif) ipif->ipif_ire_type = IRE_LOCAL; } - if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { + if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) || + ((ipif->ipif_flags & IPIF_DEPRECATED) && + !(ipif->ipif_flags & IPIF_NOFAILOVER))) { /* * Can't use our source address. Select a different * source address for the IRE_INTERFACE and IRE_LOCAL @@ -2908,7 +3010,7 @@ ipif_up_done_v6(ipif_t *ipif) if (ip6_asp_can_lookup(ipst)) { ip6_asp_table_held = B_TRUE; src_ipif = ipif_select_source_v6(ipif->ipif_ill, - &ipif->ipif_v6subnet, RESTRICT_TO_NONE, + &ipif->ipif_v6subnet, B_FALSE, IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid); } if (src_ipif == NULL) @@ -3090,9 +3192,9 @@ ipif_up_done_v6(ipif_t *ipif) ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); /* - * Need to atomically check for ip_addr_availablity_check - * now under ill_g_lock, and if it fails got bad, and remove - * from group also + * Need to atomically check for IP address availability under + * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new + * ills or new ipifs can be added while we are checking availability. */ rw_enter(&ipst->ips_ill_g_lock, RW_READER); mutex_enter(&ipst->ips_ip_addr_avail_lock); @@ -3125,9 +3227,7 @@ ipif_up_done_v6(ipif_t *ipif) } /* - * Add in all newly created IREs. We want to add before - * we call ifgrp_insert which wants to know whether - * IRE_IF_RESOLVER exists or not. + * Add in all newly created IREs. * * NOTE : We refrele the ire though we may branch to "bad" * later on where we do ire_delete. This is okay @@ -3148,36 +3248,6 @@ ipif_up_done_v6(ipif_t *ipif) ip6_asp_table_refrele(ipst); ip6_asp_table_held = B_FALSE; } - ire_added = B_TRUE; - - /* - * Form groups if possible. - * - * If we are supposed to be in a ill_group with a name, insert it - * now as we know that at least one ipif is UP. Otherwise form - * nameless groups. - * - * If ip_enable_group_ifs is set and ipif address is not ::0, insert - * this ipif into the appropriate interface group, or create a - * new one. If this is already in a nameless group, we try to form - * a bigger group looking at other ills potentially sharing this - * ipif's prefix. - */ - phyi = ill->ill_phyint; - if (phyi->phyint_groupname_len != 0) { - ASSERT(phyi->phyint_groupname != NULL); - if (ill->ill_ipif_up_count == 1) { - ASSERT(ill->ill_group == NULL); - err = illgrp_insert(&ipst->ips_illgrp_head_v6, ill, - phyi->phyint_groupname, NULL, B_TRUE); - if (err != 0) { - ip1dbg(("ipif_up_done_v6: illgrp allocation " - "failed, error %d\n", err)); - goto bad; - } - } - ASSERT(ill->ill_group != NULL); - } /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; @@ -3190,19 +3260,23 @@ ipif_up_done_v6(ipif_t *ipif) */ ill_recover_multicast(ill); } - /* Join the allhosts multicast address and the solicited node MC */ - ipif_multicast_up(ipif); - if (!loopback) { + if (ill->ill_ipif_up_count == 1) { /* - * See whether anybody else would benefit from the - * new ipif that we added. We call this always rather - * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST - * ipif for the benefit of illgrp_insert (done above) - * which does not do source address selection as it does - * not want to re-create interface routes that we are - * having reference to it here. + * Since the interface is now up, it may now be active. */ + if (IS_UNDER_IPMP(ill)) + ipmp_ill_refresh_active(ill); + } + + /* Join the allhosts multicast address and the solicited node MC */ + ipif_multicast_up(ipif); + + /* + * See if anybody else would benefit from our new ipif. + */ + if (!loopback && + !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { ill_update_source_selection(ill); } @@ -3238,29 +3312,11 @@ ipif_up_done_v6(ipif_t *ipif) bad: if (ip6_asp_table_held) ip6_asp_table_refrele(ipst); - /* - * We don't have to bother removing from ill groups because - * - * 1) For groups with names, we insert only when the first ipif - * comes up. In that case if it fails, it will not be in any - * group. So, we need not try to remove for that case. - * - * 2) For groups without names, either we tried to insert ipif_ill - * in a group as singleton or found some other group to become - * a bigger group. For the former, if it fails we don't have - * anything to do as ipif_ill is not in the group and for the - * latter, there are no failures in illgrp_insert/illgrp_delete - * (ENOMEM can't occur for this. Check ifgrp_insert). - */ while (irep > ire_array) { irep--; - if (*irep != NULL) { + if (*irep != NULL) ire_delete(*irep); - if (ire_added) - ire_refrele(*irep); - } - } (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); @@ -3272,8 +3328,7 @@ bad: ipif_refrele(src_ipif); ipif_ndp_down(ipif); - if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV) - ipif_arp_down(ipif); + ipif_resolver_down(ipif); return (err); } @@ -3286,15 +3341,14 @@ int ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) { - in6_addr_t addr; sin6_t *sin6; nce_t *nce; struct lifreq *lifr; lif_nd_req_t *lnr; - mblk_t *mp1; + ill_t *ill = ipif->ipif_ill; + ire_t *ire; - mp1 = mp->b_cont->b_cont; - lifr = (struct lifreq *)mp1->b_rptr; + lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr; lnr = &lifr->lifr_nd; /* Only allow for logical unit zero i.e. not on "le0:17" */ if (ipif->ipif_id != 0) @@ -3307,8 +3361,28 @@ ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, return (EAFNOSUPPORT); sin6 = (sin6_t *)&lnr->lnr_addr; - addr = sin6->sin6_addr; - nce = ndp_lookup_v6(ipif->ipif_ill, &addr, B_FALSE); + + /* + * Since ND mappings must be consistent across an IPMP group, prohibit + * deleting ND mappings on underlying interfaces. Also, since ND + * mappings for IPMP data addresses are owned by IP itself, prohibit + * deleting them. + */ + if (IS_UNDER_IPMP(ill)) + return (EPERM); + + if (IS_IPMP(ill)) { + ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL, + ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, + ill->ill_ipst); + if (ire != NULL) { + ire_refrele(ire); + return (EPERM); + } + } + + /* See comment in ndp_query() regarding IS_IPMP(ill) usage */ + nce = ndp_lookup_v6(ill, IS_IPMP(ill), &sin6->sin6_addr, B_FALSE); if (nce == NULL) return (ESRCH); ndp_delete(nce); @@ -3354,11 +3428,11 @@ int ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) { + sin6_t *sin6; ill_t *ill = ipif->ipif_ill; struct lifreq *lifr; lif_nd_req_t *lnr; - - ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); + ire_t *ire; lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr; lnr = &lifr->lifr_nd; @@ -3372,5 +3446,26 @@ ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, if (lnr->lnr_addr.ss_family != AF_INET6) return (EAFNOSUPPORT); + sin6 = (sin6_t *)&lnr->lnr_addr; + + /* + * Since ND mappings must be consistent across an IPMP group, prohibit + * updating ND mappings on underlying interfaces. Also, since ND + * mappings for IPMP data addresses are owned by IP itself, prohibit + * updating them. + */ + if (IS_UNDER_IPMP(ill)) + return (EPERM); + + if (IS_IPMP(ill)) { + ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL, + ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, + ill->ill_ipst); + if (ire != NULL) { + ire_refrele(ire); + return (EPERM); + } + } + return (ndp_sioc_update(ill, lnr)); } diff --git a/usr/src/uts/common/inet/ip/ip6_ire.c b/usr/src/uts/common/inet/ip/ip6_ire.c index 41461ca96f..0d0f3621f5 100644 --- a/usr/src/uts/common/inet/ip/ip6_ire.c +++ b/usr/src/uts/common/inet/ip/ip6_ire.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -73,7 +73,6 @@ static ire_t *ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *, const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *); static ire_t *ip6_ctable_lookup_impl(ire_ctable_args_t *); - /* * Initialize the ire that is specific to IPv6 part and call * ire_init_common to finish it. @@ -261,13 +260,11 @@ ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst) * Make sure we follow ire_ipif. * * We need to determine the interface route through - * which the gateway will be reached. We don't really - * care which interface is picked if the interface is - * part of a group. + * which the gateway will be reached. */ if (ire->ire_ipif != NULL) { ipif = ire->ire_ipif; - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; } switch (ire->ire_type) { @@ -409,35 +406,54 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) ire_t *ire = *ire_p; int error; ip_stack_t *ipst = ire->ire_ipst; + uint_t marks = 0; ASSERT(ire->ire_ipversion == IPV6_VERSION); ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */ ASSERT(ire->ire_nce == NULL); + /* + * IREs with source addresses hosted on interfaces that are under IPMP + * should be hidden so that applications don't accidentally end up + * sending packets with test addresses as their source addresses, or + * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here. + * (We let IREs with unspecified source addresses slip through since + * ire_send_v6() will delete them automatically.) + */ + if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && + !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) { + DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); + marks |= IRE_MARK_TESTHIDDEN; + } + /* Find the appropriate list head. */ switch (ire->ire_type) { case IRE_HOST: ire->ire_mask_v6 = ipv6_all_ones; ire->ire_masklen = IPV6_ABITS; + ire->ire_marks |= marks; if ((ire->ire_flags & RTF_SETSRC) == 0) ire->ire_src_addr_v6 = ipv6_all_zeros; break; case IRE_CACHE: + ire->ire_mask_v6 = ipv6_all_ones; + ire->ire_masklen = IPV6_ABITS; + ire->ire_marks |= marks; + break; case IRE_LOCAL: case IRE_LOOPBACK: ire->ire_mask_v6 = ipv6_all_ones; ire->ire_masklen = IPV6_ABITS; break; case IRE_PREFIX: - if ((ire->ire_flags & RTF_SETSRC) == 0) - ire->ire_src_addr_v6 = ipv6_all_zeros; - break; case IRE_DEFAULT: + ire->ire_marks |= marks; if ((ire->ire_flags & RTF_SETSRC) == 0) ire->ire_src_addr_v6 = ipv6_all_zeros; break; case IRE_IF_RESOLVER: case IRE_IF_NORESOLVER: + ire->ire_marks |= marks; break; default: printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n", @@ -543,9 +559,8 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) * 2) We could have multiple packets trying to create * an IRE_CACHE for the same ill. * - * Moreover, IPIF_NOFAILOVER and IPV6_BOUND_PIF endpoints wants - * to go out on a particular ill. Rather than looking at the - * packet, we depend on the above for MATCH_IRE_ILL here. + * Rather than looking at the packet, we depend on the above for + * MATCH_IRE_ILL here. * * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have * multiple IRE_CACHES for an ill for the same destination @@ -555,20 +570,15 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) */ if (ire->ire_ipif != NULL) flags |= MATCH_IRE_IPIF; + /* - * If we are creating hidden ires, make sure we search on - * this ill (MATCH_IRE_ILL) and a hidden ire, while we are - * searching for duplicates below. Otherwise we could - * potentially find an IRE on some other interface - * and it may not be a IRE marked with IRE_MARK_HIDDEN. We - * shouldn't do this as this will lead to an infinite loop as - * eventually we need an hidden ire for this packet to go - * out. MATCH_IRE_ILL is already marked above. + * If we are creating a hidden IRE, make sure we search for + * hidden IREs when searching for duplicates below. + * Otherwise, we might find an IRE on some other interface + * that's not marked hidden. */ - if (ire->ire_marks & IRE_MARK_HIDDEN) { - ASSERT(ire->ire_type == IRE_CACHE); - flags |= MATCH_IRE_MARK_HIDDEN; - } + if (ire->ire_marks & IRE_MARK_TESTHIDDEN) + flags |= MATCH_IRE_MARK_TESTHIDDEN; /* * Start the atomic add of the ire. Grab the ill locks, @@ -692,7 +702,7 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) } } if (ire->ire_type == IRE_CACHE) { - in6_addr_t gw_addr_v6; + const in6_addr_t *addr_v6; ill_t *ill = ire_to_ill(ire); char buf[INET6_ADDRSTRLEN]; nce_t *nce; @@ -712,12 +722,12 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) * time on the list and rts_setgwr_v6 could not * be changing this. */ - gw_addr_v6 = ire->ire_gateway_addr_v6; - if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { - nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, B_TRUE); - } else { - nce = ndp_lookup_v6(ill, &gw_addr_v6, B_TRUE); - } + addr_v6 = &ire->ire_gateway_addr_v6; + if (IN6_IS_ADDR_UNSPECIFIED(addr_v6)) + addr_v6 = &ire->ire_addr_v6; + + /* nce fastpath is per-ill; don't match across illgrp */ + nce = ndp_lookup_v6(ill, B_FALSE, addr_v6, B_TRUE); if (nce == NULL) goto failed; @@ -1217,28 +1227,29 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, in6_addr_t gw_addr_v6; ill_t *ire_ill = NULL, *dst_ill; ill_t *ipif_ill = NULL; - ill_group_t *ire_ill_group = NULL; - ill_group_t *ipif_ill_group = NULL; ipif_t *src_ipif; ASSERT(ire->ire_ipversion == IPV6_VERSION); ASSERT(addr != NULL); ASSERT(mask != NULL); ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); - ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) || + ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ipif != NULL && ipif->ipif_isv6)); /* - * HIDDEN cache entries have to be looked up specifically with - * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set - * when the interface is FAILED or INACTIVE. In that case, - * any IRE_CACHES that exists should be marked with - * IRE_MARK_HIDDEN. So, we don't really need to match below - * for IRE_MARK_HIDDEN. But we do so for consistency. + * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it + * is in fact hidden, to ensure the caller gets the right one. One + * exception: if the caller passed MATCH_IRE_IHANDLE, then they + * already know the identity of the given IRE_INTERFACE entry and + * there's no point trying to hide it from them. */ - if (!(match_flags & MATCH_IRE_MARK_HIDDEN) && - (ire->ire_marks & IRE_MARK_HIDDEN)) - return (B_FALSE); + if (ire->ire_marks & IRE_MARK_TESTHIDDEN) { + if (match_flags & MATCH_IRE_IHANDLE) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; + + if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) + return (B_FALSE); + } if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { @@ -1288,7 +1299,7 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, */ if ((dst_ill->ill_usesrc_ifindex != 0) && (src_ipif = ipif_select_source_v6(dst_ill, addr, - RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid)) + B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid)) != NULL) { ip3dbg(("ire_match_args: src_ipif %p" " dst_ill %p", (void *)src_ipif, @@ -1326,20 +1337,20 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); } + /* - * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that - * somebody wants to send out on a particular interface which - * is given by ire_stq and hence use ire_stq to derive the ill - * value. ire_ipif for IRE_CACHES is just the - * means of getting a source address i.e ire_src_addr_v6 = - * ire->ire_ipif->ipif_src_addr_v6. + * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to + * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means + * of getting a source address -- i.e., ire_src_addr_v6 == + * ire->ire_ipif->ipif_v6src_addr). ire_to_ill() handles this. + * + * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group. + * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for + * IPMP test traffic), then the ill must match exactly. */ - if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) { + if (match_flags & MATCH_IRE_ILL) { ire_ill = ire_to_ill(ire); - if (ire_ill != NULL) - ire_ill_group = ire_ill->ill_group; ipif_ill = ipif->ipif_ill; - ipif_ill_group = ipif_ill->ill_group; } /* No ire_addr_v6 bits set past the mask */ @@ -1357,17 +1368,14 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, &ipif->ipif_v6src_addr)) && ((!(match_flags & MATCH_IRE_IPIF)) || (ire->ire_ipif == ipif)) && - ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) || - (ire->ire_type != IRE_CACHE || - ire->ire_marks & IRE_MARK_HIDDEN)) && + ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) || + (ire->ire_marks & IRE_MARK_TESTHIDDEN)) && ((!(match_flags & MATCH_IRE_ILL)) || - (ire_ill == ipif_ill)) && + (ire_ill == ipif_ill || + (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) && + ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) && ((!(match_flags & MATCH_IRE_IHANDLE)) || (ire->ire_ihandle == ihandle)) && - ((!(match_flags & MATCH_IRE_ILL_GROUP)) || - (ire_ill == ipif_ill) || - (ire_ill_group != NULL && - ire_ill_group == ipif_ill_group)) && ((!(match_flags & MATCH_IRE_SECATTR)) || (!is_system_labeled()) || (tsol_ire_match_gwattr(ire, tsl) == 0))) { @@ -1391,8 +1399,7 @@ ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or * MATCH_IRE_ILL is set. */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && - (ipif == NULL)) + if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) return (NULL); /* @@ -1477,8 +1484,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or * MATCH_IRE_ILL is set. */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && - (ipif == NULL)) + if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) return (NULL); /* @@ -1661,8 +1667,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); - match_flags = MATCH_IRE_ILL_GROUP | - MATCH_IRE_SECATTR; + match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL, 0, ire->ire_ipif, zoneid, tsl, match_flags, ipst); @@ -1703,7 +1708,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, if (ire->ire_ipif != NULL) { ire_match_flags |= - MATCH_IRE_ILL_GROUP; + MATCH_IRE_ILL; } rire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, IRE_INTERFACE, @@ -1791,21 +1796,8 @@ found_ire_held: */ saved_ire = ire; - /* - * Currently MATCH_IRE_ILL is never used with - * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while - * sending out packets as MATCH_IRE_ILL is used only - * for communicating with on-link hosts. We can't assert - * that here as RTM_GET calls this function with - * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE. - * We have already used the MATCH_IRE_ILL in determining - * the right prefix route at this point. To match the - * behavior of how we locate routes while sending out - * packets, we don't want to use MATCH_IRE_ILL below - * while locating the interface route. - */ if (ire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; @@ -1958,9 +1950,7 @@ ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway, } /* - * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers - * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get - * to the hidden ones. + * Lookup cache. * * In general the zoneid has to match (where ALL_ZONES match all of them). * But for IRE_LOCAL we also need to handle the case where L2 should @@ -1968,8 +1958,7 @@ ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway, * Ethernet drivers nor Ethernet hardware loops back packets sent to their * own MAC address. This loopback is needed when the normal * routes (ignoring IREs with different zoneids) would send out the packet on - * the same ill (or ill group) as the ill with which this IRE_LOCAL is - * associated. + * the same ill as the ill with which this IRE_LOCAL is associated. * * Earlier versions of this code always matched an IRE_LOCAL independently of * the zoneid. We preserve that earlier behavior when @@ -1986,7 +1975,7 @@ ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid, ipst->ips_ip6_cache_table_size)]; rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) { - if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN)) + if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) continue; if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) { /* @@ -2125,13 +2114,8 @@ ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire) ASSERT(cire != NULL && pire != NULL); match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; - /* - * ip_newroute_v6 calls ire_ftable_lookup with MATCH_IRE_ILL only - * for on-link hosts. We should never be here for onlink. - * Thus, use MATCH_IRE_ILL_GROUP. - */ if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; /* * We know that the mask of the interface ire equals cire->ire_cmask. * (When ip_newroute_v6() created 'cire' for an on-link destn. it set @@ -2168,7 +2152,7 @@ ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire) */ match_flags = MATCH_IRE_TYPE; if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; mutex_enter(&pire->ire_lock); gw_addr = pire->ire_gateway_addr_v6; @@ -2210,24 +2194,30 @@ ire_t * ipif_to_ire_v6(const ipif_t *ipif) { ire_t *ire; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF; + + /* + * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN + * so that they aren't accidentally returned. However, if the + * caller's ipif is on an ill under IPMP, there's no need to hide 'em. + */ + if (IS_UNDER_IPMP(ipif->ipif_ill)) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; ASSERT(ipif->ipif_isv6); if (ipif->ipif_ire_type == IRE_LOOPBACK) { ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL, - IRE_LOOPBACK, ipif, ALL_ZONES, NULL, - (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ipst); + IRE_LOOPBACK, ipif, ALL_ZONES, NULL, match_flags, ipst); } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { /* In this case we need to lookup destination address. */ ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr, &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES, - 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF | - MATCH_IRE_MASK), ipst); + 0, NULL, (match_flags | MATCH_IRE_MASK), ipst); } else { ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet, &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL, - ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF | - MATCH_IRE_MASK), ipst); + ALL_ZONES, 0, NULL, (match_flags | MATCH_IRE_MASK), ipst); } return (ire); } @@ -2296,7 +2286,7 @@ ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl, continue; if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp)) continue; - if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN)) + if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) continue; unres_cnt--; } @@ -2434,7 +2424,7 @@ ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, continue; if (cire->ire_marks & (IRE_MARK_CONDEMNED| - IRE_MARK_HIDDEN)) + IRE_MARK_TESTHIDDEN)) continue; if (cire->ire_gw_secattr != NULL && @@ -2635,8 +2625,7 @@ ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, &cire->ire_addr_v6, &v6dst)) continue; if (cire->ire_marks & - (IRE_MARK_CONDEMNED| - IRE_MARK_HIDDEN)) + IRE_MARK_CONDEMNED) continue; if (cire->ire_gw_secattr != NULL && @@ -2845,8 +2834,7 @@ ip6_ctable_lookup_impl(ire_ctable_args_t *margs) ire_t *ire; ip_stack_t *ipst = margs->ict_ipst; - if ((margs->ict_flags & - (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && + if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (margs->ict_ipif == NULL)) { return (NULL); } diff --git a/usr/src/uts/common/inet/ip/ip6_rts.c b/usr/src/uts/common/inet/ip/ip6_rts.c index 7d2ddd5c04..dcf429c8ba 100644 --- a/usr/src/uts/common/inet/ip/ip6_rts.c +++ b/usr/src/uts/common/inet/ip/ip6_rts.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,8 +38,6 @@ * @(#)rtsock.c 8.6 (Berkeley) 2/11/95 */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file contains routines that processes routing socket requests. */ @@ -216,5 +214,5 @@ ip_rts_change_v6(int type, const in6_addr_t *dst_addr, rtm->rtm_errno = error; rtm->rtm_flags |= RTF_DONE; rtm->rtm_addrs = rtm_addrs; - rts_queue_input(mp, NULL, AF_INET6, ipst); + rts_queue_input(mp, NULL, AF_INET6, RTSQ_ALL, ipst); } diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c index 4fa3c7a74d..31f83c842d 100644 --- a/usr/src/uts/common/inet/ip/ip_ftable.c +++ b/usr/src/uts/common/inet/ip/ip_ftable.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,7 +67,6 @@ #include <net/pfkeyv2.h> #include <inet/ipsec_info.h> #include <inet/sadb.h> -#include <sys/kmem.h> #include <inet/tcp.h> #include <inet/ipclassifier.h> #include <sys/zone.h> @@ -159,8 +158,7 @@ ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, * ire_match_args() will dereference ipif MATCH_IRE_SRC or * MATCH_IRE_ILL is set. */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && - (ipif == NULL)) + if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) return (NULL); (void) memset(&rdst, 0, sizeof (rdst)); @@ -290,28 +288,16 @@ found_ire_held: */ save_ire = ire; + if (ire->ire_ipif != NULL) + match_flags |= MATCH_IRE_ILL; + /* - * Currently MATCH_IRE_ILL is never used with - * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while - * sending out packets as MATCH_IRE_ILL is used only - * for communicating with on-link hosts. We can't assert - * that here as RTM_GET calls this function with - * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE. - * We have already used the MATCH_IRE_ILL in determining - * the right prefix route at this point. To match the - * behavior of how we locate routes while sending out - * packets, we don't want to use MATCH_IRE_ILL below - * while locating the interface route. - * * ire_ftable_lookup may end up with an incomplete IRE_CACHE * entry for the gateway (i.e., one for which the * ire_nce->nce_state is not yet ND_REACHABLE). If the caller * has specified MATCH_IRE_COMPLETE, such entries will not * be returned; instead, we return the IF_RESOLVER ire. */ - if (ire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; - ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0, ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst); DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire, @@ -532,7 +518,7 @@ ire_ftable_lookup_simple(ipaddr_t addr, } } if (ire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst); @@ -678,13 +664,11 @@ ire_lookup_multi(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst) * Make sure we follow ire_ipif. * * We need to determine the interface route through - * which the gateway will be reached. We don't really - * care which interface is picked if the interface is - * part of a group. + * which the gateway will be reached. */ if (ire->ire_ipif != NULL) { ipif = ire->ire_ipif; - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; } switch (ire->ire_type) { @@ -854,40 +838,26 @@ ire_get_next_default_ire(ire_t *ire, ire_t *ire_origin) } static ipif_t * -ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill, +ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, int zoneid, ushort_t *marks) { ipif_t *src_ipif; - ip_stack_t *ipst = dst_ill->ill_ipst; + ill_t *ill = ire->ire_ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; /* - * Pick the best source address from dst_ill. + * Pick the best source address from ill. * - * 1) If it is part of a multipathing group, we would - * like to spread the inbound packets across different - * interfaces. ipif_select_source picks a random source - * across the different ills in the group. - * - * 2) If it is not part of a multipathing group, we try - * to pick the source address from the destination + * 1) Try to pick the source address from the destination * route. Clustering assumes that when we have multiple * prefixes hosted on an interface, the prefix of the * source address matches the prefix of the destination * route. We do this only if the address is not * DEPRECATED. * - * 3) If the conn is in a different zone than the ire, we + * 2) If the conn is in a different zone than the ire, we * need to pick a source address from the right zone. - * - * NOTE : If we hit case (1) above, the prefix of the source - * address picked may not match the prefix of the - * destination routes prefix as ipif_select_source - * does not look at "dst" while picking a source - * address. - * If we want the same behavior as (2), we will need - * to change the behavior of ipif_select_source. */ - if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { /* * The RTF_SETSRC flag is set in the parent ire (sire). @@ -899,13 +869,10 @@ ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill, return (src_ipif); } *marks |= IRE_MARK_USESRC_CHECK; - if ((dst_ill->ill_group != NULL) || + if (IS_IPMP(ill) || (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || - (dst_ill->ill_usesrc_ifindex != 0)) { - src_ipif = ipif_select_source(dst_ill, dst, zoneid); - if (src_ipif == NULL) - return (NULL); - + (ill->ill_usesrc_ifindex != 0)) { + src_ipif = ipif_select_source(ill, dst, zoneid); } else { src_ipif = ire->ire_ipif; ASSERT(src_ipif != NULL); @@ -1071,18 +1038,20 @@ create_irecache: sire->ire_last_used_time = lbolt; } - /* Obtain dst_ill */ - dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill); + dst_ill = ire->ire_ipif->ipif_ill; + if (IS_IPMP(dst_ill)) + dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp); + else + ill_refhold(dst_ill); + if (dst_ill == NULL) { - ip2dbg(("ire_forward no dst ill; ire 0x%p\n", - (void *)ire)); + ip2dbg(("ire_forward no dst ill; ire 0x%p\n", (void *)ire)); goto icmp_err_ret; } ASSERT(src_ipif == NULL); /* Now obtain the src_ipif */ - src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill, - zoneid, &ire_marks); + src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks); if (src_ipif == NULL) goto icmp_err_ret; @@ -1254,18 +1223,13 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action, ire_t *sire = NULL, *save_ire; ill_t *dst_ill = NULL; int error; - zoneid_t zoneid; + zoneid_t zoneid = GLOBAL_ZONEID; ipif_t *src_ipif = NULL; mblk_t *res_mp; ushort_t ire_marks = 0; - zoneid = GLOBAL_ZONEID; - - ire = ire_ftable_lookup_simple(dst, &sire, zoneid, - MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE, ipst); - + MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE, ipst); if (ire == NULL) { ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst); goto icmp_err_ret; @@ -1288,9 +1252,7 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action, * nexthop router, just hand over the cache entry * and we are done. */ - if (ire->ire_type & IRE_CACHE) { - /* * If we are using this ire cache entry as a * gateway to forward packets, chances are we @@ -1334,18 +1296,21 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action, UPDATE_OB_PKT_COUNT(sire); } - /* Obtain dst_ill */ - dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill); + dst_ill = ire->ire_ipif->ipif_ill; + if (IS_IPMP(dst_ill)) + dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp); + else + ill_refhold(dst_ill); /* for symmetry */ + if (dst_ill == NULL) { - ip2dbg(("ire_forward no dst ill; ire 0x%p\n", + ip2dbg(("ire_forward_simple: no dst ill; ire 0x%p\n", (void *)ire)); goto icmp_err_ret; } ASSERT(src_ipif == NULL); /* Now obtain the src_ipif */ - src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill, - zoneid, &ire_marks); + src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks); if (src_ipif == NULL) goto icmp_err_ret; @@ -1720,33 +1685,24 @@ ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE| MATCH_IRE_RJ_BHOLE| - MATCH_IRE_SECATTR); + MATCH_IRE_SECATTR | MATCH_IRE_ILL); /* * If supplied ifindex is non-null, the only valid - * nexthop is one off of the interface or group corresponding + * nexthop is one off of the interface corresponding * to the specified ifindex. */ ill = ill_lookup_on_ifindex(ifindex, B_FALSE, NULL, NULL, NULL, NULL, ipst); if (ill != NULL) { - match_flags |= MATCH_IRE_ILL; + supplied_ipif = ipif_get_next_ipif(NULL, ill); } else { - /* Fallback to group names if hook_emulation set */ - if (ipst->ips_ipmp_hook_emulation) { - ill = ill_group_lookup_on_ifindex(ifindex, - B_FALSE, ipst); - } - if (ill == NULL) { - ip1dbg(("ipfil_sendpkt: Could not find" - " route to dst\n")); - value = ECOMM; - freemsg(mp); - goto discard; - } - match_flags |= MATCH_IRE_ILL_GROUP; + ip1dbg(("ipfil_sendpkt: Could not find" + " route to dst\n")); + value = ECOMM; + freemsg(mp); + goto discard; } - supplied_ipif = ipif_get_next_ipif(NULL, ill); ire = ire_route_lookup(dst, 0, 0, 0, supplied_ipif, &sire, zoneid, MBLK_GETLABEL(mp), match_flags, ipst); @@ -2325,9 +2281,9 @@ ire_round_robin(irb_t *irb_ptr, zoneid_t zoneid, ire_ftable_args_t *margs, * interested in routers that are * reachable through ipifs within our zone. */ - if (ire->ire_ipif != NULL) { - match_flags |= MATCH_IRE_ILL_GROUP; - } + if (ire->ire_ipif != NULL) + match_flags |= MATCH_IRE_ILL; + rire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, IRE_INTERFACE, ire->ire_ipif, NULL, zoneid, margs->ift_tsl, match_flags, ipst); diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index 0597245499..9771c87721 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -46,6 +46,7 @@ #include <sys/bitmap.h> #include <sys/cpuvar.h> #include <sys/time.h> +#include <sys/ctype.h> #include <sys/kmem.h> #include <sys/systm.h> #include <sys/param.h> @@ -61,10 +62,10 @@ #include <netinet/ip6.h> #include <netinet/icmp6.h> #include <netinet/igmp_var.h> -#include <sys/strsun.h> #include <sys/policy.h> #include <sys/ethernet.h> #include <sys/callb.h> +#include <sys/md5.h> #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ #include <inet/mi.h> @@ -85,7 +86,6 @@ #include <inet/tun.h> #include <inet/sctp_ip.h> #include <inet/ip_netinfo.h> -#include <inet/mib2.h> #include <net/pfkeyv2.h> #include <inet/ipsec_info.h> @@ -93,7 +93,6 @@ #include <inet/ipsec_impl.h> #include <sys/iphada.h> - #include <netinet/igmp.h> #include <inet/ip_listutils.h> #include <inet/ipclassifier.h> @@ -158,7 +157,7 @@ static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, static void ipsq_delete(ipsq_t *); static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, - boolean_t initialize); + boolean_t initialize, boolean_t insert); static void ipif_check_bcast_ires(ipif_t *test_ipif); static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, @@ -169,7 +168,6 @@ static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); static void ipif_free(ipif_t *ipif); static void ipif_free_tail(ipif_t *ipif); static void ipif_mtu_change(ire_t *ire, char *ipif_arg); -static void ipif_multicast_down(ipif_t *ipif); static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); static void ipif_set_default(ipif_t *ipif); static int ipif_set_values(queue_t *q, mblk_t *mp, @@ -179,8 +177,7 @@ static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *); -static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); -static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); +static void ipif_update_other_ipifs(ipif_t *old_ipif); static int ill_alloc_ppa(ill_if_t *, ill_t *); static int ill_arp_off(ill_t *ill); @@ -192,33 +189,18 @@ static void ill_down(ill_t *ill); static void ill_downi(ire_t *ire, char *ill_arg); static void ill_free_mib(ill_t *ill); static void ill_glist_delete(ill_t *); -static boolean_t ill_has_usable_ipif(ill_t *); -static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); -static void ill_nominate_bcast_rcv(ill_group_t *illgrp); -static void ill_phyint_free(ill_t *ill); static void ill_phyint_reinit(ill_t *ill); static void ill_set_nce_router_flags(ill_t *, boolean_t); static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); -static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); -static boolean_t ill_split_ipsq(ipsq_t *cur_sq); -static void ill_stq_cache_delete(ire_t *, char *); - -static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); -static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); -static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, - in6_addr_t *); -static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, - ipaddr_t *); -static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); -static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, - in6_addr_t *); -static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, - ipaddr_t *); - +static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; +static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; +static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo; +static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo; static void ipif_save_ire(ipif_t *, ire_t *); static void ipif_remove_ire(ipif_t *, ire_t *); static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *); static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); +static void phyint_free(phyint_t *); /* * Per-ill IPsec capabilities management. @@ -250,18 +232,14 @@ static void ill_capability_ack_thr(void *); static void ill_capability_lso_enable(ill_t *); static void ill_capability_send(ill_t *, mblk_t *); -static void illgrp_cache_delete(ire_t *, char *); -static void illgrp_delete(ill_t *ill); -static void illgrp_reset_schednext(ill_t *ill); - static ill_t *ill_prev_usesrc(ill_t *); static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); static void ill_disband_usesrc_group(ill_t *); static void conn_cleanup_stale_ire(conn_t *, caddr_t); #ifdef DEBUG -static void ill_trace_cleanup(const ill_t *); -static void ipif_trace_cleanup(const ipif_t *); +static void ill_trace_cleanup(const ill_t *); +static void ipif_trace_cleanup(const ipif_t *); #endif /* @@ -491,6 +469,7 @@ static nv_t ipif_nv_tbl[] = { { PHYI_STANDBY, "STANDBY" }, { PHYI_INACTIVE, "INACTIVE" }, { PHYI_OFFLINE, "OFFLINE" }, + { PHYI_IPMP, "IPMP" } }; static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; @@ -508,7 +487,8 @@ static ip_m_t ip_m_tbl[] = { ip_ether_v6intfid }, { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, ip_ib_v6intfid }, - { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, + { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL }, + { SUNW_DL_IPMP, IFT_OTHER, NULL, NULL, ip_ipmp_v6intfid }, { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid } }; @@ -529,14 +509,6 @@ static ipif_t ipif_zero; */ uint_t ill_no_arena = 12; /* Setable in /etc/system */ -static uint_t -ipif_rand(ip_stack_t *ipst) -{ - ipst->ips_ipif_src_random = ipst->ips_ipif_src_random * 1103515245 + - 12345; - return ((ipst->ips_ipif_src_random >> 16) & 0x7fff); -} - /* * Allocate per-interface mibs. * Returns true if ok. False otherwise. @@ -623,7 +595,7 @@ ill_allocate_mibs(ill_t *ill) * (Always called as writer.) */ mblk_t * -ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) +ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr) { arc_t *arc = (arc_t *)template; char *cp; @@ -669,17 +641,69 @@ ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) } mblk_t * -ipif_area_alloc(ipif_t *ipif) +ipif_area_alloc(ipif_t *ipif, uint_t optflags) { - return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template, - (char *)&ipif->ipif_lcl_addr)); + caddr_t addr; + mblk_t *mp; + area_t *area; + uchar_t *areap; + ill_t *ill = ipif->ipif_ill; + + if (ill->ill_isv6) { + ASSERT(ill->ill_flags & ILLF_XRESOLV); + addr = (caddr_t)&ipif->ipif_v6lcl_addr; + areap = (uchar_t *)&ip6_area_template; + } else { + addr = (caddr_t)&ipif->ipif_lcl_addr; + areap = (uchar_t *)&ip_area_template; + } + + if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL) + return (NULL); + + /* + * IPMP requires that the hardware address be included in all + * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on. + * If there are no active underlying ills in the group (and thus no + * hardware address, DAD will be deferred until an underlying ill + * becomes active. + */ + if (IS_IPMP(ill)) { + if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { + freemsg(mp); + return (NULL); + } + } else { + ill_refhold(ill); + } + + area = (area_t *)mp->b_rptr; + area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR; + area->area_flags |= optflags; + area->area_hw_addr_length = ill->ill_phys_addr_length; + bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset, + area->area_hw_addr_length); + + ill_refrele(ill); + return (mp); } mblk_t * ipif_ared_alloc(ipif_t *ipif) { - return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template, - (char *)&ipif->ipif_lcl_addr)); + caddr_t addr; + uchar_t *aredp; + + if (ipif->ipif_ill->ill_isv6) { + ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV); + addr = (caddr_t)&ipif->ipif_v6lcl_addr; + aredp = (uchar_t *)&ip6_ared_template; + } else { + addr = (caddr_t)&ipif->ipif_lcl_addr; + aredp = (uchar_t *)&ip_ared_template; + } + + return (ill_arp_alloc(ipif->ipif_ill, aredp, addr)); } mblk_t * @@ -689,6 +713,19 @@ ill_ared_alloc(ill_t *ill, ipaddr_t addr) (char *)&addr)); } +mblk_t * +ill_arie_alloc(ill_t *ill, const char *grifname, const void *template) +{ + mblk_t *mp = ill_arp_alloc(ill, template, 0); + arie_t *arie; + + if (mp != NULL) { + arie = (arie_t *)mp->b_rptr; + (void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ); + } + return (mp); +} + /* * Completely vaporize a lower level tap and all associated interfaces. * ill_delete is called only out of ip_close when the device control @@ -751,6 +788,12 @@ ill_delete(ill_t *ill) ip_purge_allmulti(ill); /* + * If the ill being deleted is under IPMP, boot it out of the illgrp. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_ill_leave_illgrp(ill); + + /* * ill_down will arrange to blow off any IRE's dependent on this * ILL, and shut down fragmentation reassembly. */ @@ -890,8 +933,19 @@ ill_delete_tail(ill_t *ill) * ill references. */ ASSERT(ilm_walk_ill(ill) == 0); + /* - * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free + * If this ill is an IPMP meta-interface, blow away the illgrp. This + * is safe to do because the illgrp has already been unlinked from the + * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. + */ + if (IS_IPMP(ill)) { + ipmp_illgrp_destroy(ill->ill_grp); + ill->ill_grp = NULL; + } + + /* + * Take us out of the list of ILLs. ill_glist_delete -> phyint_free * could free the phyint. No more reference to the phyint after this * point. */ @@ -1139,7 +1193,7 @@ ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) * Add the pending mp to the list. There can be only 1 pending mp * in the list. Any exclusive ioctl that needs to wait for a response * from another module or driver needs to use this function to set - * the ipsq_pending_mp to the ioctl mblk and wait for the response from + * the ipx_pending_mp to the ioctl mblk and wait for the response from * the other module/driver. This is also used while waiting for the * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. */ @@ -1147,19 +1201,19 @@ boolean_t ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, int waitfor) { - ipsq_t *ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; + ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; ASSERT(IAM_WRITER_IPIF(ipif)); ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); - ASSERT(ipsq->ipsq_pending_mp == NULL); + ASSERT(ipx->ipx_pending_mp == NULL); /* * The caller may be using a different ipif than the one passed into * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT - * that `ipsq_current_ipif == ipif'. + * that `ipx_current_ipif == ipif'. */ - ASSERT(ipsq->ipsq_current_ipif != NULL); + ASSERT(ipx->ipx_current_ipif != NULL); /* * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, @@ -1180,8 +1234,8 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, if (connp->conn_state_flags & CONN_CLOSING) return (B_FALSE); } - mutex_enter(&ipsq->ipsq_lock); - ipsq->ipsq_pending_ipif = ipif; + mutex_enter(&ipx->ipx_lock); + ipx->ipx_pending_ipif = ipif; /* * Note down the queue in b_queue. This will be returned by * ipsq_pending_mp_get. Caller will then use these values to restart @@ -1189,38 +1243,40 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, */ add_mp->b_next = NULL; add_mp->b_queue = q; - ipsq->ipsq_pending_mp = add_mp; - ipsq->ipsq_waitfor = waitfor; + ipx->ipx_pending_mp = add_mp; + ipx->ipx_waitfor = waitfor; + mutex_exit(&ipx->ipx_lock); if (connp != NULL) connp->conn_oper_pending_ill = ipif->ipif_ill; - mutex_exit(&ipsq->ipsq_lock); + return (B_TRUE); } /* - * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp + * Retrieve the ipx_pending_mp and return it. There can be only 1 mp * queued in the list. */ mblk_t * ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) { mblk_t *curr = NULL; + ipxop_t *ipx = ipsq->ipsq_xop; - mutex_enter(&ipsq->ipsq_lock); *connpp = NULL; - if (ipsq->ipsq_pending_mp == NULL) { - mutex_exit(&ipsq->ipsq_lock); + mutex_enter(&ipx->ipx_lock); + if (ipx->ipx_pending_mp == NULL) { + mutex_exit(&ipx->ipx_lock); return (NULL); } /* There can be only 1 such excl message */ - curr = ipsq->ipsq_pending_mp; - ASSERT(curr != NULL && curr->b_next == NULL); - ipsq->ipsq_pending_ipif = NULL; - ipsq->ipsq_pending_mp = NULL; - ipsq->ipsq_waitfor = 0; - mutex_exit(&ipsq->ipsq_lock); + curr = ipx->ipx_pending_mp; + ASSERT(curr->b_next == NULL); + ipx->ipx_pending_ipif = NULL; + ipx->ipx_pending_mp = NULL; + ipx->ipx_waitfor = 0; + mutex_exit(&ipx->ipx_lock); if (CONN_Q(curr->b_queue)) { /* @@ -1237,7 +1293,7 @@ ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) } /* - * Cleanup the ioctl mp queued in ipsq_pending_mp + * Cleanup the ioctl mp queued in ipx_pending_mp * - Called in the ill_delete path * - Called in the M_ERROR or M_HANGUP path on the ill. * - Called in the conn close path. @@ -1246,48 +1302,41 @@ boolean_t ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) { mblk_t *mp; - ipsq_t *ipsq; + ipxop_t *ipx; queue_t *q; ipif_t *ipif; ASSERT(IAM_WRITER_ILL(ill)); - ipsq = ill->ill_phyint->phyint_ipsq; - mutex_enter(&ipsq->ipsq_lock); + ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; + /* - * If connp is null, unconditionally clean up the ipsq_pending_mp. + * If connp is null, unconditionally clean up the ipx_pending_mp. * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl * even if it is meant for another ill, since we have to enqueue - * a new mp now in ipsq_pending_mp to complete the ipif_down. + * a new mp now in ipx_pending_mp to complete the ipif_down. * If connp is non-null we are called from the conn close path. */ - mp = ipsq->ipsq_pending_mp; + mutex_enter(&ipx->ipx_lock); + mp = ipx->ipx_pending_mp; if (mp == NULL || (connp != NULL && mp->b_queue != CONNP_TO_WQ(connp))) { - mutex_exit(&ipsq->ipsq_lock); + mutex_exit(&ipx->ipx_lock); return (B_FALSE); } - /* Now remove from the ipsq_pending_mp */ - ipsq->ipsq_pending_mp = NULL; + /* Now remove from the ipx_pending_mp */ + ipx->ipx_pending_mp = NULL; q = mp->b_queue; mp->b_next = NULL; mp->b_prev = NULL; mp->b_queue = NULL; - /* If MOVE was in progress, clear the move_in_progress fields also. */ - ill = ipsq->ipsq_pending_ipif->ipif_ill; - if (ill->ill_move_in_progress) { - ILL_CLEAR_MOVE(ill); - } else if (ill->ill_up_ipifs) { - ill_group_cleanup(ill); - } - - ipif = ipsq->ipsq_pending_ipif; - ipsq->ipsq_pending_ipif = NULL; - ipsq->ipsq_waitfor = 0; - ipsq->ipsq_current_ipif = NULL; - ipsq->ipsq_current_ioctl = 0; - ipsq->ipsq_current_done = B_TRUE; - mutex_exit(&ipsq->ipsq_lock); + ipif = ipx->ipx_pending_ipif; + ipx->ipx_pending_ipif = NULL; + ipx->ipx_waitfor = 0; + ipx->ipx_current_ipif = NULL; + ipx->ipx_current_ioctl = 0; + ipx->ipx_current_done = B_TRUE; + mutex_exit(&ipx->ipx_lock); if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { if (connp == NULL) { @@ -1437,7 +1486,7 @@ conn_ioctl_cleanup(conn_t *connp) * Is any exclusive ioctl pending ? If so clean it up. If the * ioctl has not yet started, the mp is pending in the list headed by * ipsq_xopq_head. If the ioctl has started the mp could be present in - * ipsq_pending_mp. If the ioctl timed out in the streamhead but + * ipx_pending_mp. If the ioctl timed out in the streamhead but * is currently executing now the mp is not queued anywhere but * conn_oper_pending_ill is null. The conn close will wait * till the conn_ref drops to zero. @@ -1468,9 +1517,9 @@ conn_ioctl_cleanup(conn_t *connp) ill_waiter_dcr(ill); /* * Check whether this ioctl has started and is - * pending now in ipsq_pending_mp. If it is not - * found there then check whether this ioctl has - * not even started and is in the ipsq_xopq list. + * pending. If it is not found there then check + * whether this ioctl has not even started and is in + * the ipsq_xopq list. */ if (!ipsq_pending_mp_cleanup(ill, connp)) ipsq_xopq_mp_cleanup(ill, connp); @@ -1506,16 +1555,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg) if (connp->conn_multicast_ill == ill) { /* Revert to late binding */ connp->conn_multicast_ill = NULL; - connp->conn_orig_multicast_ifindex = 0; } if (connp->conn_incoming_ill == ill) connp->conn_incoming_ill = NULL; if (connp->conn_outgoing_ill == ill) connp->conn_outgoing_ill = NULL; - if (connp->conn_outgoing_pill == ill) - connp->conn_outgoing_pill = NULL; - if (connp->conn_nofailover_ill == ill) - connp->conn_nofailover_ill = NULL; if (connp->conn_dhcpinit_ill == ill) { connp->conn_dhcpinit_ill = NULL; ASSERT(ill->ill_dhcpinit != 0); @@ -1524,11 +1568,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg) if (connp->conn_ire_cache != NULL) { ire = connp->conn_ire_cache; /* - * ip_newroute creates IRE_CACHE with ire_stq coming from - * interface X and ipif coming from interface Y, if interface - * X and Y are part of the same IPMPgroup. Thus whenever - * interface X goes down, remove all references to it by - * checking both on ire_ipif and ire_stq. + * Source address selection makes it possible for IRE_CACHE + * entries to be created with ire_stq coming from interface X + * and ipif coming from interface Y. Thus whenever interface + * X goes down, remove all references to it by checking both + * on ire_ipif and ire_stq. */ if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || (ire->ire_type == IRE_CACHE && @@ -1601,14 +1645,10 @@ ill_down(ill_t *ill) ip_stack_t *ipst = ill->ill_ipst; /* Blow off any IREs dependent on this ILL. */ - ire_walk(ill_downi, (char *)ill, ipst); + ire_walk(ill_downi, ill, ipst); /* Remove any conn_*_ill depending on this ill */ ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); - - if (ill->ill_group != NULL) { - illgrp_delete(ill); - } } /* @@ -1621,9 +1661,9 @@ ill_downi(ire_t *ire, char *ill_arg) ill_t *ill = (ill_t *)ill_arg; /* - * ip_newroute creates IRE_CACHE with ire_stq coming from - * interface X and ipif coming from interface Y, if interface - * X and Y are part of the same IPMP group. Thus whenever interface + * Source address selection makes it possible for IRE_CACHE + * entries to be created with ire_stq coming from interface X + * and ipif coming from interface Y. Thus whenever interface * X goes down, remove all references to it by checking both * on ire_ipif and ire_stq. */ @@ -3696,16 +3736,39 @@ nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, } /* - * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an - * IPMP group, make sure all ill's in the group adopt the new policy. Send - * up RTS_IFINFO routing socket messages for each interface whose flags we - * change. + * Helper function for ill_forward_set(). + */ +static void +ill_forward_set_on_ill(ill_t *ill, boolean_t enable) +{ + ip_stack_t *ipst = ill->ill_ipst; + + ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); + + ip1dbg(("ill_forward_set: %s %s forwarding on %s", + (enable ? "Enabling" : "Disabling"), + (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); + mutex_enter(&ill->ill_lock); + if (enable) + ill->ill_flags |= ILLF_ROUTER; + else + ill->ill_flags &= ~ILLF_ROUTER; + mutex_exit(&ill->ill_lock); + if (ill->ill_isv6) + ill_set_nce_router_flags(ill, enable); + /* Notify routing socket listeners of this change. */ + ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); +} + +/* + * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing + * socket messages for each interface whose flags we change. */ int ill_forward_set(ill_t *ill, boolean_t enable) { - ill_group_t *illgrp; - ip_stack_t *ipst = ill->ill_ipst; + ipmp_illgrp_t *illg; + ip_stack_t *ipst = ill->ill_ipst; ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); @@ -3716,47 +3779,23 @@ ill_forward_set(ill_t *ill, boolean_t enable) if (IS_LOOPBACK(ill)) return (EINVAL); - /* - * If the ill is in an IPMP group, set the forwarding policy on all - * members of the group to the same value. - */ - illgrp = ill->ill_group; - if (illgrp != NULL) { - ill_t *tmp_ill; + if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { + /* + * Update all of the interfaces in the group. + */ + illg = ill->ill_grp; + ill = list_head(&illg->ig_if); + for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) + ill_forward_set_on_ill(ill, enable); - for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL; - tmp_ill = tmp_ill->ill_group_next) { - ip1dbg(("ill_forward_set: %s %s forwarding on %s", - (enable ? "Enabling" : "Disabling"), - (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"), - tmp_ill->ill_name)); - mutex_enter(&tmp_ill->ill_lock); - if (enable) - tmp_ill->ill_flags |= ILLF_ROUTER; - else - tmp_ill->ill_flags &= ~ILLF_ROUTER; - mutex_exit(&tmp_ill->ill_lock); - if (tmp_ill->ill_isv6) - ill_set_nce_router_flags(tmp_ill, enable); - /* Notify routing socket listeners of this change. */ - ip_rts_ifmsg(tmp_ill->ill_ipif); - } - } else { - ip1dbg(("ill_forward_set: %s %s forwarding on %s", - (enable ? "Enabling" : "Disabling"), - (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); - mutex_enter(&ill->ill_lock); - if (enable) - ill->ill_flags |= ILLF_ROUTER; - else - ill->ill_flags &= ~ILLF_ROUTER; - mutex_exit(&ill->ill_lock); - if (ill->ill_isv6) - ill_set_nce_router_flags(ill, enable); - /* Notify routing socket listeners of this change. */ - ip_rts_ifmsg(ill->ill_ipif); + /* + * Update the IPMP meta-interface. + */ + ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); + return (0); } + ill_forward_set_on_ill(ill, enable); return (0); } @@ -3772,7 +3811,12 @@ ill_set_nce_router_flags(ill_t *ill, boolean_t enable) nce_t *nce; for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE); + /* + * NOTE: we're called separately for each ill in an illgrp, + * so don't match across the illgrp. + */ + nce = ndp_lookup_v6(ill, B_FALSE, &ipif->ipif_v6lcl_addr, + B_FALSE); if (nce != NULL) { mutex_enter(&nce->nce_lock); if (enable) @@ -3928,36 +3972,45 @@ ill_next(ill_walk_context_t *ctx, ill_t *lastill) } /* - * Check interface name for correct format which is name+ppa. - * name can contain characters and digits, the right most digits - * make up the ppa number. use of octal is not allowed, name must contain - * a ppa, return pointer to the start of ppa. - * In case of error return NULL. + * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ + * The final number (PPA) must not have any leading zeros. Upon success, a + * pointer to the start of the PPA is returned; otherwise NULL is returned. */ static char * ill_get_ppa_ptr(char *name) { - int namelen = mi_strlen(name); + int namelen = strlen(name); + int end_ndx = namelen - 1; + int ppa_ndx, i; - int len = namelen; + /* + * Check that the first character is [a-zA-Z], and that the last + * character is [0-9]. + */ + if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) + return (NULL); - name += len; - while (len > 0) { - name--; - if (*name < '0' || *name > '9') + /* + * Set `ppa_ndx' to the PPA start, and check for leading zeroes. + */ + for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) + if (!isdigit(name[ppa_ndx - 1])) break; - len--; - } - /* empty string, all digits, or no trailing digits */ - if (len == 0 || len == (int)namelen) + if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) return (NULL); - name++; - /* check for attempted use of octal */ - if (*name == '0' && len != (int)namelen - 1) - return (NULL); - return (name); + /* + * Check that the intermediate characters are [a-z0-9.] + */ + for (i = 1; i < ppa_ndx; i++) { + if (!isalpha(name[i]) && !isdigit(name[i]) && + name[i] != '.' && name[i] != '_') { + return (NULL); + } + } + + return (name + ppa_ndx); } /* @@ -4037,8 +4090,10 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, } else if (ILL_CAN_WAIT(ill, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (error != NULL) @@ -4102,6 +4157,7 @@ static void ill_glist_delete(ill_t *ill) { ip_stack_t *ipst; + phyint_t *phyi; if (ill == NULL) return; @@ -4139,8 +4195,41 @@ ill_glist_delete(ill_t *ill) ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, ill->ill_name_length); - ill_phyint_free(ill); + ASSERT(ill->ill_phyint != NULL); + phyi = ill->ill_phyint; + ill->ill_phyint = NULL; + + /* + * ill_init allocates a phyint always to store the copy + * of flags relevant to phyint. At that point in time, we could + * not assign the name and hence phyint_illv4/v6 could not be + * initialized. Later in ipif_set_values, we assign the name to + * the ill, at which point in time we assign phyint_illv4/v6. + * Thus we don't rely on phyint_illv6 to be initialized always. + */ + if (ill->ill_flags & ILLF_IPV6) + phyi->phyint_illv6 = NULL; + else + phyi->phyint_illv4 = NULL; + + if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { + rw_exit(&ipst->ips_ill_g_lock); + return; + } + + /* + * There are no ills left on this phyint; pull it out of the phyint + * avl trees, and free it. + */ + if (phyi->phyint_ifindex > 0) { + avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, + phyi); + avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, + phyi); + } rw_exit(&ipst->ips_ill_g_lock); + + phyint_free(phyi); } /* @@ -4367,30 +4456,32 @@ ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) return (0); } -/* Initialize the per phyint (per IPMP group) ipsq used for serialization */ +/* Initialize the per phyint ipsq used for serialization */ static boolean_t -ipsq_init(ill_t *ill) +ipsq_init(ill_t *ill, boolean_t enter) { ipsq_t *ipsq; + ipxop_t *ipx; - /* Init the ipsq and impicitly enter as writer */ - ill->ill_phyint->phyint_ipsq = - kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); - if (ill->ill_phyint->phyint_ipsq == NULL) + if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) return (B_FALSE); - ipsq = ill->ill_phyint->phyint_ipsq; - ipsq->ipsq_phyint_list = ill->ill_phyint; - ill->ill_phyint->phyint_ipsq_next = NULL; + + ill->ill_phyint->phyint_ipsq = ipsq; + ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; + ipx->ipx_ipsq = ipsq; + ipsq->ipsq_next = ipsq; + ipsq->ipsq_phyint = ill->ill_phyint; mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); - ipsq->ipsq_refs = 1; - ipsq->ipsq_writer = curthread; - ipsq->ipsq_reentry_cnt = 1; + mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ + if (enter) { + ipx->ipx_writer = curthread; + ipx->ipx_forced = B_FALSE; + ipx->ipx_reentry_cnt = 1; #ifdef DEBUG - ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, - IPSQ_STACK_DEPTH); + ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); #endif - (void) strcpy(ipsq->ipsq_name, ill->ill_name); + } return (B_TRUE); } @@ -4468,7 +4559,7 @@ ill_init(queue_t *q, ill_t *ill) ill->ill_ppa = UINT_MAX; ill->ill_fastpath_list = &ill->ill_fastpath_list; - if (!ipsq_init(ill)) { + if (!ipsq_init(ill, B_TRUE)) { freemsg(info_mp); mi_free(frag_ptr); mi_free(ill->ill_phyint); @@ -4589,29 +4680,16 @@ loopback_kstat_update(kstat_t *ksp, int rw) } /* - * Has ifindex been plumbed already. - * Compares both phyint_ifindex and phyint_group_ifindex. + * Has ifindex been plumbed already? */ static boolean_t phyint_exists(uint_t index, ip_stack_t *ipst) { - phyint_t *phyi; - ASSERT(index != 0); ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); - /* - * Indexes are stored in the phyint - a common structure - * to both IPv4 and IPv6. - */ - phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); - for (; phyi != NULL; - phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, - phyi, AVL_AFTER)) { - if (phyi->phyint_ifindex == index || - phyi->phyint_group_ifindex == index) - return (B_TRUE); - } - return (B_FALSE); + + return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, + &index, NULL) != NULL); } /* Pick a unique ifindex */ @@ -4675,9 +4753,9 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, { ill_t *ill; ipif_t *ipif; + ipsq_t *ipsq; kstat_named_t *kn; boolean_t isloopback; - ipsq_t *old_ipsq; in6_addr_t ov6addr; isloopback = mi_strcmp(name, ipif_loopback_name) == 0; @@ -4761,16 +4839,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, ill->ill_net_type = IRE_LOOPBACK; /* Initialize the ipsq */ - if (!ipsq_init(ill)) + if (!ipsq_init(ill, B_FALSE)) goto done; - ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL; - ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--; - ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0); -#ifdef DEBUG - ill->ill_phyint->phyint_ipsq->ipsq_depth = 0; -#endif - ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE); + ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE); if (ipif == NULL) goto done; @@ -4807,7 +4879,7 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, ill->ill_frag_free_num_pkts = 0; ill->ill_last_frag_clean_time = 0; - old_ipsq = ill->ill_phyint->phyint_ipsq; + ipsq = ill->ill_phyint->phyint_ipsq; if (ill_glist_insert(ill, "lo", isv6) != 0) cmn_err(CE_PANIC, "cannot insert loopback interface"); @@ -4824,13 +4896,11 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, sctp_update_ipif_addr(ipif, ov6addr); /* - * If the ipsq was changed in ill_phyint_reinit free the old ipsq. + * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. + * If so, free our original one. */ - if (old_ipsq != ill->ill_phyint->phyint_ipsq) { - /* Loopback ills aren't in any IPMP group */ - ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP)); - ipsq_delete(old_ipsq); - } + if (ipsq != ill->ill_phyint->phyint_ipsq) + ipsq_delete(ipsq); /* * Delay this till the ipif is allocated as ipif_allocate @@ -4871,12 +4941,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, done: if (ill != NULL) { if (ill->ill_phyint != NULL) { - ipsq_t *ipsq; - ipsq = ill->ill_phyint->phyint_ipsq; if (ipsq != NULL) { - ipsq->ipsq_ipst = NULL; - kmem_free(ipsq, sizeof (ipsq_t)); + ipsq->ipsq_phyint = NULL; + ipsq_delete(ipsq); } mi_free(ill->ill_phyint); } @@ -4954,9 +5022,11 @@ ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, } else if (ILL_CAN_WAIT(ill, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); rw_exit(&ipst->ips_ill_g_lock); mutex_exit(&ill->ill_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (err != NULL) @@ -5294,6 +5364,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) dl_info_ack_t *dlia; ip_m_t *ipm; dl_qos_cl_sel1_t *sel1; + int min_mtu; ASSERT(IAM_WRITER_ILL(ill)); @@ -5336,7 +5407,14 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) ill->ill_bcast_addr_length = brdcst_addr_length; ill->ill_phys_addr_length = phys_addr_length; ill->ill_sap_length = sap_length; - ill->ill_max_frag = dlia->dl_max_sdu; + + /* + * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, + * but we must ensure a minimum IP MTU is used since other bits of + * IP will fly apart otherwise. + */ + min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; + ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); ill->ill_max_mtu = ill->ill_max_frag; ill->ill_type = ipm->ip_m_type; @@ -5358,7 +5436,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) * the wakeup. */ (void) ipif_allocate(ill, 0, IRE_LOCAL, - dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE); + dlia->dl_provider_style != DL_STYLE2, B_TRUE); mutex_enter(&ill->ill_lock); ASSERT(ill->ill_dlpi_style_set == 0); ill->ill_dlpi_style_set = 1; @@ -5397,8 +5475,13 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) /* * Free ill_resolver_mp and ill_bcast_mp as things could have * changed now. + * + * NOTE: The IPMP meta-interface is special-cased because it starts + * with no underlying interfaces (and thus an unknown broadcast + * address length), but we enforce that an interface is broadcast- + * capable as part of allowing it to join a group. */ - if (ill->ill_bcast_addr_length == 0) { + if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { if (ill->ill_resolver_mp != NULL) freemsg(ill->ill_resolver_mp); if (ill->ill_bcast_mp != NULL) @@ -5451,6 +5534,11 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) if (!ill->ill_isv6) ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; } + + /* For IPMP, PHYI_IPMP should already be set by ipif_allocate() */ + if (ill->ill_mactype == SUNW_DL_IPMP) + ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); + /* By default an interface does not support any CoS marking */ ill->ill_flags &= ~ILLF_COS_ENABLED; @@ -5552,16 +5640,18 @@ ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) } /* - * Find any non-virtual, not condemned, and up multicast capable interface - * given an IP instance and zoneid. Order of preference is: + * Find a mulitcast-capable ipif given an IP instance and zoneid. + * The ipif must be up, and its ill must multicast-capable, not + * condemned, not an underlying interface in an IPMP group, and + * not a VNI interface. Order of preference: * - * 1. normal - * 1.1 normal, but deprecated - * 2. point to point - * 2.1 point to point, but deprecated - * 3. link local - * 3.1 link local, but deprecated - * 4. loopback. + * 1a. normal + * 1b. normal, but deprecated + * 2a. point to point + * 2b. point to point, but deprecated + * 3a. link local + * 3b. link local, but deprecated + * 4. loopback. */ ipif_t * ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) @@ -5580,7 +5670,7 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) for (; ill != NULL; ill = ill_next(&ctx, ill)) { mutex_enter(&ill->ill_lock); - if (IS_VNI(ill) || !ILL_CAN_LOOKUP(ill) || + if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) || !(ill->ill_flags & ILLF_MULTICAST)) { mutex_exit(&ill->ill_lock); continue; @@ -5736,10 +5826,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, } else if (IPIF_CAN_WAIT(ipif, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); rw_exit(&ipst->ips_ill_g_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (error != NULL) @@ -5761,15 +5853,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, } /* - * Look for an ipif with the specified address. For point-point links - * we look for matches on either the destination address and the local - * address, but we ignore the check on the local address if IPIF_UNNUMBERED - * is set. - * Matches on a specific ill if match_ill is set. + * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). */ -ipif_t * -ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, - mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) +static ipif_t * +ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp, + zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, + ip_stack_t *ipst) { ipif_t *ipif; ill_t *ill; @@ -5788,7 +5877,8 @@ ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, repeat: ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (match_ill != NULL && ill != match_ill) { + if (match_ill != NULL && ill != match_ill && + (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { continue; } GRAB_CONN_LOCK(q); @@ -5817,10 +5907,12 @@ repeat: } else if (IPIF_CAN_WAIT(ipif, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); rw_exit(&ipst->ips_ill_g_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (error != NULL) @@ -5894,11 +5986,40 @@ ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) } /* + * Lookup an ipif with the specified address. For point-to-point links we + * look for matches on either the destination address or the local address, + * but we skip the local address check if IPIF_UNNUMBERED is set. If the + * `match_ill' argument is non-NULL, the lookup is restricted to that ill + * (or illgrp if `match_ill' is in an IPMP group). + */ +ipif_t * +ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, + mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) +{ + return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp, + func, error, ipst)); +} + +/* + * Special abbreviated version of ipif_lookup_addr() that doesn't match + * `match_ill' across the IPMP group. This function is only needed in some + * corner-cases; almost everything should use ipif_lookup_addr(). + */ +static ipif_t * +ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) +{ + ASSERT(match_ill != NULL); + return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES, + NULL, NULL, NULL, NULL, ipst)); +} + +/* * Look for an ipif with the specified address. For point-point links * we look for matches on either the destination address and the local * address, but we ignore the check on the local address if IPIF_UNNUMBERED * is set. - * Matches on a specific ill if match_ill is set. + * If the `match_ill' argument is non-NULL, the lookup is restricted to that + * ill (or illgrp if `match_ill' is in an IPMP group). * Return the zoneid for the ipif which matches. ALL_ZONES if no match. */ zoneid_t @@ -5918,7 +6039,8 @@ ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) repeat: ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (match_ill != NULL && ill != match_ill) { + if (match_ill != NULL && ill != match_ill && + !IS_IN_SAME_ILLGRP(ill, match_ill)) { continue; } mutex_enter(&ill->ill_lock); @@ -6008,7 +6130,7 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) /* * The callers of this function wants to know the * interface on which they have to send the replies - * back. For IRE_CACHES that have ire_stq and ire_ipif + * back. For IREs that have ire_stq and ire_ipif * derived from different ills, we really don't care * what we return here. */ @@ -6109,30 +6231,6 @@ ipif_is_freeable(ipif_t *ipif) } /* - * This func does not prevent refcnt from increasing. But if - * the caller has taken steps to that effect, then this func - * can be used to determine whether the ipifs marked with IPIF_MOVING - * have become quiescent and can be moved in a failover/failback. - */ -static ipif_t * -ill_quiescent_to_move(ill_t *ill) -{ - ipif_t *ipif; - - ASSERT(MUTEX_HELD(&ill->ill_lock)); - - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (ipif->ipif_state_flags & IPIF_MOVING) { - if (ipif->ipif_refcnt != 0 || - !IPIF_DOWN_OK(ipif)) { - return (ipif); - } - } - } - return (NULL); -} - -/* * The ipif/ill/ire has been refreled. Do the tail processing. * Determine if the ipif or ill in question has become quiescent and if so * wakeup close and/or restart any queued pending ioctl that is waiting @@ -6144,87 +6242,61 @@ ipif_ill_refrele_tail(ill_t *ill) mblk_t *mp; conn_t *connp; ipsq_t *ipsq; + ipxop_t *ipx; ipif_t *ipif; dl_notify_ind_t *dlindp; ASSERT(MUTEX_HELD(&ill->ill_lock)); - if ((ill->ill_state_flags & ILL_CONDEMNED) && - ill_is_freeable(ill)) { - /* ill_close may be waiting */ + if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { + /* ip_modclose() may be waiting */ cv_broadcast(&ill->ill_cv); } - /* ipsq can't change because ill_lock is held */ ipsq = ill->ill_phyint->phyint_ipsq; - if (ipsq->ipsq_waitfor == 0) { - /* Not waiting for anything, just return. */ - mutex_exit(&ill->ill_lock); - return; - } - ASSERT(ipsq->ipsq_pending_mp != NULL && - ipsq->ipsq_pending_ipif != NULL); - /* - * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF. - * Last ipif going down needs to down the ill, so ill_ire_cnt must - * be zero for restarting an ioctl that ends up downing the ill. - */ - ipif = ipsq->ipsq_pending_ipif; - if (ipif->ipif_ill != ill) { - /* The ioctl is pending on some other ill. */ - mutex_exit(&ill->ill_lock); - return; - } + mutex_enter(&ipsq->ipsq_lock); + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); + if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ + goto unlock; + + ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); + + ipif = ipx->ipx_pending_ipif; + if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ + goto unlock; - switch (ipsq->ipsq_waitfor) { + switch (ipx->ipx_waitfor) { case IPIF_DOWN: - if (!ipif_is_quiescent(ipif)) { - mutex_exit(&ill->ill_lock); - return; - } + if (!ipif_is_quiescent(ipif)) + goto unlock; break; case IPIF_FREE: - if (!ipif_is_freeable(ipif)) { - mutex_exit(&ill->ill_lock); - return; - } + if (!ipif_is_freeable(ipif)) + goto unlock; break; - case ILL_DOWN: - if (!ill_is_quiescent(ill)) { - mutex_exit(&ill->ill_lock); - return; - } + if (!ill_is_quiescent(ill)) + goto unlock; break; case ILL_FREE: /* - * case ILL_FREE arises only for loopback. otherwise ill_delete - * waits synchronously in ip_close, and no message is queued in - * ipsq_pending_mp at all in this case + * ILL_FREE is only for loopback; normal ill teardown waits + * synchronously in ip_modclose() without using ipx_waitfor, + * handled by the cv_broadcast() at the top of this function. */ - if (!ill_is_freeable(ill)) { - mutex_exit(&ill->ill_lock); - return; - } - break; - - case ILL_MOVE_OK: - if (ill_quiescent_to_move(ill) != NULL) { - mutex_exit(&ill->ill_lock); - return; - } + if (!ill_is_freeable(ill)) + goto unlock; break; default: - cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n", - (void *)ipsq, ipsq->ipsq_waitfor); + cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", + (void *)ipsq, ipx->ipx_waitfor); } - /* - * Incr refcnt for the qwriter_ip call below which - * does a refrele - */ - ill_refhold_locked(ill); + ill_refhold_locked(ill); /* for qwriter_ip() call below */ + mutex_exit(&ipx->ipx_lock); mp = ipsq_pending_mp_get(ipsq, &connp); + mutex_exit(&ipsq->ipsq_lock); mutex_exit(&ill->ill_lock); ASSERT(mp != NULL); @@ -6249,6 +6321,7 @@ ipif_ill_refrele_tail(ill_t *ill) return; default: ASSERT(0); + ill_refrele(ill); } break; @@ -6268,6 +6341,11 @@ ipif_ill_refrele_tail(ill_t *ill) cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " "db_type %d\n", (void *)mp, mp->b_datap->db_type); } + return; +unlock: + mutex_exit(&ipsq->ipsq_lock); + mutex_exit(&ipx->ipx_lock); + mutex_exit(&ill->ill_lock); } #ifdef DEBUG @@ -6902,10 +6980,23 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, ipif = ipif_arg; if (ipif_arg != NULL) match_flags |= MATCH_IRE_ILL; +again: gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); - if (gw_ire == NULL) + if (gw_ire == NULL) { + /* + * With IPMP, we allow host routes to influence in.mpathd's + * target selection. However, if the test addresses are on + * their own network, the above lookup will fail since the + * underlying IRE_INTERFACEs are marked hidden. So allow + * hidden test IREs to be found and try again. + */ + if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) { + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; + goto again; + } return (ENETUNREACH); + } /* * We create one of three types of IREs as a result of this request @@ -7355,9 +7446,11 @@ void ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, ill_t *pending_ill) { - conn_t *connp = NULL; + conn_t *connp; + ipxop_t *ipx = ipsq->ipsq_xop; ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); + ASSERT(MUTEX_HELD(&ipx->ipx_lock)); ASSERT(func != NULL); mp->b_queue = q; @@ -7366,14 +7459,14 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, switch (type) { case CUR_OP: - if (ipsq->ipsq_mptail != NULL) { - ASSERT(ipsq->ipsq_mphead != NULL); - ipsq->ipsq_mptail->b_next = mp; + if (ipx->ipx_mptail != NULL) { + ASSERT(ipx->ipx_mphead != NULL); + ipx->ipx_mptail->b_next = mp; } else { - ASSERT(ipsq->ipsq_mphead == NULL); - ipsq->ipsq_mphead = mp; + ASSERT(ipx->ipx_mphead == NULL); + ipx->ipx_mphead = mp; } - ipsq->ipsq_mptail = mp; + ipx->ipx_mptail = mp; break; case NEW_OP: @@ -7385,6 +7478,15 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, ipsq->ipsq_xopq_mphead = mp; } ipsq->ipsq_xopq_mptail = mp; + ipx->ipx_ipsq_queued = B_TRUE; + break; + + case SWITCH_OP: + ASSERT(ipsq->ipsq_swxop != NULL); + /* only one switch operation is currently allowed */ + ASSERT(ipsq->ipsq_switch_mp == NULL); + ipsq->ipsq_switch_mp = mp; + ipx->ipx_ipsq_queued = B_TRUE; break; default: cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); @@ -7392,55 +7494,273 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, if (CONN_Q(q) && pending_ill != NULL) { connp = Q_TO_CONN(q); - ASSERT(MUTEX_HELD(&connp->conn_lock)); connp->conn_oper_pending_ill = pending_ill; } } /* - * Return the mp at the head of the ipsq. After emptying the ipsq - * look at the next ioctl, if this ioctl is complete. Otherwise - * return, we will resume when we complete the current ioctl. - * The current ioctl will wait till it gets a response from the - * driver below. + * Dequeue the next message that requested exclusive access to this IPSQ's + * xop. Specifically: + * + * 1. If we're still processing the current operation on `ipsq', then + * dequeue the next message for the operation (from ipx_mphead), or + * return NULL if there are no queued messages for the operation. + * These messages are queued via CUR_OP to qwriter_ip() and friends. + * + * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is + * not set) see if the ipsq has requested an xop switch. If so, switch + * `ipsq' to a different xop. Xop switches only happen when joining or + * leaving IPMP groups and require a careful dance -- see the comments + * in-line below for details. If we're leaving a group xop or if we're + * joining a group xop and become writer on it, then we proceed to (3). + * Otherwise, we return NULL and exit the xop. + * + * 3. For each IPSQ in the xop, return any switch operation stored on + * ipsq_switch_mp (set via SWITCH_OP); these must be processed before + * any other messages queued on the IPSQ. Otherwise, dequeue the next + * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. + * Note that if the phyint tied to `ipsq' is not using IPMP there will + * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for + * each phyint in the group, including the IPMP meta-interface phyint. */ static mblk_t * ipsq_dq(ipsq_t *ipsq) { + ill_t *illv4, *illv6; mblk_t *mp; + ipsq_t *xopipsq; + ipsq_t *leftipsq = NULL; + ipxop_t *ipx; + phyint_t *phyi = ipsq->ipsq_phyint; + ip_stack_t *ipst = ipsq->ipsq_ipst; + boolean_t emptied = B_FALSE; - ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); + /* + * Grab all the locks we need in the defined order (ill_g_lock -> + * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. + */ + rw_enter(&ipst->ips_ill_g_lock, + ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); + mutex_enter(&ipsq->ipsq_lock); + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); - mp = ipsq->ipsq_mphead; - if (mp != NULL) { - ipsq->ipsq_mphead = mp->b_next; - if (ipsq->ipsq_mphead == NULL) - ipsq->ipsq_mptail = NULL; - mp->b_next = NULL; - return (mp); + /* + * Dequeue the next message associated with the current exclusive + * operation, if any. + */ + if ((mp = ipx->ipx_mphead) != NULL) { + ipx->ipx_mphead = mp->b_next; + if (ipx->ipx_mphead == NULL) + ipx->ipx_mptail = NULL; + mp->b_next = (void *)ipsq; + goto out; } - if (ipsq->ipsq_current_ipif != NULL) - return (NULL); - mp = ipsq->ipsq_xopq_mphead; - if (mp != NULL) { - ipsq->ipsq_xopq_mphead = mp->b_next; - if (ipsq->ipsq_xopq_mphead == NULL) - ipsq->ipsq_xopq_mptail = NULL; - mp->b_next = NULL; - return (mp); + + if (ipx->ipx_current_ipif != NULL) + goto empty; + + if (ipsq->ipsq_swxop != NULL) { + /* + * The exclusive operation that is now being completed has + * requested a switch to a different xop. This happens + * when an interface joins or leaves an IPMP group. Joins + * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). + * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb + * (phyint_free()), or interface plumb for an ill type + * not in the IPMP group (ip_rput_dlpi_writer()). + * + * Xop switches are not allowed on the IPMP meta-interface. + */ + ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); + ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); + DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); + + if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { + /* + * We're switching back to our own xop, so we have two + * xop's to drain/exit: our own, and the group xop + * that we are leaving. + * + * First, pull ourselves out of the group ipsq list. + * This is safe since we're writer on ill_g_lock. + */ + ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); + + xopipsq = ipx->ipx_ipsq; + while (xopipsq->ipsq_next != ipsq) + xopipsq = xopipsq->ipsq_next; + + xopipsq->ipsq_next = ipsq->ipsq_next; + ipsq->ipsq_next = ipsq; + ipsq->ipsq_xop = ipsq->ipsq_swxop; + ipsq->ipsq_swxop = NULL; + + /* + * Second, prepare to exit the group xop. The actual + * ipsq_exit() is done at the end of this function + * since we cannot hold any locks across ipsq_exit(). + * Note that although we drop the group's ipx_lock, no + * threads can proceed since we're still ipx_writer. + */ + leftipsq = xopipsq; + mutex_exit(&ipx->ipx_lock); + + /* + * Third, set ipx to point to our own xop (which was + * inactive and therefore can be entered). + */ + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); + ASSERT(ipx->ipx_writer == NULL); + ASSERT(ipx->ipx_current_ipif == NULL); + } else { + /* + * We're switching from our own xop to a group xop. + * The requestor of the switch must ensure that the + * group xop cannot go away (e.g. by ensuring the + * phyint associated with the xop cannot go away). + * + * If we can become writer on our new xop, then we'll + * do the drain. Otherwise, the current writer of our + * new xop will do the drain when it exits. + * + * First, splice ourselves into the group IPSQ list. + * This is safe since we're writer on ill_g_lock. + */ + ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); + + xopipsq = ipsq->ipsq_swxop->ipx_ipsq; + while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) + xopipsq = xopipsq->ipsq_next; + + xopipsq->ipsq_next = ipsq; + ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; + ipsq->ipsq_xop = ipsq->ipsq_swxop; + ipsq->ipsq_swxop = NULL; + + /* + * Second, exit our own xop, since it's now unused. + * This is safe since we've got the only reference. + */ + ASSERT(ipx->ipx_writer == curthread); + ipx->ipx_writer = NULL; + VERIFY(--ipx->ipx_reentry_cnt == 0); + ipx->ipx_ipsq_queued = B_FALSE; + mutex_exit(&ipx->ipx_lock); + + /* + * Third, set ipx to point to our new xop, and check + * if we can become writer on it. If we cannot, then + * the current writer will drain the IPSQ group when + * it exits. Our ipsq_xop is guaranteed to be stable + * because we're still holding ipsq_lock. + */ + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); + if (ipx->ipx_writer != NULL || + ipx->ipx_current_ipif != NULL) { + goto out; + } + } + + /* + * Fourth, become writer on our new ipx before we continue + * with the drain. Note that we never dropped ipsq_lock + * above, so no other thread could've raced with us to + * become writer first. Also, we're holding ipx_lock, so + * no other thread can examine the ipx right now. + */ + ASSERT(ipx->ipx_current_ipif == NULL); + ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); + VERIFY(ipx->ipx_reentry_cnt++ == 0); + ipx->ipx_writer = curthread; + ipx->ipx_forced = B_FALSE; +#ifdef DEBUG + ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); +#endif } - return (NULL); + + xopipsq = ipsq; + do { + /* + * So that other operations operate on a consistent and + * complete phyint, a switch message on an IPSQ must be + * handled prior to any other operations on that IPSQ. + */ + if ((mp = xopipsq->ipsq_switch_mp) != NULL) { + xopipsq->ipsq_switch_mp = NULL; + ASSERT(mp->b_next == NULL); + mp->b_next = (void *)xopipsq; + goto out; + } + + if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { + xopipsq->ipsq_xopq_mphead = mp->b_next; + if (xopipsq->ipsq_xopq_mphead == NULL) + xopipsq->ipsq_xopq_mptail = NULL; + mp->b_next = (void *)xopipsq; + goto out; + } + } while ((xopipsq = xopipsq->ipsq_next) != ipsq); +empty: + /* + * There are no messages. Further, we are holding ipx_lock, hence no + * new messages can end up on any IPSQ in the xop. + */ + ipx->ipx_writer = NULL; + ipx->ipx_forced = B_FALSE; + VERIFY(--ipx->ipx_reentry_cnt == 0); + ipx->ipx_ipsq_queued = B_FALSE; + emptied = B_TRUE; +#ifdef DEBUG + ipx->ipx_depth = 0; +#endif +out: + mutex_exit(&ipx->ipx_lock); + mutex_exit(&ipsq->ipsq_lock); + + /* + * If we completely emptied the xop, then wake up any threads waiting + * to enter any of the IPSQ's associated with it. + */ + if (emptied) { + xopipsq = ipsq; + do { + if ((phyi = xopipsq->ipsq_phyint) == NULL) + continue; + + illv4 = phyi->phyint_illv4; + illv6 = phyi->phyint_illv6; + + GRAB_ILL_LOCKS(illv4, illv6); + if (illv4 != NULL) + cv_broadcast(&illv4->ill_cv); + if (illv6 != NULL) + cv_broadcast(&illv6->ill_cv); + RELEASE_ILL_LOCKS(illv4, illv6); + } while ((xopipsq = xopipsq->ipsq_next) != ipsq); + } + rw_exit(&ipst->ips_ill_g_lock); + + /* + * Now that all locks are dropped, exit the IPSQ we left. + */ + if (leftipsq != NULL) + ipsq_exit(leftipsq); + + return (mp); } /* * Enter the ipsq corresponding to ill, by waiting synchronously till * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq * will have to drain completely before ipsq_enter returns success. - * ipsq_current_ipif will be set if some exclusive ioctl is in progress, - * and the ipsq_exit logic will start the next enqueued ioctl after - * completion of the current ioctl. If 'force' is used, we don't wait - * for the enqueued ioctls. This is needed when a conn_close wants to + * ipx_current_ipif will be set if some exclusive op is in progress, + * and the ipsq_exit logic will start the next enqueued op after + * completion of the current op. If 'force' is used, we don't wait + * for the enqueued ops. This is needed when a conn_close wants to * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb * of an ill can also use this option. But we dont' use it currently. */ @@ -7449,13 +7769,16 @@ boolean_t ipsq_enter(ill_t *ill, boolean_t force, int type) { ipsq_t *ipsq; + ipxop_t *ipx; boolean_t waited_enough = B_FALSE; /* - * Holding the ill_lock prevents <ill-ipsq> assocs from changing. - * Since the <ill-ipsq> assocs could change while we wait for the - * writer, it is easier to wait on a fixed global rather than try to - * cv_wait on a changing ipsq. + * Note that the relationship between ill and ipsq is fixed as long as + * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the + * relationship between the IPSQ and xop cannot change. However, + * since we cannot hold ipsq_lock across the cv_wait(), it may change + * while we're waiting. We wait on ill_cv and rely on ipsq_exit() + * waking up all ills in the xop when it becomes available. */ mutex_enter(&ill->ill_lock); for (;;) { @@ -7466,34 +7789,35 @@ ipsq_enter(ill_t *ill, boolean_t force, int type) ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); - if (ipsq->ipsq_writer == NULL && - (type == CUR_OP || ipsq->ipsq_current_ipif == NULL || - waited_enough)) { + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); + + if (ipx->ipx_writer == NULL && (type == CUR_OP || + ipx->ipx_current_ipif == NULL || waited_enough)) break; - } else if (ipsq->ipsq_writer != NULL) { + + if (!force || ipx->ipx_writer != NULL) { + mutex_exit(&ipx->ipx_lock); mutex_exit(&ipsq->ipsq_lock); cv_wait(&ill->ill_cv, &ill->ill_lock); } else { + mutex_exit(&ipx->ipx_lock); mutex_exit(&ipsq->ipsq_lock); - if (force) { - (void) cv_timedwait(&ill->ill_cv, - &ill->ill_lock, - lbolt + ENTER_SQ_WAIT_TICKS); - waited_enough = B_TRUE; - continue; - } else { - cv_wait(&ill->ill_cv, &ill->ill_lock); - } + (void) cv_timedwait(&ill->ill_cv, + &ill->ill_lock, lbolt + ENTER_SQ_WAIT_TICKS); + waited_enough = B_TRUE; } } - ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL); - ASSERT(ipsq->ipsq_reentry_cnt == 0); - ipsq->ipsq_writer = curthread; - ipsq->ipsq_reentry_cnt++; + ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); + ASSERT(ipx->ipx_reentry_cnt == 0); + ipx->ipx_writer = curthread; + ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); + ipx->ipx_reentry_cnt++; #ifdef DEBUG - ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IPSQ_STACK_DEPTH); + ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); #endif + mutex_exit(&ipx->ipx_lock); mutex_exit(&ipsq->ipsq_lock); mutex_exit(&ill->ill_lock); return (B_TRUE); @@ -7513,14 +7837,13 @@ ill_perim_exit(ill_t *ill) /* * The ipsq_t (ipsq) is the synchronization data structure used to serialize - * certain critical operations like plumbing (i.e. most set ioctls), - * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP - * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per - * IPMP group. The ipsq serializes exclusive ioctls issued by applications - * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple - * threads executing in the ipsq. Responses from the driver pertain to the - * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated - * as part of bringing up the interface) and are enqueued in ipsq_mphead. + * certain critical operations like plumbing (i.e. most set ioctls), multicast + * joins, igmp/mld timers, etc. There is one ipsq per phyint. The ipsq + * serializes exclusive ioctls issued by applications on a per ipsq basis in + * ipsq_xopq_mphead. It also protects against multiple threads executing in + * the ipsq. Responses from the driver pertain to the current ioctl (say a + * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing + * up the interface) and are enqueued in ipx_mphead. * * If a thread does not want to reenter the ipsq when it is already writer, * it must make sure that the specified reentry point to be called later @@ -7528,29 +7851,33 @@ ill_perim_exit(ill_t *ill) * point must never ever try to enter the ipsq again. Otherwise it can lead * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. * When the thread that is currently exclusive finishes, it (ipsq_exit) - * dequeues the requests waiting to become exclusive in ipsq_mphead and calls - * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit + * dequeues the requests waiting to become exclusive in ipx_mphead and calls + * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next * ioctl if the current ioctl has completed. If the current ioctl is still * in progress it simply returns. The current ioctl could be waiting for - * a response from another module (arp_ or the driver or could be waiting for - * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp - * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the + * a response from another module (arp or the driver or could be waiting for + * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp + * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the * execution of the ioctl and ipsq_exit does not start the next ioctl unless - * ipsq_current_ipif is clear which happens only on ioctl completion. + * ipx_current_ipif is NULL which happens only once the ioctl is complete and + * all associated DLPI operations have completed. */ /* - * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of - * ipif or ill can be specified). The caller ensures ipif or ill is valid by - * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued - * completion. + * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' + * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ + * on success, or NULL on failure. The caller ensures ipif/ill is valid by + * refholding it as necessary. If the IPSQ cannot be entered and `func' is + * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ + * can be entered. If `func' is NULL, then `q' and `mp' are ignored. */ ipsq_t * ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, boolean_t reentry_ok) { ipsq_t *ipsq; + ipxop_t *ipx; /* Only 1 of ipif or ill can be specified */ ASSERT((ipif != NULL) ^ (ill != NULL)); @@ -7558,13 +7885,15 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, ill = ipif->ipif_ill; /* - * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock - * ipsq of an ill can't change when ill_lock is held. + * lock ordering: conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. + * ipx of an ipsq can't change when ipsq_lock is held. */ GRAB_CONN_LOCK(q); mutex_enter(&ill->ill_lock); ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); /* * 1. Enter the ipsq if we are already writer and reentry is ok. @@ -7572,30 +7901,32 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, * 'func' nor any of its callees must ever attempt to enter the ipsq * again. Otherwise it can lead to an infinite loop * 2. Enter the ipsq if there is no current writer and this attempted - * entry is part of the current ioctl or operation + * entry is part of the current operation * 3. Enter the ipsq if there is no current writer and this is a new - * ioctl (or operation) and the ioctl (or operation) queue is - * empty and there is no ioctl (or operation) currently in progress + * operation and the operation queue is empty and there is no + * operation currently in progress */ - if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) || - (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL && - ipsq->ipsq_current_ipif == NULL))) || - (ipsq->ipsq_writer == curthread && reentry_ok)) { + if ((ipx->ipx_writer == curthread && reentry_ok) || + (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && + !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL)))) { /* Success. */ - ipsq->ipsq_reentry_cnt++; - ipsq->ipsq_writer = curthread; + ipx->ipx_reentry_cnt++; + ipx->ipx_writer = curthread; + ipx->ipx_forced = B_FALSE; + mutex_exit(&ipx->ipx_lock); mutex_exit(&ipsq->ipsq_lock); mutex_exit(&ill->ill_lock); RELEASE_CONN_LOCK(q); #ifdef DEBUG - ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, - IPSQ_STACK_DEPTH); + ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); #endif return (ipsq); } - ipsq_enq(ipsq, q, mp, func, type, ill); + if (func != NULL) + ipsq_enq(ipsq, q, mp, func, type, ill); + mutex_exit(&ipx->ipx_lock); mutex_exit(&ipsq->ipsq_lock); mutex_exit(&ill->ill_lock); RELEASE_CONN_LOCK(q); @@ -7630,188 +7961,58 @@ qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, } /* - * If there are more than ILL_GRP_CNT ills in a group, - * we use kmem alloc'd buffers, else use the stack - */ -#define ILL_GRP_CNT 14 -/* - * Drain the ipsq, if there are messages on it, and then leave the ipsq. - * Called by a thread that is currently exclusive on this ipsq. + * Exit the specified IPSQ. If this is the final exit on it then drain it + * prior to exiting. Caller must be writer on the specified IPSQ. */ void ipsq_exit(ipsq_t *ipsq) { + mblk_t *mp; + ipsq_t *mp_ipsq; queue_t *q; - mblk_t *mp; - ipsq_func_t func; - int next; - ill_t **ill_list = NULL; - size_t ill_list_size = 0; - int cnt = 0; - boolean_t need_ipsq_free = B_FALSE; - ip_stack_t *ipst = ipsq->ipsq_ipst; + phyint_t *phyi; + ipsq_func_t func; ASSERT(IAM_WRITER_IPSQ(ipsq)); - mutex_enter(&ipsq->ipsq_lock); - ASSERT(ipsq->ipsq_reentry_cnt >= 1); - if (ipsq->ipsq_reentry_cnt != 1) { - ipsq->ipsq_reentry_cnt--; - mutex_exit(&ipsq->ipsq_lock); + + ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); + if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { + ipsq->ipsq_xop->ipx_reentry_cnt--; return; } - mp = ipsq_dq(ipsq); - while (mp != NULL) { -again: - mutex_exit(&ipsq->ipsq_lock); - func = (ipsq_func_t)mp->b_prev; - q = (queue_t *)mp->b_queue; - mp->b_prev = NULL; - mp->b_queue = NULL; - - /* - * If 'q' is an conn queue, it is valid, since we did a - * a refhold on the connp, at the start of the ioctl. - * If 'q' is an ill queue, it is valid, since close of an - * ill will clean up the 'ipsq'. - */ - (*func)(ipsq, q, mp, NULL); - - mutex_enter(&ipsq->ipsq_lock); + for (;;) { + phyi = ipsq->ipsq_phyint; mp = ipsq_dq(ipsq); - } - - mutex_exit(&ipsq->ipsq_lock); - - /* - * Need to grab the locks in the right order. Need to - * atomically check (under ipsq_lock) that there are no - * messages before relinquishing the ipsq. Also need to - * atomically wakeup waiters on ill_cv while holding ill_lock. - * Holding ill_g_lock ensures that ipsq list of ills is stable. - * If we need to call ill_split_ipsq and change <ill-ipsq> we need - * to grab ill_g_lock as writer. - */ - rw_enter(&ipst->ips_ill_g_lock, - ipsq->ipsq_split ? RW_WRITER : RW_READER); + mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; - /* ipsq_refs can't change while ill_g_lock is held as reader */ - if (ipsq->ipsq_refs != 0) { - /* At most 2 ills v4/v6 per phyint */ - cnt = ipsq->ipsq_refs << 1; - ill_list_size = cnt * sizeof (ill_t *); /* - * If memory allocation fails, we will do the split - * the next time ipsq_exit is called for whatever reason. - * As long as the ipsq_split flag is set the need to - * split is remembered. + * If we've changed to a new IPSQ, and the phyint associated + * with the old one has gone away, free the old IPSQ. Note + * that this cannot happen while the IPSQ is in a group. */ - ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); - if (ill_list != NULL) - cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt); - } - mutex_enter(&ipsq->ipsq_lock); - mp = ipsq_dq(ipsq); - if (mp != NULL) { - /* oops, some message has landed up, we can't get out */ - if (ill_list != NULL) - ill_unlock_ills(ill_list, cnt); - rw_exit(&ipst->ips_ill_g_lock); - if (ill_list != NULL) - kmem_free(ill_list, ill_list_size); - ill_list = NULL; - ill_list_size = 0; - cnt = 0; - goto again; - } + if (mp_ipsq != ipsq && phyi == NULL) { + ASSERT(ipsq->ipsq_next == ipsq); + ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); + ipsq_delete(ipsq); + } - /* - * Split only if no ioctl is pending and if memory alloc succeeded - * above. - */ - if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL && - ill_list != NULL) { - /* - * No new ill can join this ipsq since we are holding the - * ill_g_lock. Hence ill_split_ipsq can safely traverse the - * ipsq. ill_split_ipsq may fail due to memory shortage. - * If so we will retry on the next ipsq_exit. - */ - ipsq->ipsq_split = ill_split_ipsq(ipsq); - } + if (mp == NULL) + break; - /* - * We are holding the ipsq lock, hence no new messages can - * land up on the ipsq, and there are no messages currently. - * Now safe to get out. Wake up waiters and relinquish ipsq - * atomically while holding ill locks. - */ - ipsq->ipsq_writer = NULL; - ipsq->ipsq_reentry_cnt--; - ASSERT(ipsq->ipsq_reentry_cnt == 0); -#ifdef DEBUG - ipsq->ipsq_depth = 0; -#endif - mutex_exit(&ipsq->ipsq_lock); - /* - * For IPMP this should wake up all ills in this ipsq. - * We need to hold the ill_lock while waking up waiters to - * avoid missed wakeups. But there is no need to acquire all - * the ill locks and then wakeup. If we have not acquired all - * the locks (due to memory failure above) ill_signal_ipsq_ills - * wakes up ills one at a time after getting the right ill_lock - */ - ill_signal_ipsq_ills(ipsq, ill_list != NULL); - if (ill_list != NULL) - ill_unlock_ills(ill_list, cnt); - if (ipsq->ipsq_refs == 0) - need_ipsq_free = B_TRUE; - rw_exit(&ipst->ips_ill_g_lock); - if (ill_list != 0) - kmem_free(ill_list, ill_list_size); + q = mp->b_queue; + func = (ipsq_func_t)mp->b_prev; + ipsq = mp_ipsq; + mp->b_next = mp->b_prev = NULL; + mp->b_queue = NULL; - if (need_ipsq_free) { /* - * Free the ipsq. ipsq_refs can't increase because ipsq can't be - * looked up. ipsq can be looked up only thru ill or phyint - * and there are no ills/phyint on this ipsq. + * If 'q' is an conn queue, it is valid, since we did a + * a refhold on the conn at the start of the ioctl. + * If 'q' is an ill queue, it is valid, since close of an + * ill will clean up its IPSQ. */ - ipsq_delete(ipsq); - } - - /* - * Now that we're outside the IPSQ, start any IGMP/MLD timers. We - * can't start these inside the IPSQ since e.g. igmp_start_timers() -> - * untimeout() (inside the IPSQ, waiting for an executing timeout to - * finish) could deadlock with igmp_timeout_handler() -> ipsq_enter() - * (executing the timeout, waiting to get inside the IPSQ). - * - * However, there is one exception to the above: if this thread *is* - * the IGMP/MLD timeout handler thread, then we must not start its - * timer until the current handler is done. - */ - mutex_enter(&ipst->ips_igmp_timer_lock); - if (curthread != ipst->ips_igmp_timer_thread) { - next = ipst->ips_igmp_deferred_next; - ipst->ips_igmp_deferred_next = INFINITY; - mutex_exit(&ipst->ips_igmp_timer_lock); - - if (next != INFINITY) - igmp_start_timers(next, ipst); - } else { - mutex_exit(&ipst->ips_igmp_timer_lock); - } - - mutex_enter(&ipst->ips_mld_timer_lock); - if (curthread != ipst->ips_mld_timer_thread) { - next = ipst->ips_mld_deferred_next; - ipst->ips_mld_deferred_next = INFINITY; - mutex_exit(&ipst->ips_mld_timer_lock); - - if (next != INFINITY) - mld_start_timers(next, ipst); - } else { - mutex_exit(&ipst->ips_mld_timer_lock); + (*func)(ipsq, q, mp, NULL); } } @@ -7822,15 +8023,17 @@ again: void ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) { + ipxop_t *ipx = ipsq->ipsq_xop; + ASSERT(IAM_WRITER_IPSQ(ipsq)); + ASSERT(ipx->ipx_current_ipif == NULL); + ASSERT(ipx->ipx_current_ioctl == 0); - mutex_enter(&ipsq->ipsq_lock); - ASSERT(ipsq->ipsq_current_ipif == NULL); - ASSERT(ipsq->ipsq_current_ioctl == 0); - ipsq->ipsq_current_done = B_FALSE; - ipsq->ipsq_current_ipif = ipif; - ipsq->ipsq_current_ioctl = ioccmd; - mutex_exit(&ipsq->ipsq_lock); + ipx->ipx_current_done = B_FALSE; + ipx->ipx_current_ioctl = ioccmd; + mutex_enter(&ipx->ipx_lock); + ipx->ipx_current_ipif = ipif; + mutex_exit(&ipx->ipx_lock); } /* @@ -7844,17 +8047,18 @@ ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) void ipsq_current_finish(ipsq_t *ipsq) { - ipif_t *ipif = ipsq->ipsq_current_ipif; + ipxop_t *ipx = ipsq->ipsq_xop; t_uscalar_t dlpi_pending = DL_PRIM_INVAL; + ipif_t *ipif = ipx->ipx_current_ipif; ASSERT(IAM_WRITER_IPSQ(ipsq)); /* - * For SIOCSLIFREMOVEIF, the ipif has been already been blown away + * For SIOCLIFREMOVEIF, the ipif has been already been blown away * (but in that case, IPIF_CHANGING will already be clear and no * pending DLPI messages can remain). */ - if (ipsq->ipsq_current_ioctl != SIOCLIFREMOVEIF) { + if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { ill_t *ill = ipif->ipif_ill; mutex_enter(&ill->ill_lock); @@ -7863,12 +8067,14 @@ ipsq_current_finish(ipsq_t *ipsq) mutex_exit(&ill->ill_lock); } - mutex_enter(&ipsq->ipsq_lock); - ipsq->ipsq_current_ioctl = 0; - ipsq->ipsq_current_done = B_TRUE; - if (dlpi_pending == DL_PRIM_INVAL) - ipsq->ipsq_current_ipif = NULL; - mutex_exit(&ipsq->ipsq_lock); + ASSERT(!ipx->ipx_current_done); + ipx->ipx_current_done = B_TRUE; + ipx->ipx_current_ioctl = 0; + if (dlpi_pending == DL_PRIM_INVAL) { + mutex_enter(&ipx->ipx_lock); + ipx->ipx_current_ipif = NULL; + mutex_exit(&ipx->ipx_lock); + } } /* @@ -7884,123 +8090,38 @@ ipsq_flush(ill_t *ill) mblk_t *prev; mblk_t *mp; mblk_t *mp_next; - ipsq_t *ipsq; + ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; ASSERT(IAM_WRITER_ILL(ill)); - ipsq = ill->ill_phyint->phyint_ipsq; + /* * Flush any messages sent up by the driver. */ - mutex_enter(&ipsq->ipsq_lock); - for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) { + mutex_enter(&ipx->ipx_lock); + for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { mp_next = mp->b_next; q = mp->b_queue; if (q == ill->ill_rq || q == ill->ill_wq) { - /* Remove the mp from the ipsq */ + /* dequeue mp */ if (prev == NULL) - ipsq->ipsq_mphead = mp->b_next; + ipx->ipx_mphead = mp->b_next; else prev->b_next = mp->b_next; - if (ipsq->ipsq_mptail == mp) { + if (ipx->ipx_mptail == mp) { ASSERT(mp_next == NULL); - ipsq->ipsq_mptail = prev; + ipx->ipx_mptail = prev; } inet_freemsg(mp); } else { prev = mp; } } - mutex_exit(&ipsq->ipsq_lock); + mutex_exit(&ipx->ipx_lock); (void) ipsq_pending_mp_cleanup(ill, NULL); ipsq_xopq_mp_cleanup(ill, NULL); ill_pending_mp_cleanup(ill); } -/* ARGSUSED */ -int -ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, - ip_ioctl_cmd_t *ipip, void *ifreq) -{ - ill_t *ill; - struct lifreq *lifr = (struct lifreq *)ifreq; - boolean_t isv6; - conn_t *connp; - ip_stack_t *ipst; - - connp = Q_TO_CONN(q); - ipst = connp->conn_netstack->netstack_ip; - isv6 = connp->conn_af_isv6; - /* - * Set original index. - * Failover and failback move logical interfaces - * from one physical interface to another. The - * original index indicates the parent of a logical - * interface, in other words, the physical interface - * the logical interface will be moved back to on - * failback. - */ - - /* - * Don't allow the original index to be changed - * for non-failover addresses, autoconfigured - * addresses, or IPv6 link local addresses. - */ - if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) || - (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) { - return (EINVAL); - } - /* - * The new original index must be in use by some - * physical interface. - */ - ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL, - NULL, NULL, ipst); - if (ill == NULL) - return (ENXIO); - ill_refrele(ill); - - ipif->ipif_orig_ifindex = lifr->lifr_index; - /* - * When this ipif gets failed back, don't - * preserve the original id, as it is no - * longer applicable. - */ - ipif->ipif_orig_ipifid = 0; - /* - * For IPv4, change the original index of any - * multicast addresses associated with the - * ipif to the new value. - */ - if (!isv6) { - ilm_t *ilm; - - mutex_enter(&ipif->ipif_ill->ill_lock); - for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if (ilm->ilm_ipif == ipif) { - ilm->ilm_orig_ifindex = lifr->lifr_index; - } - } - mutex_exit(&ipif->ipif_ill->ill_lock); - } - return (0); -} - -/* ARGSUSED */ -int -ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, - ip_ioctl_cmd_t *ipip, void *ifreq) -{ - struct lifreq *lifr = (struct lifreq *)ifreq; - - /* - * Get the original interface index i.e the one - * before FAILOVER if it ever happened. - */ - lifr->lifr_index = ipif->ipif_orig_ifindex; - return (0); -} - /* * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, * refhold and return the associated ipif @@ -8087,8 +8208,6 @@ int ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, cmd_info_t *ci, ipsq_func_t func) { - sin_t *sin; - sin6_t *sin6; char *name; struct ifreq *ifr; struct lifreq *lifr; @@ -8132,9 +8251,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, * be trusted. */ ifr->ifr_name[IFNAMSIZ - 1] = '\0'; - sin = (sin_t *)&ifr->ifr_addr; name = ifr->ifr_name; - ci->ci_sin = sin; + ci->ci_sin = (sin_t *)&ifr->ifr_addr; ci->ci_sin6 = NULL; ci->ci_lifr = (struct lifreq *)ifr; } else { @@ -8148,14 +8266,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, */ lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; name = lifr->lifr_name; - sin = (sin_t *)&lifr->lifr_addr; - sin6 = (sin6_t *)&lifr->lifr_addr; - if (ipip->ipi_cmd == SIOCSLIFGROUPNAME) { - (void) strncpy(ci->ci_groupname, lifr->lifr_groupname, - LIFNAMSIZ); - } - ci->ci_sin = sin; - ci->ci_sin6 = sin6; + ci->ci_sin = (sin_t *)&lifr->lifr_addr; + ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; ci->ci_lifr = lifr; } @@ -8181,21 +8293,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, if (ipif == NULL) { if (err == EINPROGRESS) return (err); - if (ipip->ipi_cmd == SIOCLIFFAILOVER || - ipip->ipi_cmd == SIOCLIFFAILBACK) { - /* - * Need to try both v4 and v6 since this - * ioctl can come down either v4 or v6 - * socket. The lifreq.lifr_family passed - * down by this ioctl is AF_UNSPEC. - */ - ipif = ipif_lookup_on_name(name, - mi_strlen(name), B_FALSE, &exists, !isv6, - zoneid, (connp == NULL) ? q : - CONNP_TO_WQ(connp), mp, func, &err, ipst); - if (err == EINPROGRESS) - return (err); - } err = 0; /* Ensure we don't use it below */ } } @@ -8221,15 +8318,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, if (ipif == NULL) return (ENXIO); - /* - * Allow only GET operations if this ipif has been created - * temporarily due to a MOVE operation. - */ - if (ipif->ipif_replace_zero && !(ipip->ipi_flags & IPI_REPL)) { - ipif_refrele(ipif); - return (EINVAL); - } - ci->ci_ipif = ipif; return (0); } @@ -8247,15 +8335,15 @@ ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); - - while (ill != NULL) { + for (; ill != NULL; ill = ill_next(&ctx, ill)) { + if (IS_UNDER_IPMP(ill)) + continue; for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (ipif->ipif_zoneid == zoneid || ipif->ipif_zoneid == ALL_ZONES) numifs++; } - ill = ill_next(&ctx, ill); } rw_exit(&ipst->ips_ill_g_lock); return (numifs); @@ -8283,6 +8371,9 @@ ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) ill = ILL_START_WALK_ALL(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { + if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) + continue; + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if ((ipif->ipif_flags & IPIF_NOXMIT) && @@ -8491,6 +8582,8 @@ ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { + if (IS_UNDER_IPMP(ill)) + continue; for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (zoneid != ipif->ipif_zoneid && @@ -8760,6 +8853,9 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ill_first(list, list, &ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { + if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) + continue; + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if ((ipif->ipif_flags & IPIF_NOXMIT) && @@ -8795,6 +8891,7 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); + lifr->lifr_type = ill->ill_type; if (ipif->ipif_isv6) { sin6 = (sin6_t *)&lifr->lifr_addr; *sin6 = sin6_null; @@ -8828,23 +8925,6 @@ lif_copydone: return (0); } -/* ARGSUSED */ -int -ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin, - queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) -{ - ip_stack_t *ipst; - - if (q->q_next == NULL) - ipst = CONNQ_TO_IPST(q); - else - ipst = ILLQ_TO_IPST(q); - - /* Existence of b_cont->b_cont checked in ip_wput_nondata */ - ipst->ips_ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr; - return (0); -} - static void ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) { @@ -9038,8 +9118,7 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); } else { src_ipif = ipif_select_source_v6(dst_ill, - daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, - zoneid); + daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); } if (src_ipif == NULL) goto next_dst; @@ -9325,10 +9404,14 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, struct arpreq *ar; struct xarpreq *xar; int flags, alength; - char *lladdr; - ip_stack_t *ipst; + uchar_t *lladdr; + ire_t *ire; + ip_stack_t *ipst; ill_t *ill = ipif->ipif_ill; + ill_t *proxy_ill = NULL; + ipmp_arpent_t *entp = NULL; boolean_t if_arp_ioctl = B_FALSE; + boolean_t proxyarp = B_FALSE; ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); connp = Q_TO_CONN(q); @@ -9340,7 +9423,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ar = NULL; flags = xar->xarp_flags; - lladdr = LLADDR(&xar->xarp_ha); + lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); /* * Validate against user's link layer address length @@ -9359,7 +9442,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, xar = NULL; flags = ar->arp_flags; - lladdr = ar->arp_ha.sa_data; + lladdr = (uchar_t *)ar->arp_ha.sa_data; /* * Theoretically, the sa_family could tell us what link * layer type this operation is trying to deal with. By @@ -9379,6 +9462,51 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } } + ipaddr = sin->sin_addr.s_addr; + + /* + * IPMP ARP special handling: + * + * 1. Since ARP mappings must appear consistent across the group, + * prohibit changing ARP mappings on the underlying interfaces. + * + * 2. Since ARP mappings for IPMP data addresses are maintained by + * IP itself, prohibit changing them. + * + * 3. For proxy ARP, use a functioning hardware address in the group, + * provided one exists. If one doesn't, just add the entry as-is; + * ipmp_illgrp_refresh_arpent() will refresh it if things change. + */ + if (IS_UNDER_IPMP(ill)) { + if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) + return (EPERM); + } + if (IS_IPMP(ill)) { + ipmp_illgrp_t *illg = ill->ill_grp; + + switch (ipip->ipi_cmd) { + case SIOCSARP: + case SIOCSXARP: + proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); + if (proxy_ill != NULL) { + proxyarp = B_TRUE; + if (!ipmp_ill_is_active(proxy_ill)) + proxy_ill = ipmp_illgrp_next_ill(illg); + if (proxy_ill != NULL) + lladdr = proxy_ill->ill_phys_addr; + } + /* FALLTHRU */ + case SIOCDARP: + case SIOCDXARP: + ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL, + ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); + if (ire != NULL) { + ire_refrele(ire); + return (EPERM); + } + } + } + /* * We are going to pass up to ARP a packet chain that looks * like: @@ -9400,8 +9528,6 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, return (ENOMEM); } - ipaddr = sin->sin_addr.s_addr; - mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, (caddr_t)&ipaddr); if (mp2 == NULL) { @@ -9481,6 +9607,30 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, area->area_flags |= ACE_F_AUTHORITY; /* + * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it + * so that IP can update ARP as the active ills in the group change. + */ + if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD && + (area->area_flags & ACE_F_PERMANENT)) { + entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp); + + /* + * The second part of the conditional below handles a corner + * case: if this is proxy ARP and the IPMP group has no active + * interfaces, we can't send the request to ARP now since it + * won't be able to build an ACE. So we return success and + * notify ARP about the proxy ARP entry once an interface + * becomes active. + */ + if (entp == NULL || (proxyarp && proxy_ill == NULL)) { + mp2->b_cont = NULL; + inet_freemsg(mp1); + inet_freemsg(pending_mp); + return (entp == NULL ? ENOMEM : 0); + } + } + + /* * Before sending 'mp' to ARP, we have to clear the b_next * and b_prev. Otherwise if STREAMS encounters such a message * in freemsg(), (because ARP can close any time) it can cause @@ -9497,7 +9647,12 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, mutex_enter(&connp->conn_lock); mutex_enter(&ill->ill_lock); /* conn has not yet started closing, hence this can't fail */ - VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0); + if (ipip->ipi_flags & IPI_WR) { + VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), + pending_mp, 0) != 0); + } else { + VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0); + } mutex_exit(&ill->ill_lock); mutex_exit(&connp->conn_lock); @@ -9506,6 +9661,13 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion. */ putnext(ill->ill_rq, mp1); + + /* + * If we created an IPMP ARP entry, mark that we've notified ARP. + */ + if (entp != NULL) + ipmp_illgrp_mark_arpent(ill->ill_grp, entp); + return (EINPROGRESS); } @@ -9564,55 +9726,114 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, mp, func, &err, ipst); if (ipif == NULL) return (err); - if (ipif->ipif_id != 0 || - ipif->ipif_net_type != IRE_IF_RESOLVER) { + if (ipif->ipif_id != 0) { ipif_refrele(ipif); return (ENXIO); } } else { /* - * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with sdl_nlen == - * 0: use the IP address to figure out the ill. In the IPMP - * case, a simple forwarding table lookup will return the - * IRE_IF_RESOLVER for the first interface in the group, which - * might not be the interface on which the requested IP - * address was resolved due to the ill selection algorithm - * (see ip_newroute_get_dst_ill()). So we do a cache table - * lookup first: if the IRE cache entry for the IP address is - * still there, it will contain the ill pointer for the right - * interface, so we use that. If the cache entry has been - * flushed, we fall back to the forwarding table lookup. This - * should be rare enough since IRE cache entries have a longer - * life expectancy than ARP cache entries. + * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen + * of 0: use the IP address to find the ipif. If the IP + * address is an IPMP test address, ire_ftable_lookup() will + * find the wrong ill, so we first do an ipif_lookup_addr(). */ - ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL, - ipst); - if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || - ((ill = ire_to_ill(ire)) == NULL) || - (ill->ill_net_type != IRE_IF_RESOLVER)) { - if (ire != NULL) - ire_refrele(ire); - ire = ire_ftable_lookup(sin->sin_addr.s_addr, - 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, - NULL, MATCH_IRE_TYPE, ipst); + ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, + CONNP_TO_WQ(connp), mp, func, &err, ipst); + if (ipif == NULL) { + ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0, + IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL, + MATCH_IRE_TYPE, ipst); if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) { - if (ire != NULL) ire_refrele(ire); return (ENXIO); } + ipif = ill->ill_ipif; + ipif_refhold(ipif); + ire_refrele(ire); } - ASSERT(ire != NULL && ill != NULL); - ipif = ill->ill_ipif; - ipif_refhold(ipif); - ire_refrele(ire); } + + if (ipif->ipif_net_type != IRE_IF_RESOLVER) { + ipif_refrele(ipif); + return (ENXIO); + } + ci->ci_sin = sin; ci->ci_ipif = ipif; return (0); } /* + * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the + * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is + * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it + * up and thus an ill can join that illgrp. + * + * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than + * open()/close() primarily because close() is not allowed to fail or block + * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason + * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure + * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the + * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts + * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent + * state if I_UNLINK didn't occur. + * + * Note that for each plumb/unplumb operation, we may end up here more than + * once because of the way ifconfig works. However, it's OK to link the same + * illgrp more than once, or unlink an illgrp that's already unlinked. + */ +static int +ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) +{ + int err; + ip_stack_t *ipst = ill->ill_ipst; + + ASSERT(IS_IPMP(ill)); + ASSERT(IAM_WRITER_ILL(ill)); + + switch (ioccmd) { + case I_LINK: + return (ENOTSUP); + + case I_PLINK: + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); + rw_exit(&ipst->ips_ipmp_lock); + break; + + case I_PUNLINK: + /* + * Require all UP ipifs be brought down prior to unlinking the + * illgrp so any associated IREs (and other state) is torched. + */ + if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) + return (EBUSY); + + /* + * NOTE: We hold ipmp_lock across the unlink to prevent a race + * with an SIOCSLIFGROUPNAME request from an ill trying to + * join this group. Specifically: ills trying to join grab + * ipmp_lock and bump a "pending join" counter checked by + * ipmp_illgrp_unlink_grp(). During the unlink no new pending + * joins can occur (since we have ipmp_lock). Once we drop + * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not + * find the illgrp (since we unlinked it) and will return + * EAFNOSUPPORT. This will then take them back through the + * IPMP meta-interface plumbing logic in ifconfig, and thus + * back through I_PLINK above. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + err = ipmp_illgrp_unlink_grp(ill->ill_grp); + rw_exit(&ipst->ips_ipmp_lock); + return (err); + default: + break; + } + return (0); +} + +/* * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also * atomically set/clear the muxids. Also complete the ioctl by acking or * naking it. Note that the code is structured such that the link type, @@ -9697,7 +9918,7 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) if (ipsq == NULL) { ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, - NEW_OP, B_TRUE); + NEW_OP, B_FALSE); if (ipsq == NULL) { ill_refrele(ill); return; @@ -9728,6 +9949,11 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) err = EINVAL; goto done; } + + if (IS_IPMP(ill) && + (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) + goto done; + ill->ill_arp_muxid = islink ? li->l_index : 0; } else { /* @@ -9763,6 +9989,7 @@ static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, struct linkblk *li, boolean_t doconsist) { + int err = 0; ill_t *ill; queue_t *ipwq, *dwq; const char *name; @@ -9796,7 +10023,7 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, if (ipsq == NULL) { ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, - NEW_OP, B_TRUE); + NEW_OP, B_FALSE); if (ipsq == NULL) return (EINPROGRESS); entered_ipsq = B_TRUE; @@ -9811,12 +10038,14 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, */ if ((islink && ill->ill_ip_muxid != 0) || (!islink && ill->ill_arp_muxid != 0)) { - if (entered_ipsq) - ipsq_exit(ipsq); - return (EINVAL); + err = EINVAL; + goto done; } } + if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) + goto done; + /* * As part of I_{P}LINKing, stash the number of downstream modules and * the read queue of the module immediately below IP in the ill. @@ -9853,11 +10082,11 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, ill_capability_reset(ill, B_FALSE); } ipsq_current_finish(ipsq); - +done: if (entered_ipsq) ipsq_exit(ipsq); - return (0); + return (err); } /* @@ -10124,8 +10353,9 @@ nak: } /* ip_wput hands off ARP IOCTL responses to us */ +/* ARGSUSED3 */ void -ip_sioctl_iocack(queue_t *q, mblk_t *mp) +ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) { struct arpreq *ar; struct xarpreq *xar; @@ -10136,7 +10366,6 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) struct iocblk *orig_iocp; ill_t *ill; conn_t *connp = NULL; - uint_t ioc_id; mblk_t *pending_mp; int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; int *flagsp; @@ -10146,6 +10375,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) int err; ip_stack_t *ipst; + ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); ill = q->q_ptr; ASSERT(ill != NULL); ipst = ill->ill_ipst; @@ -10185,10 +10415,14 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) iocp = (struct iocblk *)mp->b_rptr; /* - * Pick out the originating queue based on the ioc_id. + * Find the pending message; if we're exclusive, it'll be on our IPSQ. + * Otherwise, we can find it from our ioc_id. */ - ioc_id = iocp->ioc_id; - pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); + if (ipsq != NULL) + pending_mp = ipsq_pending_mp_get(ipsq, &connp); + else + pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id); + if (pending_mp == NULL) { ASSERT(connp == NULL); inet_freemsg(mp); @@ -10271,7 +10505,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) ire_refrele(ire); freemsg(mp); ip_ioctl_finish(q, orig_ioc_mp, - EINVAL, NO_COPYOUT, NULL); + EINVAL, NO_COPYOUT, ipsq); return; } *flagsp |= ATF_COM; @@ -10297,12 +10531,27 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) /* Ditch the internal IOCTL. */ freemsg(mp); ire_refrele(ire); - ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); + ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); return; } } /* + * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE + * on the IPMP meta-interface, ensure any ARP entries added in + * ip_sioctl_arp() are deleted. + */ + if (IS_IPMP(ill) && + ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) || + ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) { + ipmp_illgrp_t *illg = ill->ill_grp; + ipmp_arpent_t *entp; + + if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL) + ipmp_illgrp_destroy_arpent(illg, entp); + } + + /* * Delete the coresponding IRE_CACHE if any. * Reset the error if there was one (in case there was no entry * in arp.) @@ -10341,7 +10590,7 @@ errack: if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { err = iocp->ioc_error; freemsg(mp); - ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL); + ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq); return; } @@ -10355,7 +10604,7 @@ errack: sizeof (xar->xarp_ha.sdl_data)) { freemsg(mp); ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT, - NULL); + ipsq); return; } } @@ -10382,7 +10631,7 @@ errack: /* Ditch the internal IOCTL. */ freemsg(mp); /* Complete the original. */ - ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); + ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); } /* @@ -10397,7 +10646,7 @@ errack: * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. * - * Executed as a writer on the ill or ill group. + * Executed as a writer on the ill. * So no lock is needed to traverse the ipif chain, or examine the * phyint flags. */ @@ -10423,7 +10672,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, boolean_t found_sep = B_FALSE; conn_t *connp; zoneid_t zoneid; - int orig_ifindex = 0; ip_stack_t *ipst = CONNQ_TO_IPST(q); ASSERT(q->q_next == NULL); @@ -10513,61 +10761,10 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, if (ipsq == NULL) return (EINPROGRESS); - /* - * If the interface is failed, inactive or offlined, look for a working - * interface in the ill group and create the ipif there. If we can't - * find a good interface, create the ipif anyway so that in.mpathd can - * move it to the first repaired interface. - */ - if ((ill->ill_phyint->phyint_flags & - (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && - ill->ill_phyint->phyint_groupname_len != 0) { - phyint_t *phyi; - char *groupname = ill->ill_phyint->phyint_groupname; - - /* - * We're looking for a working interface, but it doesn't matter - * if it's up or down; so instead of following the group lists, - * we look at each physical interface and compare the groupname. - * We're only interested in interfaces with IPv4 (resp. IPv6) - * plumbed when we're adding an IPv4 (resp. IPv6) ipif. - * Otherwise we create the ipif on the failed interface. - */ - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - phyi = avl_first(&ipst->ips_phyint_g_list-> - phyint_list_avl_by_index); - for (; phyi != NULL; - phyi = avl_walk(&ipst->ips_phyint_g_list-> - phyint_list_avl_by_index, - phyi, AVL_AFTER)) { - if (phyi->phyint_groupname_len == 0) - continue; - ASSERT(phyi->phyint_groupname != NULL); - if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 && - !(phyi->phyint_flags & - (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && - (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) : - (phyi->phyint_illv4 != NULL))) { - break; - } - } - rw_exit(&ipst->ips_ill_g_lock); - - if (phyi != NULL) { - orig_ifindex = ill->ill_phyint->phyint_ifindex; - ill = (ill->ill_isv6 ? phyi->phyint_illv6 : - phyi->phyint_illv4); - } - } - - /* - * We are now exclusive on the ipsq, so an ill move will be serialized - * before or after us. - */ + /* We are now exclusive on the IPSQ */ ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(ill->ill_move_in_progress == B_FALSE); - if (found_sep && orig_ifindex == 0) { + if (found_sep) { /* Now see if there is an IPIF with this unit number. */ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { @@ -10580,14 +10777,11 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, /* * We use IRE_LOCAL for lo0:1 etc. for "receive only" use - * of lo0. We never come here when we plumb lo0:0. It - * happens in ipif_lookup_on_name. - * The specified unit number is ignored when we create the ipif on a - * different interface. However, we save it in ipif_orig_ipifid below so - * that the ipif fails back to the right position. - */ - if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ? - id : -1, IRE_LOCAL, B_TRUE)) == NULL) { + * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() + * instead. + */ + if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, + B_TRUE, B_TRUE)) == NULL) { err = ENOBUFS; goto done; } @@ -10604,14 +10798,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); } - /* Set ifindex and unit number for failback */ - if (err == 0 && orig_ifindex != 0) { - ipif->ipif_orig_ifindex = orig_ifindex; - if (found_sep) { - ipif->ipif_orig_ipifid = id; - } - } - done: ipsq_exit(ipsq); return (err); @@ -10672,7 +10858,6 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ill_delete(ill); mutex_enter(&connp->conn_lock); mutex_enter(&ill->ill_lock); - ASSERT(ill->ill_group == NULL); /* Are any references to this ill active */ if (ill_is_freeable(ill)) { @@ -10693,14 +10878,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } } - /* - * We are exclusive on the ipsq, so an ill move will be serialized - * before or after us. - */ - ASSERT(ill->ill_move_in_progress == B_FALSE); - if (ipif->ipif_id == 0) { - ipsq_t *ipsq; /* Find based on address */ @@ -10712,35 +10890,15 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, sin6 = (sin6_t *)sin; /* We are a writer, so we should be able to lookup */ - ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, - ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst); - if (ipif == NULL) { - /* - * Maybe the address in on another interface in - * the same IPMP group? We check this below. - */ - ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, - NULL, ALL_ZONES, NULL, NULL, NULL, NULL, - ipst); - } + ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, + ipst); } else { - ipaddr_t addr; - if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); - addr = sin->sin_addr.s_addr; /* We are a writer, so we should be able to lookup */ - ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL, - NULL, NULL, NULL, ipst); - if (ipif == NULL) { - /* - * Maybe the address in on another interface in - * the same IPMP group? We check this below. - */ - ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES, - NULL, NULL, NULL, NULL, ipst); - } + ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, + ipst); } if (ipif == NULL) { return (EADDRNOTAVAIL); @@ -10750,32 +10908,11 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * It is possible for a user to send an SIOCLIFREMOVEIF with * lifr_name of the physical interface but with an ip address * lifr_addr of a logical interface plumbed over it. - * So update ipsq_current_ipif once ipif points to the - * correct interface after doing ipif_lookup_addr(). + * So update ipx_current_ipif now that ipif points to the + * correct one. */ ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; - ASSERT(ipsq != NULL); - - mutex_enter(&ipsq->ipsq_lock); - ipsq->ipsq_current_ipif = ipif; - mutex_exit(&ipsq->ipsq_lock); - - /* - * When the address to be removed is hosted on a different - * interface, we check if the interface is in the same IPMP - * group as the specified one; if so we proceed with the - * removal. - * ill->ill_group is NULL when the ill is down, so we have to - * compare the group names instead. - */ - if (ipif->ipif_ill != ill && - (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 || - ill->ill_phyint->phyint_groupname_len == 0 || - mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname, - ill->ill_phyint->phyint_groupname) != 0)) { - ipif_refrele(ipif); - return (EADDRNOTAVAIL); - } + ipsq->ipsq_xop->ipx_current_ipif = ipif; /* This is a writer */ ipif_refrele(ipif); @@ -11072,7 +11209,7 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if (need_dl_down) ill_dl_down(ill); if (need_arp_down) - ipif_arp_down(ipif); + ipif_resolver_down(ipif); return (err); } @@ -11272,9 +11409,9 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if (need_dl_down) ill_dl_down(ill); - if (need_arp_down) - ipif_arp_down(ipif); + ipif_resolver_down(ipif); + return (err); } @@ -11323,144 +11460,8 @@ ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } /* - * part of ipmp, make this func return the active/inactive state and - * caller can set once atomically instead of multiple mutex_enter/mutex_exit - */ -/* - * This function either sets or clears the IFF_INACTIVE flag. - * - * As long as there are some addresses or multicast memberships on the - * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we - * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface - * will be used for outbound packets. - * - * Caller needs to verify the validity of setting IFF_INACTIVE. - */ -static void -phyint_inactive(phyint_t *phyi) -{ - ill_t *ill_v4; - ill_t *ill_v6; - ipif_t *ipif; - ilm_t *ilm; - - ill_v4 = phyi->phyint_illv4; - ill_v6 = phyi->phyint_illv6; - - /* - * No need for a lock while traversing the list since iam - * a writer - */ - if (ill_v4 != NULL) { - ASSERT(IAM_WRITER_ILL(ill_v4)); - for (ipif = ill_v4->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { - mutex_enter(&phyi->phyint_lock); - phyi->phyint_flags &= ~PHYI_INACTIVE; - mutex_exit(&phyi->phyint_lock); - return; - } - } - for (ilm = ill_v4->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { - mutex_enter(&phyi->phyint_lock); - phyi->phyint_flags &= ~PHYI_INACTIVE; - mutex_exit(&phyi->phyint_lock); - return; - } - } - } - if (ill_v6 != NULL) { - ill_v6 = phyi->phyint_illv6; - for (ipif = ill_v6->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { - mutex_enter(&phyi->phyint_lock); - phyi->phyint_flags &= ~PHYI_INACTIVE; - mutex_exit(&phyi->phyint_lock); - return; - } - } - for (ilm = ill_v6->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { - mutex_enter(&phyi->phyint_lock); - phyi->phyint_flags &= ~PHYI_INACTIVE; - mutex_exit(&phyi->phyint_lock); - return; - } - } - } - mutex_enter(&phyi->phyint_lock); - phyi->phyint_flags |= PHYI_INACTIVE; - mutex_exit(&phyi->phyint_lock); -} - -/* - * This function is called only when the phyint flags change. Currently - * called from ip_sioctl_flags. We re-do the broadcast nomination so - * that we can select a good ill. - */ -static void -ip_redo_nomination(phyint_t *phyi) -{ - ill_t *ill_v4; - - ill_v4 = phyi->phyint_illv4; - - if (ill_v4 != NULL && ill_v4->ill_group != NULL) { - ASSERT(IAM_WRITER_ILL(ill_v4)); - if (ill_v4->ill_group->illgrp_ill_count > 1) - ill_nominate_bcast_rcv(ill_v4->ill_group); - } -} - -/* - * Heuristic to check if ill is INACTIVE. - * Checks if ill has an ipif with an usable ip address. - * - * Return values: - * B_TRUE - ill is INACTIVE; has no usable ipif - * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif - */ -static boolean_t -ill_is_inactive(ill_t *ill) -{ - ipif_t *ipif; - - /* Check whether it is in an IPMP group */ - if (ill->ill_phyint->phyint_groupname == NULL) - return (B_FALSE); - - if (ill->ill_ipif_up_count == 0) - return (B_TRUE); - - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - uint64_t flags = ipif->ipif_flags; - - /* - * This ipif is usable if it is IPIF_UP and not a - * dedicated test address. A dedicated test address - * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED - * (note in particular that V6 test addresses are - * link-local data addresses and thus are marked - * IPIF_NOFAILOVER but not IPIF_DEPRECATED). - */ - if ((flags & IPIF_UP) && - ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) != - (IPIF_DEPRECATED|IPIF_NOFAILOVER))) - return (B_FALSE); - } - return (B_TRUE); -} - -/* - * Set interface flags. - * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, - * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST, - * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE. + * Set interface flags. Many flags require special handling (e.g., + * bringing the interface down); see below for details. * * NOTE : We really don't enforce that ipif_id zero should be used * for setting any flags other than IFF_LOGINT_FLAGS. This @@ -11478,17 +11479,16 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, { uint64_t turn_on; uint64_t turn_off; - int err; + int err = 0; phyint_t *phyi; ill_t *ill; - uint64_t intf_flags; + uint64_t intf_flags, cantchange_flags; boolean_t phyint_flags_modified = B_FALSE; uint64_t flags; struct ifreq *ifr; struct lifreq *lifr; boolean_t set_linklocal = B_FALSE; boolean_t zero_source = B_FALSE; - ip_stack_t *ipst; ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); @@ -11497,11 +11497,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ill = ipif->ipif_ill; phyi = ill->ill_phyint; - ipst = ill->ill_ipst; if (ipip->ipi_cmd_type == IF_CMD) { ifr = (struct ifreq *)if_req; - flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); + flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); } else { lifr = (struct lifreq *)if_req; flags = lifr->lifr_flags; @@ -11524,25 +11523,60 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, flags |= intf_flags & ~0xFFFF; /* - * First check which bits will change and then which will - * go on and off + * Explicitly fail attempts to change flags that are always invalid on + * an IPMP meta-interface. */ - turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE; - if (!turn_on) + if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) + return (EINVAL); + + /* + * Check which flags will change; silently ignore flags which userland + * is not allowed to control. (Because these flags may change between + * SIOCGLIFFLAGS and SIOCSLIFFLAGS, and that's outside of userland's + * control, we need to silently ignore them rather than fail.) + */ + cantchange_flags = IFF_CANTCHANGE; + if (IS_IPMP(ill)) + cantchange_flags |= IFF_IPMP_CANTCHANGE; + + turn_on = (flags ^ intf_flags) & ~cantchange_flags; + if (turn_on == 0) return (0); /* No change */ turn_off = intf_flags & turn_on; turn_on ^= turn_off; - err = 0; /* - * Don't allow any bits belonging to the logical interface - * to be set or cleared on the replacement ipif that was - * created temporarily during a MOVE. + * All test addresses must be IFF_DEPRECATED (to ensure source address + * selection avoids them) -- so force IFF_DEPRECATED on, and do not + * allow it to be turned off. */ - if (ipif->ipif_replace_zero && - ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) { + if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && + (turn_on|intf_flags) & IFF_NOFAILOVER) return (EINVAL); + + if (turn_on & IFF_NOFAILOVER) { + turn_on |= IFF_DEPRECATED; + flags |= IFF_DEPRECATED; + } + + /* + * On underlying interfaces, only allow applications to manage test + * addresses -- otherwise, they may get confused when the address + * moves as part of being brought up. Likewise, prevent an + * application-managed test address from being converted to a data + * address. To prevent migration of administratively up addresses in + * the kernel, we don't allow them to be converted either. + */ + if (IS_UNDER_IPMP(ill)) { + const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; + + if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) + return (EINVAL); + + if ((turn_off & IFF_NOFAILOVER) && + (flags & (appflags | IFF_UP | IFF_DUPLICATE))) + return (EINVAL); } /* @@ -11583,16 +11617,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } /* - * ILL cannot be part of a usesrc group and and IPMP group at the - * same time. No need to grab ill_g_usesrc_lock here, see - * synchronization notes in ip.c - */ - if (turn_on & PHYI_STANDBY && - ipif->ipif_ill->ill_usesrc_grp_next != NULL) { - return (EINVAL); - } - - /* * If we modify physical interface flags, we'll potentially need to * send up two routing socket messages for the changes (one for the * IPv4 ill, and another for the IPv6 ill). Note that here. @@ -11601,98 +11625,44 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, phyint_flags_modified = B_TRUE; /* - * If we are setting or clearing FAILED or STANDBY or OFFLINE, - * we need to flush the IRE_CACHES belonging to this ill. - * We handle this case here without doing the DOWN/UP dance - * like it is done for other flags. If some other flags are - * being turned on/off with FAILED/STANDBY/OFFLINE, the code - * below will handle it by bringing it down and then - * bringing it UP. + * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE + * (otherwise, we'd immediately use them, defeating standby). Also, + * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not + * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already + * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We + * also don't allow PHYI_STANDBY if VNI is enabled since its semantics + * will not be honored. */ - if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) { - ill_t *ill_v4, *ill_v6; - - ill_v4 = phyi->phyint_illv4; - ill_v6 = phyi->phyint_illv6; - + if (turn_on & PHYI_STANDBY) { /* - * First set the INACTIVE flag if needed. Then delete the ires. - * ire_add will atomically prevent creating new IRE_CACHEs - * unless hidden flag is set. - * PHYI_FAILED and PHYI_INACTIVE are exclusive + * No need to grab ill_g_usesrc_lock here; see the + * synchronization notes in ip.c. */ - if ((turn_on & PHYI_FAILED) && - ((intf_flags & PHYI_STANDBY) || - !ipst->ips_ipmp_enable_failback)) { - /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */ - phyi->phyint_flags &= ~PHYI_INACTIVE; - } - if ((turn_off & PHYI_FAILED) && - ((intf_flags & PHYI_STANDBY) || - (!ipst->ips_ipmp_enable_failback && - ill_is_inactive(ill)))) { - phyint_inactive(phyi); - } - - if (turn_on & PHYI_STANDBY) { - /* - * We implicitly set INACTIVE only when STANDBY is set. - * INACTIVE is also set on non-STANDBY phyint when user - * disables FAILBACK using configuration file. - * Do not allow STANDBY to be set on such INACTIVE - * phyint - */ - if (phyi->phyint_flags & PHYI_INACTIVE) - return (EINVAL); - if (!(phyi->phyint_flags & PHYI_FAILED)) - phyint_inactive(phyi); - } - if (turn_off & PHYI_STANDBY) { - if (ipst->ips_ipmp_enable_failback) { - /* - * Reset PHYI_INACTIVE. - */ - phyi->phyint_flags &= ~PHYI_INACTIVE; - } else if (ill_is_inactive(ill) && - !(phyi->phyint_flags & PHYI_FAILED)) { - /* - * Need to set INACTIVE, when user sets - * STANDBY on a non-STANDBY phyint and - * later resets STANDBY - */ - phyint_inactive(phyi); - } + if (ill->ill_usesrc_grp_next != NULL || + intf_flags & PHYI_INACTIVE) + return (EINVAL); + if (!(flags & PHYI_FAILED)) { + flags |= PHYI_INACTIVE; + turn_on |= PHYI_INACTIVE; } - /* - * We should always send up a message so that the - * daemons come to know of it. Note that the zeroth - * interface can be down and the check below for IPIF_UP - * will not make sense as we are actually setting - * a phyint flag here. We assume that the ipif used - * is always the zeroth ipif. (ip_rts_ifmsg does not - * send up any message for non-zero ipifs). - */ - phyint_flags_modified = B_TRUE; + } - if (ill_v4 != NULL) { - ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, ill_stq_cache_delete, - (char *)ill_v4, ill_v4); - illgrp_reset_schednext(ill_v4); - } - if (ill_v6 != NULL) { - ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, ill_stq_cache_delete, - (char *)ill_v6, ill_v6); - illgrp_reset_schednext(ill_v6); - } + if (turn_off & PHYI_STANDBY) { + flags &= ~PHYI_INACTIVE; + turn_off |= PHYI_INACTIVE; } /* + * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both + * would end up on. + */ + if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == + (PHYI_FAILED | PHYI_INACTIVE)) + return (EINVAL); + + /* * If ILLF_ROUTER changes, we need to change the ip forwarding - * status of the interface and, if the interface is part of an IPMP - * group, all other interfaces that are part of the same IPMP - * group. + * status of the interface. */ if ((turn_on | turn_off) & ILLF_ROUTER) (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); @@ -11718,33 +11688,31 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, mutex_exit(&ill->ill_phyint->phyint_lock); /* - * We do the broadcast and nomination here rather - * than waiting for a FAILOVER/FAILBACK to happen. In - * the case of FAILBACK from INACTIVE standby to the - * interface that has been repaired, PHYI_FAILED has not - * been cleared yet. If there are only two interfaces in - * that group, all we have is a FAILED and INACTIVE - * interface. If we do the nomination soon after a failback, - * the broadcast nomination code would select the - * INACTIVE interface for receiving broadcasts as FAILED is - * not yet cleared. As we don't want STANDBY/INACTIVE to - * receive broadcast packets, we need to redo nomination - * when the FAILED is cleared here. Thus, in general we - * always do the nomination here for FAILED, STANDBY - * and OFFLINE. + * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the + * same to the kernel: if any of them has been set by + * userland, the interface cannot be used for data traffic. */ - if (((turn_on | turn_off) & - (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) { - ip_redo_nomination(phyi); + if ((turn_on|turn_off) & + (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { + ASSERT(!IS_IPMP(ill)); + /* + * It's possible the ill is part of an "anonymous" + * IPMP group rather than a real group. In that case, + * there are no other interfaces in the group and thus + * no need to call ipmp_phyint_refresh_active(). + */ + if (IS_UNDER_IPMP(ill)) + ipmp_phyint_refresh_active(phyi); } + if (phyint_flags_modified) { if (phyi->phyint_illv4 != NULL) { ip_rts_ifmsg(phyi->phyint_illv4-> - ill_ipif); + ill_ipif, RTSQ_DEFAULT); } if (phyi->phyint_illv6 != NULL) { ip_rts_ifmsg(phyi->phyint_illv6-> - ill_ipif); + ill_ipif, RTSQ_DEFAULT); } } return (0); @@ -11785,15 +11753,17 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } /* - * The only flag changes that we currently take specific action on - * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, - * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and - * IPIF_PREFERRED. This is done by bring the ipif down, changing - * the flags and bringing it back up again. + * The only flag changes that we currently take specific action on are + * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, + * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and + * IPIF_NOFAILOVER. This is done by bring the ipif down, changing the + * flags and bringing it back up again. For IPIF_NOFAILOVER, the act + * of bringing it back up will trigger the address to be moved. */ if ((turn_on|turn_off) & (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| - ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) { + ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| + IPIF_NOFAILOVER)) { /* * Taking this ipif down, make sure we have * valid net and subnet bcast ire's for other @@ -11822,9 +11792,8 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) { ill_t *ill; phyint_t *phyi; - uint64_t turn_on; - uint64_t turn_off; - uint64_t intf_flags; + uint64_t turn_on, turn_off; + uint64_t intf_flags, cantchange_flags; boolean_t phyint_flags_modified = B_FALSE; int err = 0; boolean_t set_linklocal = B_FALSE; @@ -11839,12 +11808,15 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) phyi = ill->ill_phyint; intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; - turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP); + cantchange_flags = IFF_CANTCHANGE | IFF_UP; + if (IS_IPMP(ill)) + cantchange_flags |= IFF_IPMP_CANTCHANGE; + turn_on = (flags ^ intf_flags) & ~cantchange_flags; turn_off = intf_flags & turn_on; turn_on ^= turn_off; - if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) + if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) phyint_flags_modified = B_TRUE; /* @@ -11870,9 +11842,6 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) mutex_exit(&ill->ill_lock); mutex_exit(&phyi->phyint_lock); - if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) - ip_redo_nomination(phyi); - if (set_linklocal) (void) ipif_setlinklocal(ipif); @@ -11881,12 +11850,29 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) else ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; + /* + * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to + * the kernel: if any of them has been set by userland, the interface + * cannot be used for data traffic. + */ + if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { + ASSERT(!IS_IPMP(ill)); + /* + * It's possible the ill is part of an "anonymous" IPMP group + * rather than a real group. In that case, there are no other + * interfaces in the group and thus no need for us to call + * ipmp_phyint_refresh_active(). + */ + if (IS_UNDER_IPMP(ill)) + ipmp_phyint_refresh_active(phyi); + } + if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { /* * XXX ipif_up really does not know whether a phyint flags * was modified or not. So, it sends up information on * only one routing sockets message. As we don't bring up - * the interface and also set STANDBY/FAILED simultaneously + * the interface and also set PHYI_ flags simultaneously * it should be okay. */ err = ipif_up(ipif, q, mp); @@ -11898,14 +11884,14 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) if (phyint_flags_modified) { if (phyi->phyint_illv4 != NULL) { ip_rts_ifmsg(phyi->phyint_illv4-> - ill_ipif); + ill_ipif, RTSQ_DEFAULT); } if (phyi->phyint_illv6 != NULL) { ip_rts_ifmsg(phyi->phyint_illv6-> - ill_ipif); + ill_ipif, RTSQ_DEFAULT); } } else { - ip_rts_ifmsg(ipif); + ip_rts_ifmsg(ipif, RTSQ_DEFAULT); } /* * Update the flags in SCTP's IPIF list, ipif_up() will do @@ -12101,10 +12087,7 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * broadcast address makes sense. If it does, * there should be an IRE for it already. * Don't match on ipif, only on the ill - * since we are sharing these now. Don't use - * MATCH_IRE_ILL_GROUP as we are looking for - * the broadcast ire on this ill and each ill - * in the group has its own broadcast ire. + * since we are sharing these now. */ ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif, ALL_ZONES, NULL, @@ -12302,9 +12285,16 @@ int ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *if_req) { - ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); + + /* + * Since no applications should ever be setting metrics on underlying + * interfaces, we explicitly fail to smoke 'em out. + */ + if (IS_UNDER_IPMP(ipif->ipif_ill)) + return (EINVAL); + /* * Set interface metric. We don't use this for * anything but we keep track of it in case it is @@ -12332,6 +12322,7 @@ ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, /* Get interface metric. */ ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); + if (ipip->ipi_cmd_type == IF_CMD) { struct ifreq *ifr; @@ -12766,13 +12757,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, nipif->ipif_state_flags |= IPIF_CHANGING; } - mutex_exit(&ill->ill_lock); - if (lir->lir_maxmtu != 0) { ill->ill_max_mtu = lir->lir_maxmtu; - ill->ill_mtu_userspecified = 1; + ill->ill_user_mtu = lir->lir_maxmtu; mtu_walk = B_TRUE; } + mutex_exit(&ill->ill_lock); if (lir->lir_reachtime != 0) ill->ill_reachable_time = lir->lir_reachtime; @@ -12821,6 +12811,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ILL_UNMARK_CHANGING(ill); mutex_exit(&ill->ill_lock); + /* + * Refresh IPMP meta-interface MTU if necessary. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_illgrp_refresh_mtu(ill->ill_grp); + return (0); } @@ -13032,13 +13028,117 @@ ipif_assign_seqid(ipif_t *ipif) } /* + * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are + * administratively down (i.e., no DAD), of the same type, and locked. Note + * that the clone is complete -- including the seqid -- and the expectation is + * that the caller will either free or overwrite `sipif' before it's unlocked. + */ +static void +ipif_clone(const ipif_t *sipif, ipif_t *dipif) +{ + ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); + ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); + ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); + ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); + ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); + ASSERT(sipif->ipif_arp_del_mp == NULL); + ASSERT(dipif->ipif_arp_del_mp == NULL); + ASSERT(sipif->ipif_igmp_rpt == NULL); + ASSERT(dipif->ipif_igmp_rpt == NULL); + ASSERT(sipif->ipif_multicast_up == 0); + ASSERT(dipif->ipif_multicast_up == 0); + ASSERT(sipif->ipif_joined_allhosts == 0); + ASSERT(dipif->ipif_joined_allhosts == 0); + + dipif->ipif_mtu = sipif->ipif_mtu; + dipif->ipif_flags = sipif->ipif_flags; + dipif->ipif_metric = sipif->ipif_metric; + dipif->ipif_zoneid = sipif->ipif_zoneid; + dipif->ipif_v6subnet = sipif->ipif_v6subnet; + dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; + dipif->ipif_v6src_addr = sipif->ipif_v6src_addr; + dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; + dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; + dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; + + /* + * While dipif is down right now, it might've been up before. Since + * it's changing identity, its packet counters need to be reset. + */ + dipif->ipif_ib_pkt_count = 0; + dipif->ipif_ob_pkt_count = 0; + dipif->ipif_fo_pkt_count = 0; + + /* + * As per the comment atop the function, we assume that these sipif + * fields will be changed before sipif is unlocked. + */ + dipif->ipif_seqid = sipif->ipif_seqid; + dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp; + dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt; + dipif->ipif_state_flags = sipif->ipif_state_flags; +} + +/* + * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' + * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin + * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then + * transfer the xop to `dipif'. Requires that all ipifs are administratively + * down (i.e., no DAD), of the same type, and unlocked. + */ +static void +ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) +{ + ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; + int ipx_current_ioctl; + + ASSERT(sipif != dipif); + ASSERT(sipif != virgipif); + + /* + * Grab all of the locks that protect the ipif in a defined order. + */ + GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); + if (sipif > dipif) { + mutex_enter(&sipif->ipif_saved_ire_lock); + mutex_enter(&dipif->ipif_saved_ire_lock); + } else { + mutex_enter(&dipif->ipif_saved_ire_lock); + mutex_enter(&sipif->ipif_saved_ire_lock); + } + + ipif_clone(sipif, dipif); + if (virgipif != NULL) { + ipif_clone(virgipif, sipif); + mi_free(virgipif); + } + + mutex_exit(&sipif->ipif_saved_ire_lock); + mutex_exit(&dipif->ipif_saved_ire_lock); + RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); + + /* + * Transfer ownership of the current xop, if necessary. + */ + if (ipsq->ipsq_xop->ipx_current_ipif == sipif) { + ASSERT(ipsq->ipsq_xop->ipx_pending_ipif == NULL); + ipx_current_ioctl = ipsq->ipsq_xop->ipx_current_ioctl; + ipsq_current_finish(ipsq); + ipsq_current_start(ipsq, dipif, ipx_current_ioctl); + } + + if (virgipif == NULL) + mi_free(sipif); +} + +/* * Insert the ipif, so that the list of ipifs on the ill will be sorted * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will * be inserted into the first space available in the list. The value of * ipif_id will then be set to the appropriate value for its position. */ static int -ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) +ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) { ill_t *ill; ipif_t *tipif; @@ -13056,12 +13156,11 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) /* * In the case of lo0:0 we already hold the ill_g_lock. * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> - * ipif_insert. Another such caller is ipif_move. + * ipif_insert. */ if (acquire_g_lock) rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - if (acquire_ill_lock) - mutex_enter(&ill->ill_lock); + mutex_enter(&ill->ill_lock); id = ipif->ipif_id; tipifp = &(ill->ill_ipif); if (id == -1) { /* need to find a real id */ @@ -13075,8 +13174,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) } /* limit number of logical interfaces */ if (id >= ipst->ips_ip_addrs_per_if) { - if (acquire_ill_lock) - mutex_exit(&ill->ill_lock); + mutex_exit(&ill->ill_lock); if (acquire_g_lock) rw_exit(&ipst->ips_ill_g_lock); return (-1); @@ -13091,8 +13189,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) tipifp = &(tipif->ipif_next); } } else { - if (acquire_ill_lock) - mutex_exit(&ill->ill_lock); + mutex_exit(&ill->ill_lock); if (acquire_g_lock) rw_exit(&ipst->ips_ill_g_lock); return (-1); @@ -13102,25 +13199,22 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) ipif->ipif_next = tipif; *tipifp = ipif; - if (acquire_ill_lock) - mutex_exit(&ill->ill_lock); + mutex_exit(&ill->ill_lock); if (acquire_g_lock) rw_exit(&ipst->ips_ill_g_lock); + return (0); } static void -ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock) +ipif_remove(ipif_t *ipif) { ipif_t **ipifp; ill_t *ill = ipif->ipif_ill; ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); - if (acquire_ill_lock) - mutex_enter(&ill->ill_lock); - else - ASSERT(MUTEX_HELD(&ill->ill_lock)); + mutex_enter(&ill->ill_lock); ipifp = &ill->ill_ipif; for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { if (*ipifp == ipif) { @@ -13128,9 +13222,7 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock) break; } } - - if (acquire_ill_lock) - mutex_exit(&ill->ill_lock); + mutex_exit(&ill->ill_lock); } /* @@ -13149,10 +13241,12 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock) * second DL_INFO_ACK comes in from the driver. */ static ipif_t * -ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) +ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, + boolean_t insert) { ipif_t *ipif; - phyint_t *phyi; + phyint_t *phyi = ill->ill_phyint; + ip_stack_t *ipst = ill->ill_ipst; ip1dbg(("ipif_allocate(%s:%d ill %p)\n", ill->ill_name, id, (void *)ill)); @@ -13175,23 +13269,61 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) ipif->ipif_refcnt = 0; ipif->ipif_saved_ire_cnt = 0; - if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) { - mi_free(ipif); - return (NULL); + if (insert) { + if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) { + mi_free(ipif); + return (NULL); + } + /* -1 id should have been replaced by real id */ + id = ipif->ipif_id; + ASSERT(id >= 0); } - /* -1 id should have been replaced by real id */ - id = ipif->ipif_id; - ASSERT(id >= 0); if (ill->ill_name[0] != '\0') ipif_assign_seqid(ipif); /* - * Keep a copy of original id in ipif_orig_ipifid. Failback - * will attempt to restore the original id. The SIOCSLIFOINDEX - * ioctl sets ipif_orig_ipifid to zero. + * If this is ipif zero, configure ill/phyint-wide information. + * Defer most configuration until we're guaranteed we're attached. */ - ipif->ipif_orig_ipifid = id; + if (id == 0) { + if (ill->ill_mactype == SUNW_DL_IPMP) { + /* + * Set PHYI_IPMP and also set PHYI_FAILED since there + * are no active interfaces. Similarly, PHYI_RUNNING + * isn't set until the group has an active interface. + */ + mutex_enter(&phyi->phyint_lock); + phyi->phyint_flags |= (PHYI_IPMP | PHYI_FAILED); + mutex_exit(&phyi->phyint_lock); + + /* + * Create the illgrp (which must not exist yet because + * the zeroth ipif is created once per ill). However, + * do not not link it to the ipmp_grp_t until I_PLINK + * is called; see ip_sioctl_plink_ipmp() for details. + */ + if (ipmp_illgrp_create(ill) == NULL) { + if (insert) { + rw_enter(&ipst->ips_ill_g_lock, + RW_WRITER); + ipif_remove(ipif); + rw_exit(&ipst->ips_ill_g_lock); + } + mi_free(ipif); + return (NULL); + } + } else { + /* + * By default, PHYI_RUNNING is set when the zeroth + * ipif is created. For other ipifs, we don't touch + * it since DLPI notifications may have changed it. + */ + mutex_enter(&phyi->phyint_lock); + phyi->phyint_flags |= PHYI_RUNNING; + mutex_exit(&phyi->phyint_lock); + } + } /* * We grab the ill_lock and phyint_lock to protect the flag changes. @@ -13199,18 +13331,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) * ioctl completes and the IPIF_CHANGING flag is cleared. */ mutex_enter(&ill->ill_lock); - mutex_enter(&ill->ill_phyint->phyint_lock); - /* - * Set the running flag when logical interface zero is created. - * For subsequent logical interfaces, a DLPI link down - * notification message may have cleared the running flag to - * indicate the link is down, so we shouldn't just blindly set it. - */ - if (id == 0) - ill->ill_phyint->phyint_flags |= PHYI_RUNNING; + mutex_enter(&phyi->phyint_lock); + ipif->ipif_ire_type = ire_type; - phyi = ill->ill_phyint; - ipif->ipif_orig_ifindex = phyi->phyint_ifindex; if (ipif->ipif_isv6) { ill->ill_flags |= ILLF_IPV6; @@ -13238,14 +13361,18 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) * Don't set the interface flags etc. now, will do it in * ip_ll_subnet_defaults. */ - if (!initialize) { - mutex_exit(&ill->ill_lock); - mutex_exit(&ill->ill_phyint->phyint_lock); - return (ipif); - } + if (!initialize) + goto out; + ipif->ipif_mtu = ill->ill_max_mtu; - if (ill->ill_bcast_addr_length != 0) { + /* + * NOTE: The IPMP meta-interface is special-cased because it starts + * with no underlying interfaces (and thus an unknown broadcast + * address length), but all interfaces that can be placed into an IPMP + * group are required to be broadcast-capable. + */ + if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { /* * Later detect lack of DLPI driver multicast * capability by catching DL_ENABMULTI errors in @@ -13269,8 +13396,7 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) ill->ill_flags |= ILLF_NOARP; } if (ill->ill_phys_addr_length == 0) { - if (ill->ill_media && - ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { + if (ill->ill_mactype == SUNW_DL_VNI) { ipif->ipif_flags |= IPIF_NOXMIT; phyi->phyint_flags |= PHYI_VIRTUAL; } else { @@ -13285,8 +13411,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) } } } +out: + mutex_exit(&phyi->phyint_lock); mutex_exit(&ill->ill_lock); - mutex_exit(&ill->ill_phyint->phyint_lock); return (ipif); } @@ -13300,34 +13427,49 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) * for details. */ void -ipif_arp_down(ipif_t *ipif) +ipif_resolver_down(ipif_t *ipif) { mblk_t *mp; ill_t *ill = ipif->ipif_ill; - ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); + ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); ASSERT(IAM_WRITER_IPIF(ipif)); + if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) + return; + /* Delete the mapping for the local address */ mp = ipif->ipif_arp_del_mp; if (mp != NULL) { - ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", + ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, mp); ipif->ipif_arp_del_mp = NULL; } /* + * Make IPMP aware of the deleted data address. + */ + if (IS_IPMP(ill)) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); + + /* * If this is the last ipif that is going down and there are no * duplicate addresses we may yet attempt to re-probe, then we need to * clean up ARP completely. */ if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { + /* + * If this was the last ipif on an IPMP interface, purge any + * IPMP ARP entries associated with it. + */ + if (IS_IPMP(ill)) + ipmp_illgrp_refresh_arpent(ill->ill_grp); /* Send up AR_INTERFACE_DOWN message */ mp = ill->ill_arp_down_mp; if (mp != NULL) { - ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", + ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, mp); @@ -13337,7 +13479,7 @@ ipif_arp_down(ipif_t *ipif) /* Tell ARP to delete the multicast mappings */ mp = ill->ill_arp_del_mapping_mp; if (mp != NULL) { - ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", + ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, mp); @@ -13377,6 +13519,13 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) return (0); /* + * IPMP meta-interfaces don't have any inherent multicast mappings, + * and instead use the ones on the underlying interfaces. + */ + if (IS_IPMP(ill)) + return (0); + + /* * Delete the existing mapping from ARP. Normally ipif_down * -> ipif_arp_down should send this up to ARP. The only * reason we would find this when we are switching from @@ -13473,26 +13622,23 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) } /* - * Get the resolver set up for a new interface address. - * (Always called as writer.) - * Called both for IPv4 and IPv6 interfaces, - * though it only sets up the resolver for v6 - * if it's an xresolv interface (one using an external resolver). - * Honors ILLF_NOARP. - * The enumerated value res_act is used to tune the behavior. - * If set to Res_act_initial, then we set up all the resolver - * structures for a new interface. If set to Res_act_move, then - * we just send an AR_ENTRY_ADD message up to ARP for IPv4 - * interfaces; this is called by ip_rput_dlpi_writer() to handle - * asynchronous hardware address change notification. If set to - * Res_act_defend, then we tell ARP that it needs to send a single - * gratuitous message in defense of the address. + * Get the resolver set up for a new IP address. (Always called as writer.) + * Called both for IPv4 and IPv6 interfaces, though it only sets up the + * resolver for v6 if it's an ILLF_XRESOLV interface. Honors ILLF_NOARP. + * + * The enumerated value res_act tunes the behavior: + * * Res_act_initial: set up all the resolver structures for a new + * IP address. + * * Res_act_defend: tell ARP that it needs to send a single gratuitous + * ARP message in defense of the address. + * * Res_act_rebind: tell ARP to change the hardware address for an IP + * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). + * * Returns error on failure. */ int ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) { - caddr_t addr; mblk_t *arp_up_mp = NULL; mblk_t *arp_down_mp = NULL; mblk_t *arp_add_mp = NULL; @@ -13500,9 +13646,9 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) mblk_t *arp_add_mapping_mp = NULL; mblk_t *arp_del_mapping_mp = NULL; ill_t *ill = ipif->ipif_ill; - uchar_t *area_p = NULL; - uchar_t *ared_p = NULL; int err = ENOMEM; + boolean_t added_ipif = B_FALSE; + boolean_t publish; boolean_t was_dup; ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", @@ -13540,11 +13686,7 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) * External resolver for IPv6 */ ASSERT(res_act == Res_act_initial); - if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { - addr = (caddr_t)&ipif->ipif_v6lcl_addr; - area_p = (uchar_t *)&ip6_area_template; - ared_p = (uchar_t *)&ip6_ared_template; - } + publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr); } else { /* * IPv4 arp case. If the ARP stream has already started @@ -13562,41 +13704,39 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) ill->ill_arp_bringup_pending = 1; mutex_exit(&ill->ill_lock); } - if (ipif->ipif_lcl_addr != INADDR_ANY) { - addr = (caddr_t)&ipif->ipif_lcl_addr; - area_p = (uchar_t *)&ip_area_template; - ared_p = (uchar_t *)&ip_ared_template; + publish = (ipif->ipif_lcl_addr != INADDR_ANY); + } + + if (IS_IPMP(ill) && publish) { + /* + * If we're here via ipif_up(), then the ipif won't be bound + * yet -- add it to the group, which will bind it if possible. + * (We would add it in ipif_up(), but deleting on failure + * there is gruesome.) If we're here via ipmp_ill_bind_ipif(), + * then the ipif has already been added to the group and we + * just need to use the binding. + */ + if (ipmp_ipif_bound_ill(ipif) == NULL) { + if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) { + /* + * We couldn't bind the ipif to an ill yet, + * so we have nothing to publish. + */ + publish = B_FALSE; + } + added_ipif = B_TRUE; } } /* * Add an entry for the local address in ARP only if it - * is not UNNUMBERED and the address is not INADDR_ANY. + * is not UNNUMBERED and it is suitable for publishing. */ - if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) { - area_t *area; - - /* Now ask ARP to publish our address. */ - arp_add_mp = ill_arp_alloc(ill, area_p, addr); - if (arp_add_mp == NULL) - goto failed; - area = (area_t *)arp_add_mp->b_rptr; - if (res_act != Res_act_initial) { - /* - * Copy the new hardware address and length into - * arp_add_mp to be sent to ARP. - */ - area->area_hw_addr_length = ill->ill_phys_addr_length; - bcopy(ill->ill_phys_addr, - ((char *)area + area->area_hw_addr_offset), - area->area_hw_addr_length); - } - - area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | - ACE_F_MYADDR; - + if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) { if (res_act == Res_act_defend) { - area->area_flags |= ACE_F_DEFEND; + arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND); + if (arp_add_mp == NULL) + goto failed; /* * If we're just defending our address now, then * there's no need to set up ARP multicast mappings. @@ -13605,17 +13745,18 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) goto done; } - if (res_act != Res_act_initial) - goto arp_setup_multicast; - /* - * Allocate an ARP deletion message so we know we can tell ARP - * when the interface goes down. + * Allocate an ARP add message and an ARP delete message (the + * latter is saved for use when the address goes down). */ - arp_del_mp = ill_arp_alloc(ill, ared_p, addr); - if (arp_del_mp == NULL) + if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL) + goto failed; + + if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) goto failed; + if (res_act != Res_act_initial) + goto arp_setup_multicast; } else { if (res_act != Res_act_initial) goto done; @@ -13624,14 +13765,11 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) * Need to bring up ARP or setup multicast mapping only * when the first interface is coming UP. */ - if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || - was_dup) { + if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup) goto done; - } /* - * Allocate an ARP down message (to be saved) and an ARP up - * message. + * Allocate an ARP down message (to be saved) and an ARP up message. */ arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); if (arp_down_mp == NULL) @@ -13648,33 +13786,21 @@ arp_setup_multicast: /* * Setup the multicast mappings. This function initializes * ill_arp_del_mapping_mp also. This does not need to be done for - * IPv6. + * IPv6, or for the IPMP interface (since it has no link-layer). */ - if (!ill->ill_isv6) { + if (!ill->ill_isv6 && !IS_IPMP(ill)) { err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); if (err != 0) goto failed; ASSERT(ill->ill_arp_del_mapping_mp != NULL); ASSERT(arp_add_mapping_mp != NULL); } - done: - if (arp_del_mp != NULL) { - ASSERT(ipif->ipif_arp_del_mp == NULL); - ipif->ipif_arp_del_mp = arp_del_mp; - } - if (arp_down_mp != NULL) { - ASSERT(ill->ill_arp_down_mp == NULL); - ill->ill_arp_down_mp = arp_down_mp; - } - if (arp_del_mapping_mp != NULL) { - ASSERT(ill->ill_arp_del_mapping_mp == NULL); - ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; - } if (arp_up_mp != NULL) { ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, arp_up_mp); + arp_up_mp = NULL; } if (arp_add_mp != NULL) { ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", @@ -13686,6 +13812,7 @@ done: if (!ill->ill_arp_extend) ipif->ipif_addr_ready = 1; putnext(ill->ill_rq, arp_add_mp); + arp_add_mp = NULL; } else { ipif->ipif_addr_ready = 1; } @@ -13693,29 +13820,40 @@ done: ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, arp_add_mapping_mp); + arp_add_mapping_mp = NULL; } - if (res_act != Res_act_initial) - return (0); - if (ill->ill_flags & ILLF_NOARP) - err = ill_arp_off(ill); - else - err = ill_arp_on(ill); - if (err != 0) { - ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); - freemsg(ipif->ipif_arp_del_mp); - freemsg(ill->ill_arp_down_mp); - freemsg(ill->ill_arp_del_mapping_mp); - ipif->ipif_arp_del_mp = NULL; - ill->ill_arp_down_mp = NULL; - ill->ill_arp_del_mapping_mp = NULL; - return (err); + if (res_act == Res_act_initial) { + if (ill->ill_flags & ILLF_NOARP) + err = ill_arp_off(ill); + else + err = ill_arp_on(ill); + if (err != 0) { + ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", + err)); + goto failed; + } } + + if (arp_del_mp != NULL) { + ASSERT(ipif->ipif_arp_del_mp == NULL); + ipif->ipif_arp_del_mp = arp_del_mp; + } + if (arp_down_mp != NULL) { + ASSERT(ill->ill_arp_down_mp == NULL); + ill->ill_arp_down_mp = arp_down_mp; + } + if (arp_del_mapping_mp != NULL) { + ASSERT(ill->ill_arp_del_mapping_mp == NULL); + ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; + } + return ((ill->ill_ipif_up_count != 0 || was_dup || ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); - failed: ip1dbg(("ipif_resolver_up: FAILED\n")); + if (added_ipif) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); freemsg(arp_add_mp); freemsg(arp_del_mp); freemsg(arp_add_mapping_mp); @@ -13734,13 +13872,12 @@ ipif_arp_start_dad(ipif_t *ipif) { ill_t *ill = ipif->ipif_ill; mblk_t *arp_add_mp; - area_t *area; + /* ACE_F_UNVERIFIED restarts DAD */ if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || (ipif->ipif_flags & IPIF_UNNUMBERED) || ipif->ipif_lcl_addr == INADDR_ANY || - (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, - (char *)&ipif->ipif_lcl_addr)) == NULL) { + (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) { /* * If we can't contact ARP for some reason, that's not really a * problem. Just send out the routing socket notification that @@ -13752,10 +13889,6 @@ ipif_arp_start_dad(ipif_t *ipif) return; } - /* Setting the 'unverified' flag restarts DAD */ - area = (area_t *)arp_add_mp->b_rptr; - area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | - ACE_F_UNVERIFIED; putnext(ill->ill_rq, arp_add_mp); } @@ -13764,7 +13897,8 @@ ipif_ndp_start_dad(ipif_t *ipif) { nce_t *nce; - nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE); + nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr, + B_FALSE); if (nce == NULL) return; @@ -13805,7 +13939,7 @@ ill_restart_dad(ill_t *ill, boolean_t went_up) */ if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || (!ill->ill_isv6 && !ill->ill_arp_extend)) { - ip_rts_ifmsg(ill->ill_ipif); + ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); return; } @@ -13838,8 +13972,10 @@ ill_restart_dad(ill_t *ill, boolean_t went_up) * we'll handle eventual routing socket * notification via DAD completion.) */ - if (ipif == ill->ill_ipif) - ip_rts_ifmsg(ill->ill_ipif); + if (ipif == ill->ill_ipif) { + ip_rts_ifmsg(ill->ill_ipif, + RTSQ_DEFAULT); + } } } else { /* @@ -13855,285 +13991,30 @@ ill_restart_dad(ill_t *ill, boolean_t went_up) * If we've torn down links, then notify the user right away. */ if (!went_up) - ip_rts_ifmsg(ill->ill_ipif); -} - -/* - * Wakeup all threads waiting to enter the ipsq, and sleeping - * on any of the ills in this ipsq. The ill_lock of the ill - * must be held so that waiters don't miss wakeups - */ -static void -ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock) -{ - phyint_t *phyint; - - phyint = ipsq->ipsq_phyint_list; - while (phyint != NULL) { - if (phyint->phyint_illv4) { - if (!caller_holds_lock) - mutex_enter(&phyint->phyint_illv4->ill_lock); - ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); - cv_broadcast(&phyint->phyint_illv4->ill_cv); - if (!caller_holds_lock) - mutex_exit(&phyint->phyint_illv4->ill_lock); - } - if (phyint->phyint_illv6) { - if (!caller_holds_lock) - mutex_enter(&phyint->phyint_illv6->ill_lock); - ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); - cv_broadcast(&phyint->phyint_illv6->ill_cv); - if (!caller_holds_lock) - mutex_exit(&phyint->phyint_illv6->ill_lock); - } - phyint = phyint->phyint_ipsq_next; - } -} - -static ipsq_t * -ipsq_create(char *groupname, ip_stack_t *ipst) -{ - ipsq_t *ipsq; - - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); - if (ipsq == NULL) { - return (NULL); - } - - if (groupname != NULL) - (void) strcpy(ipsq->ipsq_name, groupname); - else - ipsq->ipsq_name[0] = '\0'; - - mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL); - ipsq->ipsq_flags |= IPSQ_GROUP; - ipsq->ipsq_next = ipst->ips_ipsq_g_head; - ipst->ips_ipsq_g_head = ipsq; - ipsq->ipsq_ipst = ipst; /* No netstack_hold */ - return (ipsq); -} - -/* - * Return an ipsq correspoding to the groupname. If 'create' is true - * allocate a new ipsq if one does not exist. Usually an ipsq is associated - * uniquely with an IPMP group. However during IPMP groupname operations, - * multiple IPMP groups may be associated with a single ipsq. But no - * IPMP group can be associated with more than 1 ipsq at any time. - * For example - * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs - * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2 - * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2 - * - * Now the command ifconfig hme3 group mpk17-84 results in the temporary - * status shown below during the execution of the above command. - * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4 - * - * After the completion of the above groupname command we return to the stable - * state shown below. - * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3 - * hme4 mpk17-85 ipsq2 mpk17-85 1 - * - * Because of the above, we don't search based on the ipsq_name since that - * would miss the correct ipsq during certain windows as shown above. - * The ipsq_name is only used during split of an ipsq to return the ipsq to its - * natural state. - */ -static ipsq_t * -ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq, - ip_stack_t *ipst) -{ - ipsq_t *ipsq; - int group_len; - phyint_t *phyint; - - ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); - - group_len = strlen(groupname); - ASSERT(group_len != 0); - group_len++; - - for (ipsq = ipst->ips_ipsq_g_head; - ipsq != NULL; - ipsq = ipsq->ipsq_next) { - /* - * When an ipsq is being split, and ill_split_ipsq - * calls this function, we exclude it from being considered. - */ - if (ipsq == exclude_ipsq) - continue; - - /* - * Compare against the ipsq_name. The groupname change happens - * in 2 phases. The 1st phase merges the from group into - * the to group's ipsq, by calling ill_merge_groups and restarts - * the ioctl. The 2nd phase then locates the ipsq again thru - * ipsq_name. At this point the phyint_groupname has not been - * updated. - */ - if ((group_len == strlen(ipsq->ipsq_name) + 1) && - (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) { - /* - * Verify that an ipmp groupname is exactly - * part of 1 ipsq and is not found in any other - * ipsq. - */ - ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, ipst) == - NULL); - return (ipsq); - } - - /* - * Comparison against ipsq_name alone is not sufficient. - * In the case when groups are currently being - * merged, the ipsq could hold other IPMP groups temporarily. - * so we walk the phyint list and compare against the - * phyint_groupname as well. - */ - phyint = ipsq->ipsq_phyint_list; - while (phyint != NULL) { - if ((group_len == phyint->phyint_groupname_len) && - (bcmp(phyint->phyint_groupname, groupname, - group_len) == 0)) { - /* - * Verify that an ipmp groupname is exactly - * part of 1 ipsq and is not found in any other - * ipsq. - */ - ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, - ipst) == NULL); - return (ipsq); - } - phyint = phyint->phyint_ipsq_next; - } - } - if (create) - ipsq = ipsq_create(groupname, ipst); - return (ipsq); + ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); } static void ipsq_delete(ipsq_t *ipsq) { - ipsq_t *nipsq; - ipsq_t *pipsq = NULL; - ip_stack_t *ipst = ipsq->ipsq_ipst; - - /* - * We don't hold the ipsq lock, but we are sure no new - * messages can land up, since the ipsq_refs is zero. - * i.e. this ipsq is unnamed and no phyint or phyint group - * is associated with this ipsq. (Lookups are based on ill_name - * or phyint_groupname) - */ - ASSERT(ipsq->ipsq_refs == 0); - ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL); - ASSERT(ipsq->ipsq_pending_mp == NULL); - if (!(ipsq->ipsq_flags & IPSQ_GROUP)) { - /* - * This is not the ipsq of an IPMP group. - */ - ipsq->ipsq_ipst = NULL; - kmem_free(ipsq, sizeof (ipsq_t)); - return; - } - - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - - /* - * Locate the ipsq before we can remove it from - * the singly linked list of ipsq's. - */ - for (nipsq = ipst->ips_ipsq_g_head; nipsq != NULL; - nipsq = nipsq->ipsq_next) { - if (nipsq == ipsq) { - break; - } - pipsq = nipsq; - } - - ASSERT(nipsq == ipsq); + ipxop_t *ipx = ipsq->ipsq_xop; - /* unlink ipsq from the list */ - if (pipsq != NULL) - pipsq->ipsq_next = ipsq->ipsq_next; - else - ipst->ips_ipsq_g_head = ipsq->ipsq_next; ipsq->ipsq_ipst = NULL; + ASSERT(ipsq->ipsq_phyint == NULL); + ASSERT(ipsq->ipsq_xop != NULL); + ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); + ASSERT(ipx->ipx_pending_mp == NULL); kmem_free(ipsq, sizeof (ipsq_t)); - rw_exit(&ipst->ips_ill_g_lock); -} - -static void -ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp, - queue_t *q) -{ - ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock)); - ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL); - ASSERT(old_ipsq->ipsq_pending_ipif == NULL); - ASSERT(old_ipsq->ipsq_pending_mp == NULL); - ASSERT(current_mp != NULL); - - ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl, - NEW_OP, NULL); - - ASSERT(new_ipsq->ipsq_xopq_mptail != NULL && - new_ipsq->ipsq_xopq_mphead != NULL); - - /* - * move from old ipsq to the new ipsq. - */ - new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead; - if (old_ipsq->ipsq_xopq_mphead != NULL) - new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail; - - old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL; } -void -ill_group_cleanup(ill_t *ill) -{ - ill_t *ill_v4; - ill_t *ill_v6; - ipif_t *ipif; - - ill_v4 = ill->ill_phyint->phyint_illv4; - ill_v6 = ill->ill_phyint->phyint_illv6; - - if (ill_v4 != NULL) { - mutex_enter(&ill_v4->ill_lock); - for (ipif = ill_v4->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - IPIF_UNMARK_MOVING(ipif); - } - ill_v4->ill_up_ipifs = B_FALSE; - mutex_exit(&ill_v4->ill_lock); - } - - if (ill_v6 != NULL) { - mutex_enter(&ill_v6->ill_lock); - for (ipif = ill_v6->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - IPIF_UNMARK_MOVING(ipif); - } - ill_v6->ill_up_ipifs = B_FALSE; - mutex_exit(&ill_v6->ill_lock); - } -} -/* - * This function is called when an ill has had a change in its group status - * to bring up all the ipifs that were up before the change. - */ -int -ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) +static int +ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) { + int err; ipif_t *ipif; - ill_t *ill_v4; - ill_t *ill_v6; - ill_t *from_ill; - int err = 0; - ASSERT(IAM_WRITER_ILL(ill)); + if (ill == NULL) + return (0); /* * Except for ipif_state_flags and ill_state_flags the other @@ -14142,389 +14023,86 @@ ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) * even an ipif that was already down, in ill_down_ipifs. So we * just blindly clear the IPIF_CHANGING flag here on all ipifs. */ - ill_v4 = ill->ill_phyint->phyint_illv4; - ill_v6 = ill->ill_phyint->phyint_illv6; - if (ill_v4 != NULL) { - ill_v4->ill_up_ipifs = B_TRUE; - for (ipif = ill_v4->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - mutex_enter(&ill_v4->ill_lock); - ipif->ipif_state_flags &= ~IPIF_CHANGING; - IPIF_UNMARK_MOVING(ipif); - mutex_exit(&ill_v4->ill_lock); - if (ipif->ipif_was_up) { - if (!(ipif->ipif_flags & IPIF_UP)) - err = ipif_up(ipif, q, mp); - ipif->ipif_was_up = B_FALSE; - if (err != 0) { - /* - * Can there be any other error ? - */ - ASSERT(err == EINPROGRESS); - return (err); - } - } - } - mutex_enter(&ill_v4->ill_lock); - ill_v4->ill_state_flags &= ~ILL_CHANGING; - mutex_exit(&ill_v4->ill_lock); - ill_v4->ill_up_ipifs = B_FALSE; - if (ill_v4->ill_move_in_progress) { - ASSERT(ill_v4->ill_move_peer != NULL); - ill_v4->ill_move_in_progress = B_FALSE; - from_ill = ill_v4->ill_move_peer; - from_ill->ill_move_in_progress = B_FALSE; - from_ill->ill_move_peer = NULL; - mutex_enter(&from_ill->ill_lock); - from_ill->ill_state_flags &= ~ILL_CHANGING; - mutex_exit(&from_ill->ill_lock); - if (ill_v6 == NULL) { - if (from_ill->ill_phyint->phyint_flags & - PHYI_STANDBY) { - phyint_inactive(from_ill->ill_phyint); - } - if (ill_v4->ill_phyint->phyint_flags & - PHYI_STANDBY) { - phyint_inactive(ill_v4->ill_phyint); - } - } - ill_v4->ill_move_peer = NULL; - } - } + ASSERT(IAM_WRITER_ILL(ill)); - if (ill_v6 != NULL) { - ill_v6->ill_up_ipifs = B_TRUE; - for (ipif = ill_v6->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - mutex_enter(&ill_v6->ill_lock); - ipif->ipif_state_flags &= ~IPIF_CHANGING; - IPIF_UNMARK_MOVING(ipif); - mutex_exit(&ill_v6->ill_lock); - if (ipif->ipif_was_up) { - if (!(ipif->ipif_flags & IPIF_UP)) - err = ipif_up(ipif, q, mp); - ipif->ipif_was_up = B_FALSE; - if (err != 0) { - /* - * Can there be any other error ? - */ - ASSERT(err == EINPROGRESS); - return (err); - } - } - } - mutex_enter(&ill_v6->ill_lock); - ill_v6->ill_state_flags &= ~ILL_CHANGING; - mutex_exit(&ill_v6->ill_lock); - ill_v6->ill_up_ipifs = B_FALSE; - if (ill_v6->ill_move_in_progress) { - ASSERT(ill_v6->ill_move_peer != NULL); - ill_v6->ill_move_in_progress = B_FALSE; - from_ill = ill_v6->ill_move_peer; - from_ill->ill_move_in_progress = B_FALSE; - from_ill->ill_move_peer = NULL; - mutex_enter(&from_ill->ill_lock); - from_ill->ill_state_flags &= ~ILL_CHANGING; - mutex_exit(&from_ill->ill_lock); - if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { - phyint_inactive(from_ill->ill_phyint); - } - if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) { - phyint_inactive(ill_v6->ill_phyint); + ill->ill_up_ipifs = B_TRUE; + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + mutex_enter(&ill->ill_lock); + ipif->ipif_state_flags &= ~IPIF_CHANGING; + mutex_exit(&ill->ill_lock); + if (ipif->ipif_was_up) { + if (!(ipif->ipif_flags & IPIF_UP)) + err = ipif_up(ipif, q, mp); + ipif->ipif_was_up = B_FALSE; + if (err != 0) { + ASSERT(err == EINPROGRESS); + return (err); } - ill_v6->ill_move_peer = NULL; } } + mutex_enter(&ill->ill_lock); + ill->ill_state_flags &= ~ILL_CHANGING; + mutex_exit(&ill->ill_lock); + ill->ill_up_ipifs = B_FALSE; return (0); } /* - * bring down all the approriate ipifs. + * This function is called to bring up all the ipifs that were up before + * bringing the ill down via ill_down_ipifs(). */ -/* ARGSUSED */ -static void -ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) +int +ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) { - ipif_t *ipif; + int err; ASSERT(IAM_WRITER_ILL(ill)); - /* - * Except for ipif_state_flags the other fields of the ipif/ill that - * are modified below are protected implicitly since we are a writer - */ - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER)) - continue; - /* - * Don't bring down the LINK LOCAL addresses as they are tied - * to physical interface and they don't move. Treat them as - * IPIF_NOFAILOVER. - */ - if (chk_nofailover && ill->ill_isv6 && - IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) - continue; - if (index == 0 || index == ipif->ipif_orig_ifindex) { - /* - * We go through the ipif_down logic even if the ipif - * is already down, since routes can be added based - * on down ipifs. Going through ipif_down once again - * will delete any IREs created based on these routes. - */ - if (ipif->ipif_flags & IPIF_UP) - ipif->ipif_was_up = B_TRUE; - /* - * If called with chk_nofailover true ipif is moving. - */ - mutex_enter(&ill->ill_lock); - if (chk_nofailover) { - ipif->ipif_state_flags |= - IPIF_MOVING | IPIF_CHANGING; - } else { - ipif->ipif_state_flags |= IPIF_CHANGING; - } - mutex_exit(&ill->ill_lock); - /* - * Need to re-create net/subnet bcast ires if - * they are dependent on ipif. - */ - if (!ipif->ipif_isv6) - ipif_check_bcast_ires(ipif); - (void) ipif_logical_down(ipif, NULL, NULL); - ipif_non_duplicate(ipif); - ipif_down_tail(ipif); - } - } -} - -#define IPSQ_INC_REF(ipsq, ipst) { \ - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \ - (ipsq)->ipsq_refs++; \ -} + err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); + if (err != 0) + return (err); -#define IPSQ_DEC_REF(ipsq, ipst) { \ - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \ - (ipsq)->ipsq_refs--; \ - if ((ipsq)->ipsq_refs == 0) \ - (ipsq)->ipsq_name[0] = '\0'; \ + return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); } /* - * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to - * new_ipsq. + * Bring down any IPIF_UP ipifs on ill. */ static void -ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq, ip_stack_t *ipst) +ill_down_ipifs(ill_t *ill) { - phyint_t *phyint; - phyint_t *next_phyint; - - /* - * To change the ipsq of an ill, we need to hold the ill_g_lock as - * writer and the ill_lock of the ill in question. Also the dest - * ipsq can't vanish while we hold the ill_g_lock as writer. - */ - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - - phyint = cur_ipsq->ipsq_phyint_list; - cur_ipsq->ipsq_phyint_list = NULL; - while (phyint != NULL) { - next_phyint = phyint->phyint_ipsq_next; - IPSQ_DEC_REF(cur_ipsq, ipst); - phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list; - new_ipsq->ipsq_phyint_list = phyint; - IPSQ_INC_REF(new_ipsq, ipst); - phyint->phyint_ipsq = new_ipsq; - phyint = next_phyint; - } -} - -#define SPLIT_SUCCESS 0 -#define SPLIT_NOT_NEEDED 1 -#define SPLIT_FAILED 2 - -int -ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry, - ip_stack_t *ipst) -{ - ipsq_t *newipsq = NULL; - - /* - * Assertions denote pre-requisites for changing the ipsq of - * a phyint - */ - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - /* - * <ill-phyint> assocs can't change while ill_g_lock - * is held as writer. See ill_phyint_reinit() - */ - ASSERT(phyint->phyint_illv4 == NULL || - MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); - ASSERT(phyint->phyint_illv6 == NULL || - MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); - - if ((phyint->phyint_groupname_len != - (strlen(cur_ipsq->ipsq_name) + 1) || - bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name, - phyint->phyint_groupname_len) != 0)) { - /* - * Once we fail in creating a new ipsq due to memory shortage, - * don't attempt to create new ipsq again, based on another - * phyint, since we want all phyints belonging to an IPMP group - * to be in the same ipsq even in the event of mem alloc fails. - */ - newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry, - cur_ipsq, ipst); - if (newipsq == NULL) { - /* Memory allocation failure */ - return (SPLIT_FAILED); - } else { - /* ipsq_refs protected by ill_g_lock (writer) */ - IPSQ_DEC_REF(cur_ipsq, ipst); - phyint->phyint_ipsq = newipsq; - phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list; - newipsq->ipsq_phyint_list = phyint; - IPSQ_INC_REF(newipsq, ipst); - return (SPLIT_SUCCESS); - } - } - return (SPLIT_NOT_NEEDED); -} + ipif_t *ipif; -/* - * The ill locks of the phyint and the ill_g_lock (writer) must be held - * to do this split - */ -static int -ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, ip_stack_t *ipst) -{ - ipsq_t *newipsq; + ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); /* - * <ill-phyint> assocs can't change while ill_g_lock - * is held as writer. See ill_phyint_reinit() + * Except for ipif_state_flags the other fields of the ipif/ill that + * are modified below are protected implicitly since we are a writer */ - - ASSERT(phyint->phyint_illv4 == NULL || - MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); - ASSERT(phyint->phyint_illv6 == NULL || - MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); - - if (!ipsq_init((phyint->phyint_illv4 != NULL) ? - phyint->phyint_illv4: phyint->phyint_illv6)) { + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { /* - * ipsq_init failed due to no memory - * caller will use the same ipsq + * We go through the ipif_down logic even if the ipif + * is already down, since routes can be added based + * on down ipifs. Going through ipif_down once again + * will delete any IREs created based on these routes. */ - return (SPLIT_FAILED); - } - - /* ipsq_ref is protected by ill_g_lock (writer) */ - IPSQ_DEC_REF(cur_ipsq, ipst); - - /* - * This is a new ipsq that is unknown to the world. - * So we don't need to hold ipsq_lock, - */ - newipsq = phyint->phyint_ipsq; - newipsq->ipsq_writer = NULL; - newipsq->ipsq_reentry_cnt--; - ASSERT(newipsq->ipsq_reentry_cnt == 0); -#ifdef DEBUG - newipsq->ipsq_depth = 0; -#endif - - return (SPLIT_SUCCESS); -} + if (ipif->ipif_flags & IPIF_UP) + ipif->ipif_was_up = B_TRUE; -/* - * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to - * ipsq's representing their individual groups or themselves. Return - * whether split needs to be retried again later. - */ -static boolean_t -ill_split_ipsq(ipsq_t *cur_ipsq) -{ - phyint_t *phyint; - phyint_t *next_phyint; - int error; - boolean_t need_retry = B_FALSE; - ip_stack_t *ipst = cur_ipsq->ipsq_ipst; + mutex_enter(&ill->ill_lock); + ipif->ipif_state_flags |= IPIF_CHANGING; + mutex_exit(&ill->ill_lock); - phyint = cur_ipsq->ipsq_phyint_list; - cur_ipsq->ipsq_phyint_list = NULL; - while (phyint != NULL) { - next_phyint = phyint->phyint_ipsq_next; /* - * 'created' will tell us whether the callee actually - * created an ipsq. Lack of memory may force the callee - * to return without creating an ipsq. + * Need to re-create net/subnet bcast ires if + * they are dependent on ipif. */ - if (phyint->phyint_groupname == NULL) { - error = ill_split_to_own_ipsq(phyint, cur_ipsq, ipst); - } else { - error = ill_split_to_grp_ipsq(phyint, cur_ipsq, - need_retry, ipst); - } - - switch (error) { - case SPLIT_FAILED: - need_retry = B_TRUE; - /* FALLTHRU */ - case SPLIT_NOT_NEEDED: - /* - * Keep it on the list. - */ - phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list; - cur_ipsq->ipsq_phyint_list = phyint; - break; - case SPLIT_SUCCESS: - break; - default: - ASSERT(0); - } - - phyint = next_phyint; - } - return (need_retry); -} - -/* - * given an ipsq 'ipsq' lock all ills associated with this ipsq. - * and return the ills in the list. This list will be - * needed to unlock all the ills later on by the caller. - * The <ill-ipsq> associations could change between the - * lock and unlock. Hence the unlock can't traverse the - * ipsq to get the list of ills. - */ -static int -ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max) -{ - int cnt = 0; - phyint_t *phyint; - ip_stack_t *ipst = ipsq->ipsq_ipst; - - /* - * The caller holds ill_g_lock to ensure that the ill memberships - * of the ipsq don't change - */ - ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); - - phyint = ipsq->ipsq_phyint_list; - while (phyint != NULL) { - if (phyint->phyint_illv4 != NULL) { - ASSERT(cnt < list_max); - list[cnt++] = phyint->phyint_illv4; - } - if (phyint->phyint_illv6 != NULL) { - ASSERT(cnt < list_max); - list[cnt++] = phyint->phyint_illv6; - } - phyint = phyint->phyint_ipsq_next; + if (!ipif->ipif_isv6) + ipif_check_bcast_ires(ipif); + (void) ipif_logical_down(ipif, NULL, NULL); + ipif_non_duplicate(ipif); + ipif_down_tail(ipif); } - ill_lock_ills(list, cnt); - return (cnt); } void @@ -14577,3504 +14155,251 @@ ill_unlock_ills(ill_t **list, int cnt) } /* - * Merge all the ills from 1 ipsq group into another ipsq group. - * The source ipsq group is specified by the ipsq associated with - * 'from_ill'. The destination ipsq group is specified by the ipsq - * associated with 'to_ill' or 'groupname' respectively. - * Note that ipsq itself does not have a reference count mechanism - * and functions don't look up an ipsq and pass it around. Instead - * functions pass around an ill or groupname, and the ipsq is looked - * up from the ill or groupname and the required operation performed - * atomically with the lookup on the ipsq. + * Redo source address selection. This is called when a + * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up. */ -static int -ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp, - queue_t *q) -{ - ipsq_t *old_ipsq; - ipsq_t *new_ipsq; - ill_t **ill_list; - int cnt; - size_t ill_list_size; - boolean_t became_writer_on_new_sq = B_FALSE; - ip_stack_t *ipst = from_ill->ill_ipst; - - ASSERT(to_ill == NULL || ipst == to_ill->ill_ipst); - /* Exactly 1 of 'to_ill' and groupname can be specified. */ - ASSERT((to_ill != NULL) ^ (groupname != NULL)); - - /* - * Need to hold ill_g_lock as writer and also the ill_lock to - * change the <ill-ipsq> assoc of an ill. Need to hold the - * ipsq_lock to prevent new messages from landing on an ipsq. - */ - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - - old_ipsq = from_ill->ill_phyint->phyint_ipsq; - if (groupname != NULL) - new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL, ipst); - else { - new_ipsq = to_ill->ill_phyint->phyint_ipsq; - } - - ASSERT(old_ipsq != NULL && new_ipsq != NULL); - - /* - * both groups are on the same ipsq. - */ - if (old_ipsq == new_ipsq) { - rw_exit(&ipst->ips_ill_g_lock); - return (0); - } - - cnt = old_ipsq->ipsq_refs << 1; - ill_list_size = cnt * sizeof (ill_t *); - ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); - if (ill_list == NULL) { - rw_exit(&ipst->ips_ill_g_lock); - return (ENOMEM); - } - cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt); - - /* Need ipsq lock to enque messages on new ipsq or to become writer */ - mutex_enter(&new_ipsq->ipsq_lock); - if ((new_ipsq->ipsq_writer == NULL && - new_ipsq->ipsq_current_ipif == NULL) || - (new_ipsq->ipsq_writer == curthread)) { - new_ipsq->ipsq_writer = curthread; - new_ipsq->ipsq_reentry_cnt++; - became_writer_on_new_sq = B_TRUE; - } - - /* - * We are holding ill_g_lock as writer and all the ill locks of - * the old ipsq. So the old_ipsq can't be looked up, and hence no new - * message can land up on the old ipsq even though we don't hold the - * ipsq_lock of the old_ipsq. Now move all messages to the newipsq. - */ - ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q); - - /* - * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'. - * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq> - * assocs. till we release the ill_g_lock, and hence it can't vanish. - */ - ill_merge_ipsq(old_ipsq, new_ipsq, ipst); - - /* - * Mark the new ipsq as needing a split since it is currently - * being shared by more than 1 IPMP group. The split will - * occur at the end of ipsq_exit - */ - new_ipsq->ipsq_split = B_TRUE; - - /* Now release all the locks */ - mutex_exit(&new_ipsq->ipsq_lock); - ill_unlock_ills(ill_list, cnt); - rw_exit(&ipst->ips_ill_g_lock); - - kmem_free(ill_list, ill_list_size); - - /* - * If we succeeded in becoming writer on the new ipsq, then - * drain the new ipsq and start processing all enqueued messages - * including the current ioctl we are processing which is either - * a set groupname or failover/failback. - */ - if (became_writer_on_new_sq) - ipsq_exit(new_ipsq); - - /* - * syncq has been changed and all the messages have been moved. - */ - mutex_enter(&old_ipsq->ipsq_lock); - old_ipsq->ipsq_current_ipif = NULL; - old_ipsq->ipsq_current_ioctl = 0; - old_ipsq->ipsq_current_done = B_TRUE; - mutex_exit(&old_ipsq->ipsq_lock); - return (EINPROGRESS); -} - -/* - * Delete and add the loopback copy and non-loopback copy of - * the BROADCAST ire corresponding to ill and addr. Used to - * group broadcast ires together when ill becomes part of - * a group. - * - * This function is also called when ill is leaving the group - * so that the ires belonging to the group gets re-grouped. - */ -static void -ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr) -{ - ire_t *ire, *nire, *nire_next, *ire_head = NULL; - ire_t **ire_ptpn = &ire_head; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * The loopback and non-loopback IREs are inserted in the order in which - * they're found, on the basis that they are correctly ordered (loopback - * first). - */ - for (;;) { - ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, - ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); - if (ire == NULL) - break; - - /* - * we are passing in KM_SLEEP because it is not easy to - * go back to a sane state in case of memory failure. - */ - nire = kmem_cache_alloc(ire_cache, KM_SLEEP); - ASSERT(nire != NULL); - bzero(nire, sizeof (ire_t)); - /* - * Don't use ire_max_frag directly since we don't - * hold on to 'ire' until we add the new ire 'nire' and - * we don't want the new ire to have a dangling reference - * to 'ire'. The ire_max_frag of a broadcast ire must - * be in sync with the ipif_mtu of the associate ipif. - * For eg. this happens as a result of SIOCSLIFNAME, - * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by - * the driver. A change in ire_max_frag triggered as - * as a result of path mtu discovery, or due to an - * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a - * route change -mtu command does not apply to broadcast ires. - * - * XXX We need a recovery strategy here if ire_init fails - */ - if (ire_init(nire, - (uchar_t *)&ire->ire_addr, - (uchar_t *)&ire->ire_mask, - (uchar_t *)&ire->ire_src_addr, - (uchar_t *)&ire->ire_gateway_addr, - ire->ire_stq == NULL ? &ip_loopback_mtu : - &ire->ire_ipif->ipif_mtu, - ire->ire_nce, - ire->ire_rfq, - ire->ire_stq, - ire->ire_type, - ire->ire_ipif, - ire->ire_cmask, - ire->ire_phandle, - ire->ire_ihandle, - ire->ire_flags, - &ire->ire_uinfo, - NULL, - NULL, - ipst) == NULL) { - cmn_err(CE_PANIC, "ire_init() failed"); - } - ire_delete(ire); - ire_refrele(ire); - - /* - * The newly created IREs are inserted at the tail of the list - * starting with ire_head. As we've just allocated them no one - * knows about them so it's safe. - */ - *ire_ptpn = nire; - ire_ptpn = &nire->ire_next; - } - - for (nire = ire_head; nire != NULL; nire = nire_next) { - int error; - ire_t *oire; - /* unlink the IRE from our list before calling ire_add() */ - nire_next = nire->ire_next; - nire->ire_next = NULL; - - /* ire_add adds the ire at the right place in the list */ - oire = nire; - error = ire_add(&nire, NULL, NULL, NULL, B_FALSE); - ASSERT(error == 0); - ASSERT(oire == nire); - ire_refrele(nire); /* Held in ire_add */ - } -} - -/* - * This function is usually called when an ill is inserted in - * a group and all the ipifs are already UP. As all the ipifs - * are already UP, the broadcast ires have already been created - * and been inserted. But, ire_add_v4 would not have grouped properly. - * We need to re-group for the benefit of ip_wput_ire which - * expects BROADCAST ires to be grouped properly to avoid sending - * more than one copy of the broadcast packet per group. - * - * NOTE : We don't check for ill_ipif_up_count to be non-zero here - * because when ipif_up_done ends up calling this, ires have - * already been added before illgrp_insert i.e before ill_group - * has been initialized. - */ -static void -ill_group_bcast_for_xmit(ill_t *ill) +void +ill_update_source_selection(ill_t *ill) { - ill_group_t *illgrp; ipif_t *ipif; - ipaddr_t addr; - ipaddr_t net_mask; - ipaddr_t subnet_netmask; - illgrp = ill->ill_group; + ASSERT(IAM_WRITER_ILL(ill)); /* - * This function is called even when an ill is deleted from - * the group. Hence, illgrp could be null. + * Underlying interfaces are only used for test traffic and thus + * should always send with their (deprecated) source addresses. */ - if (illgrp != NULL && illgrp->illgrp_ill_count == 1) + if (IS_UNDER_IPMP(ill)) return; - /* - * Delete all the BROADCAST ires matching this ill and add - * them back. This time, ire_add_v4 should take care of - * grouping them with others because ill is part of the - * group. - */ - ill_bcast_delete_and_add(ill, 0); - ill_bcast_delete_and_add(ill, INADDR_BROADCAST); - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - - if ((ipif->ipif_lcl_addr != INADDR_ANY) && - !(ipif->ipif_flags & IPIF_NOLOCAL)) { - net_mask = ip_net_mask(ipif->ipif_lcl_addr); - } else { - net_mask = htonl(IN_CLASSA_NET); - } - addr = net_mask & ipif->ipif_subnet; - ill_bcast_delete_and_add(ill, addr); - ill_bcast_delete_and_add(ill, ~net_mask | addr); - - subnet_netmask = ipif->ipif_net_mask; - addr = ipif->ipif_subnet; - ill_bcast_delete_and_add(ill, addr); - ill_bcast_delete_and_add(ill, ~subnet_netmask | addr); - } -} - -/* - * This function is called from illgrp_delete when ill is being deleted - * from the group. - * - * As ill is not there in the group anymore, any address belonging - * to this ill should be cleared of IRE_MARK_NORECV. - */ -static void -ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr) -{ - ire_t *ire; - irb_t *irb; - ip_stack_t *ipst = ill->ill_ipst; - - ASSERT(ill->ill_group == NULL); - - ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, - ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); - - if (ire != NULL) { - /* - * IPMP and plumbing operations are serialized on the ipsq, so - * no one will insert or delete a broadcast ire under our feet. - */ - irb = ire->ire_bucket; - rw_enter(&irb->irb_lock, RW_READER); - ire_refrele(ire); - - for (; ire != NULL; ire = ire->ire_next) { - if (ire->ire_addr != addr) - break; - if (ire_to_ill(ire) != ill) - continue; - - ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED)); - ire->ire_marks &= ~IRE_MARK_NORECV; - } - rw_exit(&irb->irb_lock); - } -} - -ire_t * -irep_insert(ill_group_t *illgrp, ipaddr_t addr, ire_t *ire, ire_t ***pirep) -{ - boolean_t first = B_TRUE; - ire_t *clear_ire = NULL; - ire_t *start_ire = NULL; - uint64_t match_flags; - uint64_t phyi_flags; - boolean_t fallback = B_FALSE; - - /* - * irb_lock must be held by the caller. - * Get to the first ire matching the address and the - * group. If the address does not match we are done - * as we could not find the IRE. If the address matches - * we should get to the first one matching the group. - */ - while (ire != NULL) { - if (ire->ire_addr != addr || - ire->ire_ipif->ipif_ill->ill_group == illgrp) { - break; - } - ire = ire->ire_next; - } - match_flags = PHYI_FAILED | PHYI_INACTIVE; - start_ire = ire; -redo: - while (ire != NULL && ire->ire_addr == addr && - ire->ire_ipif->ipif_ill->ill_group == illgrp) { - /* - * The first ire for any address within a group - * should always be the one with IRE_MARK_NORECV cleared - * so that ip_wput_ire can avoid searching for one. - * Note down the insertion point which will be used - * later. - */ - if (first && (*pirep == NULL)) - *pirep = ire->ire_ptpn; - /* - * PHYI_FAILED is set when the interface fails. - * This interface might have become good, but the - * daemon has not yet detected. We should still - * not receive on this. PHYI_OFFLINE should never - * be picked as this has been offlined and soon - * be removed. - */ - phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags; - if (phyi_flags & PHYI_OFFLINE) { - ire->ire_marks |= IRE_MARK_NORECV; - ire = ire->ire_next; - continue; - } - if (phyi_flags & match_flags) { - ire->ire_marks |= IRE_MARK_NORECV; - ire = ire->ire_next; - if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) == - PHYI_INACTIVE) { - fallback = B_TRUE; - } - continue; - } - if (first) { - /* - * We will move this to the front of the list later - * on. - */ - clear_ire = ire; - ire->ire_marks &= ~IRE_MARK_NORECV; - } else { - ire->ire_marks |= IRE_MARK_NORECV; - } - first = B_FALSE; - ire = ire->ire_next; - } - /* - * If we never nominated anybody, try nominating at least - * an INACTIVE, if we found one. Do it only once though. - */ - if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) && - fallback) { - match_flags = PHYI_FAILED; - ire = start_ire; - *pirep = NULL; - goto redo; - } - return (clear_ire); -} - -/* - * This function must be called only after the broadcast ires - * have been grouped together. For a given address addr, nominate - * only one of the ires whose interface is not FAILED or OFFLINE. - * - * This is also called when an ipif goes down, so that we can nominate - * a different ire with the same address for receiving. - */ -static void -ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr, ip_stack_t *ipst) -{ - irb_t *irb; - ire_t *ire; - ire_t *ire1; - ire_t *save_ire; - ire_t **irep = NULL; - ire_t *clear_ire = NULL; - ire_t *new_lb_ire; - ire_t *new_nlb_ire; - boolean_t new_lb_ire_used = B_FALSE; - boolean_t new_nlb_ire_used = B_FALSE; - boolean_t refrele_lb_ire = B_FALSE; - boolean_t refrele_nlb_ire = B_FALSE; - uint_t max_frag; - - ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES, - NULL, MATCH_IRE_TYPE, ipst); - /* - * We may not be able to find some ires if a previous - * ire_create failed. This happens when an ipif goes - * down and we are unable to create BROADCAST ires due - * to memory failure. Thus, we have to check for NULL - * below. This should handle the case for LOOPBACK, - * POINTOPOINT and interfaces with some POINTOPOINT - * logicals for which there are no BROADCAST ires. - */ - if (ire == NULL) - return; - /* - * Currently IRE_BROADCASTS are deleted when an ipif - * goes down which runs exclusively. Thus, setting - * IRE_MARK_RCVD should not race with ire_delete marking - * IRE_MARK_CONDEMNED. We grab the lock below just to - * be consistent with other parts of the code that walks - * a given bucket. - */ - save_ire = ire; - irb = ire->ire_bucket; - new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); - if (new_lb_ire == NULL) { - ire_refrele(ire); - return; - } - new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); - if (new_nlb_ire == NULL) { - ire_refrele(ire); - kmem_cache_free(ire_cache, new_lb_ire); - return; - } - IRB_REFHOLD(irb); - rw_enter(&irb->irb_lock, RW_WRITER); - clear_ire = irep_insert(illgrp, addr, ire, &irep); - - /* - * irep non-NULL indicates that we entered the while loop - * above. If clear_ire is at the insertion point, we don't - * have to do anything. clear_ire will be NULL if all the - * interfaces are failed. - * - * We cannot unlink and reinsert the ire at the right place - * in the list since there can be other walkers of this bucket. - * Instead we delete and recreate the ire - */ - if (clear_ire != NULL && irep != NULL && *irep != clear_ire) { - ire_t *clear_ire_stq = NULL; - ire_t *clr_ire = NULL; - ire_t *ire_next = NULL; - - if (clear_ire->ire_stq == NULL) - ire_next = clear_ire->ire_next; - - rw_exit(&irb->irb_lock); - - bzero(new_lb_ire, sizeof (ire_t)); - /* XXX We need a recovery strategy here. */ - if (ire_init(new_lb_ire, - (uchar_t *)&clear_ire->ire_addr, - (uchar_t *)&clear_ire->ire_mask, - (uchar_t *)&clear_ire->ire_src_addr, - (uchar_t *)&clear_ire->ire_gateway_addr, - &clear_ire->ire_max_frag, - NULL, /* let ire_nce_init derive the resolver info */ - clear_ire->ire_rfq, - clear_ire->ire_stq, - clear_ire->ire_type, - clear_ire->ire_ipif, - clear_ire->ire_cmask, - clear_ire->ire_phandle, - clear_ire->ire_ihandle, - clear_ire->ire_flags, - &clear_ire->ire_uinfo, - NULL, - NULL, - ipst) == NULL) - cmn_err(CE_PANIC, "ire_init() failed"); - - refrele_lb_ire = B_TRUE; - - if (ire_next != NULL && - ire_next->ire_stq != NULL && - ire_next->ire_addr == clear_ire->ire_addr && - ire_next->ire_ipif->ipif_ill == - clear_ire->ire_ipif->ipif_ill) { - clear_ire_stq = ire_next; - - bzero(new_nlb_ire, sizeof (ire_t)); - /* XXX We need a recovery strategy here. */ - if (ire_init(new_nlb_ire, - (uchar_t *)&clear_ire_stq->ire_addr, - (uchar_t *)&clear_ire_stq->ire_mask, - (uchar_t *)&clear_ire_stq->ire_src_addr, - (uchar_t *)&clear_ire_stq->ire_gateway_addr, - &clear_ire_stq->ire_max_frag, - NULL, - clear_ire_stq->ire_rfq, - clear_ire_stq->ire_stq, - clear_ire_stq->ire_type, - clear_ire_stq->ire_ipif, - clear_ire_stq->ire_cmask, - clear_ire_stq->ire_phandle, - clear_ire_stq->ire_ihandle, - clear_ire_stq->ire_flags, - &clear_ire_stq->ire_uinfo, - NULL, - NULL, - ipst) == NULL) - cmn_err(CE_PANIC, "ire_init() failed"); - - refrele_nlb_ire = B_TRUE; - } - - rw_enter(&irb->irb_lock, RW_WRITER); - /* - * irb_lock was dropped across call to ire_init() due to - * lock ordering issue with ipst->ips_ndp{4,6}->ndp_g_lock - * mutex lock. Therefore irep could have changed. call - * irep_insert() to get the new insertion point (irep) and - * recheck all known conditions. - */ - irep = NULL; - clr_ire = irep_insert(illgrp, addr, save_ire, &irep); - if ((irep != NULL) && (*irep != clear_ire) && - (clr_ire == clear_ire)) { - if ((clear_ire_stq != NULL) && - (clr_ire->ire_next != clear_ire_stq)) - clear_ire_stq = NULL; - /* - * Delete the ire. We can't call ire_delete() since - * we are holding the bucket lock. We can't release the - * bucket lock since we can't allow irep to change. - * So just mark it CONDEMNED. - * The IRB_REFRELE will delete the ire from the list - * and do the refrele. - */ - clear_ire->ire_marks |= IRE_MARK_CONDEMNED; - irb->irb_marks |= IRB_MARK_CONDEMNED; - - if (clear_ire_stq != NULL && - clear_ire_stq->ire_nce != NULL) { - nce_fastpath_list_delete( - clear_ire_stq->ire_nce); - clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED; - } - - /* - * Also take care of otherfields like ib/ob pkt count - * etc. Need to dup them. - * ditto in ill_bcast_delete_and_add - */ - - /* Set the max_frag before adding the ire */ - max_frag = *new_lb_ire->ire_max_fragp; - new_lb_ire->ire_max_fragp = NULL; - new_lb_ire->ire_max_frag = max_frag; - - /* Add the new ire's. Insert at *irep */ - new_lb_ire->ire_bucket = clear_ire->ire_bucket; - ire1 = *irep; - if (ire1 != NULL) - ire1->ire_ptpn = &new_lb_ire->ire_next; - new_lb_ire->ire_next = ire1; - /* Link the new one in. */ - new_lb_ire->ire_ptpn = irep; - membar_producer(); - *irep = new_lb_ire; - new_lb_ire_used = B_TRUE; - BUMP_IRE_STATS(ipst->ips_ire_stats_v4, - ire_stats_inserted); - new_lb_ire->ire_bucket->irb_ire_cnt++; - DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), - new_lb_ire->ire_ipif, - (char *), "ire", (void *), new_lb_ire); - new_lb_ire->ire_ipif->ipif_ire_cnt++; - - if (clear_ire_stq != NULL) { - ill_t *ire_ill; - /* Set the max_frag before adding the ire */ - max_frag = *new_nlb_ire->ire_max_fragp; - new_nlb_ire->ire_max_fragp = NULL; - new_nlb_ire->ire_max_frag = max_frag; - - new_nlb_ire->ire_bucket = clear_ire->ire_bucket; - irep = &new_lb_ire->ire_next; - /* Add the new ire. Insert at *irep */ - ire1 = *irep; - if (ire1 != NULL) - ire1->ire_ptpn = &new_nlb_ire->ire_next; - new_nlb_ire->ire_next = ire1; - /* Link the new one in. */ - new_nlb_ire->ire_ptpn = irep; - membar_producer(); - *irep = new_nlb_ire; - new_nlb_ire_used = B_TRUE; - BUMP_IRE_STATS(ipst->ips_ire_stats_v4, - ire_stats_inserted); - new_nlb_ire->ire_bucket->irb_ire_cnt++; - DTRACE_PROBE3(ipif__incr__cnt, - (ipif_t *), new_nlb_ire->ire_ipif, - (char *), "ire", (void *), new_nlb_ire); - new_nlb_ire->ire_ipif->ipif_ire_cnt++; - DTRACE_PROBE3(ill__incr__cnt, - (ill_t *), new_nlb_ire->ire_stq->q_ptr, - (char *), "ire", (void *), new_nlb_ire); - ire_ill = (ill_t *)new_nlb_ire->ire_stq->q_ptr; - ire_ill->ill_ire_cnt++; - } - } - } - ire_refrele(save_ire); - rw_exit(&irb->irb_lock); - /* - * Since we dropped the irb_lock across call to ire_init() - * and rechecking known conditions, it is possible that - * the checks might fail, therefore undo the work done by - * ire_init() by calling ire_refrele() on the newly created ire. - */ - if (!new_lb_ire_used) { - if (refrele_lb_ire) { - ire_refrele(new_lb_ire); - } else { - kmem_cache_free(ire_cache, new_lb_ire); - } - } - if (!new_nlb_ire_used) { - if (refrele_nlb_ire) { - ire_refrele(new_nlb_ire); - } else { - kmem_cache_free(ire_cache, new_nlb_ire); - } - } - IRB_REFRELE(irb); -} - -/* - * Whenever an ipif goes down we have to renominate a different - * broadcast ire to receive. Whenever an ipif comes up, we need - * to make sure that we have only one nominated to receive. - */ -static void -ipif_renominate_bcast(ipif_t *ipif) -{ - ill_t *ill = ipif->ipif_ill; - ipaddr_t subnet_addr; - ipaddr_t net_addr; - ipaddr_t net_mask = 0; - ipaddr_t subnet_netmask; - ipaddr_t addr; - ill_group_t *illgrp; - ip_stack_t *ipst = ill->ill_ipst; - - illgrp = ill->ill_group; - /* - * If this is the last ipif going down, it might take - * the ill out of the group. In that case ipif_down -> - * illgrp_delete takes care of doing the nomination. - * ipif_down does not call for this case. - */ - ASSERT(illgrp != NULL); - - /* There could not have been any ires associated with this */ - if (ipif->ipif_subnet == 0) - return; - - ill_mark_bcast(illgrp, 0, ipst); - ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst); - - if ((ipif->ipif_lcl_addr != INADDR_ANY) && - !(ipif->ipif_flags & IPIF_NOLOCAL)) { - net_mask = ip_net_mask(ipif->ipif_lcl_addr); - } else { - net_mask = htonl(IN_CLASSA_NET); - } - addr = net_mask & ipif->ipif_subnet; - ill_mark_bcast(illgrp, addr, ipst); - - net_addr = ~net_mask | addr; - ill_mark_bcast(illgrp, net_addr, ipst); - - subnet_netmask = ipif->ipif_net_mask; - addr = ipif->ipif_subnet; - ill_mark_bcast(illgrp, addr, ipst); - - subnet_addr = ~subnet_netmask | addr; - ill_mark_bcast(illgrp, subnet_addr, ipst); -} - -/* - * Whenever we form or delete ill groups, we need to nominate one set of - * BROADCAST ires for receiving in the group. - * - * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires - * have been added, but ill_ipif_up_count is 0. Thus, we don't assert - * for ill_ipif_up_count to be non-zero. This is the only case where - * ill_ipif_up_count is zero and we would still find the ires. - * - * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one - * ipif is UP and we just have to do the nomination. - * - * 3) When ill_handoff_responsibility calls us, some ill has been removed - * from the group. So, we have to do the nomination. - * - * Because of (3), there could be just one ill in the group. But we have - * to nominate still as IRE_MARK_NORCV may have been marked on this. - * Thus, this function does not optimize when there is only one ill as - * it is not correct for (3). - */ -static void -ill_nominate_bcast_rcv(ill_group_t *illgrp) -{ - ill_t *ill; - ipif_t *ipif; - ipaddr_t subnet_addr; - ipaddr_t prev_subnet_addr = 0; - ipaddr_t net_addr; - ipaddr_t prev_net_addr = 0; - ipaddr_t net_mask = 0; - ipaddr_t subnet_netmask; - ipaddr_t addr; - ip_stack_t *ipst; - - /* - * When the last memeber is leaving, there is nothing to - * nominate. - */ - if (illgrp->illgrp_ill_count == 0) { - ASSERT(illgrp->illgrp_ill == NULL); - return; - } - - ill = illgrp->illgrp_ill; - ASSERT(!ill->ill_isv6); - ipst = ill->ill_ipst; - /* - * We assume that ires with same address and belonging to the - * same group, has been grouped together. Nominating a *single* - * ill in the group for sending and receiving broadcast is done - * by making sure that the first BROADCAST ire (which will be - * the one returned by ire_ctable_lookup for ip_rput and the - * one that will be used in ip_wput_ire) will be the one that - * will not have IRE_MARK_NORECV set. - * - * 1) ip_rput checks and discards packets received on ires marked - * with IRE_MARK_NORECV. Thus, we don't send up duplicate - * broadcast packets. We need to clear IRE_MARK_NORECV on the - * first ire in the group for every broadcast address in the group. - * ip_rput will accept packets only on the first ire i.e only - * one copy of the ill. - * - * 2) ip_wput_ire needs to send out just one copy of the broadcast - * packet for the whole group. It needs to send out on the ill - * whose ire has not been marked with IRE_MARK_NORECV. If it sends - * on the one marked with IRE_MARK_NORECV, ip_rput will accept - * the copy echoed back on other port where the ire is not marked - * with IRE_MARK_NORECV. - * - * Note that we just need to have the first IRE either loopback or - * non-loopback (either of them may not exist if ire_create failed - * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will - * always hit the first one and hence will always accept one copy. - * - * We have a broadcast ire per ill for all the unique prefixes - * hosted on that ill. As we don't have a way of knowing the - * unique prefixes on a given ill and hence in the whole group, - * we just call ill_mark_bcast on all the prefixes that exist - * in the group. For the common case of one prefix, the code - * below optimizes by remebering the last address used for - * markng. In the case of multiple prefixes, this will still - * optimize depending the order of prefixes. - * - * The only unique address across the whole group is 0.0.0.0 and - * 255.255.255.255 and thus we call only once. ill_mark_bcast enables - * the first ire in the bucket for receiving and disables the - * others. - */ - ill_mark_bcast(illgrp, 0, ipst); - ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst); - for (; ill != NULL; ill = ill->ill_group_next) { - - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - - if (!(ipif->ipif_flags & IPIF_UP) || - ipif->ipif_subnet == 0) { - continue; - } - if ((ipif->ipif_lcl_addr != INADDR_ANY) && - !(ipif->ipif_flags & IPIF_NOLOCAL)) { - net_mask = ip_net_mask(ipif->ipif_lcl_addr); - } else { - net_mask = htonl(IN_CLASSA_NET); - } - addr = net_mask & ipif->ipif_subnet; - if (prev_net_addr == 0 || prev_net_addr != addr) { - ill_mark_bcast(illgrp, addr, ipst); - net_addr = ~net_mask | addr; - ill_mark_bcast(illgrp, net_addr, ipst); - } - prev_net_addr = addr; - - subnet_netmask = ipif->ipif_net_mask; - addr = ipif->ipif_subnet; - if (prev_subnet_addr == 0 || - prev_subnet_addr != addr) { - ill_mark_bcast(illgrp, addr, ipst); - subnet_addr = ~subnet_netmask | addr; - ill_mark_bcast(illgrp, subnet_addr, ipst); - } - prev_subnet_addr = addr; - } - } -} - -/* - * This function is called while forming ill groups. - * - * Currently, we handle only allmulti groups. We want to join - * allmulti on only one of the ills in the groups. In future, - * when we have link aggregation, we may have to join normal - * multicast groups on multiple ills as switch does inbound load - * balancing. Following are the functions that calls this - * function : - * - * 1) ill_recover_multicast : Interface is coming back UP. - * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6 - * will call ill_recover_multicast to recover all the multicast - * groups. We need to make sure that only one member is joined - * in the ill group. - * - * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed. - * Somebody is joining allmulti. We need to make sure that only one - * member is joined in the group. - * - * 3) illgrp_insert : If allmulti has already joined, we need to make - * sure that only one member is joined in the group. - * - * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving - * allmulti who we have nominated. We need to pick someother ill. - * - * 5) illgrp_delete : The ill we nominated is leaving the group, - * we need to pick a new ill to join the group. - * - * For (1), (2), (5) - we just have to check whether there is - * a good ill joined in the group. If we could not find any ills - * joined the group, we should join. - * - * For (4), the one that was nominated to receive, left the group. - * There could be nobody joined in the group when this function is - * called. - * - * For (3) - we need to explicitly check whether there are multiple - * ills joined in the group. - * - * For simplicity, we don't differentiate any of the above cases. We - * just leave the group if it is joined on any of them and join on - * the first good ill. - */ -int -ill_nominate_mcast_rcv(ill_group_t *illgrp) -{ - ilm_t *ilm; - ill_t *ill; - ill_t *fallback_inactive_ill = NULL; - ill_t *fallback_failed_ill = NULL; - int ret = 0; - - /* - * Leave the allmulti on all the ills and start fresh. - */ - for (ill = illgrp->illgrp_ill; ill != NULL; - ill = ill->ill_group_next) { - if (ill->ill_join_allmulti) - ill_leave_allmulti(ill); - } - - /* - * Choose a good ill. Fallback to inactive or failed if - * none available. We need to fallback to FAILED in the - * case where we have 2 interfaces in a group - where - * one of them is failed and another is a good one and - * the good one (not marked inactive) is leaving the group. - */ - for (ill = illgrp->illgrp_ill; ill != NULL; ill = ill->ill_group_next) { - if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE) - continue; - if (ill->ill_phyint->phyint_flags & PHYI_FAILED) { - fallback_failed_ill = ill; - continue; - } - if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) { - fallback_inactive_ill = ill; - continue; - } - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { - ret = ill_join_allmulti(ill); - /* - * ill_join_allmulti() can fail because of - * memory failures so make sure we join at - * least on one ill. - */ - if (ill->ill_join_allmulti) - return (0); - } - } - } - if (ret != 0) { - /* - * If we tried nominating above and failed to do so, - * return error. We might have tried multiple times. - * But, return the latest error. - */ - return (ret); - } - if ((ill = fallback_inactive_ill) != NULL) { - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) - return (ill_join_allmulti(ill)); - } - } else if ((ill = fallback_failed_ill) != NULL) { - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) - return (ill_join_allmulti(ill)); - } - } - return (0); -} - -/* - * This function is called from illgrp_delete after it is - * deleted from the group to reschedule responsibilities - * to a different ill. - */ -static void -ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp) -{ - ilm_t *ilm; - ipif_t *ipif; - ipaddr_t subnet_addr; - ipaddr_t net_addr; - ipaddr_t net_mask = 0; - ipaddr_t subnet_netmask; - ipaddr_t addr; - ip_stack_t *ipst = ill->ill_ipst; - - ASSERT(ill->ill_group == NULL); - /* - * Broadcast Responsibility: - * - * 1. If this ill has been nominated for receiving broadcast - * packets, we need to find a new one. Before we find a new - * one, we need to re-group the ires that are part of this new - * group (assumed by ill_nominate_bcast_rcv). We do this by - * calling ill_group_bcast_for_xmit(ill) which will do the right - * thing for us. - * - * 2. If this ill was not nominated for receiving broadcast - * packets, we need to clear the IRE_MARK_NORECV flag - * so that we continue to send up broadcast packets. - */ - if (!ill->ill_isv6) { - /* - * Case 1 above : No optimization here. Just redo the - * nomination. - */ - ill_group_bcast_for_xmit(ill); - ill_nominate_bcast_rcv(illgrp); - - /* - * Case 2 above : Lookup and clear IRE_MARK_NORECV. - */ - ill_clear_bcast_mark(ill, 0); - ill_clear_bcast_mark(ill, INADDR_BROADCAST); - - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - - if (!(ipif->ipif_flags & IPIF_UP) || - ipif->ipif_subnet == 0) { - continue; - } - if ((ipif->ipif_lcl_addr != INADDR_ANY) && - !(ipif->ipif_flags & IPIF_NOLOCAL)) { - net_mask = ip_net_mask(ipif->ipif_lcl_addr); - } else { - net_mask = htonl(IN_CLASSA_NET); - } - addr = net_mask & ipif->ipif_subnet; - ill_clear_bcast_mark(ill, addr); - - net_addr = ~net_mask | addr; - ill_clear_bcast_mark(ill, net_addr); - - subnet_netmask = ipif->ipif_net_mask; - addr = ipif->ipif_subnet; - ill_clear_bcast_mark(ill, addr); - - subnet_addr = ~subnet_netmask | addr; - ill_clear_bcast_mark(ill, subnet_addr); - } - } - - /* - * Multicast Responsibility. - * - * If we have joined allmulti on this one, find a new member - * in the group to join allmulti. As this ill is already part - * of allmulti, we don't have to join on this one. - * - * If we have not joined allmulti on this one, there is no - * responsibility to handoff. But we need to take new - * responsibility i.e, join allmulti on this one if we need - * to. - */ - if (ill->ill_join_allmulti) { - (void) ill_nominate_mcast_rcv(illgrp); - } else { - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { - (void) ill_join_allmulti(ill); - break; - } - } - } - - /* - * We intentionally do the flushing of IRE_CACHES only matching - * on the ill and not on groups. Note that we are already deleted - * from the group. - * - * This will make sure that all IRE_CACHES whose stq is pointing - * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get - * deleted and IRE_CACHES that are not pointing at this ill will - * be left alone. - */ - ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, - illgrp_cache_delete, ill, ill); - - /* - * Some conn may have cached one of the IREs deleted above. By removing - * the ire reference, we clean up the extra reference to the ill held in - * ire->ire_stq. - */ - ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); - - /* - * Re-do source address selection for all the members in the - * group, if they borrowed source address from one of the ipifs - * in this ill. - */ - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (ill->ill_isv6) { - ipif_update_other_ipifs_v6(ipif, illgrp); - } else { - ipif_update_other_ipifs(ipif, illgrp); - } + if (ill->ill_isv6) + ipif_recreate_interface_routes_v6(NULL, ipif); + else + ipif_recreate_interface_routes(NULL, ipif); } } /* - * Delete the ill from the group. The caller makes sure that it is - * in a group and it okay to delete from the group. So, we always - * delete here. + * Finish the group join started in ip_sioctl_groupname(). */ +/* ARGSUSED */ static void -illgrp_delete(ill_t *ill) -{ - ill_group_t *illgrp; - ill_group_t *tmpg; - ill_t *tmp_ill; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * Reset illgrp_ill_schednext if it was pointing at us. - * We need to do this before we set ill_group to NULL. - */ - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - mutex_enter(&ill->ill_lock); - - illgrp_reset_schednext(ill); - - illgrp = ill->ill_group; - - /* Delete the ill from illgrp. */ - if (illgrp->illgrp_ill == ill) { - illgrp->illgrp_ill = ill->ill_group_next; - } else { - tmp_ill = illgrp->illgrp_ill; - while (tmp_ill->ill_group_next != ill) { - tmp_ill = tmp_ill->ill_group_next; - ASSERT(tmp_ill != NULL); - } - tmp_ill->ill_group_next = ill->ill_group_next; - } - ill->ill_group = NULL; - ill->ill_group_next = NULL; - - illgrp->illgrp_ill_count--; - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - - /* - * As this ill is leaving the group, we need to hand off - * the responsibilities to the other ills in the group, if - * this ill had some responsibilities. - */ - - ill_handoff_responsibility(ill, illgrp); - - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - - if (illgrp->illgrp_ill_count == 0) { - - ASSERT(illgrp->illgrp_ill == NULL); - if (ill->ill_isv6) { - if (illgrp == ipst->ips_illgrp_head_v6) { - ipst->ips_illgrp_head_v6 = illgrp->illgrp_next; - } else { - tmpg = ipst->ips_illgrp_head_v6; - while (tmpg->illgrp_next != illgrp) { - tmpg = tmpg->illgrp_next; - ASSERT(tmpg != NULL); - } - tmpg->illgrp_next = illgrp->illgrp_next; - } - } else { - if (illgrp == ipst->ips_illgrp_head_v4) { - ipst->ips_illgrp_head_v4 = illgrp->illgrp_next; - } else { - tmpg = ipst->ips_illgrp_head_v4; - while (tmpg->illgrp_next != illgrp) { - tmpg = tmpg->illgrp_next; - ASSERT(tmpg != NULL); - } - tmpg->illgrp_next = illgrp->illgrp_next; - } - } - mutex_destroy(&illgrp->illgrp_lock); - mi_free(illgrp); - } - rw_exit(&ipst->ips_ill_g_lock); - - /* - * Even though the ill is out of the group its not necessary - * to set ipsq_split as TRUE as the ipifs could be down temporarily - * We will split the ipsq when phyint_groupname is set to NULL. - */ - - /* - * Send a routing sockets message if we are deleting from - * groups with names. - */ - if (ill->ill_phyint->phyint_groupname_len != 0) - ip_rts_ifmsg(ill->ill_ipif); -} - -/* - * Re-do source address selection. This is normally called when - * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST - * ipif comes up. - */ -void -ill_update_source_selection(ill_t *ill) +ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) { - ipif_t *ipif; - - ASSERT(IAM_WRITER_ILL(ill)); - - if (ill->ill_group != NULL) - ill = ill->ill_group->illgrp_ill; - - for (; ill != NULL; ill = ill->ill_group_next) { - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (ill->ill_isv6) - ipif_recreate_interface_routes_v6(NULL, ipif); - else - ipif_recreate_interface_routes(NULL, ipif); - } - } -} - -/* - * Insert ill in a group headed by illgrp_head. The caller can either - * pass a groupname in which case we search for a group with the - * same name to insert in or pass a group to insert in. This function - * would only search groups with names. - * - * NOTE : The caller should make sure that there is at least one ipif - * UP on this ill so that illgrp_scheduler can pick this ill - * for outbound packets. If ill_ipif_up_count is zero, we have - * already sent a DL_UNBIND to the driver and we don't want to - * send anymore packets. We don't assert for ipif_up_count - * to be greater than zero, because ipif_up_done wants to call - * this function before bumping up the ipif_up_count. See - * ipif_up_done() for details. - */ -int -illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname, - ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up) -{ - ill_group_t *illgrp; - ill_t *prev_ill; - phyint_t *phyi; + ill_t *ill = q->q_ptr; + phyint_t *phyi = ill->ill_phyint; + ipmp_grp_t *grp = phyi->phyint_grp; ip_stack_t *ipst = ill->ill_ipst; - ASSERT(ill->ill_group == NULL); - - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - mutex_enter(&ill->ill_lock); - - if (groupname != NULL) { - /* - * Look for a group with a matching groupname to insert. - */ - for (illgrp = *illgrp_head; illgrp != NULL; - illgrp = illgrp->illgrp_next) { - - ill_t *tmp_ill; - - /* - * If we have an ill_group_t in the list which has - * no ill_t assigned then we must be in the process of - * removing this group. We skip this as illgrp_delete() - * will remove it from the list. - */ - if ((tmp_ill = illgrp->illgrp_ill) == NULL) { - ASSERT(illgrp->illgrp_ill_count == 0); - continue; - } - - ASSERT(tmp_ill->ill_phyint != NULL); - phyi = tmp_ill->ill_phyint; - /* - * Look at groups which has names only. - */ - if (phyi->phyint_groupname_len == 0) - continue; - /* - * Names are stored in the phyint common to both - * IPv4 and IPv6. - */ - if (mi_strcmp(phyi->phyint_groupname, - groupname) == 0) { - break; - } - } - } else { - /* - * If the caller passes in a NULL "grp_to_insert", we - * allocate one below and insert this singleton. - */ - illgrp = grp_to_insert; - } - - ill->ill_group_next = NULL; - - if (illgrp == NULL) { - illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t)); - if (illgrp == NULL) { - return (ENOMEM); - } - illgrp->illgrp_next = *illgrp_head; - *illgrp_head = illgrp; - illgrp->illgrp_ill = ill; - illgrp->illgrp_ill_count = 1; - ill->ill_group = illgrp; - /* - * Used in illgrp_scheduler to protect multiple threads - * from traversing the list. - */ - mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0); - } else { - ASSERT(ill->ill_net_type == - illgrp->illgrp_ill->ill_net_type); - ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type); - - /* Insert ill at tail of this group */ - prev_ill = illgrp->illgrp_ill; - while (prev_ill->ill_group_next != NULL) - prev_ill = prev_ill->ill_group_next; - prev_ill->ill_group_next = ill; - ill->ill_group = illgrp; - illgrp->illgrp_ill_count++; - /* - * Inherit group properties. Currently only forwarding - * is the property we try to keep the same with all the - * ills. When there are more, we will abstract this into - * a function. - */ - ill->ill_flags &= ~ILLF_ROUTER; - ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER); - } - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - - /* - * 1) When ipif_up_done() calls this function, ipif_up_count - * may be zero as it has not yet been bumped. But the ires - * have already been added. So, we do the nomination here - * itself. But, when ip_sioctl_groupname calls this, it checks - * for ill_ipif_up_count != 0. Thus we don't check for - * ill_ipif_up_count here while nominating broadcast ires for - * receive. - * - * 2) Similarly, we need to call ill_group_bcast_for_xmit here - * to group them properly as ire_add() has already happened - * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert - * case, we need to do it here anyway. - */ - if (!ill->ill_isv6) { - ill_group_bcast_for_xmit(ill); - ill_nominate_bcast_rcv(illgrp); - } - - if (!ipif_is_coming_up) { - /* - * When ipif_up_done() calls this function, the multicast - * groups have not been joined yet. So, there is no point in - * nomination. ill_join_allmulti() will handle groups when - * ill_recover_multicast() is called from ipif_up_done() later. - */ - (void) ill_nominate_mcast_rcv(illgrp); - /* - * ipif_up_done calls ill_update_source_selection - * anyway. Moreover, we don't want to re-create - * interface routes while ipif_up_done() still has reference - * to them. Refer to ipif_up_done() for more details. - */ - ill_update_source_selection(ill); - } - - /* - * Send a routing sockets message if we are inserting into - * groups with names. - */ - if (groupname != NULL) - ip_rts_ifmsg(ill->ill_ipif); - return (0); -} - -/* - * Return the first phyint matching the groupname. There could - * be more than one when there are ill groups. - * - * If 'usable' is set, then we exclude ones that are marked with any of - * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE). - * Needs work: called only from ip_sioctl_groupname and from the ipmp/netinfo - * emulation of ipmp. - */ -phyint_t * -phyint_lookup_group(char *groupname, boolean_t usable, ip_stack_t *ipst) -{ - phyint_t *phyi; - - ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); - /* - * Group names are stored in the phyint - a common structure - * to both IPv4 and IPv6. - */ - phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); - for (; phyi != NULL; - phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, - phyi, AVL_AFTER)) { - if (phyi->phyint_groupname_len == 0) - continue; - /* - * Skip the ones that should not be used since the callers - * sometime use this for sending packets. - */ - if (usable && (phyi->phyint_flags & - (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE))) - continue; + /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ + ASSERT(!IS_IPMP(ill) && grp != NULL); + ASSERT(IAM_WRITER_IPSQ(ipsq)); - ASSERT(phyi->phyint_groupname != NULL); - if (mi_strcmp(groupname, phyi->phyint_groupname) == 0) - return (phyi); + if (phyi->phyint_illv4 != NULL) { + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + VERIFY(grp->gr_pendv4-- > 0); + rw_exit(&ipst->ips_ipmp_lock); + ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); } - return (NULL); -} - - -/* - * Return the first usable phyint matching the group index. By 'usable' - * we exclude ones that are marked ununsable with any of - * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE). - * - * Used only for the ipmp/netinfo emulation of ipmp. - */ -phyint_t * -phyint_lookup_group_ifindex(uint_t group_ifindex, ip_stack_t *ipst) -{ - phyint_t *phyi; - - ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); - - if (!ipst->ips_ipmp_hook_emulation) - return (NULL); - - /* - * Group indicies are stored in the phyint - a common structure - * to both IPv4 and IPv6. - */ - phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); - for (; phyi != NULL; - phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, - phyi, AVL_AFTER)) { - /* Ignore the ones that do not have a group */ - if (phyi->phyint_groupname_len == 0) - continue; - - ASSERT(phyi->phyint_group_ifindex != 0); - /* - * Skip the ones that should not be used since the callers - * sometime use this for sending packets. - */ - if (phyi->phyint_flags & - (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)) - continue; - if (phyi->phyint_group_ifindex == group_ifindex) - return (phyi); + if (phyi->phyint_illv6 != NULL) { + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + VERIFY(grp->gr_pendv6-- > 0); + rw_exit(&ipst->ips_ipmp_lock); + ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); } - return (NULL); + freemsg(mp); } /* - * MT notes on creation and deletion of IPMP groups - * - * Creation and deletion of IPMP groups introduce the need to merge or - * split the associated serialization objects i.e the ipsq's. Normally all - * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled - * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during - * the execution of the SIOCSLIFGROUPNAME command the picture changes. There - * is a need to change the <ill-ipsq> association and we have to operate on both - * the source and destination IPMP groups. For eg. attempting to set the - * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to - * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the - * source or destination IPMP group are mapped to a single ipsq for executing - * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's. - * The <ill-ipsq> mapping is restored back to normal at a later point. This is - * termed as a split of the ipsq. The converse of the merge i.e. a split of the - * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname - * occurred on the ipsq, then the ipsq_split flag is set. This indicates the - * ipsq has to be examined for redoing the <ill-ipsq> associations. - * - * In the above example the ioctl handling code locates the current ipsq of hme0 - * which is ipsq(mpk17-84). It then enters the above ipsq immediately or - * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates - * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into - * the destination ipsq. If the destination ipsq is not busy, it also enters - * the destination ipsq exclusively. Now the actual groupname setting operation - * can proceed. If the destination ipsq is busy, the operation is enqueued - * on the destination (merged) ipsq and will be handled in the unwind from - * ipsq_exit. - * - * To prevent other threads accessing the ill while the group name change is - * in progres, we bring down the ipifs which also removes the ill from the - * group. The group is changed in phyint and when the first ipif on the ill - * is brought up, the ill is inserted into the right IPMP group by - * illgrp_insert. + * Process an SIOCSLIFGROUPNAME request. */ /* ARGSUSED */ int ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) { - int i; - char *tmp; - int namelen; - ill_t *ill = ipif->ipif_ill; - ill_t *ill_v4, *ill_v6; - int err = 0; - phyint_t *phyi; - phyint_t *phyi_tmp; - struct lifreq *lifr; - mblk_t *mp1; - char *groupname; - ipsq_t *ipsq; + struct lifreq *lifr = ifreq; + ill_t *ill = ipif->ipif_ill; ip_stack_t *ipst = ill->ill_ipst; - - ASSERT(IAM_WRITER_IPIF(ipif)); - - /* Existance verified in ip_wput_nondata */ - mp1 = mp->b_cont->b_cont; - lifr = (struct lifreq *)mp1->b_rptr; - groupname = lifr->lifr_groupname; - - if (ipif->ipif_id != 0) - return (EINVAL); - - phyi = ill->ill_phyint; - ASSERT(phyi != NULL); - - if (phyi->phyint_flags & PHYI_VIRTUAL) - return (EINVAL); - - tmp = groupname; - for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++) - ; - - if (i == LIFNAMSIZ) { - /* no null termination */ - return (EINVAL); - } + phyint_t *phyi = ill->ill_phyint; + ipmp_grp_t *grp = phyi->phyint_grp; + mblk_t *ipsq_mp; + int err = 0; /* - * Calculate the namelen exclusive of the null - * termination character. + * Note that phyint_grp can only change here, where we're exclusive. */ - namelen = tmp - groupname; - - ill_v4 = phyi->phyint_illv4; - ill_v6 = phyi->phyint_illv6; + ASSERT(IAM_WRITER_ILL(ill)); - /* - * ILL cannot be part of a usesrc group and and IPMP group at the - * same time. No need to grab the ill_g_usesrc_lock here, see - * synchronization notes in ip.c - */ - if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) { + if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || + (phyi->phyint_flags & PHYI_VIRTUAL)) return (EINVAL); - } - - /* - * mark the ill as changing. - * this should queue all new requests on the syncq. - */ - GRAB_ILL_LOCKS(ill_v4, ill_v6); - - if (ill_v4 != NULL) - ill_v4->ill_state_flags |= ILL_CHANGING; - if (ill_v6 != NULL) - ill_v6->ill_state_flags |= ILL_CHANGING; - RELEASE_ILL_LOCKS(ill_v4, ill_v6); - - if (namelen == 0) { - /* - * Null string means remove this interface from the - * existing group. - */ - if (phyi->phyint_groupname_len == 0) { - /* - * Never was in a group. - */ - err = 0; - goto done; - } - - /* - * IPv4 or IPv6 may be temporarily out of the group when all - * the ipifs are down. Thus, we need to check for ill_group to - * be non-NULL. - */ - if (ill_v4 != NULL && ill_v4->ill_group != NULL) { - ill_down_ipifs(ill_v4, mp, 0, B_FALSE); - mutex_enter(&ill_v4->ill_lock); - if (!ill_is_quiescent(ill_v4)) { - /* - * ipsq_pending_mp_add will not fail since - * connp is NULL - */ - (void) ipsq_pending_mp_add(NULL, - ill_v4->ill_ipif, q, mp, ILL_DOWN); - mutex_exit(&ill_v4->ill_lock); - err = EINPROGRESS; - goto done; - } - mutex_exit(&ill_v4->ill_lock); - } - - if (ill_v6 != NULL && ill_v6->ill_group != NULL) { - ill_down_ipifs(ill_v6, mp, 0, B_FALSE); - mutex_enter(&ill_v6->ill_lock); - if (!ill_is_quiescent(ill_v6)) { - (void) ipsq_pending_mp_add(NULL, - ill_v6->ill_ipif, q, mp, ILL_DOWN); - mutex_exit(&ill_v6->ill_lock); - err = EINPROGRESS; - goto done; - } - mutex_exit(&ill_v6->ill_lock); - } - - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - GRAB_ILL_LOCKS(ill_v4, ill_v6); - mutex_enter(&phyi->phyint_lock); - ASSERT(phyi->phyint_groupname != NULL); - mi_free(phyi->phyint_groupname); - phyi->phyint_groupname = NULL; - phyi->phyint_groupname_len = 0; - - /* Restore the ifindex used to be the per interface one */ - phyi->phyint_group_ifindex = 0; - phyi->phyint_hook_ifindex = phyi->phyint_ifindex; - mutex_exit(&phyi->phyint_lock); - RELEASE_ILL_LOCKS(ill_v4, ill_v6); - rw_exit(&ipst->ips_ill_g_lock); - err = ill_up_ipifs(ill, q, mp); - /* - * set the split flag so that the ipsq can be split - */ - mutex_enter(&phyi->phyint_ipsq->ipsq_lock); - phyi->phyint_ipsq->ipsq_split = B_TRUE; - mutex_exit(&phyi->phyint_ipsq->ipsq_lock); + lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; - } else { - if (phyi->phyint_groupname_len != 0) { - ASSERT(phyi->phyint_groupname != NULL); - /* Are we inserting in the same group ? */ - if (mi_strcmp(groupname, - phyi->phyint_groupname) == 0) { - err = 0; - goto done; - } - } + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - /* - * Merge ipsq for the group's. - * This check is here as multiple groups/ills might be - * sharing the same ipsq. - * If we have to merege than the operation is restarted - * on the new ipsq. - */ - ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL, ipst); - if (phyi->phyint_ipsq != ipsq) { - rw_exit(&ipst->ips_ill_g_lock); - err = ill_merge_groups(ill, NULL, groupname, mp, q); - goto done; - } - /* - * Running exclusive on new ipsq. - */ - - ASSERT(ipsq != NULL); - ASSERT(ipsq->ipsq_writer == curthread); - - /* - * Check whether the ill_type and ill_net_type matches before - * we allocate any memory so that the cleanup is easier. - * - * We can't group dissimilar ones as we can't load spread - * packets across the group because of potential link-level - * header differences. - */ - phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst); - if (phyi_tmp != NULL) { - if ((ill_v4 != NULL && - phyi_tmp->phyint_illv4 != NULL) && - ((ill_v4->ill_net_type != - phyi_tmp->phyint_illv4->ill_net_type) || - (ill_v4->ill_type != - phyi_tmp->phyint_illv4->ill_type))) { - mutex_enter(&phyi->phyint_ipsq->ipsq_lock); - phyi->phyint_ipsq->ipsq_split = B_TRUE; - mutex_exit(&phyi->phyint_ipsq->ipsq_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (EINVAL); - } - if ((ill_v6 != NULL && - phyi_tmp->phyint_illv6 != NULL) && - ((ill_v6->ill_net_type != - phyi_tmp->phyint_illv6->ill_net_type) || - (ill_v6->ill_type != - phyi_tmp->phyint_illv6->ill_type))) { - mutex_enter(&phyi->phyint_ipsq->ipsq_lock); - phyi->phyint_ipsq->ipsq_split = B_TRUE; - mutex_exit(&phyi->phyint_ipsq->ipsq_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (EINVAL); - } - } - - rw_exit(&ipst->ips_ill_g_lock); - - /* - * bring down all v4 ipifs. - */ - if (ill_v4 != NULL) { - ill_down_ipifs(ill_v4, mp, 0, B_FALSE); - } - - /* - * bring down all v6 ipifs. - */ - if (ill_v6 != NULL) { - ill_down_ipifs(ill_v6, mp, 0, B_FALSE); - } - - /* - * make sure all ipifs are down and there are no active - * references. Call to ipsq_pending_mp_add will not fail - * since connp is NULL. - */ - if (ill_v4 != NULL) { - mutex_enter(&ill_v4->ill_lock); - if (!ill_is_quiescent(ill_v4)) { - (void) ipsq_pending_mp_add(NULL, - ill_v4->ill_ipif, q, mp, ILL_DOWN); - mutex_exit(&ill_v4->ill_lock); - err = EINPROGRESS; - goto done; - } - mutex_exit(&ill_v4->ill_lock); - } - - if (ill_v6 != NULL) { - mutex_enter(&ill_v6->ill_lock); - if (!ill_is_quiescent(ill_v6)) { - (void) ipsq_pending_mp_add(NULL, - ill_v6->ill_ipif, q, mp, ILL_DOWN); - mutex_exit(&ill_v6->ill_lock); - err = EINPROGRESS; - goto done; - } - mutex_exit(&ill_v6->ill_lock); - } - - /* - * allocate including space for null terminator - * before we insert. - */ - tmp = (char *)mi_alloc(namelen + 1, BPRI_MED); - if (tmp == NULL) - return (ENOMEM); - - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - GRAB_ILL_LOCKS(ill_v4, ill_v6); - mutex_enter(&phyi->phyint_lock); - if (phyi->phyint_groupname_len != 0) { - ASSERT(phyi->phyint_groupname != NULL); - mi_free(phyi->phyint_groupname); - } - - /* - * setup the new group name. - */ - phyi->phyint_groupname = tmp; - bcopy(groupname, phyi->phyint_groupname, namelen + 1); - phyi->phyint_groupname_len = namelen + 1; - - if (ipst->ips_ipmp_hook_emulation) { - /* - * If the group already exists we use the existing - * group_ifindex, otherwise we pick a new index here. - */ - if (phyi_tmp != NULL) { - phyi->phyint_group_ifindex = - phyi_tmp->phyint_group_ifindex; - } else { - /* XXX We need a recovery strategy here. */ - if (!ip_assign_ifindex( - &phyi->phyint_group_ifindex, ipst)) - cmn_err(CE_PANIC, - "ip_assign_ifindex() failed"); - } - } - /* - * Select whether the netinfo and hook use the per-interface - * or per-group ifindex. - */ - if (ipst->ips_ipmp_hook_emulation) - phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex; - else - phyi->phyint_hook_ifindex = phyi->phyint_ifindex; - - if (ipst->ips_ipmp_hook_emulation && - phyi_tmp != NULL) { - /* First phyint in group - group PLUMB event */ - ill_nic_event_plumb(ill, B_TRUE); - } - mutex_exit(&phyi->phyint_lock); - RELEASE_ILL_LOCKS(ill_v4, ill_v6); - rw_exit(&ipst->ips_ill_g_lock); - - err = ill_up_ipifs(ill, q, mp); - } - -done: /* - * normally ILL_CHANGING is cleared in ill_up_ipifs. + * If the name hasn't changed, there's nothing to do. */ - if (err != EINPROGRESS) { - GRAB_ILL_LOCKS(ill_v4, ill_v6); - if (ill_v4 != NULL) - ill_v4->ill_state_flags &= ~ILL_CHANGING; - if (ill_v6 != NULL) - ill_v6->ill_state_flags &= ~ILL_CHANGING; - RELEASE_ILL_LOCKS(ill_v4, ill_v6); - } - return (err); -} + if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) + goto unlock; -/* ARGSUSED */ -int -ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, - mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) -{ - ill_t *ill; - phyint_t *phyi; - struct lifreq *lifr; - mblk_t *mp1; - - /* Existence verified in ip_wput_nondata */ - mp1 = mp->b_cont->b_cont; - lifr = (struct lifreq *)mp1->b_rptr; - ill = ipif->ipif_ill; - phyi = ill->ill_phyint; - - lifr->lifr_groupname[0] = '\0'; /* - * ill_group may be null if all the interfaces - * are down. But still, the phyint should always - * hold the name. - */ - if (phyi->phyint_groupname_len != 0) { - bcopy(phyi->phyint_groupname, lifr->lifr_groupname, - phyi->phyint_groupname_len); - } - - return (0); -} - - -typedef struct conn_move_s { - ill_t *cm_from_ill; - ill_t *cm_to_ill; - int cm_ifindex; -} conn_move_t; - -/* - * ipcl_walk function for moving conn_multicast_ill for a given ill. - */ -static void -conn_move(conn_t *connp, caddr_t arg) -{ - conn_move_t *connm; - int ifindex; - int i; - ill_t *from_ill; - ill_t *to_ill; - ilg_t *ilg; - ilm_t *ret_ilm; - - connm = (conn_move_t *)arg; - ifindex = connm->cm_ifindex; - from_ill = connm->cm_from_ill; - to_ill = connm->cm_to_ill; - - /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */ - - /* All multicast fields protected by conn_lock */ - mutex_enter(&connp->conn_lock); - ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); - if ((connp->conn_outgoing_ill == from_ill) && - (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) { - connp->conn_outgoing_ill = to_ill; - connp->conn_incoming_ill = to_ill; - } - - /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */ - - if ((connp->conn_multicast_ill == from_ill) && - (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) { - connp->conn_multicast_ill = connm->cm_to_ill; - } - - /* - * Change the ilg_ill to point to the new one. This assumes - * ilm_move_v6 has moved the ilms to new_ill and the driver - * has been told to receive packets on this interface. - * ilm_move_v6 FAILBACKS all the ilms successfully always. - * But when doing a FAILOVER, it might fail with ENOMEM and so - * some ilms may not have moved. We check to see whether - * the ilms have moved to to_ill. We can't check on from_ill - * as in the process of moving, we could have split an ilm - * in to two - which has the same orig_ifindex and v6group. + * Handle requests to rename an IPMP meta-interface. * - * For IPv4, ilg_ipif moves implicitly. The code below really - * does not do anything for IPv4 as ilg_ill is NULL for IPv4. - */ - for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { - ilg = &connp->conn_ilg[i]; - if ((ilg->ilg_ill == from_ill) && - (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) { - /* ifindex != 0 indicates failback */ - if (ifindex != 0) { - connp->conn_ilg[i].ilg_ill = to_ill; - continue; - } - - mutex_enter(&to_ill->ill_lock); - ret_ilm = ilm_lookup_ill_index_v6(to_ill, - &ilg->ilg_v6group, ilg->ilg_orig_ifindex, - connp->conn_zoneid); - mutex_exit(&to_ill->ill_lock); - - if (ret_ilm != NULL) - connp->conn_ilg[i].ilg_ill = to_ill; - } + * Note that creation of the IPMP meta-interface is handled in + * userland through the standard plumbing sequence. As part of the + * plumbing the IPMP meta-interface, its initial groupname is set to + * the name of the interface (see ipif_set_values_tail()). + */ + if (IS_IPMP(ill)) { + err = ipmp_grp_rename(grp, lifr->lifr_groupname); + goto unlock; } - mutex_exit(&connp->conn_lock); -} - -static void -conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex) -{ - conn_move_t connm; - ip_stack_t *ipst = from_ill->ill_ipst; - - connm.cm_from_ill = from_ill; - connm.cm_to_ill = to_ill; - connm.cm_ifindex = ifindex; - - ipcl_walk(conn_move, (caddr_t)&connm, ipst); -} - -/* - * ilm has been moved from from_ill to to_ill. - * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill. - * appropriately. - * - * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because - * the code there de-references ipif_ill to get the ill to - * send multicast requests. It does not work as ipif is on its - * move and already moved when this function is called. - * Thus, we need to use from_ill and to_ill send down multicast - * requests. - */ -static void -ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill) -{ - ipif_t *ipif; - ilm_t *ilm; /* - * See whether we need to send down DL_ENABMULTI_REQ on - * to_ill as ilm has just been added. + * Handle requests to add or remove an IP interface from a group. */ - ASSERT(IAM_WRITER_ILL(to_ill)); - ASSERT(IAM_WRITER_ILL(from_ill)); - - ILM_WALKER_HOLD(to_ill); - for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - - if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED)) - continue; - /* - * no locks held, ill/ipif cannot dissappear as long - * as we are writer. - */ - ipif = to_ill->ill_ipif; + if (lifr->lifr_groupname[0] != '\0') { /* add */ /* - * No need to hold any lock as we are the writer and this - * can only be changed by a writer. + * Moves are handled by first removing the interface from + * its existing group, and then adding it to another group. + * So, fail if it's already in a group. */ - ilm->ilm_is_new = B_FALSE; - - if (to_ill->ill_net_type != IRE_IF_RESOLVER || - ipif->ipif_flags & IPIF_POINTOPOINT) { - ip1dbg(("ilm_send_multicast_reqs: to_ill not " - "resolver\n")); - continue; /* Must be IRE_IF_NORESOLVER */ - } - - if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { - ip1dbg(("ilm_send_multicast_reqs: " - "to_ill MULTI_BCAST\n")); - goto from; + if (IS_UNDER_IPMP(ill)) { + err = EALREADY; + goto unlock; } - if (to_ill->ill_isv6) - mld_joingroup(ilm); - else - igmp_joingroup(ilm); - - if (to_ill->ill_ipif_up_count == 0) { - /* - * Nobody there. All multicast addresses will be - * re-joined when we get the DL_BIND_ACK bringing the - * interface up. - */ - ilm->ilm_notify_driver = B_FALSE; - ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n")); - goto from; + grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); + if (grp == NULL) { + err = ENOENT; + goto unlock; } /* - * For allmulti address, we want to join on only one interface. - * Checking for ilm_numentries_v6 is not correct as you may - * find an ilm with zero address on to_ill, but we may not - * have nominated to_ill for receiving. Thus, if we have - * nominated from_ill (ill_join_allmulti is set), nominate - * only if to_ill is not already nominated (to_ill normally - * should not have been nominated if "from_ill" has already - * been nominated. As we don't prevent failovers from happening - * across groups, we don't assert). + * Check if the phyint and its ills are suitable for + * inclusion into the group. */ - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { - /* - * There is no need to hold ill locks as we are - * writer on both ills and when ill_join_allmulti() - * is called the thread is always a writer. - */ - if (from_ill->ill_join_allmulti && - !to_ill->ill_join_allmulti) { - (void) ill_join_allmulti(to_ill); - } - } else if (ilm->ilm_notify_driver) { - - /* - * This is a newly moved ilm so we need to tell the - * driver about the new group. There can be more than - * one ilm's for the same group in the list each with a - * different orig_ifindex. We have to inform the driver - * once. In ilm_move_v[4,6] we only set the flag - * ilm_notify_driver for the first ilm. - */ - - (void) ip_ll_send_enabmulti_req(to_ill, - &ilm->ilm_v6addr); - } - - ilm->ilm_notify_driver = B_FALSE; + if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) + goto unlock; /* - * See whether we need to send down DL_DISABMULTI_REQ on - * from_ill as ilm has just been removed. + * Checks pass; join the group, and enqueue the remaining + * illgrp joins for when we've become part of the group xop + * and are exclusive across its IPSQs. Since qwriter_ip() + * requires an mblk_t to scribble on, and since `mp' will be + * freed as part of completing the ioctl, allocate another. */ -from: - ipif = from_ill->ill_ipif; - if (from_ill->ill_net_type != IRE_IF_RESOLVER || - ipif->ipif_flags & IPIF_POINTOPOINT) { - ip1dbg(("ilm_send_multicast_reqs: " - "from_ill not resolver\n")); - continue; /* Must be IRE_IF_NORESOLVER */ - } - - if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { - ip1dbg(("ilm_send_multicast_reqs: " - "from_ill MULTI_BCAST\n")); - continue; - } - - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { - if (from_ill->ill_join_allmulti) - ill_leave_allmulti(from_ill); - } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) { - (void) ip_ll_send_disabmulti_req(from_ill, - &ilm->ilm_v6addr); + if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { + err = ENOMEM; + goto unlock; } - } - ILM_WALKER_RELE(to_ill); -} - -/* - * This function is called when all multicast memberships needs - * to be moved from "from_ill" to "to_ill" for IPv6. This function is - * called only once unlike the IPv4 counterpart where it is called after - * every logical interface is moved. The reason is due to multicast - * memberships are joined using an interface address in IPv4 while in - * IPv6, interface index is used. - */ -static void -ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex) -{ - ilm_t *ilm; - ilm_t *ilm_next; - ilm_t *new_ilm; - ilm_t **ilmp; - int count; - char buf[INET6_ADDRSTRLEN]; - in6_addr_t ipv6_snm = ipv6_solicited_node_mcast; - ip_stack_t *ipst = from_ill->ill_ipst; - - ASSERT(MUTEX_HELD(&to_ill->ill_lock)); - ASSERT(MUTEX_HELD(&from_ill->ill_lock)); - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - if (ifindex == 0) { /* - * Form the solicited node mcast address which is used later. + * Before we drop ipmp_lock, bump gr_pend* to ensure that the + * IPMP meta-interface ills needed by `phyi' cannot go away + * before ip_join_illgrps() is called back. See the comments + * in ip_sioctl_plink_ipmp() for more. */ - ipif_t *ipif; - - ipif = from_ill->ill_ipif; - ASSERT(ipif->ipif_id == 0); - - ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; - } - - ilmp = &from_ill->ill_ilm; - for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { - ilm_next = ilm->ilm_next; - - if (ilm->ilm_flags & ILM_DELETED) { - ilmp = &ilm->ilm_next; - continue; - } + if (phyi->phyint_illv4 != NULL) + grp->gr_pendv4++; + if (phyi->phyint_illv6 != NULL) + grp->gr_pendv6++; - new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr, - ilm->ilm_orig_ifindex, ilm->ilm_zoneid); - ASSERT(ilm->ilm_orig_ifindex != 0); - if (ilm->ilm_orig_ifindex == ifindex) { - /* - * We are failing back multicast memberships. - * If the same ilm exists in to_ill, it means somebody - * has joined the same group there e.g. ff02::1 - * is joined within the kernel when the interfaces - * came UP. - */ - ASSERT(ilm->ilm_ipif == NULL); - if (new_ilm != NULL) { - new_ilm->ilm_refcnt += ilm->ilm_refcnt; - if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || - !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { - new_ilm->ilm_is_new = B_TRUE; - } - } else { - /* - * check if we can just move the ilm - */ - if (from_ill->ill_ilm_walker_cnt != 0) { - /* - * We have walkers we cannot move - * the ilm, so allocate a new ilm, - * this (old) ilm will be marked - * ILM_DELETED at the end of the loop - * and will be freed when the - * last walker exits. - */ - new_ilm = (ilm_t *)mi_zalloc - (sizeof (ilm_t)); - if (new_ilm == NULL) { - ip0dbg(("ilm_move_v6: " - "FAILBACK of IPv6" - " multicast address %s : " - "from %s to" - " %s failed : ENOMEM \n", - inet_ntop(AF_INET6, - &ilm->ilm_v6addr, buf, - sizeof (buf)), - from_ill->ill_name, - to_ill->ill_name)); - - ilmp = &ilm->ilm_next; - continue; - } - *new_ilm = *ilm; - /* - * we don't want new_ilm linked to - * ilm's filter list. - */ - new_ilm->ilm_filter = NULL; - } else { - /* - * No walkers we can move the ilm. - * lets take it out of the list. - */ - *ilmp = ilm->ilm_next; - ilm->ilm_next = NULL; - DTRACE_PROBE3(ill__decr__cnt, - (ill_t *), from_ill, - (char *), "ilm", (void *), ilm); - ASSERT(from_ill->ill_ilm_cnt > 0); - from_ill->ill_ilm_cnt--; - - new_ilm = ilm; - } + rw_exit(&ipst->ips_ipmp_lock); - /* - * if this is the first ilm for the group - * set ilm_notify_driver so that we notify the - * driver in ilm_send_multicast_reqs. - */ - if (ilm_lookup_ill_v6(to_ill, - &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) - new_ilm->ilm_notify_driver = B_TRUE; - - DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill, - (char *), "ilm", (void *), new_ilm); - new_ilm->ilm_ill = to_ill; - to_ill->ill_ilm_cnt++; - - /* Add to the to_ill's list */ - new_ilm->ilm_next = to_ill->ill_ilm; - to_ill->ill_ilm = new_ilm; - /* - * set the flag so that mld_joingroup is - * called in ilm_send_multicast_reqs(). - */ - new_ilm->ilm_is_new = B_TRUE; - } - goto bottom; - } else if (ifindex != 0) { - /* - * If this is FAILBACK (ifindex != 0) and the ifindex - * has not matched above, look at the next ilm. - */ - ilmp = &ilm->ilm_next; - continue; - } - /* - * If we are here, it means ifindex is 0. Failover - * everything. - * - * We need to handle solicited node mcast address - * and all_nodes mcast address differently as they - * are joined witin the kenrel (ipif_multicast_up) - * and potentially from the userland. We are called - * after the ipifs of from_ill has been moved. - * If we still find ilms on ill with solicited node - * mcast address or all_nodes mcast address, it must - * belong to the UP interface that has not moved e.g. - * ipif_id 0 with the link local prefix does not move. - * We join this on the new ill accounting for all the - * userland memberships so that applications don't - * see any failure. - * - * We need to make sure that we account only for the - * solicited node and all node multicast addresses - * that was brought UP on these. In the case of - * a failover from A to B, we might have ilms belonging - * to A (ilm_orig_ifindex pointing at A) on B accounting - * for the membership from the userland. If we are failing - * over from B to C now, we will find the ones belonging - * to A on B. These don't account for the ill_ipif_up_count. - * They just move from B to C. The check below on - * ilm_orig_ifindex ensures that. - */ - if ((ilm->ilm_orig_ifindex == - from_ill->ill_phyint->phyint_ifindex) && - (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) || - IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, - &ilm->ilm_v6addr))) { - ASSERT(ilm->ilm_refcnt > 0); - count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count; - /* - * For indentation reasons, we are not using a - * "else" here. - */ - if (count == 0) { - ilmp = &ilm->ilm_next; - continue; - } - ilm->ilm_refcnt -= count; - if (new_ilm != NULL) { - /* - * Can find one with the same - * ilm_orig_ifindex, if we are failing - * over to a STANDBY. This happens - * when somebody wants to join a group - * on a STANDBY interface and we - * internally join on a different one. - * If we had joined on from_ill then, a - * failover now will find a new ilm - * with this index. - */ - ip1dbg(("ilm_move_v6: FAILOVER, found" - " new ilm on %s, group address %s\n", - to_ill->ill_name, - inet_ntop(AF_INET6, - &ilm->ilm_v6addr, buf, - sizeof (buf)))); - new_ilm->ilm_refcnt += count; - if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || - !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { - new_ilm->ilm_is_new = B_TRUE; - } - } else { - new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); - if (new_ilm == NULL) { - ip0dbg(("ilm_move_v6: FAILOVER of IPv6" - " multicast address %s : from %s to" - " %s failed : ENOMEM \n", - inet_ntop(AF_INET6, - &ilm->ilm_v6addr, buf, - sizeof (buf)), from_ill->ill_name, - to_ill->ill_name)); - ilmp = &ilm->ilm_next; - continue; - } - *new_ilm = *ilm; - new_ilm->ilm_filter = NULL; - new_ilm->ilm_refcnt = count; - new_ilm->ilm_timer = INFINITY; - new_ilm->ilm_rtx.rtx_timer = INFINITY; - new_ilm->ilm_is_new = B_TRUE; - /* - * If the to_ill has not joined this - * group we need to tell the driver in - * ill_send_multicast_reqs. - */ - if (ilm_lookup_ill_v6(to_ill, - &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) - new_ilm->ilm_notify_driver = B_TRUE; - - new_ilm->ilm_ill = to_ill; - DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill, - (char *), "ilm", (void *), new_ilm); - to_ill->ill_ilm_cnt++; - - /* Add to the to_ill's list */ - new_ilm->ilm_next = to_ill->ill_ilm; - to_ill->ill_ilm = new_ilm; - ASSERT(new_ilm->ilm_ipif == NULL); - } - if (ilm->ilm_refcnt == 0) { - goto bottom; - } else { - new_ilm->ilm_fmode = MODE_IS_EXCLUDE; - CLEAR_SLIST(new_ilm->ilm_filter); - ilmp = &ilm->ilm_next; - } - continue; - } else { - /* - * ifindex = 0 means, move everything pointing at - * from_ill. We are doing this becuase ill has - * either FAILED or became INACTIVE. - * - * As we would like to move things later back to - * from_ill, we want to retain the identity of this - * ilm. Thus, we don't blindly increment the reference - * count on the ilms matching the address alone. We - * need to match on the ilm_orig_index also. new_ilm - * was obtained by matching ilm_orig_index also. - */ - if (new_ilm != NULL) { - /* - * This is possible only if a previous restore - * was incomplete i.e restore to - * ilm_orig_ifindex left some ilms because - * of some failures. Thus when we are failing - * again, we might find our old friends there. - */ - ip1dbg(("ilm_move_v6: FAILOVER, found new ilm" - " on %s, group address %s\n", - to_ill->ill_name, - inet_ntop(AF_INET6, - &ilm->ilm_v6addr, buf, - sizeof (buf)))); - new_ilm->ilm_refcnt += ilm->ilm_refcnt; - if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || - !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { - new_ilm->ilm_is_new = B_TRUE; - } - } else { - if (from_ill->ill_ilm_walker_cnt != 0) { - new_ilm = (ilm_t *) - mi_zalloc(sizeof (ilm_t)); - if (new_ilm == NULL) { - ip0dbg(("ilm_move_v6: " - "FAILOVER of IPv6" - " multicast address %s : " - "from %s to" - " %s failed : ENOMEM \n", - inet_ntop(AF_INET6, - &ilm->ilm_v6addr, buf, - sizeof (buf)), - from_ill->ill_name, - to_ill->ill_name)); - - ilmp = &ilm->ilm_next; - continue; - } - *new_ilm = *ilm; - new_ilm->ilm_filter = NULL; - } else { - *ilmp = ilm->ilm_next; - DTRACE_PROBE3(ill__decr__cnt, - (ill_t *), from_ill, - (char *), "ilm", (void *), ilm); - ASSERT(from_ill->ill_ilm_cnt > 0); - from_ill->ill_ilm_cnt--; - - new_ilm = ilm; - } - /* - * If the to_ill has not joined this - * group we need to tell the driver in - * ill_send_multicast_reqs. - */ - if (ilm_lookup_ill_v6(to_ill, - &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) - new_ilm->ilm_notify_driver = B_TRUE; - - /* Add to the to_ill's list */ - new_ilm->ilm_next = to_ill->ill_ilm; - to_ill->ill_ilm = new_ilm; - ASSERT(ilm->ilm_ipif == NULL); - new_ilm->ilm_ill = to_ill; - DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill, - (char *), "ilm", (void *), new_ilm); - to_ill->ill_ilm_cnt++; - new_ilm->ilm_is_new = B_TRUE; - } - - } - -bottom: - /* - * Revert multicast filter state to (EXCLUDE, NULL). - * new_ilm->ilm_is_new should already be set if needed. - */ - new_ilm->ilm_fmode = MODE_IS_EXCLUDE; - CLEAR_SLIST(new_ilm->ilm_filter); + ipmp_phyint_join_grp(phyi, grp); + ill_refhold(ill); + qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, + SWITCH_OP, B_FALSE); + return (0); + } else { /* - * We allocated/got a new ilm, free the old one. + * Request to remove the interface from a group. If the + * interface is not in a group, this trivially succeeds. */ - if (new_ilm != ilm) { - if (from_ill->ill_ilm_walker_cnt == 0) { - *ilmp = ilm->ilm_next; - - ASSERT(ilm->ilm_ipif == NULL); /* ipv6 */ - DTRACE_PROBE3(ill__decr__cnt, (ill_t *), - from_ill, (char *), "ilm", (void *), ilm); - ASSERT(from_ill->ill_ilm_cnt > 0); - from_ill->ill_ilm_cnt--; - - ilm_inactive(ilm); /* frees this ilm */ - - } else { - ilm->ilm_flags |= ILM_DELETED; - from_ill->ill_ilm_cleanup_reqd = 1; - ilmp = &ilm->ilm_next; - } - } + rw_exit(&ipst->ips_ipmp_lock); + if (IS_UNDER_IPMP(ill)) + ipmp_phyint_leave_grp(phyi); + return (0); } +unlock: + rw_exit(&ipst->ips_ipmp_lock); + return (err); } /* - * Move all the multicast memberships to to_ill. Called when - * an ipif moves from "from_ill" to "to_ill". This function is slightly - * different from IPv6 counterpart as multicast memberships are associated - * with ills in IPv6. This function is called after every ipif is moved - * unlike IPv6, where it is moved only once. + * Process an SIOCGLIFBINDING request. */ -static void -ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif) -{ - ilm_t *ilm; - ilm_t *ilm_next; - ilm_t *new_ilm; - ilm_t **ilmp; - ip_stack_t *ipst = from_ill->ill_ipst; - - ASSERT(MUTEX_HELD(&to_ill->ill_lock)); - ASSERT(MUTEX_HELD(&from_ill->ill_lock)); - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - - ilmp = &from_ill->ill_ilm; - for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { - ilm_next = ilm->ilm_next; - - if (ilm->ilm_flags & ILM_DELETED) { - ilmp = &ilm->ilm_next; - continue; - } - - ASSERT(ilm->ilm_ipif != NULL); - - if (ilm->ilm_ipif != ipif) { - ilmp = &ilm->ilm_next; - continue; - } - - if (V4_PART_OF_V6(ilm->ilm_v6addr) == - htonl(INADDR_ALLHOSTS_GROUP)) { - new_ilm = ilm_lookup_ipif(ipif, - V4_PART_OF_V6(ilm->ilm_v6addr)); - if (new_ilm != NULL) { - new_ilm->ilm_refcnt += ilm->ilm_refcnt; - /* - * We still need to deal with the from_ill. - */ - new_ilm->ilm_is_new = B_TRUE; - new_ilm->ilm_fmode = MODE_IS_EXCLUDE; - CLEAR_SLIST(new_ilm->ilm_filter); - ASSERT(ilm->ilm_ipif == ipif); - ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0); - if (from_ill->ill_ilm_walker_cnt == 0) { - DTRACE_PROBE3(ill__decr__cnt, - (ill_t *), from_ill, - (char *), "ilm", (void *), ilm); - ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0); - } - goto delete_ilm; - } - /* - * If we could not find one e.g. ipif is - * still down on to_ill, we add this ilm - * on ill_new to preserve the reference - * count. - */ - } - /* - * When ipifs move, ilms always move with it - * to the NEW ill. Thus we should never be - * able to find ilm till we really move it here. - */ - ASSERT(ilm_lookup_ipif(ipif, - V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL); - - if (from_ill->ill_ilm_walker_cnt != 0) { - new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); - if (new_ilm == NULL) { - char buf[INET6_ADDRSTRLEN]; - ip0dbg(("ilm_move_v4: FAILBACK of IPv4" - " multicast address %s : " - "from %s to" - " %s failed : ENOMEM \n", - inet_ntop(AF_INET, - &ilm->ilm_v6addr, buf, - sizeof (buf)), - from_ill->ill_name, - to_ill->ill_name)); - - ilmp = &ilm->ilm_next; - continue; - } - *new_ilm = *ilm; - DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ipif, - (char *), "ilm", (void *), ilm); - new_ilm->ilm_ipif->ipif_ilm_cnt++; - /* We don't want new_ilm linked to ilm's filter list */ - new_ilm->ilm_filter = NULL; - } else { - /* Remove from the list */ - *ilmp = ilm->ilm_next; - new_ilm = ilm; - } - - /* - * If we have never joined this group on the to_ill - * make sure we tell the driver. - */ - if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr, - ALL_ZONES) == NULL) - new_ilm->ilm_notify_driver = B_TRUE; - - /* Add to the to_ill's list */ - new_ilm->ilm_next = to_ill->ill_ilm; - to_ill->ill_ilm = new_ilm; - new_ilm->ilm_is_new = B_TRUE; - - /* - * Revert multicast filter state to (EXCLUDE, NULL) - */ - new_ilm->ilm_fmode = MODE_IS_EXCLUDE; - CLEAR_SLIST(new_ilm->ilm_filter); - - /* - * Delete only if we have allocated a new ilm. - */ - if (new_ilm != ilm) { -delete_ilm: - if (from_ill->ill_ilm_walker_cnt == 0) { - /* Remove from the list */ - *ilmp = ilm->ilm_next; - ilm->ilm_next = NULL; - DTRACE_PROBE3(ipif__decr__cnt, - (ipif_t *), ilm->ilm_ipif, - (char *), "ilm", (void *), ilm); - ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0); - ilm->ilm_ipif->ipif_ilm_cnt--; - ilm_inactive(ilm); - } else { - ilm->ilm_flags |= ILM_DELETED; - from_ill->ill_ilm_cleanup_reqd = 1; - ilmp = &ilm->ilm_next; - } - } - } -} - -static uint_t -ipif_get_id(ill_t *ill, uint_t id) -{ - uint_t unit; - ipif_t *tipif; - boolean_t found = B_FALSE; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * During failback, we want to go back to the same id - * instead of the smallest id so that the original - * configuration is maintained. id is non-zero in that - * case. - */ - if (id != 0) { - /* - * While failing back, if we still have an ipif with - * MAX_ADDRS_PER_IF, it means this will be replaced - * as soon as we return from this function. It was - * to set to MAX_ADDRS_PER_IF by the caller so that - * we can choose the smallest id. Thus we return zero - * in that case ignoring the hint. - */ - if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF) - return (0); - for (tipif = ill->ill_ipif; tipif != NULL; - tipif = tipif->ipif_next) { - if (tipif->ipif_id == id) { - found = B_TRUE; - break; - } - } - /* - * If somebody already plumbed another logical - * with the same id, we won't be able to find it. - */ - if (!found) - return (id); - } - for (unit = 0; unit <= ipst->ips_ip_addrs_per_if; unit++) { - found = B_FALSE; - for (tipif = ill->ill_ipif; tipif != NULL; - tipif = tipif->ipif_next) { - if (tipif->ipif_id == unit) { - found = B_TRUE; - break; - } - } - if (!found) - break; - } - return (unit); -} - /* ARGSUSED */ -static int -ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, - ipif_t **rep_ipif_ptr) +int +ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, + ip_ioctl_cmd_t *ipip, void *ifreq) { - ill_t *from_ill; - ipif_t *rep_ipif; - uint_t unit; - int err = 0; - ipif_t *to_ipif; - struct iocblk *iocp; - boolean_t failback_cmd; - boolean_t remove_ipif; - int rc; - ip_stack_t *ipst; - - ASSERT(IAM_WRITER_ILL(to_ill)); - ASSERT(IAM_WRITER_IPIF(ipif)); - - iocp = (struct iocblk *)mp->b_rptr; - failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK); - remove_ipif = B_FALSE; - - from_ill = ipif->ipif_ill; - ipst = from_ill->ill_ipst; - - ASSERT(MUTEX_HELD(&to_ill->ill_lock)); - ASSERT(MUTEX_HELD(&from_ill->ill_lock)); - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - - /* - * Don't move LINK LOCAL addresses as they are tied to - * physical interface. - */ - if (from_ill->ill_isv6 && - IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) { - ipif->ipif_was_up = B_FALSE; - IPIF_UNMARK_MOVING(ipif); - return (0); - } - - /* - * We set the ipif_id to maximum so that the search for - * ipif_id will pick the lowest number i.e 0 in the - * following 2 cases : - * - * 1) We have a replacement ipif at the head of to_ill. - * We can't remove it yet as we can exceed ip_addrs_per_if - * on to_ill and hence the MOVE might fail. We want to - * remove it only if we could move the ipif. Thus, by - * setting it to the MAX value, we make the search in - * ipif_get_id return the zeroth id. - * - * 2) When DR pulls out the NIC and re-plumbs the interface, - * we might just have a zero address plumbed on the ipif - * with zero id in the case of IPv4. We remove that while - * doing the failback. We want to remove it only if we - * could move the ipif. Thus, by setting it to the MAX - * value, we make the search in ipif_get_id return the - * zeroth id. - * - * Both (1) and (2) are done only when when we are moving - * an ipif (either due to failover/failback) which originally - * belonged to this interface i.e the ipif_orig_ifindex is - * the same as to_ill's ifindex. This is needed so that - * FAILOVER from A -> B ( A failed) followed by FAILOVER - * from B -> A (B is being removed from the group) and - * FAILBACK from A -> B restores the original configuration. - * Without the check for orig_ifindex, the second FAILOVER - * could make the ipif belonging to B replace the A's zeroth - * ipif and the subsequent failback re-creating the replacement - * ipif again. - * - * NOTE : We created the replacement ipif when we did a - * FAILOVER (See below). We could check for FAILBACK and - * then look for replacement ipif to be removed. But we don't - * want to do that because we wan't to allow the possibility - * of a FAILOVER from A -> B (which creates the replacement ipif), - * followed by a *FAILOVER* from B -> A instead of a FAILBACK - * from B -> A. - */ - to_ipif = to_ill->ill_ipif; - if ((to_ill->ill_phyint->phyint_ifindex == - ipif->ipif_orig_ifindex) && - to_ipif->ipif_replace_zero) { - ASSERT(to_ipif->ipif_id == 0); - remove_ipif = B_TRUE; - to_ipif->ipif_id = MAX_ADDRS_PER_IF; - } - /* - * Find the lowest logical unit number on the to_ill. - * If we are failing back, try to get the original id - * rather than the lowest one so that the original - * configuration is maintained. - * - * XXX need a better scheme for this. - */ - if (failback_cmd) { - unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid); - } else { - unit = ipif_get_id(to_ill, 0); - } - - /* Reset back to zero in case we fail below */ - if (to_ipif->ipif_id == MAX_ADDRS_PER_IF) - to_ipif->ipif_id = 0; + ill_t *bound_ill; + struct lifreq *lifr = ifreq; - if (unit == ipst->ips_ip_addrs_per_if) { - ipif->ipif_was_up = B_FALSE; - IPIF_UNMARK_MOVING(ipif); + if (!IS_IPMP(ipif->ipif_ill)) return (EINVAL); - } - - /* - * ipif is ready to move from "from_ill" to "to_ill". - * - * 1) If we are moving ipif with id zero, create a - * replacement ipif for this ipif on from_ill. If this fails - * fail the MOVE operation. - * - * 2) Remove the replacement ipif on to_ill if any. - * We could remove the replacement ipif when we are moving - * the ipif with id zero. But what if somebody already - * unplumbed it ? Thus we always remove it if it is present. - * We want to do it only if we are sure we are going to - * move the ipif to to_ill which is why there are no - * returns due to error till ipif is linked to to_ill. - * Note that the first ipif that we failback will always - * be zero if it is present. - */ - if (ipif->ipif_id == 0) { - ipaddr_t inaddr_any = INADDR_ANY; - rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED); - if (rep_ipif == NULL) { - ipif->ipif_was_up = B_FALSE; - IPIF_UNMARK_MOVING(ipif); - return (ENOMEM); - } - *rep_ipif = ipif_zero; - /* - * Before we put the ipif on the list, store the addresses - * as mapped addresses as some of the ioctls e.g SIOCGIFADDR - * assumes so. This logic is not any different from what - * ipif_allocate does. - */ - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6lcl_addr); - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6src_addr); - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6subnet); - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6net_mask); - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6brd_addr); - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6pp_dst_addr); - /* - * We mark IPIF_NOFAILOVER so that this can never - * move. - */ - rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; - rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE; - rep_ipif->ipif_replace_zero = B_TRUE; - mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, - MUTEX_DEFAULT, NULL); - rep_ipif->ipif_id = 0; - rep_ipif->ipif_ire_type = ipif->ipif_ire_type; - rep_ipif->ipif_ill = from_ill; - rep_ipif->ipif_orig_ifindex = - from_ill->ill_phyint->phyint_ifindex; - /* Insert at head */ - rep_ipif->ipif_next = from_ill->ill_ipif; - from_ill->ill_ipif = rep_ipif; - /* - * We don't really care to let apps know about - * this interface. - */ - } - - if (remove_ipif) { - /* - * We set to a max value above for this case to get - * id zero. ASSERT that we did get one. - */ - ASSERT((to_ipif->ipif_id == 0) && (unit == 0)); - rep_ipif = to_ipif; - to_ill->ill_ipif = rep_ipif->ipif_next; - rep_ipif->ipif_next = NULL; - /* - * If some apps scanned and find this interface, - * it is time to let them know, so that they can - * delete it. - */ - - *rep_ipif_ptr = rep_ipif; - } - - /* Get it out of the ILL interface list. */ - ipif_remove(ipif, B_FALSE); - - /* Assign the new ill */ - ipif->ipif_ill = to_ill; - ipif->ipif_id = unit; - /* id has already been checked */ - rc = ipif_insert(ipif, B_FALSE, B_FALSE); - ASSERT(rc == 0); - /* Let SCTP update its list */ - sctp_move_ipif(ipif, from_ill, to_ill); - /* - * Handle the failover and failback of ipif_t between - * ill_t that have differing maximum mtu values. - */ - if (ipif->ipif_mtu > to_ill->ill_max_mtu) { - if (ipif->ipif_saved_mtu == 0) { - /* - * As this ipif_t is moving to an ill_t - * that has a lower ill_max_mtu, its - * ipif_mtu needs to be saved so it can - * be restored during failback or during - * failover to an ill_t which has a - * higher ill_max_mtu. - */ - ipif->ipif_saved_mtu = ipif->ipif_mtu; - ipif->ipif_mtu = to_ill->ill_max_mtu; - } else { - /* - * The ipif_t is, once again, moving to - * an ill_t that has a lower maximum mtu - * value. - */ - ipif->ipif_mtu = to_ill->ill_max_mtu; - } - } else if (ipif->ipif_mtu < to_ill->ill_max_mtu && - ipif->ipif_saved_mtu != 0) { - /* - * The mtu of this ipif_t had to be reduced - * during an earlier failover; this is an - * opportunity for it to be increased (either as - * part of another failover or a failback). - */ - if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) { - ipif->ipif_mtu = ipif->ipif_saved_mtu; - ipif->ipif_saved_mtu = 0; - } else { - ipif->ipif_mtu = to_ill->ill_max_mtu; - } + if ((bound_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { + lifr->lifr_binding[0] = '\0'; + return (0); } - /* - * We preserve all the other fields of the ipif including - * ipif_saved_ire_mp. The routes that are saved here will - * be recreated on the new interface and back on the old - * interface when we move back. - */ - ASSERT(ipif->ipif_arp_del_mp == NULL); - - return (err); -} - -static int -ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp, - int ifindex, ipif_t **rep_ipif_ptr) -{ - ipif_t *mipif; - ipif_t *ipif_next; - int err; - - /* - * We don't really try to MOVE back things if some of the - * operations fail. The daemon will take care of moving again - * later on. - */ - for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) { - ipif_next = mipif->ipif_next; - if (!(mipif->ipif_flags & IPIF_NOFAILOVER) && - (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) { - - err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr); - - /* - * When the MOVE fails, it is the job of the - * application to take care of this properly - * i.e try again if it is ENOMEM. - */ - if (mipif->ipif_ill != from_ill) { - /* - * ipif has moved. - * - * Move the multicast memberships associated - * with this ipif to the new ill. For IPv6, we - * do it once after all the ipifs are moved - * (in ill_move) as they are not associated - * with ipifs. - * - * We need to move the ilms as the ipif has - * already been moved to a new ill even - * in the case of errors. Neither - * ilm_free(ipif) will find the ilm - * when somebody unplumbs this ipif nor - * ilm_delete(ilm) will be able to find the - * ilm, if we don't move now. - */ - if (!from_ill->ill_isv6) - ilm_move_v4(from_ill, to_ill, mipif); - } - - if (err != 0) - return (err); - } - } + (void) strlcpy(lifr->lifr_binding, bound_ill->ill_name, LIFNAMSIZ); + ill_refrele(bound_ill); return (0); } -static int -ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp) -{ - int ifindex; - int err; - struct iocblk *iocp; - ipif_t *ipif; - ipif_t *rep_ipif_ptr = NULL; - ipif_t *from_ipif = NULL; - boolean_t check_rep_if = B_FALSE; - ip_stack_t *ipst = from_ill->ill_ipst; - - iocp = (struct iocblk *)mp->b_rptr; - if (iocp->ioc_cmd == SIOCLIFFAILOVER) { - /* - * Move everything pointing at from_ill to to_ill. - * We acheive this by passing in 0 as ifindex. - */ - ifindex = 0; - } else { - /* - * Move everything pointing at from_ill whose original - * ifindex of connp, ipif, ilm points at to_ill->ill_index. - * We acheive this by passing in ifindex rather than 0. - * Multicast vifs, ilgs move implicitly because ipifs move. - */ - ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK); - ifindex = to_ill->ill_phyint->phyint_ifindex; - } - - /* - * Determine if there is at least one ipif that would move from - * 'from_ill' to 'to_ill'. If so, it is possible that the replacement - * ipif (if it exists) on the to_ill would be consumed as a result of - * the move, in which case we need to quiesce the replacement ipif also. - */ - for (from_ipif = from_ill->ill_ipif; from_ipif != NULL; - from_ipif = from_ipif->ipif_next) { - if (((ifindex == 0) || - (ifindex == from_ipif->ipif_orig_ifindex)) && - !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) { - check_rep_if = B_TRUE; - break; - } - } - - ill_down_ipifs(from_ill, mp, ifindex, B_TRUE); - - GRAB_ILL_LOCKS(from_ill, to_ill); - if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) { - (void) ipsq_pending_mp_add(NULL, ipif, q, - mp, ILL_MOVE_OK); - RELEASE_ILL_LOCKS(from_ill, to_ill); - return (EINPROGRESS); - } - - /* Check if the replacement ipif is quiescent to delete */ - if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif, - (iocp->ioc_cmd == SIOCLIFFAILBACK))) { - to_ill->ill_ipif->ipif_state_flags |= - IPIF_MOVING | IPIF_CHANGING; - if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) { - (void) ipsq_pending_mp_add(NULL, ipif, q, - mp, ILL_MOVE_OK); - RELEASE_ILL_LOCKS(from_ill, to_ill); - return (EINPROGRESS); - } - } - RELEASE_ILL_LOCKS(from_ill, to_ill); - - ASSERT(!MUTEX_HELD(&to_ill->ill_lock)); - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - GRAB_ILL_LOCKS(from_ill, to_ill); - err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr); - - /* ilm_move is done inside ipif_move for IPv4 */ - if (err == 0 && from_ill->ill_isv6) - ilm_move_v6(from_ill, to_ill, ifindex); - - RELEASE_ILL_LOCKS(from_ill, to_ill); - rw_exit(&ipst->ips_ill_g_lock); - - /* - * send rts messages and multicast messages. - */ - if (rep_ipif_ptr != NULL) { - if (rep_ipif_ptr->ipif_recovery_id != 0) { - (void) untimeout(rep_ipif_ptr->ipif_recovery_id); - rep_ipif_ptr->ipif_recovery_id = 0; - } - ip_rts_ifmsg(rep_ipif_ptr); - ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr); -#ifdef DEBUG - ipif_trace_cleanup(rep_ipif_ptr); -#endif - mi_free(rep_ipif_ptr); - } - - conn_move_ill(from_ill, to_ill, ifindex); - - return (err); -} - /* - * Used to extract arguments for FAILOVER/FAILBACK ioctls. - * Also checks for the validity of the arguments. - * Note: We are already exclusive inside the from group. - * It is upto the caller to release refcnt on the to_ill's. + * Process an SIOCGLIFGROUPNAME request. */ -static int -ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4, - ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6) +/* ARGSUSED */ +int +ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, + ip_ioctl_cmd_t *ipip, void *ifreq) { - int dst_index; - ipif_t *ipif_v4, *ipif_v6; - struct lifreq *lifr; - mblk_t *mp1; - boolean_t exists; - sin_t *sin; - int err = 0; - ip_stack_t *ipst; + ipmp_grp_t *grp; + struct lifreq *lifr = ifreq; + ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - if (CONN_Q(q)) - ipst = CONNQ_TO_IPST(q); + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) + lifr->lifr_groupname[0] = '\0'; else - ipst = ILLQ_TO_IPST(q); - - if ((mp1 = mp->b_cont) == NULL) - return (EPROTO); - - if ((mp1 = mp1->b_cont) == NULL) - return (EPROTO); - - lifr = (struct lifreq *)mp1->b_rptr; - sin = (sin_t *)&lifr->lifr_addr; - - /* - * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6 - * specific operations. - */ - if (sin->sin_family != AF_UNSPEC) - return (EINVAL); - - /* - * Get ipif with id 0. We are writer on the from ill. So we can pass - * NULLs for the last 4 args and we know the lookup won't fail - * with EINPROGRESS. - */ - ipif_v4 = ipif_lookup_on_name(lifr->lifr_name, - mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE, - ALL_ZONES, NULL, NULL, NULL, NULL, ipst); - ipif_v6 = ipif_lookup_on_name(lifr->lifr_name, - mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE, - ALL_ZONES, NULL, NULL, NULL, NULL, ipst); - - if (ipif_v4 == NULL && ipif_v6 == NULL) - return (ENXIO); - - if (ipif_v4 != NULL) { - ASSERT(ipif_v4->ipif_refcnt != 0); - if (ipif_v4->ipif_id != 0) { - err = EINVAL; - goto done; - } - - ASSERT(IAM_WRITER_IPIF(ipif_v4)); - *ill_from_v4 = ipif_v4->ipif_ill; - } - - if (ipif_v6 != NULL) { - ASSERT(ipif_v6->ipif_refcnt != 0); - if (ipif_v6->ipif_id != 0) { - err = EINVAL; - goto done; - } - - ASSERT(IAM_WRITER_IPIF(ipif_v6)); - *ill_from_v6 = ipif_v6->ipif_ill; - } - - err = 0; - dst_index = lifr->lifr_movetoindex; - *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE, - q, mp, ip_process_ioctl, &err, ipst); - if (err != 0) { - /* - * A move may be in progress, EINPROGRESS looking up the "to" - * ill means changes already done to the "from" ipsq need to - * be undone to avoid potential deadlocks. - * - * ENXIO will usually be because there is only v6 on the ill, - * that's not treated as an error unless an ENXIO is also - * seen when looking up the v6 "to" ill. - * - * If EINPROGRESS, the mp has been enqueued and can not be - * used to look up the v6 "to" ill, but a preemptive clean - * up of changes to the v6 "from" ipsq is done. - */ - if (err == EINPROGRESS) { - if (*ill_from_v4 != NULL) { - ill_t *from_ill; - ipsq_t *from_ipsq; - - from_ill = ipif_v4->ipif_ill; - from_ipsq = from_ill->ill_phyint->phyint_ipsq; - - mutex_enter(&from_ipsq->ipsq_lock); - from_ipsq->ipsq_current_ipif = NULL; - mutex_exit(&from_ipsq->ipsq_lock); - } - if (*ill_from_v6 != NULL) { - ill_t *from_ill; - ipsq_t *from_ipsq; - - from_ill = ipif_v6->ipif_ill; - from_ipsq = from_ill->ill_phyint->phyint_ipsq; - - mutex_enter(&from_ipsq->ipsq_lock); - from_ipsq->ipsq_current_ipif = NULL; - mutex_exit(&from_ipsq->ipsq_lock); - } - goto done; - } - ASSERT(err == ENXIO); - err = 0; - } - - *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE, - q, mp, ip_process_ioctl, &err, ipst); - if (err != 0) { - /* - * A move may be in progress, EINPROGRESS looking up the "to" - * ill means changes already done to the "from" ipsq need to - * be undone to avoid potential deadlocks. - */ - if (err == EINPROGRESS) { - if (*ill_from_v6 != NULL) { - ill_t *from_ill; - ipsq_t *from_ipsq; - - from_ill = ipif_v6->ipif_ill; - from_ipsq = from_ill->ill_phyint->phyint_ipsq; - - mutex_enter(&from_ipsq->ipsq_lock); - from_ipsq->ipsq_current_ipif = NULL; - mutex_exit(&from_ipsq->ipsq_lock); - } - goto done; - } - ASSERT(err == ENXIO); - - /* Both v4 and v6 lookup failed */ - if (*ill_to_v4 == NULL) { - err = ENXIO; - goto done; - } - err = 0; - } - - /* - * If we have something to MOVE i.e "from" not NULL, - * "to" should be non-NULL. - */ - if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) || - (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) { - err = EINVAL; - } - -done: - if (ipif_v4 != NULL) - ipif_refrele(ipif_v4); - if (ipif_v6 != NULL) - ipif_refrele(ipif_v6); - return (err); + (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); + rw_exit(&ipst->ips_ipmp_lock); + return (0); } /* - * FAILOVER and FAILBACK are modelled as MOVE operations. - * - * We don't check whether the MOVE is within the same group or - * not, because this ioctl can be used as a generic mechanism - * to failover from interface A to B, though things will function - * only if they are really part of the same group. Moreover, - * all ipifs may be down and hence temporarily out of the group. - * - * ipif's that need to be moved are first brought down; V4 ipifs are brought - * down first and then V6. For each we wait for the ipif's to become quiescent. - * Bringing down the ipifs ensures that all ires pointing to these ipifs's - * have been deleted and there are no active references. Once quiescent the - * ipif's are moved and brought up on the new ill. - * - * Normally the source ill and destination ill belong to the same IPMP group - * and hence the same ipsq_t. In the event they don't belong to the same - * same group the two ipsq's are first merged into one ipsq - that of the - * to_ill. The multicast memberships on the source and destination ill cannot - * change during the move operation since multicast joins/leaves also have to - * execute on the same ipsq and are hence serialized. + * Process an SIOCGLIFGROUPINFO request. */ /* ARGSUSED */ int -ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, - ip_ioctl_cmd_t *ipip, void *ifreq) +ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, + ip_ioctl_cmd_t *ipip, void *dummy) { - ill_t *ill_to_v4 = NULL; - ill_t *ill_to_v6 = NULL; - ill_t *ill_from_v4 = NULL; - ill_t *ill_from_v6 = NULL; - int err = 0; - - /* - * setup from and to ill's, we can get EINPROGRESS only for - * to_ill's. - */ - err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6, - &ill_to_v4, &ill_to_v6); - - if (err != 0) { - ip0dbg(("ip_sioctl_move: extract args failed\n")); - goto done; - } - - /* - * nothing to do. - */ - if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) { - goto done; - } - - /* - * nothing to do. - */ - if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) { - goto done; - } - - /* - * Mark the ill as changing. - * ILL_CHANGING flag is cleared when the ipif's are brought up - * in ill_up_ipifs in case of error they are cleared below. - */ - - GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); - if (ill_from_v4 != NULL) - ill_from_v4->ill_state_flags |= ILL_CHANGING; - if (ill_from_v6 != NULL) - ill_from_v6->ill_state_flags |= ILL_CHANGING; - RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); - - /* - * Make sure that both src and dst are - * in the same syncq group. If not make it happen. - * We are not holding any locks because we are the writer - * on the from_ipsq and we will hold locks in ill_merge_groups - * to protect to_ipsq against changing. - */ - if (ill_from_v4 != NULL) { - if (ill_from_v4->ill_phyint->phyint_ipsq != - ill_to_v4->ill_phyint->phyint_ipsq) { - err = ill_merge_groups(ill_from_v4, ill_to_v4, - NULL, mp, q); - goto err_ret; - - } - ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock)); - } else { - - if (ill_from_v6->ill_phyint->phyint_ipsq != - ill_to_v6->ill_phyint->phyint_ipsq) { - err = ill_merge_groups(ill_from_v6, ill_to_v6, - NULL, mp, q); - goto err_ret; - - } - ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock)); - } - - /* - * Now that the ipsq's have been merged and we are the writer - * lets mark to_ill as changing as well. - */ - - GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); - if (ill_to_v4 != NULL) - ill_to_v4->ill_state_flags |= ILL_CHANGING; - if (ill_to_v6 != NULL) - ill_to_v6->ill_state_flags |= ILL_CHANGING; - RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); - - /* - * Its ok for us to proceed with the move even if - * ill_pending_mp is non null on one of the from ill's as the reply - * should not be looking at the ipif, it should only care about the - * ill itself. - */ - - /* - * lets move ipv4 first. - */ - if (ill_from_v4 != NULL) { - ASSERT(IAM_WRITER_ILL(ill_to_v4)); - ill_from_v4->ill_move_in_progress = B_TRUE; - ill_to_v4->ill_move_in_progress = B_TRUE; - ill_to_v4->ill_move_peer = ill_from_v4; - ill_from_v4->ill_move_peer = ill_to_v4; - err = ill_move(ill_from_v4, ill_to_v4, q, mp); - } - - /* - * Now lets move ipv6. - */ - if (err == 0 && ill_from_v6 != NULL) { - ASSERT(IAM_WRITER_ILL(ill_to_v6)); - ill_from_v6->ill_move_in_progress = B_TRUE; - ill_to_v6->ill_move_in_progress = B_TRUE; - ill_to_v6->ill_move_peer = ill_from_v6; - ill_from_v6->ill_move_peer = ill_to_v6; - err = ill_move(ill_from_v6, ill_to_v6, q, mp); - } - -err_ret: - /* - * EINPROGRESS means we are waiting for the ipif's that need to be - * moved to become quiescent. - */ - if (err == EINPROGRESS) { - goto done; - } - - /* - * if err is set ill_up_ipifs will not be called - * lets clear the flags. - */ - - GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); - GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); - /* - * Some of the clearing may be redundant. But it is simple - * not making any extra checks. - */ - if (ill_from_v6 != NULL) { - ill_from_v6->ill_move_in_progress = B_FALSE; - ill_from_v6->ill_move_peer = NULL; - ill_from_v6->ill_state_flags &= ~ILL_CHANGING; - } - if (ill_from_v4 != NULL) { - ill_from_v4->ill_move_in_progress = B_FALSE; - ill_from_v4->ill_move_peer = NULL; - ill_from_v4->ill_state_flags &= ~ILL_CHANGING; - } - if (ill_to_v6 != NULL) { - ill_to_v6->ill_move_in_progress = B_FALSE; - ill_to_v6->ill_move_peer = NULL; - ill_to_v6->ill_state_flags &= ~ILL_CHANGING; - } - if (ill_to_v4 != NULL) { - ill_to_v4->ill_move_in_progress = B_FALSE; - ill_to_v4->ill_move_peer = NULL; - ill_to_v4->ill_state_flags &= ~ILL_CHANGING; - } - - /* - * Check for setting INACTIVE, if STANDBY is set and FAILED is not set. - * Do this always to maintain proper state i.e even in case of errors. - * As phyint_inactive looks at both v4 and v6 interfaces, - * we need not call on both v4 and v6 interfaces. - */ - if (ill_from_v4 != NULL) { - if ((ill_from_v4->ill_phyint->phyint_flags & - (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { - phyint_inactive(ill_from_v4->ill_phyint); - } - } else if (ill_from_v6 != NULL) { - if ((ill_from_v6->ill_phyint->phyint_flags & - (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { - phyint_inactive(ill_from_v6->ill_phyint); - } - } - - if (ill_to_v4 != NULL) { - if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) { - ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; - } - } else if (ill_to_v6 != NULL) { - if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) { - ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; - } - } - - RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); - RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); - -no_err: - /* - * lets bring the interfaces up on the to_ill. - */ - if (err == 0) { - err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4, - q, mp); - } - - if (err == 0) { - if (ill_from_v4 != NULL && ill_to_v4 != NULL) - ilm_send_multicast_reqs(ill_from_v4, ill_to_v4); + lifgroupinfo_t *lifgr; + ipmp_grp_t *grp; + ip_stack_t *ipst = CONNQ_TO_IPST(q); - if (ill_from_v6 != NULL && ill_to_v6 != NULL) - ilm_send_multicast_reqs(ill_from_v6, ill_to_v6); - } -done: + /* ip_wput_nondata() verified mp->b_cont->b_cont */ + lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; + lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; - if (ill_to_v4 != NULL) { - ill_refrele(ill_to_v4); - } - if (ill_to_v6 != NULL) { - ill_refrele(ill_to_v6); + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { + rw_exit(&ipst->ips_ipmp_lock); + return (ENOENT); } - - return (err); + ipmp_grp_info(grp, lifgr); + rw_exit(&ipst->ips_ipmp_lock); + return (0); } static void @@ -18167,10 +14492,9 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) * we only wait for the ACK of the DL_UNBIND_REQ. */ mutex_enter(&ill->ill_lock); - if (!(ill->ill_state_flags & ILL_CONDEMNED) || - (prim == DL_UNBIND_REQ)) { + if (!(ill->ill_state_flags & ILL_CONDEMNED) || (prim == DL_UNBIND_REQ)) ill->ill_dlpi_pending = prim; - } + mutex_exit(&ill->ill_lock); putnext(ill->ill_wq, mp); } @@ -18324,6 +14648,7 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim) { mblk_t *mp; ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; + ipxop_t *ipx = ipsq->ipsq_xop; ASSERT(IAM_WRITER_IPSQ(ipsq)); mutex_enter(&ill->ill_lock); @@ -18336,12 +14661,11 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim) if ((mp = ill->ill_dlpi_deferred) == NULL) { ill->ill_dlpi_pending = DL_PRIM_INVAL; - - mutex_enter(&ipsq->ipsq_lock); - if (ipsq->ipsq_current_done) - ipsq->ipsq_current_ipif = NULL; - mutex_exit(&ipsq->ipsq_lock); - + if (ipx->ipx_current_done) { + mutex_enter(&ipx->ipx_lock); + ipx->ipx_current_ipif = NULL; + mutex_exit(&ipx->ipx_lock); + } cv_signal(&ill->ill_cv); mutex_exit(&ill->ill_lock); return; @@ -18379,7 +14703,7 @@ conn_delete_ire(conn_t *connp, caddr_t arg) } /* - * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number + * Some operations (e.g., ipif_down()) conditionally delete a number * of IREs. Those IREs may have been previously cached in the conn structure. * This ipcl_walk() walker function releases all references to such IREs based * on the condemned flag. @@ -18403,7 +14727,6 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) /* * Take down a specific interface, but don't lose any information about it. - * Also delete interface from its interface group (ifgrp). * (Always called as writer.) * This function goes through the down sequence even if the interface is * already down. There are 2 reasons. @@ -18501,7 +14824,7 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) * For eg. bind, and route operations (Eg. route add / delete) cannot return * failure if the ipif is currently undergoing an exclusive operation, and * hence pass the flag. The mblk is then enqueued in the ipsq and the operation - * is restarted by ipsq_exit() when the currently exclusive ioctl completes. + * is restarted by ipsq_exit() when the current exclusive operation completes. * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't * change while the ill_lock is held. Before dropping the ill_lock we acquire @@ -18522,7 +14845,6 @@ int ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) { ill_t *ill = ipif->ipif_ill; - phyint_t *phyi; conn_t *connp; boolean_t success; boolean_t ipif_was_up = B_FALSE; @@ -18569,20 +14891,7 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) } /* - * Before we delete the ill from the group (if any), we need - * to make sure that we delete all the routes dependent on - * this and also any ipifs dependent on this ipif for - * source address. We need to do before we delete from - * the group because - * - * 1) ipif_down_delete_ire de-references ill->ill_group. - * - * 2) ipif_update_other_ipifs needs to walk the whole group - * for re-doing source address selection. Note that - * ipif_select_source[_v6] called from - * ipif_update_other_ipifs[_v6] will not pick this ipif - * because we have already marked down here i.e cleared - * IPIF_UP. + * Delete all IRE's pointing at this ipif or its source address. */ if (ipif->ipif_isv6) { ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, @@ -18592,6 +14901,17 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) ipst); } + if (ipif_was_up && ill->ill_ipif_up_count == 0) { + /* + * Since the interface is now down, it may have just become + * inactive. Note that this needs to be done even for a + * lll_logical_down(), or ARP entries will not get correctly + * restored when the interface comes back up. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_ill_refresh_active(ill); + } + /* * Cleaning up the conn_ire_cache or conns must be done only after the * ires have been deleted above. Otherwise a thread could end up @@ -18609,53 +14929,9 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) * entries for such ipifs. */ if (ipif->ipif_isv6) - ipif_update_other_ipifs_v6(ipif, ill->ill_group); + ipif_update_other_ipifs_v6(ipif); else - ipif_update_other_ipifs(ipif, ill->ill_group); - - if (ipif_was_up) { - /* - * Check whether it is last ipif to leave this group. - * If this is the last ipif to leave, we should remove - * this ill from the group as ipif_select_source will not - * be able to find any useful ipifs if this ill is selected - * for load balancing. - * - * For nameless groups, we should call ifgrp_delete if this - * belongs to some group. As this ipif is going down, we may - * need to reconstruct groups. - */ - phyi = ill->ill_phyint; - /* - * If the phyint_groupname_len is 0, it may or may not - * be in the nameless group. If the phyint_groupname_len is - * not 0, then this ill should be part of some group. - * As we always insert this ill in the group if - * phyint_groupname_len is not zero when the first ipif - * comes up (in ipif_up_done), it should be in a group - * when the namelen is not 0. - * - * NOTE : When we delete the ill from the group,it will - * blow away all the IRE_CACHES pointing either at this ipif or - * ill_wq (illgrp_cache_delete does this). Thus, no IRES - * should be pointing at this ill. - */ - ASSERT(phyi->phyint_groupname_len == 0 || - (phyi->phyint_groupname != NULL && ill->ill_group != NULL)); - - if (phyi->phyint_groupname_len != 0) { - if (ill->ill_ipif_up_count == 0) - illgrp_delete(ill); - } - - /* - * If we have deleted some of the broadcast ires associated - * with this ipif, we need to re-nominate somebody else if - * the ires that we deleted were the nominated ones. - */ - if (ill->ill_group != NULL && !ill->ill_isv6) - ipif_renominate_bcast(ipif); - } + ipif_update_other_ipifs(ipif); /* * neighbor-discovery or arp entries for this interface. @@ -18734,17 +15010,12 @@ ipif_down_tail(ipif_t *ipif) ill->ill_logical_down = 0; /* - * Have to be after removing the routes in ipif_down_delete_ire. + * Has to be after removing the routes in ipif_down_delete_ire. */ - if (ipif->ipif_isv6) { - if (ill->ill_flags & ILLF_XRESOLV) - ipif_arp_down(ipif); - } else { - ipif_arp_down(ipif); - } + ipif_resolver_down(ipif); - ip_rts_ifmsg(ipif); - ip_rts_newaddrmsg(RTM_DELETE, 0, ipif); + ip_rts_ifmsg(ipif, RTSQ_DEFAULT); + ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); } /* @@ -18804,39 +15075,11 @@ static void ipif_down_delete_ire(ire_t *ire, char *ipif_arg) { ipif_t *ipif = (ipif_t *)ipif_arg; - ill_t *ire_ill; - ill_t *ipif_ill; ASSERT(IAM_WRITER_IPIF(ipif)); if (ire->ire_ipif == NULL) return; - /* - * For IPv4, we derive source addresses for an IRE from ipif's - * belonging to the same IPMP group as the IRE's outgoing - * interface. If an IRE's outgoing interface isn't in the - * same IPMP group as a particular ipif, then that ipif - * couldn't have been used as a source address for this IRE. - * - * For IPv6, source addresses are only restricted to the IPMP group - * if the IRE is for a link-local address or a multicast address. - * Otherwise, source addresses for an IRE can be chosen from - * interfaces other than the the outgoing interface for that IRE. - * - * For source address selection details, see ipif_select_source() - * and ipif_select_source_v6(). - */ - if (ire->ire_ipversion == IPV4_VERSION || - IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) || - IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { - ire_ill = ire->ire_ipif->ipif_ill; - ipif_ill = ipif->ipif_ill; - - if (ire_ill->ill_group != ipif_ill->ill_group) { - return; - } - } - if (ire->ire_ipif != ipif) { /* * Look for a matching source address. @@ -18875,83 +15118,53 @@ void ill_ipif_cache_delete(ire_t *ire, char *ill_arg) { ill_t *ill = (ill_t *)ill_arg; - ill_t *ipif_ill; ASSERT(IAM_WRITER_ILL(ill)); - /* - * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. - * Hence this should be IRE_CACHE. - */ ASSERT(ire->ire_type == IRE_CACHE); /* - * We are called for IRE_CACHES whose ire_ipif matches ill. - * We are only interested in IRE_CACHES that has borrowed - * the source address from ill_arg e.g. ipif_up_done[_v6] - * for which we need to look at ire_ipif->ipif_ill match - * with ill. + * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches + * ill, but we only want to delete the IRE if ire_ipif matches. */ ASSERT(ire->ire_ipif != NULL); - ipif_ill = ire->ire_ipif->ipif_ill; - if (ipif_ill == ill || (ill->ill_group != NULL && - ipif_ill->ill_group == ill->ill_group)) { + if (ill == ire->ire_ipif->ipif_ill) ire_delete(ire); - } } /* - * Delete all the ire whose stq references ill_arg. + * Delete all the IREs whose ire_stq's reference `ill_arg'. IPMP uses this + * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references + * the IPMP ill. */ -static void +void ill_stq_cache_delete(ire_t *ire, char *ill_arg) { ill_t *ill = (ill_t *)ill_arg; - ill_t *ire_ill; ASSERT(IAM_WRITER_ILL(ill)); - /* - * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. - * Hence this should be IRE_CACHE. - */ ASSERT(ire->ire_type == IRE_CACHE); /* - * We are called for IRE_CACHES whose ire_stq and ire_ipif - * matches ill. We are only interested in IRE_CACHES that - * has ire_stq->q_ptr pointing at ill_arg. Thus we do the - * filtering here. + * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches + * ill, but we only want to delete the IRE if ire_stq matches. */ - ire_ill = (ill_t *)ire->ire_stq->q_ptr; - - if (ire_ill == ill) + if (ire->ire_stq->q_ptr == ill_arg) ire_delete(ire); } /* - * This is called when an ill leaves the group. We want to delete - * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is - * pointing at ill. + * Delete all broadcast IREs with a source address on `ill_arg'. */ static void -illgrp_cache_delete(ire_t *ire, char *ill_arg) +ill_broadcast_delete(ire_t *ire, char *ill_arg) { - ill_t *ill = (ill_t *)ill_arg; + ill_t *ill = (ill_t *)ill_arg; ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(ill->ill_group == NULL); - /* - * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. - * Hence this should be IRE_CACHE. - */ - ASSERT(ire->ire_type == IRE_CACHE); - /* - * We are called for IRE_CACHES whose ire_stq and ire_ipif - * matches ill. We are interested in both. - */ - ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) || - (ire->ire_ipif->ipif_ill == ill)); + ASSERT(ire->ire_type == IRE_BROADCAST); - ire_delete(ire); + if (ire->ire_ipif->ipif_ill == ill) + ire_delete(ire); } /* @@ -18997,13 +15210,12 @@ ipif_free(ipif_t *ipif) rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); /* Remove pointers to this ill in the multicast routing tables */ reset_mrt_vif_ipif(ipif); + /* If necessary, clear the cached source ipif rotor. */ + if (ipif->ipif_ill->ill_src_ipif == ipif) + ipif->ipif_ill->ill_src_ipif = NULL; rw_exit(&ipst->ips_ill_g_lock); } -/* - * Warning: this is not the only function that calls mi_free on an ipif_t. See - * also ill_move(). - */ static void ipif_free_tail(ipif_t *ipif) { @@ -19036,7 +15248,7 @@ ipif_free_tail(ipif_t *ipif) sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); /* Get it out of the ILL interface list. */ - ipif_remove(ipif, B_TRUE); + ipif_remove(ipif); rw_exit(&ipst->ips_ill_g_lock); mutex_destroy(&ipif->ipif_saved_ire_lock); @@ -19208,8 +15420,10 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, } else if (IPIF_CAN_WAIT(ipif, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); ill_refrele(ill); @@ -19244,7 +15458,7 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, ire_type = IRE_LOOPBACK; else ire_type = IRE_LOCAL; - ipif = ipif_allocate(ill, id, ire_type, B_TRUE); + ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE); if (ipif != NULL) ipif_refhold_locked(ipif); else if (error != NULL) @@ -19342,65 +15556,62 @@ ill_mtu_change(ire_t *ire, char *ill_arg) void ipif_multicast_up(ipif_t *ipif) { - int err, index; + int err; ill_t *ill; ASSERT(IAM_WRITER_IPIF(ipif)); ill = ipif->ipif_ill; - index = ill->ill_phyint->phyint_ifindex; ip1dbg(("ipif_multicast_up\n")); if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) return; if (ipif->ipif_isv6) { + in6_addr_t v6allmc = ipv6_all_hosts_mcast; + in6_addr_t v6solmc = ipv6_solicited_node_mcast; + + v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; + if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) return; - /* Join the all hosts multicast address */ ip1dbg(("ipif_multicast_up - addmulti\n")); + /* - * Passing B_TRUE means we have to join the multicast - * membership on this interface even though this is - * FAILED. If we join on a different one in the group, - * we will not be able to delete the membership later - * as we currently don't track where we join when we - * join within the kernel unlike applications where - * we have ilg/ilg_orig_index. See ip_addmulti_v6 - * for more on this. + * Join the all hosts multicast address. We skip this for + * underlying IPMP interfaces since they should be invisible. */ - err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index, - ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); - if (err != 0) { - ip0dbg(("ipif_multicast_up: " - "all_hosts_mcast failed %d\n", - err)); - return; + if (!IS_UNDER_IPMP(ill)) { + err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid, + ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); + if (err != 0) { + ip0dbg(("ipif_multicast_up: " + "all_hosts_mcast failed %d\n", err)); + return; + } + ipif->ipif_joined_allhosts = 1; } + /* * Enable multicast for the solicited node multicast address */ if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { - in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; - - ipv6_multi.s6_addr32[3] |= - ipif->ipif_v6lcl_addr.s6_addr32[3]; - - err = ip_addmulti_v6(&ipv6_multi, ill, index, - ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, - NULL); + err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid, + ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); if (err != 0) { ip0dbg(("ipif_multicast_up: solicited MC" " failed %d\n", err)); - (void) ip_delmulti_v6(&ipv6_all_hosts_mcast, - ill, ill->ill_phyint->phyint_ifindex, - ipif->ipif_zoneid, B_TRUE, B_TRUE); + if (ipif->ipif_joined_allhosts) { + (void) ip_delmulti_v6(&v6allmc, ill, + ipif->ipif_zoneid, B_TRUE, B_TRUE); + ipif->ipif_joined_allhosts = 0; + } return; } } } else { - if (ipif->ipif_lcl_addr == INADDR_ANY) + if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) return; /* Join the all hosts multicast address */ @@ -19420,7 +15631,7 @@ ipif_multicast_up(ipif_t *ipif) * (Explicit memberships are blown away in ill_leave_multicast() when the * ill is brought down.) */ -static void +void ipif_multicast_down(ipif_t *ipif) { int err; @@ -19444,19 +15655,18 @@ ipif_multicast_down(ipif_t *ipif) } /* - * Leave the all hosts multicast address. Similar to ip_addmulti_v6, - * we should look for ilms on this ill rather than the ones that have - * been failed over here. They are here temporarily. As - * ipif_multicast_up has joined on this ill, we should delete only - * from this ill. + * Leave the all-hosts multicast address. */ - err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, - ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, - B_TRUE, B_TRUE); - if (err != 0) { - ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n", - err)); + if (ipif->ipif_joined_allhosts) { + err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, + ipif->ipif_zoneid, B_TRUE, B_TRUE); + if (err != 0) { + ip0dbg(("ipif_multicast_down: all_hosts_mcast " + "failed %d\n", err)); + } + ipif->ipif_joined_allhosts = 0; } + /* * Disable multicast for the solicited node multicast address */ @@ -19467,9 +15677,7 @@ ipif_multicast_down(ipif_t *ipif) ipif->ipif_v6lcl_addr.s6_addr32[3]; err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, - ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, B_TRUE, B_TRUE); - if (err != 0) { ip0dbg(("ipif_multicast_down: sol MC failed %d\n", err)); @@ -19683,9 +15891,8 @@ ipif_set_default(ipif_t *ipif) * Return 0 if this address can be used as local address without causing * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address * is already up on a different ill, and EADDRINUSE if it's up on the same ill. - * Special checks are needed to allow the same IPv6 link-local address - * on different ills. - * TODO: allowing the same site-local address on different ill's. + * Note that the same IPv6 link-local address is allowed as long as the ills + * are not on the same link. */ int ip_addr_availability_check(ipif_t *new_ipif) @@ -19717,30 +15924,26 @@ ip_addr_availability_check(ipif_t *new_ipif) ipif = ipif->ipif_next) { if ((ipif == new_ipif) || !(ipif->ipif_flags & IPIF_UP) || - (ipif->ipif_flags & IPIF_UNNUMBERED)) + (ipif->ipif_flags & IPIF_UNNUMBERED) || + !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, + &our_v6addr)) continue; - if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, - &our_v6addr)) { - if (new_ipif->ipif_flags & IPIF_POINTOPOINT) - new_ipif->ipif_flags |= IPIF_UNNUMBERED; - else if (ipif->ipif_flags & IPIF_POINTOPOINT) - ipif->ipif_flags |= IPIF_UNNUMBERED; - else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) && - new_ipif->ipif_ill != ill) - continue; - else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) && - new_ipif->ipif_ill != ill) - continue; - else if (new_ipif->ipif_zoneid != - ipif->ipif_zoneid && - ipif->ipif_zoneid != ALL_ZONES && - IS_LOOPBACK(ill)) - continue; - else if (new_ipif->ipif_ill == ill) - return (EADDRINUSE); - else - return (EADDRNOTAVAIL); - } + + if (new_ipif->ipif_flags & IPIF_POINTOPOINT) + new_ipif->ipif_flags |= IPIF_UNNUMBERED; + else if (ipif->ipif_flags & IPIF_POINTOPOINT) + ipif->ipif_flags |= IPIF_UNNUMBERED; + else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || + IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && + !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) + continue; + else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && + ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) + continue; + else if (new_ipif->ipif_ill == ill) + return (EADDRINUSE); + else + return (EADDRNOTAVAIL); } } @@ -19753,13 +15956,15 @@ ip_addr_availability_check(ipif_t *new_ipif) * When the routine returns EINPROGRESS then mp has been consumed and * the ioctl will be acked from ip_rput_dlpi. */ -static int +int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) { - ill_t *ill = ipif->ipif_ill; - boolean_t isv6 = ipif->ipif_isv6; - int err = 0; - boolean_t success; + ill_t *ill = ipif->ipif_ill; + boolean_t isv6 = ipif->ipif_isv6; + int err = 0; + boolean_t success; + uint_t ipif_orig_id; + ip_stack_t *ipst = ill->ill_ipst; ASSERT(IAM_WRITER_IPIF(ipif)); @@ -19769,6 +15974,123 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) if (ipif->ipif_flags & IPIF_UP) return (EALREADY); + /* + * If this is a request to bring up a data address on an interface + * under IPMP, then move the address to its IPMP meta-interface and + * try to bring it up. One complication is that the zeroth ipif for + * an ill is special, in that every ill always has one, and that code + * throughout IP deferences ill->ill_ipif without holding any locks. + */ + if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && + (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { + ipif_t *stubipif = NULL, *moveipif = NULL; + ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); + + /* + * The ipif being brought up should be quiesced. If it's not, + * something has gone amiss and we need to bail out. (If it's + * quiesced, we know it will remain so via IPIF_CHANGING.) + */ + mutex_enter(&ill->ill_lock); + if (!ipif_is_quiescent(ipif)) { + mutex_exit(&ill->ill_lock); + return (EINVAL); + } + mutex_exit(&ill->ill_lock); + + /* + * If we're going to need to allocate ipifs, do it prior + * to starting the move (and grabbing locks). + */ + if (ipif->ipif_id == 0) { + moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, + B_FALSE); + stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, + B_FALSE); + if (moveipif == NULL || stubipif == NULL) { + mi_free(moveipif); + mi_free(stubipif); + return (ENOMEM); + } + } + + /* + * Grab or transfer the ipif to move. During the move, keep + * ill_g_lock held to prevent any ill walker threads from + * seeing things in an inconsistent state. + */ + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + if (ipif->ipif_id != 0) { + ipif_remove(ipif); + } else { + ipif_transfer(ipif, moveipif, stubipif); + ipif = moveipif; + } + + /* + * Place the ipif on the IPMP ill. If the zeroth ipif on + * the IPMP ill is a stub (0.0.0.0 down address) then we + * replace that one. Otherwise, pick the next available slot. + */ + ipif->ipif_ill = ipmp_ill; + ipif_orig_id = ipif->ipif_id; + + if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { + ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); + ipif = ipmp_ill->ill_ipif; + } else { + ipif->ipif_id = -1; + if (ipif_insert(ipif, B_FALSE) != 0) { + /* + * No more available ipif_id's -- put it back + * on the original ill and fail the operation. + * Since we're writer on the ill, we can be + * sure our old slot is still available. + */ + ipif->ipif_id = ipif_orig_id; + ipif->ipif_ill = ill; + if (ipif_orig_id == 0) { + ipif_transfer(ipif, ill->ill_ipif, + NULL); + } else { + VERIFY(ipif_insert(ipif, B_FALSE) == 0); + } + rw_exit(&ipst->ips_ill_g_lock); + return (ENOMEM); + } + } + rw_exit(&ipst->ips_ill_g_lock); + + /* + * Tell SCTP that the ipif has moved. Note that even if we + * had to allocate a new ipif, the original sequence id was + * preserved and therefore SCTP won't know. + */ + sctp_move_ipif(ipif, ill, ipmp_ill); + + /* + * If the ipif being brought up was on slot zero, then we + * first need to bring up the placeholder we stuck there. In + * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call + * to ipif_up() itself, if we successfully bring up the + * placeholder, we'll check ill_move_ipif and bring it up too. + */ + if (ipif_orig_id == 0) { + ASSERT(ill->ill_move_ipif == NULL); + ill->ill_move_ipif = ipif; + if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) + ASSERT(ill->ill_move_ipif == NULL); + if (err != EINPROGRESS) + ill->ill_move_ipif = NULL; + return (err); + } + + /* + * Bring it up on the IPMP ill. + */ + return (ipif_up(ipif, q, mp)); + } + /* Skip arp/ndp for any loopback interface. */ if (ill->ill_wq != NULL) { conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; @@ -19798,7 +16120,6 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) */ ASSERT(connp != NULL || !CONN_Q(q)); - ASSERT(ipsq->ipsq_pending_mp == NULL); if (connp != NULL) mutex_enter(&connp->conn_lock); mutex_enter(&ill->ill_lock); @@ -19810,27 +16131,25 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) return (EINTR); /* - * Crank up IPv6 neighbor discovery - * Unlike ARP, this should complete when - * ipif_ndp_up returns. However, for - * ILLF_XRESOLV interfaces we also send a - * AR_INTERFACE_UP to the external resolver. - * That ioctl will complete in ip_rput. + * Crank up the resolver. For IPv6, this cranks up the + * external resolver if one is configured, but even if an + * external resolver isn't configured, it must be called to + * reset DAD state. For IPv6, if an external resolver is not + * being used, ipif_resolver_up() will never return + * EINPROGRESS, so we can always call ipif_ndp_up() here. + * Note that if an external resolver is being used, there's no + * need to call ipif_ndp_up() since it will do nothing. */ - if (isv6) { - err = ipif_ndp_up(ipif); - if (err != 0) { - if (err != EINPROGRESS) - mp = ipsq_pending_mp_get(ipsq, &connp); - return (err); - } - } - /* Now, ARP */ err = ipif_resolver_up(ipif, Res_act_initial); if (err == EINPROGRESS) { - /* We will complete it in ip_arp_done */ + /* We will complete it in ip_arp_done() */ return (err); } + + if (isv6 && err == 0) + err = ipif_ndp_up(ipif, B_TRUE); + + ASSERT(err != EINPROGRESS); mp = ipsq_pending_mp_get(ipsq, &connp); ASSERT(mp != NULL); if (err != 0) @@ -19843,7 +16162,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); ipif->ipif_addr_ready = 1; } - return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); + + err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif); + if (err == 0 && ill->ill_move_ipif != NULL) { + ipif = ill->ill_move_ipif; + ill->ill_move_ipif = NULL; + return (ipif_up(ipif, q, mp)); + } + return (err); } /* @@ -19939,13 +16265,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) return (EINPROGRESS); bad: ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); - /* - * We don't have to check for possible removal from illgrp - * as we have not yet inserted in illgrp. For groups - * without names, this ipif is still not UP and hence - * this could not have possibly had any influence in forming - * groups. - */ freemsg(bind_mp); freemsg(unbind_mp); @@ -19974,12 +16293,10 @@ ipif_up_done(ipif_t *ipif) ipif_t *tmp_ipif; boolean_t flush_ire_cache = B_TRUE; int err = 0; - phyint_t *phyi; ire_t **ipif_saved_irep = NULL; int ipif_saved_ire_cnt; int cnt; boolean_t src_ipif_held = B_FALSE; - boolean_t ire_added = B_FALSE; boolean_t loopback = B_FALSE; ip_stack_t *ipst = ill->ill_ipst; @@ -20010,7 +16327,7 @@ ipif_up_done(ipif_t *ipif) break; } if (flush_ire_cache) - ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, + ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); /* @@ -20044,7 +16361,9 @@ ipif_up_done(ipif_t *ipif) ipif->ipif_ire_type = IRE_LOCAL; } - if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { + if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) || + ((ipif->ipif_flags & IPIF_DEPRECATED) && + !(ipif->ipif_flags & IPIF_NOFAILOVER))) { /* * Can't use our source address. Select a different * source address for the IRE_INTERFACE and IRE_LOCAL @@ -20189,11 +16508,9 @@ ipif_up_done(ipif_t *ipif) } /* - * Need to atomically check for ip_addr_availablity_check - * under ip_addr_avail_lock, and if it fails got bad, and remove - * from group also.The ill_g_lock is grabbed as reader - * just to make sure no new ills or new ipifs are being added - * to the system while we are checking the uniqueness of addresses. + * Need to atomically check for IP address availability under + * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new + * ills or new ipifs can be added while we are checking availability. */ rw_enter(&ipst->ips_ill_g_lock, RW_READER); mutex_enter(&ipst->ips_ip_addr_avail_lock); @@ -20227,13 +16544,6 @@ ipif_up_done(ipif_t *ipif) /* * Add in all newly created IREs. ire_create_bcast() has * already checked for duplicates of the IRE_BROADCAST type. - * We want to add before we call ifgrp_insert which wants - * to know whether IRE_IF_RESOLVER exists or not. - * - * NOTE : We refrele the ire though we may branch to "bad" - * later on where we do ire_delete. This is okay - * because nobody can delete it as we are running - * exclusively. */ for (irep1 = irep; irep1 > ire_array; ) { irep1--; @@ -20243,44 +16553,6 @@ ipif_up_done(ipif_t *ipif) */ (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); } - ire_added = B_TRUE; - /* - * Form groups if possible. - * - * If we are supposed to be in a ill_group with a name, insert it - * now as we know that at least one ipif is UP. Otherwise form - * nameless groups. - * - * If ip_enable_group_ifs is set and ipif address is not 0, insert - * this ipif into the appropriate interface group, or create a - * new one. If this is already in a nameless group, we try to form - * a bigger group looking at other ills potentially sharing this - * ipif's prefix. - */ - phyi = ill->ill_phyint; - if (phyi->phyint_groupname_len != 0) { - ASSERT(phyi->phyint_groupname != NULL); - if (ill->ill_ipif_up_count == 1) { - ASSERT(ill->ill_group == NULL); - err = illgrp_insert(&ipst->ips_illgrp_head_v4, ill, - phyi->phyint_groupname, NULL, B_TRUE); - if (err != 0) { - ip1dbg(("ipif_up_done: illgrp allocation " - "failed, error %d\n", err)); - goto bad; - } - } - ASSERT(ill->ill_group != NULL); - } - - /* - * When this is part of group, we need to make sure that - * any broadcast ires created because of this ipif coming - * UP gets marked/cleared with IRE_MARK_NORECV appropriately - * so that we don't receive duplicate broadcast packets. - */ - if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0) - ipif_renominate_bcast(ipif); /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; @@ -20331,19 +16603,30 @@ ipif_up_done(ipif_t *ipif) */ ill_recover_multicast(ill); } - /* Join the allhosts multicast address */ - ipif_multicast_up(ipif); - if (!loopback) { + if (ill->ill_ipif_up_count == 1) { + /* + * Since the interface is now up, it may now be active. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_ill_refresh_active(ill); + /* - * See whether anybody else would benefit from the - * new ipif that we added. We call this always rather - * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST - * ipif is for the benefit of illgrp_insert (done above) - * which does not do source address selection as it does - * not want to re-create interface routes that we are - * having reference to it here. + * If this is an IPMP interface, we may now be able to + * establish ARP entries. */ + if (IS_IPMP(ill)) + ipmp_illgrp_refresh_arpent(ill->ill_grp); + } + + /* Join the allhosts multicast address */ + ipif_multicast_up(ipif); + + /* + * See if anybody else would benefit from our new ipif. + */ + if (!loopback && + !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { ill_update_source_selection(ill); } @@ -20386,27 +16669,11 @@ ipif_up_done(ipif_t *ipif) bad: ip1dbg(("ipif_up_done: FAILED \n")); - /* - * We don't have to bother removing from ill groups because - * - * 1) For groups with names, we insert only when the first ipif - * comes up. In that case if it fails, it will not be in any - * group. So, we need not try to remove for that case. - * - * 2) For groups without names, either we tried to insert ipif_ill - * in a group as singleton or found some other group to become - * a bigger group. For the former, if it fails we don't have - * anything to do as ipif_ill is not in the group and for the - * latter, there are no failures in illgrp_insert/illgrp_delete - * (ENOMEM can't occur for this. Check ifgrp_insert). - */ + while (irep > ire_array) { irep--; - if (*irep != NULL) { + if (*irep != NULL) ire_delete(*irep); - if (ire_added) - ire_refrele(*irep); - } } (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); @@ -20417,7 +16684,7 @@ bad: if (src_ipif_held) ipif_refrele(src_ipif); - ipif_arp_down(ipif); + ipif_resolver_down(ipif); return (err); } @@ -20493,119 +16760,6 @@ ill_arp_on(ill_t *ill) } /* - * Called after either deleting ill from the group or when setting - * FAILED or STANDBY on the interface. - */ -static void -illgrp_reset_schednext(ill_t *ill) -{ - ill_group_t *illgrp; - ill_t *save_ill; - - ASSERT(IAM_WRITER_ILL(ill)); - /* - * When called from illgrp_delete, ill_group will be non-NULL. - * But when called from ip_sioctl_flags, it could be NULL if - * somebody is setting FAILED/INACTIVE on some interface which - * is not part of a group. - */ - illgrp = ill->ill_group; - if (illgrp == NULL) - return; - if (illgrp->illgrp_ill_schednext != ill) - return; - - illgrp->illgrp_ill_schednext = NULL; - save_ill = ill; - /* - * Choose a good ill to be the next one for - * outbound traffic. As the flags FAILED/STANDBY is - * not yet marked when called from ip_sioctl_flags, - * we check for ill separately. - */ - for (ill = illgrp->illgrp_ill; ill != NULL; - ill = ill->ill_group_next) { - if ((ill != save_ill) && - !(ill->ill_phyint->phyint_flags & - (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) { - illgrp->illgrp_ill_schednext = ill; - return; - } - } -} - -/* - * Given an ill, find the next ill in the group to be scheduled. - * (This should be called by ip_newroute() before ire_create().) - * The passed in ill may be pulled out of the group, after we have picked - * up a different outgoing ill from the same group. However ire add will - * atomically check this. - */ -ill_t * -illgrp_scheduler(ill_t *ill) -{ - ill_t *retill; - ill_group_t *illgrp; - int illcnt; - int i; - uint64_t flags; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * We don't use a lock to check for the ill_group. If this ill - * is currently being inserted we may end up just returning this - * ill itself. That is ok. - */ - if (ill->ill_group == NULL) { - ill_refhold(ill); - return (ill); - } - - /* - * Grab the ill_g_lock as reader to make sure we are dealing with - * a set of stable ills. No ill can be added or deleted or change - * group while we hold the reader lock. - */ - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - if ((illgrp = ill->ill_group) == NULL) { - rw_exit(&ipst->ips_ill_g_lock); - ill_refhold(ill); - return (ill); - } - - illcnt = illgrp->illgrp_ill_count; - mutex_enter(&illgrp->illgrp_lock); - retill = illgrp->illgrp_ill_schednext; - - if (retill == NULL) - retill = illgrp->illgrp_ill; - - /* - * We do a circular search beginning at illgrp_ill_schednext - * or illgrp_ill. We don't check the flags against the ill lock - * since it can change anytime. The ire creation will be atomic - * and will fail if the ill is FAILED or OFFLINE. - */ - for (i = 0; i < illcnt; i++) { - flags = retill->ill_phyint->phyint_flags; - - if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && - ILL_CAN_LOOKUP(retill)) { - illgrp->illgrp_ill_schednext = retill->ill_group_next; - ill_refhold(retill); - break; - } - retill = retill->ill_group_next; - if (retill == NULL) - retill = illgrp->illgrp_ill; - } - mutex_exit(&illgrp->illgrp_lock); - rw_exit(&ipst->ips_ill_g_lock); - - return (i == illcnt ? NULL : retill); -} - -/* * Checks for availbility of a usable source address (if there is one) when the * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note * this selection is done regardless of the destination. @@ -20654,11 +16808,26 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) } /* - * Determine the best source address given a destination address and an ill. - * Prefers non-deprecated over deprecated but will return a deprecated - * address if there is no other choice. If there is a usable source address - * on the interface pointed to by ill_usesrc_ifindex then that is given - * first preference. + * IP source address type, sorted from worst to best. For a given type, + * always prefer IP addresses on the same subnet. All-zones addresses are + * suboptimal because they pose problems with unlabeled destinations. + */ +typedef enum { + IPIF_NONE, + IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ + IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ + IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ + IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ + IPIF_DIFFNET, /* normal and different subnet */ + IPIF_SAMENET /* normal and same subnet */ +} ipif_type_t; + +/* + * Pick the optimal ipif on `ill' for sending to destination `dst' from zone + * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t + * enumeration, and return the highest-rated ipif. If there's a tie, we pick + * the first one, unless IPMP is used in which case we round-robin among them; + * see below for more. * * Returns NULL if there is no suitable source address for the ill. * This only occurs when there is no valid source address for the ill. @@ -20666,17 +16835,13 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) ipif_t * ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) { - ipif_t *ipif; - ipif_t *ipif_dep = NULL; /* Fallback to deprecated */ - ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE]; - int index = 0; - boolean_t wrapped = B_FALSE; - boolean_t same_subnet_only = B_FALSE; - boolean_t ipif_same_found, ipif_other_found; - boolean_t specific_found; - ill_t *till, *usill = NULL; + ill_t *usill = NULL; + ill_t *ipmp_ill = NULL; + ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; + ipif_type_t type, best_type; tsol_tpc_t *src_rhtp, *dst_rhtp; - ip_stack_t *ipst = ill->ill_ipst; + ip_stack_t *ipst = ill->ill_ipst; + boolean_t samenet; if (ill->ill_usesrc_ifindex != 0) { usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, @@ -20688,6 +16853,17 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) } /* + * Test addresses should never be used for source address selection, + * so if we were passed one, switch to the IPMP meta-interface. + */ + if (IS_UNDER_IPMP(ill)) { + if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) + ill = ipmp_ill; /* Select source from IPMP ill */ + else + return (NULL); + } + + /* * If we're dealing with an unlabeled destination on a labeled system, * make sure that we ignore source addresses that are incompatible with * the destination's default label. That destination's default label @@ -20705,7 +16881,7 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) } /* - * Holds the ill_g_lock as reader. This makes sure that no ipif/ill + * Hold the ill_g_lock as reader. This makes sure that no ipif/ill * can be deleted. But an ipif/ill can get CONDEMNED any time. * After selecting the right ipif, under ill_lock make sure ipif is * not condemned, and increment refcnt. If ipif is CONDEMNED, @@ -20713,190 +16889,117 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) * but not under a lock. */ rw_enter(&ipst->ips_ill_g_lock, RW_READER); - retry: - till = ill; - ipif_arr[0] = NULL; + /* + * For source address selection, we treat the ipif list as circular + * and continue until we get back to where we started. This allows + * IPMP to vary source address selection (which improves inbound load + * spreading) by caching its last ending point and starting from + * there. NOTE: we don't have to worry about ill_src_ipif changing + * ills since that can't happen on the IPMP ill. + */ + start_ipif = ill->ill_ipif; + if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) + start_ipif = ill->ill_src_ipif; - if (till->ill_group != NULL) - till = till->ill_group->illgrp_ill; + ipif = start_ipif; + best_ipif = NULL; + best_type = IPIF_NONE; + do { + if ((next_ipif = ipif->ipif_next) == NULL) + next_ipif = ill->ill_ipif; - /* - * Choose one good source address from each ill across the group. - * If possible choose a source address in the same subnet as - * the destination address. - * - * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE - * This is okay because of the following. - * - * If PHYI_FAILED is set and we still have non-deprecated - * addresses, it means the addresses have not yet been - * failed over to a different interface. We potentially - * select them to create IRE_CACHES, which will be later - * flushed when the addresses move over. - * - * If PHYI_INACTIVE is set and we still have non-deprecated - * addresses, it means either the user has configured them - * or PHYI_INACTIVE has not been cleared after the addresses - * been moved over. For the former, in.mpathd does a failover - * when the interface becomes INACTIVE and hence we should - * not find them. Once INACTIVE is set, we don't allow them - * to create logical interfaces anymore. For the latter, a - * flush will happen when INACTIVE is cleared which will - * flush the IRE_CACHES. - * - * If PHYI_OFFLINE is set, all the addresses will be failed - * over soon. We potentially select them to create IRE_CACHEs, - * which will be later flushed when the addresses move over. - * - * NOTE : As ipif_select_source is called to borrow source address - * for an ipif that is part of a group, source address selection - * will be re-done whenever the group changes i.e either an - * insertion/deletion in the group. - * - * Fill ipif_arr[] with source addresses, using these rules: - * - * 1. At most one source address from a given ill ends up - * in ipif_arr[] -- that is, at most one of the ipif's - * associated with a given ill ends up in ipif_arr[]. - * - * 2. If there is at least one non-deprecated ipif in the - * IPMP group with a source address on the same subnet as - * our destination, then fill ipif_arr[] only with - * source addresses on the same subnet as our destination. - * Note that because of (1), only the first - * non-deprecated ipif found with a source address - * matching the destination ends up in ipif_arr[]. - * - * 3. Otherwise, fill ipif_arr[] with non-deprecated source - * addresses not in the same subnet as our destination. - * Again, because of (1), only the first off-subnet source - * address will be chosen. - * - * 4. If there are no non-deprecated ipifs, then just use - * the source address associated with the last deprecated - * one we find that happens to be on the same subnet, - * otherwise the first one not in the same subnet. - */ - specific_found = B_FALSE; - for (; till != NULL; till = till->ill_group_next) { - ipif_same_found = B_FALSE; - ipif_other_found = B_FALSE; - for (ipif = till->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (!IPIF_CAN_LOOKUP(ipif)) - continue; - /* Always skip NOLOCAL and ANYCAST interfaces */ - if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) - continue; - if (!(ipif->ipif_flags & IPIF_UP) || - !ipif->ipif_addr_ready) - continue; - if (ipif->ipif_zoneid != zoneid && - ipif->ipif_zoneid != ALL_ZONES) - continue; - /* - * Interfaces with 0.0.0.0 address are allowed to be UP, - * but are not valid as source addresses. - */ - if (ipif->ipif_lcl_addr == INADDR_ANY) - continue; + if (!IPIF_CAN_LOOKUP(ipif)) + continue; + /* Always skip NOLOCAL and ANYCAST interfaces */ + if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) + continue; + if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready) + continue; + if (ipif->ipif_zoneid != zoneid && + ipif->ipif_zoneid != ALL_ZONES) + continue; - /* - * Check compatibility of local address for - * destination's default label if we're on a labeled - * system. Incompatible addresses can't be used at - * all. - */ - if (dst_rhtp != NULL) { - boolean_t incompat; + /* + * Interfaces with 0.0.0.0 address are allowed to be UP, but + * are not valid as source addresses. + */ + if (ipif->ipif_lcl_addr == INADDR_ANY) + continue; - src_rhtp = find_tpc(&ipif->ipif_lcl_addr, - IPV4_VERSION, B_FALSE); - if (src_rhtp == NULL) - continue; - incompat = - src_rhtp->tpc_tp.host_type != SUN_CIPSO || - src_rhtp->tpc_tp.tp_doi != - dst_rhtp->tpc_tp.tp_doi || - (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, - &src_rhtp->tpc_tp.tp_sl_range_cipso) && - !blinlset(&dst_rhtp->tpc_tp.tp_def_label, - src_rhtp->tpc_tp.tp_sl_set_cipso)); - TPC_RELE(src_rhtp); - if (incompat) - continue; - } + /* + * Check compatibility of local address for destination's + * default label if we're on a labeled system. Incompatible + * addresses can't be used at all. + */ + if (dst_rhtp != NULL) { + boolean_t incompat; - /* - * We prefer not to use all all-zones addresses, if we - * can avoid it, as they pose problems with unlabeled - * destinations. - */ - if (ipif->ipif_zoneid != ALL_ZONES) { - if (!specific_found && - (!same_subnet_only || - (ipif->ipif_net_mask & dst) == - ipif->ipif_subnet)) { - index = 0; - specific_found = B_TRUE; - ipif_other_found = B_FALSE; - } - } else { - if (specific_found) - continue; - } - if (ipif->ipif_flags & IPIF_DEPRECATED) { - if (ipif_dep == NULL || - (ipif->ipif_net_mask & dst) == - ipif->ipif_subnet) - ipif_dep = ipif; + src_rhtp = find_tpc(&ipif->ipif_lcl_addr, + IPV4_VERSION, B_FALSE); + if (src_rhtp == NULL) + continue; + incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || + src_rhtp->tpc_tp.tp_doi != + dst_rhtp->tpc_tp.tp_doi || + (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, + &src_rhtp->tpc_tp.tp_sl_range_cipso) && + !blinlset(&dst_rhtp->tpc_tp.tp_def_label, + src_rhtp->tpc_tp.tp_sl_set_cipso)); + TPC_RELE(src_rhtp); + if (incompat) continue; - } - if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) { - /* found a source address in the same subnet */ - if (!same_subnet_only) { - same_subnet_only = B_TRUE; - index = 0; - } - ipif_same_found = B_TRUE; - } else { - if (same_subnet_only || ipif_other_found) - continue; - ipif_other_found = B_TRUE; - } - ipif_arr[index++] = ipif; - if (index == MAX_IPIF_SELECT_SOURCE) { - wrapped = B_TRUE; - index = 0; - } - if (ipif_same_found) - break; } - } - if (ipif_arr[0] == NULL) { - ipif = ipif_dep; - } else { - if (wrapped) - index = MAX_IPIF_SELECT_SOURCE; - ipif = ipif_arr[ipif_rand(ipst) % index]; - ASSERT(ipif != NULL); - } + samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); - if (ipif != NULL) { + if (ipif->ipif_flags & IPIF_DEPRECATED) { + type = samenet ? IPIF_SAMENET_DEPRECATED : + IPIF_DIFFNET_DEPRECATED; + } else if (ipif->ipif_zoneid == ALL_ZONES) { + type = samenet ? IPIF_SAMENET_ALLZONES : + IPIF_DIFFNET_ALLZONES; + } else { + type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; + } + + if (type > best_type) { + best_type = type; + best_ipif = ipif; + if (best_type == IPIF_SAMENET) + break; /* can't get better */ + } + } while ((ipif = next_ipif) != start_ipif); + + if ((ipif = best_ipif) != NULL) { mutex_enter(&ipif->ipif_ill->ill_lock); if (!IPIF_CAN_LOOKUP(ipif)) { mutex_exit(&ipif->ipif_ill->ill_lock); goto retry; } ipif_refhold_locked(ipif); + + /* + * For IPMP, update the source ipif rotor to the next ipif, + * provided we can look it up. (We must not use it if it's + * IPIF_CONDEMNED since we may have grabbed ill_g_lock after + * ipif_free() checked ill_src_ipif.) + */ + if (IS_IPMP(ill) && ipif != NULL) { + next_ipif = ipif->ipif_next; + if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif)) + ill->ill_src_ipif = next_ipif; + else + ill->ill_src_ipif = NULL; + } mutex_exit(&ipif->ipif_ill->ill_lock); } rw_exit(&ipst->ips_ill_g_lock); if (usill != NULL) ill_refrele(usill); + if (ipmp_ill != NULL) + ill_refrele(ipmp_ill); if (dst_rhtp != NULL) TPC_RELE(dst_rhtp); @@ -20929,8 +17032,7 @@ retry: * ipif_update_other_ipifs calls us. * * If old_ipif is NULL, just redo the source address selection - * if needed. This happens when illgrp_insert or ipif_up_done - * calls us. + * if needed. This happens when ipif_up_done calls us. */ static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) @@ -21064,49 +17166,31 @@ ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) /* * This old_ipif is going away. * - * Determine if any other ipif's is using our address as + * Determine if any other ipif's are using our address as * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or * IPIF_DEPRECATED). * Find the IRE_INTERFACE for such ipifs and recreate them * to use an different source address following the rules in * ipif_up_done. - * - * This function takes an illgrp as an argument so that illgrp_delete - * can call this to update source address even after deleting the - * old_ipif->ipif_ill from the ill group. */ static void -ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp) +ipif_update_other_ipifs(ipif_t *old_ipif) { - ipif_t *ipif; - ill_t *ill; + ipif_t *ipif; + ill_t *ill; char buf[INET6_ADDRSTRLEN]; ASSERT(IAM_WRITER_IPIF(old_ipif)); - ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif)); ill = old_ipif->ipif_ill; - ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", - ill->ill_name, - inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, - buf, sizeof (buf)))); - /* - * If this part of a group, look at all ills as ipif_select_source - * borrows source address across all the ills in the group. - */ - if (illgrp != NULL) - ill = illgrp->illgrp_ill; - - for (; ill != NULL; ill = ill->ill_group_next) { - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - - if (ipif == old_ipif) - continue; + ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name, + inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf)))); - ipif_recreate_interface_routes(old_ipif, ipif); - } + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + if (ipif == old_ipif) + continue; + ipif_recreate_interface_routes(old_ipif, ipif); } } @@ -21117,8 +17201,7 @@ if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, { /* * ill_phyint_reinit merged the v4 and v6 into a single - * ipsq. Could also have become part of a ipmp group in the - * process, and we might not have been able to complete the + * ipsq. We might not have been able to complete the * operation in ipif_set_values, if we could not become * exclusive. If so restart it here. */ @@ -21171,6 +17254,48 @@ ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, } /* + * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the + * minimum (but complete) set exist. This is necessary when adding or + * removing an interface to/from an IPMP group, since interfaces in an + * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever + * its test address subnets overlap with IPMP data addresses). It's also + * used to refresh the IRE_BROADCAST entries associated with the IPMP + * interface when the nominated broadcast interface changes. + */ +void +ill_refresh_bcast(ill_t *ill) +{ + ire_t *ire_array[12]; /* max ipif_create_bcast_ires() can create */ + ire_t **irep; + ipif_t *ipif; + + ASSERT(!ill->ill_isv6); + ASSERT(IAM_WRITER_ILL(ill)); + + /* + * Remove any old broadcast IREs. + */ + ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST, + ill_broadcast_delete, ill, ill); + + /* + * Create new ones for any ipifs that are up and broadcast-capable. + */ + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) != + (IPIF_UP|IPIF_BROADCAST)) + continue; + + irep = ipif_create_bcast_ires(ipif, ire_array); + while (irep-- > ire_array) { + (void) ire_add(irep, NULL, NULL, NULL, B_FALSE); + if (*irep != NULL) + ire_refrele(*irep); + } + } +} + +/* * Create any IRE_BROADCAST entries for `ipif', and store those entries in * `irep'. Returns a pointer to the next free `irep' entry (just like * ire_check_and_create_bcast()). @@ -21433,10 +17558,33 @@ ipif_check_bcast_ires(ipif_t *test_ipif) /* * Walk through all the ipifs that will be affected by the dying IREs, - * and recreate the IREs as necessary. + * and recreate the IREs as necessary. Note that all interfaces in an + * IPMP illgrp share the same broadcast IREs, and thus the entire + * illgrp must be walked, starting with the IPMP meta-interface (so + * that broadcast IREs end up on it whenever possible). */ + if (IS_UNDER_IPMP(ill)) + ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); + irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); + if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { + ipmp_illgrp_t *illg = ill->ill_grp; + + ill = list_head(&illg->ig_if); + for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { + for (i = 0; i < BCAST_COUNT; i++) { + if (bireinfo[i].bi_willdie && + !bireinfo[i].bi_haverep) + break; + } + if (i == BCAST_COUNT) + break; + + irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); + } + } + /* * Scan through the set of broadcast IREs and see if there are any * that we need to replace that have not yet been replaced. If so, @@ -21528,7 +17676,7 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, /* * If there's another ill already with the requested name, ensure - * that it's of the same type. Otherwise, ill_phyint_reinit() will + * that it's of the same type. Otherwise, ill_phyint_reinit() will * fuse together two unrelated ills, which will cause chaos. */ ipst = ill->ill_ipst; @@ -21620,8 +17768,7 @@ ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, { /* * ill_phyint_reinit merged the v4 and v6 into a single - * ipsq. Could also have become part of a ipmp group in the - * process, and we might not have been able to complete the + * ipsq. We might not have been able to complete the * slifname in ipif_set_values, if we could not become * exclusive. If so restart it here */ @@ -21665,85 +17812,6 @@ ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, return (ipif); } -typedef struct conn_change_s { - uint_t cc_old_ifindex; - uint_t cc_new_ifindex; -} conn_change_t; - -/* - * ipcl_walk function for changing interface index. - */ -static void -conn_change_ifindex(conn_t *connp, caddr_t arg) -{ - conn_change_t *connc; - uint_t old_ifindex; - uint_t new_ifindex; - int i; - ilg_t *ilg; - - connc = (conn_change_t *)arg; - old_ifindex = connc->cc_old_ifindex; - new_ifindex = connc->cc_new_ifindex; - - if (connp->conn_orig_bound_ifindex == old_ifindex) - connp->conn_orig_bound_ifindex = new_ifindex; - - if (connp->conn_orig_multicast_ifindex == old_ifindex) - connp->conn_orig_multicast_ifindex = new_ifindex; - - for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { - ilg = &connp->conn_ilg[i]; - if (ilg->ilg_orig_ifindex == old_ifindex) - ilg->ilg_orig_ifindex = new_ifindex; - } -} - -/* - * Walk all the ipifs and ilms on this ill and change the orig_ifindex - * to new_index if it matches the old_index. - * - * Failovers typically happen within a group of ills. But somebody - * can remove an ill from the group after a failover happened. If - * we are setting the ifindex after this, we potentially need to - * look at all the ills rather than just the ones in the group. - * We cut down the work by looking at matching ill_net_types - * and ill_types as we could not possibly grouped them together. - */ -static void -ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc) -{ - ill_t *ill; - ipif_t *ipif; - uint_t old_ifindex; - uint_t new_ifindex; - ilm_t *ilm; - ill_walk_context_t ctx; - ip_stack_t *ipst = ill_orig->ill_ipst; - - old_ifindex = connc->cc_old_ifindex; - new_ifindex = connc->cc_new_ifindex; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - ill = ILL_START_WALK_ALL(&ctx, ipst); - for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if ((ill_orig->ill_net_type != ill->ill_net_type) || - (ill_orig->ill_type != ill->ill_type)) { - continue; - } - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (ipif->ipif_orig_ifindex == old_ifindex) - ipif->ipif_orig_ifindex = new_ifindex; - } - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (ilm->ilm_orig_ifindex == old_ifindex) - ilm->ilm_orig_ifindex = new_ifindex; - } - } - rw_exit(&ipst->ips_ill_g_lock); -} - /* * We first need to ensure that the new index is unique, and * then carry the change across both v4 and v6 ill representation @@ -21755,13 +17823,10 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) { ill_t *ill; - ill_t *ill_other; phyint_t *phyi; - int old_index; - conn_change_t connc; struct ifreq *ifr = (struct ifreq *)ifreq; struct lifreq *lifr = (struct lifreq *)ifreq; - uint_t index; + uint_t old_index, index; ill_t *ill_v4; ill_t *ill_v6; ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; @@ -21773,31 +17838,15 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, /* * Only allow on physical interface. Also, index zero is illegal. - * - * Need to check for PHYI_FAILED and PHYI_INACTIVE - * - * 1) If PHYI_FAILED is set, a failover could have happened which - * implies a possible failback might have to happen. As failback - * depends on the old index, we should fail setting the index. - * - * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that - * any addresses or multicast memberships are failed over to - * a non-STANDBY interface. As failback depends on the old - * index, we should fail setting the index for this case also. - * - * 3) If PHYI_OFFLINE is set, a possible failover has happened. - * Be consistent with PHYI_FAILED and fail the ioctl. */ ill = ipif->ipif_ill; phyi = ill->ill_phyint; - if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) || - ipif->ipif_id != 0 || index == 0) { + if (ipif->ipif_id != 0 || index == 0) { return (EINVAL); } - old_index = phyi->phyint_ifindex; /* If the index is not changing, no work to do */ - if (old_index == index) + if (phyi->phyint_ifindex == index) return (0); /* @@ -21816,31 +17865,17 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, return (EBUSY); } - /* - * The new index is unused. Set it in the phyint. - * Locate the other ill so that we can send a routing - * sockets message. - */ - if (ill->ill_isv6) { - ill_other = phyi->phyint_illv4; - } else { - ill_other = phyi->phyint_illv6; - } - + /* The new index is unused. Set it in the phyint. */ + old_index = phyi->phyint_ifindex; phyi->phyint_ifindex = index; /* Update SCTP's ILL list */ sctp_ill_reindex(ill, old_index); - connc.cc_old_ifindex = old_index; - connc.cc_new_ifindex = index; - ip_change_ifindex(ill, &connc); - ipcl_walk(conn_change_ifindex, (caddr_t)&connc, ipst); - /* Send the routing sockets message */ - ip_rts_ifmsg(ipif); - if (ill_other != NULL) - ip_rts_ifmsg(ill_other->ill_ipif); + ip_rts_ifmsg(ipif, RTSQ_DEFAULT); + if (ILL_OTHER(ill)) + ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); return (0); } @@ -22038,6 +18073,45 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, B_TRUE)); } +/* + * Return the number of addresses on `ill' with one or more of the values + * in `set' set and all of the values in `clear' clear. + */ +static uint_t +ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) +{ + ipif_t *ipif; + uint_t cnt = 0; + + ASSERT(IAM_WRITER_ILL(ill)); + + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) + cnt++; + + return (cnt); +} + +/* + * Return the number of migratable addresses on `ill' that are under + * application control. + */ +uint_t +ill_appaddr_cnt(const ill_t *ill) +{ + return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, + IPIF_NOFAILOVER)); +} + +/* + * Return the number of point-to-point addresses on `ill'. + */ +uint_t +ill_ptpaddr_cnt(const ill_t *ill) +{ + return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); +} + /* ARGSUSED */ int ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, @@ -22158,7 +18232,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; int err = 0, ret; uint_t ifindex; - phyint_t *us_phyint, *us_cli_phyint; ipsq_t *ipsq = NULL; ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; @@ -22167,19 +18240,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ASSERT(CONN_Q(q)); isv6 = (Q_TO_CONN(q))->conn_af_isv6; - us_cli_phyint = usesrc_cli_ill->ill_phyint; - - ASSERT(us_cli_phyint != NULL); - - /* - * If the client ILL is being used for IPMP, abort. - * Note, this can be done before ipsq_try_enter since we are already - * exclusive on this ILL - */ - if ((us_cli_phyint->phyint_groupname != NULL) || - (us_cli_phyint->phyint_flags & PHYI_STANDBY)) { - return (EINVAL); - } ifindex = lifr->lifr_index; if (ifindex == 0) { @@ -22198,15 +18258,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, return (err); } - /* - * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP - * group nor can either of the interfaces be used for standy. So - * to guarantee mutual exclusion with ip_sioctl_flags (which sets - * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname) - * we need to be exclusive on the ipsq belonging to the usesrc_ill. - * We are already exlusive on this ipsq i.e ipsq corresponding to - * the usesrc_cli_ill - */ ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, NEW_OP, B_TRUE); if (ipsq == NULL) { @@ -22215,11 +18266,19 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, goto done; } - /* Check if the usesrc_ill is used for IPMP */ - us_phyint = usesrc_ill->ill_phyint; - if ((us_phyint->phyint_groupname != NULL) || - (us_phyint->phyint_flags & PHYI_STANDBY)) { - err = EINVAL; + /* USESRC isn't currently supported with IPMP */ + if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { + err = ENOTSUP; + goto done; + } + + /* + * USESRC isn't compatible with the STANDBY flag. (STANDBY is only + * used by IPMP underlying interfaces, but someone might think it's + * more general and try to use it independently with VNI.) + */ + if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { + err = ENOTSUP; goto done; } @@ -22372,79 +18431,45 @@ ill_phyint_compare_name(const void *name_ptr, const void *phyip) return (-1); return (0); } + /* - * This function is called from ill_delete when the ill is being - * unplumbed. We remove the reference from the phyint and we also - * free the phyint when there are no more references to it. + * This function is called on the unplumb path via ill_glist_delete() when + * there are no ills left on the phyint and thus the phyint can be freed. */ static void -ill_phyint_free(ill_t *ill) +phyint_free(phyint_t *phyi) { - phyint_t *phyi; - phyint_t *next_phyint; - ipsq_t *cur_ipsq; - ip_stack_t *ipst = ill->ill_ipst; + ip_stack_t *ipst = PHYINT_TO_IPST(phyi); - ASSERT(ill->ill_phyint != NULL); + ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - phyi = ill->ill_phyint; - ill->ill_phyint = NULL; /* - * ill_init allocates a phyint always to store the copy - * of flags relevant to phyint. At that point in time, we could - * not assign the name and hence phyint_illv4/v6 could not be - * initialized. Later in ipif_set_values, we assign the name to - * the ill, at which point in time we assign phyint_illv4/v6. - * Thus we don't rely on phyint_illv6 to be initialized always. + * If this phyint was an IPMP meta-interface, blow away the group. + * This is safe to do because all of the illgrps have already been + * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. + * If we're cleaning up as a result of failed initialization, + * phyint_grp may be NULL. */ - if (ill->ill_flags & ILLF_IPV6) { - phyi->phyint_illv6 = NULL; - } else { - phyi->phyint_illv4 = NULL; + if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + ipmp_grp_destroy(phyi->phyint_grp); + phyi->phyint_grp = NULL; + rw_exit(&ipst->ips_ipmp_lock); } - /* - * ipif_down removes it from the group when the last ipif goes - * down. - */ - ASSERT(ill->ill_group == NULL); - - if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) - return; /* - * Make sure this phyint was put in the list. + * If this interface was under IPMP, take it out of the group. */ - if (phyi->phyint_ifindex > 0) { - avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, - phyi); - avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, - phyi); - } + if (phyi->phyint_grp != NULL) + ipmp_phyint_leave_grp(phyi); + /* - * remove phyint from the ipsq list. + * Delete the phyint and disassociate its ipsq. The ipsq itself + * will be freed in ipsq_exit(). */ - cur_ipsq = phyi->phyint_ipsq; - if (phyi == cur_ipsq->ipsq_phyint_list) { - cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next; - } else { - next_phyint = cur_ipsq->ipsq_phyint_list; - while (next_phyint != NULL) { - if (next_phyint->phyint_ipsq_next == phyi) { - next_phyint->phyint_ipsq_next = - phyi->phyint_ipsq_next; - break; - } - next_phyint = next_phyint->phyint_ipsq_next; - } - ASSERT(next_phyint != NULL); - } - IPSQ_DEC_REF(cur_ipsq, ipst); + phyi->phyint_ipsq->ipsq_phyint = NULL; + phyi->phyint_name[0] = '\0'; - if (phyi->phyint_groupname_len != 0) { - ASSERT(phyi->phyint_groupname != NULL); - mi_free(phyi->phyint_groupname); - } mi_free(phyi); } @@ -22464,7 +18489,6 @@ ill_phyint_reinit(ill_t *ill) phyint_t *phyi; avl_index_t where = 0; ill_t *ill_other = NULL; - ipsq_t *ipsq; ip_stack_t *ipst = ill->ill_ipst; ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); @@ -22476,6 +18500,11 @@ ill_phyint_reinit(ill_t *ill) phyi_old->phyint_illv4 == NULL)); ASSERT(phyi_old->phyint_ifindex == 0); + /* + * Now that our ill has a name, set it in the phyint. + */ + (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); + phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, ill->ill_name, &where); @@ -22497,8 +18526,7 @@ ill_phyint_reinit(ill_t *ill) * we are initializing IPv4. */ if (phyi != NULL) { - ill_other = (isv6) ? phyi->phyint_illv4 : - phyi->phyint_illv6; + ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; ASSERT(ill_other->ill_phyint != NULL); ASSERT((isv6 && !ill_other->ill_isv6) || (!isv6 && ill_other->ill_isv6)); @@ -22517,26 +18545,15 @@ ill_phyint_reinit(ill_t *ill) ASSERT(phyi->phyint_illv4 == NULL); phyi->phyint_illv4 = ill; } - /* - * This is a new ill, currently undergoing SLIFNAME - * So we could not have joined an IPMP group until now. - */ - ASSERT(phyi_old->phyint_ipsq_next == NULL && - phyi_old->phyint_groupname == NULL); /* - * This phyi_old is going away. Decref ipsq_refs and - * assert it is zero. The ipsq itself will be freed in - * ipsq_exit + * Delete the old phyint and make its ipsq eligible + * to be freed in ipsq_exit(). */ - ipsq = phyi_old->phyint_ipsq; - IPSQ_DEC_REF(ipsq, ipst); - ASSERT(ipsq->ipsq_refs == 0); - /* Get the singleton phyint out of the ipsq list */ - ASSERT(phyi_old->phyint_ipsq_next == NULL); - ipsq->ipsq_phyint_list = NULL; phyi_old->phyint_illv4 = NULL; phyi_old->phyint_illv6 = NULL; + phyi_old->phyint_ipsq->ipsq_phyint = NULL; + phyi_old->phyint_name[0] = '\0'; mi_free(phyi_old); } else { mutex_enter(&ill->ill_lock); @@ -22551,9 +18568,6 @@ ill_phyint_reinit(ill_t *ill) if (!phyint_assign_ifindex(phyi, ipst)) cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); - /* No IPMP group yet, thus the hook uses the ifindex */ - phyi->phyint_hook_ifindex = phyi->phyint_ifindex; - avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, (void *)phyi, where); @@ -22571,13 +18585,6 @@ ill_phyint_reinit(ill_t *ill) ill->ill_phyint = phyi; /* - * Keep the index on ipif_orig_index to be used by FAILOVER. - * We do this here as when the first ipif was allocated, - * ipif_allocate does not know the right interface index. - */ - - ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex; - /* * Now that the phyint's ifindex has been assigned, complete the * remaining */ @@ -22606,45 +18613,14 @@ ill_phyint_reinit(ill_t *ill) */ if (ill->ill_name_length <= 2 || ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') { - /* - * Generate nic plumb event for ill_name even if - * ipmp_hook_emulation is set. That avoids generating events - * for the ill_names should ipmp_hook_emulation be turned on - * later. - */ - ill_nic_event_plumb(ill, B_FALSE); + ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, + ill->ill_name_length); } RELEASE_ILL_LOCKS(ill, ill_other); mutex_exit(&phyi->phyint_lock); } /* - * Allocate a NE_PLUMB nic info event and store in the ill. - * If 'group' is set we do it for the group name, otherwise the ill name. - * It will be sent when we leave the ipsq. - */ -void -ill_nic_event_plumb(ill_t *ill, boolean_t group) -{ - phyint_t *phyi = ill->ill_phyint; - char *name; - int namelen; - - ASSERT(MUTEX_HELD(&ill->ill_lock)); - - if (group) { - ASSERT(phyi->phyint_groupname_len != 0); - namelen = phyi->phyint_groupname_len; - name = phyi->phyint_groupname; - } else { - namelen = ill->ill_name_length; - name = ill->ill_name; - } - - ill_nic_event_dispatch(ill, 0, NE_PLUMB, name, namelen); -} - -/* * Notify any downstream modules of the name of this interface. * An M_IOCTL is used even though we don't expect a successful reply. * Any reply message from the driver (presumably an M_IOCNAK) will @@ -22686,8 +18662,9 @@ ip_ifname_notify(ill_t *ill, queue_t *q) static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) { - int err; + int err; ip_stack_t *ipst = ill->ill_ipst; + phyint_t *phyi = ill->ill_phyint; /* Set the obsolete NDD per-interface forwarding name. */ err = ill_set_ndd_name(ill); @@ -22696,6 +18673,34 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) err); } + /* + * Now that ill_name is set, the configuration for the IPMP + * meta-interface can be performed. + */ + if (IS_IPMP(ill)) { + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + /* + * If phyi->phyint_grp is NULL, then this is the first IPMP + * meta-interface and we need to create the IPMP group. + */ + if (phyi->phyint_grp == NULL) { + /* + * If someone has renamed another IPMP group to have + * the same name as our interface, bail. + */ + if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { + rw_exit(&ipst->ips_ipmp_lock); + return (EEXIST); + } + phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); + if (phyi->phyint_grp == NULL) { + rw_exit(&ipst->ips_ipmp_lock); + return (ENOMEM); + } + } + rw_exit(&ipst->ips_ipmp_lock); + } + /* Tell downstream modules where they are. */ ip_ifname_notify(ill, q); @@ -22966,10 +18971,10 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) /* * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. */ - if (ipsq->ipsq_current_ipif == NULL) + if (ipsq->ipsq_xop->ipx_current_ipif == NULL) ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); else - ASSERT(ipsq->ipsq_current_ipif == ipif); + ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); error = ipif_set_values_tail(ill, ipif, mp, q); ipsq_exit(ipsq); @@ -22986,18 +18991,8 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) void ipif_init(ip_stack_t *ipst) { - hrtime_t hrt; int i; - /* - * Can't call drv_getparm here as it is too early in the boot. - * As we use ipif_src_random just for picking a different - * source address everytime, this need not be really random. - */ - hrt = gethrtime(); - ipst->ips_ipif_src_random = - ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff); - for (i = 0; i < MAX_G_HEADS; i++) { ipst->ips_ill_g_heads[i].ill_g_list_head = (ill_if_t *)&ipst->ips_ill_g_heads[i]; @@ -23023,7 +19018,11 @@ ipif_init(ip_stack_t *ipst) * match is found to take care of such rare network configurations like - * le0: 129.146.1.1/16 * le1: 129.146.2.2/24 - * It is used only by SO_DONTROUTE at the moment. + * + * This is used by SO_DONTROUTE and IP_NEXTHOP. Since neither of those are + * supported on underlying interfaces in an IPMP group, underlying interfaces + * are ignored when looking up a match. (If we didn't ignore them, we'd + * risk using a test address as a source for outgoing traffic.) */ ipif_t * ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) @@ -23038,6 +19037,8 @@ ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { + if (IS_UNDER_IPMP(ill)) + continue; mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { @@ -23660,30 +19661,76 @@ ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa, * Knows about IEEE 802 and IEEE EUI-64 mappings. */ static boolean_t -ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) +ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) { char *addr; - if (phys_length != ETHERADDRL) + if (ill->ill_phys_addr_length != ETHERADDRL) return (B_FALSE); /* Form EUI-64 like address */ addr = (char *)&v6addr->s6_addr32[2]; - bcopy((char *)phys_addr, addr, 3); + bcopy(ill->ill_phys_addr, addr, 3); addr[0] ^= 0x2; /* Toggle Universal/Local bit */ addr[3] = (char)0xff; addr[4] = (char)0xfe; - bcopy((char *)phys_addr + 3, addr + 5, 3); + bcopy(ill->ill_phys_addr + 3, addr + 5, 3); return (B_TRUE); } /* ARGSUSED */ static boolean_t -ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) +ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) { return (B_FALSE); } +typedef struct ipmp_ifcookie { + uint32_t ic_hostid; + char ic_ifname[LIFNAMSIZ]; + char ic_zonename[ZONENAME_MAX]; +} ipmp_ifcookie_t; + +/* + * Construct a pseudo-random interface ID for the IPMP interface that's both + * predictable and (almost) guaranteed to be unique. + */ +static boolean_t +ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) +{ + zone_t *zp; + uint8_t *addr; + uchar_t hash[16]; + ulong_t hostid; + MD5_CTX ctx; + ipmp_ifcookie_t ic = { 0 }; + + ASSERT(IS_IPMP(ill)); + + (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); + ic.ic_hostid = htonl((uint32_t)hostid); + + (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); + + if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { + (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); + zone_rele(zp); + } + + MD5Init(&ctx); + MD5Update(&ctx, &ic, sizeof (ic)); + MD5Final(hash, &ctx); + + /* + * Map the hash to an interface ID per the basic approach in RFC3041. + */ + addr = &v6addr->s6_addr8[8]; + bcopy(hash + 8, addr, sizeof (uint64_t)); + addr[0] &= ~0x2; /* set local bit */ + + return (B_TRUE); +} + /* ARGSUSED */ static boolean_t ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, @@ -23739,14 +19786,14 @@ ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, * Derive IPoIB interface id from the link layer address. */ static boolean_t -ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) +ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) { char *addr; - if (phys_length != 20) + if (ill->ill_phys_addr_length != 20) return (B_FALSE); addr = (char *)&v6addr->s6_addr32[2]; - bcopy(phys_addr + 12, addr, 8); + bcopy(ill->ill_phys_addr + 12, addr, 8); /* * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit * in the globally assigned EUI-64 GUID to 1, in violation of IEEE @@ -23863,6 +19910,7 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) *ipifp = NULL; return (B_FALSE); } + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (!IPIF_CAN_LOOKUP(ipif)) continue; @@ -23897,71 +19945,9 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) } /* - * Same as ipif_lookup_zoneid() but looks at all the ills in the same group. - */ -boolean_t -ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) -{ - ill_t *illg; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * We look at the passed-in ill first without grabbing ill_g_lock. - */ - if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) { - return (B_TRUE); - } - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - if (ill->ill_group == NULL) { - /* ill not in a group */ - rw_exit(&ipst->ips_ill_g_lock); - return (B_FALSE); - } - - /* - * There's no ipif in the zone on ill, however ill is part of an IPMP - * group. We need to look for an ipif in the zone on all the ills in the - * group. - */ - illg = ill->ill_group->illgrp_ill; - do { - /* - * We don't call ipif_lookup_zoneid() on ill as we already know - * that it's not there. - */ - if (illg != ill && - ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) { - break; - } - } while ((illg = illg->ill_group_next) != NULL); - rw_exit(&ipst->ips_ill_g_lock); - return (illg != NULL); -} - -/* - * Check if this ill is only being used to send ICMP probes for IPMP - */ -boolean_t -ill_is_probeonly(ill_t *ill) -{ - /* - * Check if the interface is FAILED, or INACTIVE - */ - if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) - return (B_TRUE); - - return (B_FALSE); -} - -/* * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) * If a pointer to an ipif_t is returned then the caller will need to do * an ill_refrele(). - * - * If there is no real interface which matches the ifindex, then it looks - * for a group that has a matching index. In the case of a group match the - * lifidx must be zero. We don't need emulate the logical interfaces - * since IP Filter's use of netinfo doesn't use that. */ ipif_t * ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, @@ -23972,18 +19958,8 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, ipst); - - if (ill == NULL) { - /* Fallback to group names only if hook_emulation set */ - if (!ipst->ips_ipmp_hook_emulation) - return (NULL); - - if (lifidx != 0) - return (NULL); - ill = ill_group_lookup_on_ifindex(ifindex, isv6, ipst); - if (ill == NULL) - return (NULL); - } + if (ill == NULL) + return (NULL); mutex_enter(&ill->ill_lock); if (ill->ill_state_flags & ILL_CONDEMNED) { @@ -24059,7 +20035,7 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp) * If we can quiesce the ill, then set the address. If not, then * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). */ - ill_down_ipifs(ill, NULL, 0, B_FALSE); + ill_down_ipifs(ill); mutex_enter(&ill->ill_lock); if (!ill_is_quiescent(ill)) { /* call cannot fail since `conn_t *' argument is NULL */ @@ -24283,10 +20259,7 @@ ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) goto fail; - if (event == NE_UNPLUMB) - info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; - else - info->hnei_event.hne_nic = ill->ill_phyint->phyint_hook_ifindex; + info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; info->hnei_event.hne_lif = lif; info->hnei_event.hne_event = event; info->hnei_event.hne_protocol = ill->ill_isv6 ? @@ -24323,8 +20296,8 @@ fail: void ipif_up_notify(ipif_t *ipif) { - ip_rts_ifmsg(ipif); - ip_rts_newaddrmsg(RTM_ADD, 0, ipif); + ip_rts_ifmsg(ipif, RTSQ_DEFAULT); + ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); sctp_update_ipif(ipif, SCTP_IPIF_UP); ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), NE_LIF_UP, NULL, 0); diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c index 405cb653d5..52a7e74806 100644 --- a/usr/src/uts/common/inet/ip/ip_ire.c +++ b/usr/src/uts/common/inet/ip/ip_ire.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -31,6 +31,7 @@ #include <sys/types.h> #include <sys/stream.h> #include <sys/stropts.h> +#include <sys/strsun.h> #include <sys/ddi.h> #include <sys/cmn_err.h> #include <sys/policy.h> @@ -61,7 +62,6 @@ #include <net/pfkeyv2.h> #include <inet/ipsec_info.h> #include <inet/sadb.h> -#include <sys/kmem.h> #include <inet/tcp.h> #include <inet/ipclassifier.h> #include <sys/zone.h> @@ -220,11 +220,6 @@ struct kmem_cache *rt_entry_cache; * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is * to be ignored when walking the ires using ire_next. * - * IRE_MARK_HIDDEN signifies that the ire is a special ire typically for the - * benefit of in.mpathd which needs to probe interfaces for failures. Normal - * applications should not be seeing this ire and hence this ire is ignored - * in most cases in the search using ire_next. - * * Zones note: * Walking IREs within a given zone also walks certain ires in other * zones. This is done intentionally. IRE walks with a specified @@ -1235,10 +1230,9 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) { irb_t *irb; boolean_t drop = B_FALSE; - /* LINTED : set but not used in function */ boolean_t mctl_present; mblk_t *first_mp = NULL; - mblk_t *save_mp = NULL; + mblk_t *data_mp = NULL; ire_t *dst_ire; ipha_t *ipha; ip6_t *ip6h; @@ -1258,27 +1252,16 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) * we resolve an IPv6 address with an IPv4 ire * or vice versa. */ + EXTRACT_PKT_MP(mp, first_mp, mctl_present); + data_mp = mp; + mp = first_mp; if (ire->ire_ipversion == IPV4_VERSION) { - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - ipha = (ipha_t *)mp->b_rptr; - save_mp = mp; - mp = first_mp; - + ipha = (ipha_t *)data_mp->b_rptr; dst_ire = ire_cache_lookup(ipha->ipha_dst, ire->ire_zoneid, MBLK_GETLABEL(mp), ipst); } else { ASSERT(ire->ire_ipversion == IPV6_VERSION); - /* - * Get a pointer to the beginning of the IPv6 header. - * Ignore leading IPsec control mblks. - */ - first_mp = mp; - if (mp->b_datap->db_type == M_CTL) { - mp = mp->b_cont; - } - ip6h = (ip6_t *)mp->b_rptr; - save_mp = mp; - mp = first_mp; + ip6h = (ip6_t *)data_mp->b_rptr; dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ire->ire_zoneid, MBLK_GETLABEL(mp), ipst); } @@ -1330,10 +1313,8 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) * is over: we just drop the packet. */ if (ire->ire_flags & RTF_MULTIRT) { - if (save_mp) { - save_mp->b_prev = NULL; - save_mp->b_next = NULL; - } + data_mp->b_prev = NULL; + data_mp->b_next = NULL; MULTIRT_DEBUG_UNTAG(mp); freemsg(mp); } else { @@ -1355,9 +1336,31 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) (CONN_Q(q) ? Q_TO_CONN(q) : NULL), ire->ire_zoneid, ipst); } else { + int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN; + ASSERT(ire->ire_ipversion == IPV6_VERSION); - ip_newroute_v6(q, mp, &ip6h->ip6_dst, NULL, - NULL, ire->ire_zoneid, ipst); + + /* + * If necessary, skip over the ip6i_t to find + * the header with the actual source address. + */ + if (ip6h->ip6_nxt == IPPROTO_RAW) { + if (MBLKL(data_mp) < minlen && + pullupmsg(data_mp, -1) == 0) { + ip1dbg(("ire_add_then_send: " + "cannot pullupmsg ip6i\n")); + if (mctl_present) + freeb(first_mp); + ire_refrele(ire); + return; + } + ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN); + ip6h = (ip6_t *)(data_mp->b_rptr + + sizeof (ip6i_t)); + } + ip_newroute_v6(q, mp, &ip6h->ip6_dst, + &ip6h->ip6_src, NULL, ire->ire_zoneid, + ipst); } } @@ -1680,7 +1683,9 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, { ire_t *ire; uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + boolean_t prefer; + ill_t *ill = ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; /* * No broadcast IREs for the LOOPBACK interface @@ -1690,21 +1695,26 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, (ipif->ipif_flags & IPIF_NOXMIT)) return (irep); - /* If this would be a duplicate, don't bother. */ + /* + * If this new IRE would be a duplicate, only prefer it if one of + * the following is true: + * + * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST + * set and the new one has all of those clear. + * + * 2. The existing one corresponds to an underlying ILL in an IPMP + * group and the new one corresponds to an IPMP group interface. + */ if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif, ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) { - /* - * We look for non-deprecated (and non-anycast, non-nolocal) - * ipifs as the best choice. ipifs with check_flags matching - * (deprecated, etc) are used only if non-deprecated ipifs - * are not available. if the existing ire's ipif is deprecated - * and the new ipif is non-deprecated, switch to the new ipif - */ - if ((!(ire->ire_ipif->ipif_flags & check_flags)) || - (ipif->ipif_flags & check_flags)) { + prefer = ((ire->ire_ipif->ipif_flags & check_flags) && + !(ipif->ipif_flags & check_flags)) || + (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill)); + if (!prefer) { ire_refrele(ire); return (irep); } + /* * Bcast ires exist in pairs. Both have to be deleted, * Since we are exclusive we can make the above assertion. @@ -1716,10 +1726,7 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, ire_delete(ire); ire_refrele(ire); } - - irep = ire_create_bcast(ipif, addr, irep); - - return (irep); + return (ire_create_bcast(ipif, addr, irep)); } uint_t ip_loopback_mtu = IP_LOOPBACK_MTU; @@ -1733,6 +1740,22 @@ ire_t ** ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) { ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + ill_t *ill = ipif->ipif_ill; + + ASSERT(IAM_WRITER_IPIF(ipif)); + + if (IS_IPMP(ill)) { + /* + * Broadcast IREs for the IPMP meta-interface use the + * nominated broadcast interface to send and receive packets. + * If there's no nominated interface, send the packets down to + * the IPMP stub driver, which will discard them. If the + * nominated broadcast interface changes, ill_refresh_bcast() + * will refresh the broadcast IREs. + */ + if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) + ill = ipif->ipif_ill; + } *irep++ = ire_create( (uchar_t *)&addr, /* dest addr */ @@ -1741,8 +1764,8 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) NULL, /* no gateway */ &ipif->ipif_mtu, /* max frag */ NULL, /* no src nce */ - ipif->ipif_rq, /* recv-from queue */ - ipif->ipif_wq, /* send-to queue */ + ill->ill_rq, /* recv-from queue */ + ill->ill_wq, /* send-to queue */ IRE_BROADCAST, ipif, 0, @@ -1761,7 +1784,7 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) NULL, /* no gateway */ &ip_loopback_mtu, /* max frag size */ NULL, /* no src_nce */ - ipif->ipif_rq, /* recv-from queue */ + ill->ill_rq, /* recv-from queue */ NULL, /* no send-to queue */ IRE_BROADCAST, /* Needed for fanout in wput */ ipif, @@ -2049,32 +2072,23 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, { ill_t *ire_stq_ill = NULL; ill_t *ire_ipif_ill = NULL; - ill_group_t *ire_ill_group = NULL; ASSERT(match_flags != 0 || zoneid != ALL_ZONES); /* - * MATCH_IRE_ILL/MATCH_IRE_ILL_GROUP : We match both on ill - * pointed by ire_stq and ire_ipif. Only in the case of - * IRE_CACHEs can ire_stq and ire_ipif be pointing to - * different ills. But we want to keep this function generic - * enough for future use. So, we always try to match on both. - * The only caller of this function ire_walk_ill_tables, will - * call "func" after we return from this function. We expect - * "func" to do the right filtering of ires in this case. - * - * NOTE : In the case of MATCH_IRE_ILL_GROUP, groups - * pointed by ire_stq and ire_ipif should always be the same. - * So, we just match on only one of them. + * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and + * ire_ipif. Only in the case of IRE_CACHEs can ire_stq and + * ire_ipif be pointing to different ills. But we want to keep + * this function generic enough for future use. So, we always + * try to match on both. The only caller of this function + * ire_walk_ill_tables, will call "func" after we return from + * this function. We expect "func" to do the right filtering + * of ires in this case. */ - if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) { + if (match_flags & MATCH_IRE_ILL) { if (ire->ire_stq != NULL) - ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr; + ire_stq_ill = ire->ire_stq->q_ptr; if (ire->ire_ipif != NULL) ire_ipif_ill = ire->ire_ipif->ipif_ill; - if (ire_stq_ill != NULL) - ire_ill_group = ire_stq_ill->ill_group; - if ((ire_ill_group == NULL) && (ire_ipif_ill != NULL)) - ire_ill_group = ire_ipif_ill->ill_group; } if (zoneid != ALL_ZONES) { @@ -2115,7 +2129,7 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, ipif_t *src_ipif; src_ipif = ipif_select_source_v6(ire_stq_ill, - &ire->ire_addr_v6, RESTRICT_TO_NONE, + &ire->ire_addr_v6, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); if (src_ipif != NULL) { @@ -2143,9 +2157,9 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, ire_t *rire; ire_match_flags |= MATCH_IRE_TYPE; - if (ire->ire_ipif != NULL) { - ire_match_flags |= MATCH_IRE_ILL_GROUP; - } + if (ire->ire_ipif != NULL) + ire_match_flags |= MATCH_IRE_ILL; + if (ire->ire_ipversion == IPV4_VERSION) { rire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, IRE_INTERFACE, ire->ire_ipif, NULL, @@ -2169,11 +2183,8 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, if (((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & ire_type)) && ((!(match_flags & MATCH_IRE_ILL)) || - (ire_stq_ill == ill || ire_ipif_ill == ill)) && - ((!(match_flags & MATCH_IRE_ILL_GROUP)) || - (ire_stq_ill == ill) || (ire_ipif_ill == ill) || - (ire_ill_group != NULL && - ire_ill_group == ill->ill_group))) { + (ire_stq_ill == ill || ire_ipif_ill == ill || + ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) { return (B_TRUE); } return (B_FALSE); @@ -2221,8 +2232,7 @@ ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, boolean_t ret; struct rtfuncarg rtfarg; - ASSERT((!(match_flags & (MATCH_IRE_ILL | - MATCH_IRE_ILL_GROUP))) || (ill != NULL)); + ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL)); ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0)); /* * Optimize by not looking at the forwarding table if there @@ -2399,32 +2409,26 @@ ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp, } /* - * IPMP flag settings happen without taking the exclusive route - * in ip_sioctl_flags. So we need to make an atomic check here - * for FAILED/OFFLINE/INACTIVE flags or if it has hit the - * FAILBACK=no case. + * Don't allow IRE's to be created on changing ill's. Also, since + * IPMP flags can be set on an ill without quiescing it, if we're not + * a writer on stq_ill, check that the flags still allow IRE creation. */ if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) { if (stq_ill->ill_state_flags & ILL_CHANGING) { ill = stq_ill; error = EAGAIN; - } else if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) || - (ill_is_probeonly(stq_ill) && - !(ire->ire_marks & IRE_MARK_HIDDEN))) { - error = EINVAL; + } else if (IS_UNDER_IPMP(stq_ill)) { + mutex_enter(&stq_ill->ill_phyint->phyint_lock); + if (!ipmp_ill_is_active(stq_ill) && + !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) { + error = EINVAL; + } + mutex_exit(&stq_ill->ill_phyint->phyint_lock); } - goto done; + if (error != 0) + goto done; } - /* - * We don't check for OFFLINE/FAILED in this case because - * the source address selection logic (ipif_select_source) - * may still select a source address from such an ill. The - * assumption is that these addresses will be moved by in.mpathd - * soon. (i.e. this is a race). However link local addresses - * will not move and hence ipif_select_source_v6 tries to avoid - * FAILED ills. Please see ipif_select_source_v6 for more info - */ if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) && (ipif_ill->ill_state_flags & ILL_CHANGING)) { ill = ipif_ill; @@ -2444,8 +2448,10 @@ done: if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) { ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); ire_atomic_end(irb_ptr, ire); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); error = EINPROGRESS; } else if (error != 0) { @@ -2502,39 +2508,7 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, ire = ire1; } if (ire->ire_stq != NULL) - stq_ill = (ill_t *)ire->ire_stq->q_ptr; - - if (ire->ire_type == IRE_CACHE) { - /* - * If this interface is FAILED, or INACTIVE or has hit - * the FAILBACK=no case, we create IRE_CACHES marked - * HIDDEN for some special cases e.g. bind to - * IPIF_NOFAILOVER address etc. So, if this interface - * is FAILED/INACTIVE/hit FAILBACK=no case, and we are - * not creating hidden ires, we should not allow that. - * This happens because the state of the interface - * changed while we were waiting in ARP. If this is the - * daemon sending probes, the next probe will create - * HIDDEN ires and we will create an ire then. This - * cannot happen with NDP currently because IRE is - * never queued in NDP. But it can happen in the - * future when we have external resolvers with IPv6. - * If the interface gets marked with OFFLINE while we - * are waiting in ARP, don't add the ire. - */ - if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) || - (ill_is_probeonly(stq_ill) && - !(ire->ire_marks & IRE_MARK_HIDDEN))) { - /* - * We don't know whether it is a valid ipif or not. - * unless we do the check below. So, set it to NULL. - */ - ire->ire_ipif = NULL; - ire_delete(ire); - *irep = NULL; - return (EINVAL); - } - } + stq_ill = ire->ire_stq->q_ptr; if (stq_ill != NULL && ire->ire_type == IRE_CACHE && stq_ill->ill_net_type == IRE_IF_RESOLVER) { @@ -2573,12 +2547,12 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, rw_exit(&ipst->ips_ill_g_lock); if (ipif == NULL || (ipif->ipif_isv6 && + !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) && !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, &ipif->ipif_v6src_addr)) || (!ipif->ipif_isv6 && ire->ire_src_addr != ipif->ipif_src_addr) || ire->ire_zoneid != ipif->ipif_zoneid) { - if (ipif != NULL) ipif_refrele(ipif); ire->ire_ipif = NULL; @@ -2587,20 +2561,7 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, return (EINVAL); } - ASSERT(ill != NULL); - /* - * If this group was dismantled while this packets was - * queued in ARP, don't add it here. - */ - if (ire->ire_ipif->ipif_ill->ill_group != ill->ill_group) { - /* We don't want ire_inactive bump stats for this */ - ipif_refrele(ipif); - ire->ire_ipif = NULL; - ire_delete(ire); - *irep = NULL; - return (EINVAL); - } /* * Since we didn't attach label security attributes to the @@ -2677,6 +2638,16 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, boolean_t need_refrele = B_FALSE; nce_t *nce; ip_stack_t *ipst = ire->ire_ipst; + uint_t marks = 0; + + /* + * IREs with source addresses hosted on interfaces that are under IPMP + * should be hidden so that applications don't accidentally end up + * sending packets with test addresses as their source addresses, or + * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here. + */ + if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill)) + marks |= IRE_MARK_TESTHIDDEN; if (ire->ire_ipif != NULL) ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); @@ -2691,10 +2662,15 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, case IRE_HOST: ire->ire_mask = IP_HOST_MASK; ire->ire_masklen = IP_ABITS; + ire->ire_marks |= marks; if ((ire->ire_flags & RTF_SETSRC) == 0) ire->ire_src_addr = 0; break; case IRE_CACHE: + ire->ire_mask = IP_HOST_MASK; + ire->ire_masklen = IP_ABITS; + ire->ire_marks |= marks; + break; case IRE_BROADCAST: case IRE_LOCAL: case IRE_LOOPBACK: @@ -2702,15 +2678,14 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, ire->ire_masklen = IP_ABITS; break; case IRE_PREFIX: - if ((ire->ire_flags & RTF_SETSRC) == 0) - ire->ire_src_addr = 0; - break; case IRE_DEFAULT: + ire->ire_marks |= marks; if ((ire->ire_flags & RTF_SETSRC) == 0) ire->ire_src_addr = 0; break; case IRE_IF_RESOLVER: case IRE_IF_NORESOLVER: + ire->ire_marks |= marks; break; default: ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n", @@ -2796,19 +2771,13 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, */ flags |= MATCH_IRE_IPIF; /* - * If we are creating hidden ires, make sure we search on - * this ill (MATCH_IRE_ILL) and a hidden ire, - * while we are searching for duplicates below. Otherwise we - * could potentially find an IRE on some other interface - * and it may not be a IRE marked with IRE_MARK_HIDDEN. We - * shouldn't do this as this will lead to an infinite loop - * (if we get to ip_wput again) eventually we need an hidden - * ire for this packet to go out. MATCH_IRE_ILL is explicitly - * done below. + * If we are creating a hidden IRE, make sure we search for + * hidden IREs when searching for duplicates below. + * Otherwise, we might find an IRE on some other interface + * that's not marked hidden. */ - if (ire->ire_type == IRE_CACHE && - (ire->ire_marks & IRE_MARK_HIDDEN)) - flags |= (MATCH_IRE_MARK_HIDDEN); + if (ire->ire_marks & IRE_MARK_TESTHIDDEN) + flags |= MATCH_IRE_MARK_TESTHIDDEN; } if ((ire->ire_type & IRE_CACHETABLE) == 0) { irb_ptr = ire_get_bucket(ire); @@ -2927,7 +2896,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, * avoid a lookup in the caller again. If the callers * don't want to use it, they need to do a REFRELE. */ - ip1dbg(("found dup ire existing %p new %p", + ip1dbg(("found dup ire existing %p new %p\n", (void *)ire1, (void *)ire)); IRE_REFHOLD(ire1); ire_atomic_end(irb_ptr, ire); @@ -2948,6 +2917,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, return (0); } } + if (ire->ire_type & IRE_CACHE) { ASSERT(ire->ire_stq != NULL); nce = ndp_lookup_v4(ire_to_ill(ire), @@ -2999,17 +2969,9 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, } /* * Make it easy for ip_wput_ire() to hit multiple broadcast ires by - * grouping identical addresses together on the hash chain. We also - * don't want to send multiple copies out if there are two ills part - * of the same group. Thus we group the ires with same addr and same - * ill group together so that ip_wput_ire can easily skip all the - * ires with same addr and same group after sending the first copy. - * We do this only for IRE_BROADCASTs as ip_wput_ire is currently - * interested in such groupings only for broadcasts. - * - * NOTE : If the interfaces are brought up first and then grouped, - * illgrp_insert will handle it. We come here when the interfaces - * are already in group and we are bringing them UP. + * grouping identical addresses together on the hash chain. We do + * this only for IRE_BROADCASTs as ip_wput_ire is currently interested + * in such groupings only for broadcasts. * * Find the first entry that matches ire_addr. *irep will be null * if no match. @@ -3023,29 +2985,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, if (ire->ire_type == IRE_BROADCAST && *irep != NULL) { /* * We found some ire (i.e *irep) with a matching addr. We - * want to group ires with same addr and same ill group - * together. - * - * First get to the entry that matches our address and - * ill group i.e stop as soon as we find the first ire - * matching the ill group and address. If there is only - * an address match, we should walk and look for some - * group match. These are some of the possible scenarios : - * - * 1) There are no groups at all i.e all ire's ill_group - * are NULL. In that case we will essentially group - * all the ires with the same addr together. Same as - * the "else" block of this "if". - * - * 2) There are some groups and this ire's ill_group is - * NULL. In this case, we will first find the group - * that matches the address and a NULL group. Then - * we will insert the ire at the end of that group. - * - * 3) There are some groups and this ires's ill_group is - * non-NULL. In this case we will first find the group - * that matches the address and the ill_group. Then - * we will insert the ire at the end of that group. + * want to group ires with same addr. */ for (;;) { ire1 = *irep; @@ -3053,8 +2993,8 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, (ire1->ire_next->ire_addr != ire->ire_addr) || (ire1->ire_type != IRE_BROADCAST) || (ire1->ire_flags & RTF_MULTIRT) || - (ire1->ire_ipif->ipif_ill->ill_group == - ire->ire_ipif->ipif_ill->ill_group)) + (ire1->ire_ipif->ipif_ill->ill_grp == + ire->ire_ipif->ipif_ill->ill_grp)) break; irep = &ire1->ire_next; } @@ -3071,18 +3011,14 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, /* * Either we have hit the end of the list or the address - * did not match or the group *matched*. If we found - * a match on the group, skip to the end of the group. + * did not match. */ while (*irep != NULL) { ire1 = *irep; if ((ire1->ire_addr != ire->ire_addr) || - (ire1->ire_type != IRE_BROADCAST) || - (ire1->ire_ipif->ipif_ill->ill_group != - ire->ire_ipif->ipif_ill->ill_group)) + (ire1->ire_type != IRE_BROADCAST)) break; - if (ire1->ire_ipif->ipif_ill->ill_group == NULL && - ire1->ire_ipif == ire->ire_ipif) { + if (ire1->ire_ipif == ire->ire_ipif) { irep = &ire1->ire_next; break; } @@ -3611,15 +3547,14 @@ ire_inactive(ire_t *ire) * The ipif that is associated with an ire is ire->ire_ipif and * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call * ipif_ill_refrele_tail. Usually stq_ill is null or the same as - * ire->ire_ipif->ipif_ill. So nothing more needs to be done. Only - * in the case of IRE_CACHES when IPMP is used, stq_ill can be - * different. If this is different from ire->ire_ipif->ipif_ill and - * if the ill_ire_cnt on the stq_ill also has dropped to zero, we call + * ire->ire_ipif->ipif_ill. So nothing more needs to be done. + * However, for VNI or IPMP IRE entries, stq_ill can be different. + * If this is different from ire->ire_ipif->ipif_ill and if the + * ill_ire_cnt on the stq_ill also has dropped to zero, we call * ipif_ill_refrele_tail on the stq_ill. */ - if (ire->ire_stq != NULL) - stq_ill = (ill_t *)ire->ire_stq->q_ptr; + stq_ill = ire->ire_stq->q_ptr; if (stq_ill == NULL || stq_ill == ill) { /* Optimize the most common case */ @@ -3881,26 +3816,27 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, { ill_t *ire_ill = NULL, *dst_ill; ill_t *ipif_ill = NULL; - ill_group_t *ire_ill_group = NULL; - ill_group_t *ipif_ill_group = NULL; ASSERT(ire->ire_ipversion == IPV4_VERSION); ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); - ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) || + ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ipif != NULL && !ipif->ipif_isv6)); ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL); /* - * HIDDEN cache entries have to be looked up specifically with - * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set - * when the interface is FAILED or INACTIVE. In that case, - * any IRE_CACHES that exists should be marked with - * IRE_MARK_HIDDEN. So, we don't really need to match below - * for IRE_MARK_HIDDEN. But we do so for consistency. + * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it + * is in fact hidden, to ensure the caller gets the right one. One + * exception: if the caller passed MATCH_IRE_IHANDLE, then they + * already know the identity of the given IRE_INTERFACE entry and + * there's no point trying to hide it from them. */ - if (!(match_flags & MATCH_IRE_MARK_HIDDEN) && - (ire->ire_marks & IRE_MARK_HIDDEN)) - return (B_FALSE); + if (ire->ire_marks & IRE_MARK_TESTHIDDEN) { + if (match_flags & MATCH_IRE_IHANDLE) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; + + if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) + return (B_FALSE); + } /* * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option @@ -3994,19 +3930,18 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, } /* - * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that - * somebody wants to send out on a particular interface which - * is given by ire_stq and hence use ire_stq to derive the ill - * value. ire_ipif for IRE_CACHES is just the means of getting - * a source address i.e ire_src_addr = ire->ire_ipif->ipif_src_addr. - * ire_to_ill does the right thing for this. + * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to + * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means + * of getting a source address -- i.e., ire_src_addr == + * ire->ire_ipif->ipif_src_addr). ire_to_ill() handles this. + * + * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group. + * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for + * IPMP test traffic), then the ill must match exactly. */ - if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) { + if (match_flags & MATCH_IRE_ILL) { ire_ill = ire_to_ill(ire); - if (ire_ill != NULL) - ire_ill_group = ire_ill->ill_group; ipif_ill = ipif->ipif_ill; - ipif_ill_group = ipif_ill->ill_group; } if ((ire->ire_addr == (addr & mask)) && @@ -4018,24 +3953,21 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, (ire->ire_src_addr == ipif->ipif_src_addr)) && ((!(match_flags & MATCH_IRE_IPIF)) || (ire->ire_ipif == ipif)) && - ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) || - (ire->ire_type != IRE_CACHE || - ire->ire_marks & IRE_MARK_HIDDEN)) && + ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) || + (ire->ire_marks & IRE_MARK_TESTHIDDEN)) && ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) || (ire->ire_type != IRE_CACHE || ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) && - ((!(match_flags & MATCH_IRE_ILL)) || - (ire_ill == ipif_ill)) && ((!(match_flags & MATCH_IRE_WQ)) || (ire->ire_stq == wq)) && + ((!(match_flags & MATCH_IRE_ILL)) || + (ire_ill == ipif_ill || + (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) && + ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) && ((!(match_flags & MATCH_IRE_IHANDLE)) || (ire->ire_ihandle == ihandle)) && ((!(match_flags & MATCH_IRE_MASK)) || (ire->ire_mask == mask)) && - ((!(match_flags & MATCH_IRE_ILL_GROUP)) || - (ire_ill == ipif_ill) || - (ire_ill_group != NULL && - ire_ill_group == ipif_ill_group)) && ((!(match_flags & MATCH_IRE_SECATTR)) || (!is_system_labeled()) || (tsol_ire_match_gwattr(ire, tsl) == 0))) { @@ -4060,8 +3992,7 @@ ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, * ire_match_args() will dereference ipif MATCH_IRE_SRC or * MATCH_IRE_ILL is set. */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && - (ipif == NULL)) + if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) return (NULL); /* @@ -4142,14 +4073,15 @@ ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif, /* * Check whether the IRE_LOCAL and the IRE potentially used to transmit - * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are part of - * the same ill group. + * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical + * or part of the same illgrp. (In the IPMP case, usually the two IREs + * will both belong to the IPMP ill, but exceptions are possible -- e.g. + * if IPMP test addresses are on their own subnet.) */ boolean_t -ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire) +ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire) { - ill_t *recv_ill, *xmit_ill; - ill_group_t *recv_group, *xmit_group; + ill_t *recv_ill, *xmit_ill; ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK)); ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE)); @@ -4160,20 +4092,11 @@ ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire) ASSERT(recv_ill != NULL); ASSERT(xmit_ill != NULL); - if (recv_ill == xmit_ill) - return (B_TRUE); - - recv_group = recv_ill->ill_group; - xmit_group = xmit_ill->ill_group; - - if (recv_group != NULL && recv_group == xmit_group) - return (B_TRUE); - - return (B_FALSE); + return (IS_ON_SAME_LAN(recv_ill, xmit_ill)); } /* - * Check if the IRE_LOCAL uses the same ill (group) as another route would use. + * Check if the IRE_LOCAL uses the same ill as another route would use. * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE, * then we don't allow this IRE_LOCAL to be used. */ @@ -4183,17 +4106,16 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, { ire_t *alt_ire; boolean_t rval; + int flags; + + flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE; if (ire_local->ire_ipversion == IPV4_VERSION) { alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL, - NULL, zoneid, 0, tsl, - MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE, ipst); + NULL, zoneid, 0, tsl, flags, ipst); } else { - alt_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL, - 0, NULL, NULL, zoneid, 0, tsl, - MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE, ipst); + alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, + NULL, zoneid, 0, tsl, flags, ipst); } if (alt_ire == NULL) @@ -4203,16 +4125,14 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, ire_refrele(alt_ire); return (B_FALSE); } - rval = ire_local_same_ill_group(ire_local, alt_ire); + rval = ire_local_same_lan(ire_local, alt_ire); ire_refrele(alt_ire); return (rval); } /* - * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers - * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get - * to the hidden ones. + * Lookup cache * * In general the zoneid has to match (where ALL_ZONES match all of them). * But for IRE_LOCAL we also need to handle the case where L2 should @@ -4220,8 +4140,7 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, * Ethernet drivers nor Ethernet hardware loops back packets sent to their * own MAC address. This loopback is needed when the normal * routes (ignoring IREs with different zoneids) would send out the packet on - * the same ill (or ill group) as the ill with which this IRE_LOCAL is - * associated. + * the same ill as the ill with which this IRE_LOCAL is associated. * * Earlier versions of this code always matched an IRE_LOCAL independently of * the zoneid. We preserve that earlier behavior when @@ -4239,7 +4158,7 @@ ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl, rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { if (ire->ire_marks & (IRE_MARK_CONDEMNED | - IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) { + IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) { continue; } if (ire->ire_addr == addr) { @@ -4284,7 +4203,7 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst) ire_t *ire; /* - * Lets look for an ire in the cachetable whose + * Look for an ire in the cachetable whose * ire_addr matches the destination. * Since we are being called by forwarding fastpath * no need to check for Trusted Solaris label. @@ -4293,8 +4212,8 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst) dst, ipst->ips_ip_cache_table_size)]; rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & (IRE_MARK_CONDEMNED | - IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) { + if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN | + IRE_MARK_PRIVATE_ADDR)) { continue; } if (ire->ire_addr == dst) { @@ -4307,7 +4226,6 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst) return (NULL); } - /* * Locate the interface ire that is tied to the cache ire 'cire' via * cire->ire_ihandle. @@ -4333,13 +4251,8 @@ ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire) * because the ihandle refers to an ipif which can be in only one zone. */ match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; - /* - * ip_newroute calls ire_ftable_lookup with MATCH_IRE_ILL only - * for on-link hosts. We should never be here for onlink. - * Thus, use MATCH_IRE_ILL_GROUP. - */ if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; /* * We know that the mask of the interface ire equals cire->ire_cmask. * (When ip_newroute() created 'cire' for the gateway it set its @@ -4376,7 +4289,7 @@ ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire) */ match_flags = MATCH_IRE_TYPE; if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET, pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); if (ire == NULL) @@ -4411,7 +4324,16 @@ ire_t * ipif_to_ire(const ipif_t *ipif) { ire_t *ire; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK; + + /* + * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN + * so that they aren't accidentally returned. However, if the + * caller's ipif is on an ill under IPMP, there's no need to hide 'em. + */ + if (IS_UNDER_IPMP(ipif->ipif_ill)) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; ASSERT(!ipif->ipif_isv6); if (ipif->ipif_ire_type == IRE_LOOPBACK) { @@ -4421,13 +4343,12 @@ ipif_to_ire(const ipif_t *ipif) } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { /* In this case we need to lookup destination address. */ ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0, - IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, - (MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK), ipst); + IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags, + ipst); } else { ire = ire_ftable_lookup(ipif->ipif_subnet, ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL, - ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF | - MATCH_IRE_MASK), ipst); + ALL_ZONES, 0, NULL, match_flags, ipst); } return (ire); } @@ -4811,7 +4732,7 @@ ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst) continue; if (cire->ire_addr != dst) continue; - if (cire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) + if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) continue; unres_cnt--; } @@ -4983,7 +4904,7 @@ ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, continue; if (cire->ire_marks & (IRE_MARK_CONDEMNED | - IRE_MARK_HIDDEN)) + IRE_MARK_TESTHIDDEN)) continue; if (cire->ire_gw_secattr != NULL && @@ -5186,7 +5107,7 @@ ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, continue; if (cire->ire_marks & (IRE_MARK_CONDEMNED | - IRE_MARK_HIDDEN)) + IRE_MARK_TESTHIDDEN)) continue; if (cire->ire_gw_secattr != NULL && @@ -5401,7 +5322,7 @@ ire_trace_cleanup(const ire_t *ire) * invoked when the mblk containing fake_ire is freed. */ void -ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) +ire_arpresolve(ire_t *in_ire) { areq_t *areq; ipaddr_t *addrp; @@ -5409,8 +5330,13 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) ire_t *ire, *buf; size_t bufsize; frtn_t *frtnp; - ill_t *ill; - ip_stack_t *ipst = dst_ill->ill_ipst; + ill_t *dst_ill; + ip_stack_t *ipst; + + ASSERT(in_ire->ire_nce != NULL); + + dst_ill = ire_to_ill(in_ire); + ipst = dst_ill->ill_ipst; /* * Construct message chain for the resolver @@ -5431,16 +5357,16 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) */ /* - * We use esballoc to allocate the second part(the ire_t size mblk) - * of the message chain depicted above. THis mblk will be freed - * by arp when there is a timeout, and otherwise passed to IP - * and IP will * free it after processing the ARP response. + * We use esballoc to allocate the second part (IRE_MBLK) + * of the message chain depicted above. This mblk will be freed + * by arp when there is a timeout, and otherwise passed to IP + * and IP will free it after processing the ARP response. */ bufsize = sizeof (ire_t) + sizeof (frtn_t); buf = kmem_alloc(bufsize, KM_NOSLEEP); if (buf == NULL) { - ip1dbg(("ire_arpresolver:alloc buffer failed\n ")); + ip1dbg(("ire_arpresolve: alloc buffer failed\n")); return; } frtnp = (frtn_t *)(buf + 1); @@ -5448,16 +5374,15 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) frtnp->free_func = ire_freemblk; ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); - if (ire_mp == NULL) { ip1dbg(("ire_arpresolve: esballoc failed\n")); kmem_free(buf, bufsize); return; } - ASSERT(in_ire->ire_nce != NULL); + areq_mp = copyb(dst_ill->ill_resolver_mp); if (areq_mp == NULL) { - kmem_free(buf, bufsize); + freemsg(ire_mp); return; } @@ -5473,9 +5398,8 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) ire->ire_ipif_seqid = in_ire->ire_ipif_seqid; ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex; ire->ire_ipif = in_ire->ire_ipif; - ire->ire_stq = in_ire->ire_stq; - ill = ire_to_ill(ire); - ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; + ire->ire_stq = dst_ill->ill_wq; + ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex; ire->ire_zoneid = in_ire->ire_zoneid; ire->ire_stackid = ipst->ips_netstack->netstack_stackid; ire->ire_ipst = ipst; @@ -5528,7 +5452,6 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) * Note that the ARP/IP merge should replace the functioanlity by providing * direct function calls to clean up unresolved entries in ire/nce lists. */ - void ire_freemblk(ire_t *ire_mp) { @@ -5738,9 +5661,8 @@ retry_nce: * is marked as ND_REACHABLE at this point. * This nce does not undergo any further state changes, * and exists as long as the interface is plumbed. - * Note: we do the ire_nce assignment here for IRE_BROADCAST - * because some functions like ill_mark_bcast() inline the - * ire_add functionality. + * Note: the assignment of ire_nce here is a historical + * artifact of old code that used to inline ire_add(). */ ire->ire_nce = nce; /* @@ -5772,8 +5694,7 @@ ip4_ctable_lookup_impl(ire_ctable_args_t *margs) ire_t *ire; ip_stack_t *ipst = margs->ict_ipst; - if ((margs->ict_flags & - (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && + if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (margs->ict_ipif == NULL)) { return (NULL); } @@ -5802,10 +5723,7 @@ ip4_ctable_lookup_impl(ire_ctable_args_t *margs) /* * This function locates IRE_CACHE entries which were added by the * ire_forward() path. We can fully specify the IRE we are looking for by - * providing the ipif_t AND the ire_stq. This is different to MATCH_IRE_ILL - * which uses the ipif_ill. This is inadequate with IPMP groups where - * illgrp_scheduler() may have been used to select an ill from the group for - * the outgoing interface. + * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ). */ ire_t * ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif, diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c index ac14adf00d..1a3df02418 100644 --- a/usr/src/uts/common/inet/ip/ip_mroute.c +++ b/usr/src/uts/common/inet/ip/ip_mroute.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -2037,6 +2037,7 @@ static int ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, struct mfc *rt) { + ill_t *vill; vifi_t vifi; struct vif *vifp; ipaddr_t dst = ipha->ipha_dst; @@ -2102,25 +2103,21 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, } /* * Don't forward if it didn't arrive from the parent vif for its - * origin. But do match on the groups as we nominate only one - * ill in the group for receiving allmulti packets. + * origin. */ - if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill && - (ill->ill_group == NULL || - ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group != - ill->ill_group)) || + vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill; + if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) || (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { /* Came in the wrong interface */ ip1dbg(("ip_mdq: arrived wrong if, vifi %d " "numvifs %d ill %s viftable ill %s\n", (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, - ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name)); + vill->ill_name)); if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "ip_mdq: arrived wrong if, vifi %d ill " "%s viftable ill %s\n", - (int)vifi, ill->ill_name, - ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); + (int)vifi, ill->ill_name, vill->ill_name); } ipst->ips_mrtstat->mrts_wrong_if++; rt->mfc_wrong_if++; @@ -3047,7 +3044,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp) dst = ipha->ipha_dst; ipif = vifp->v_ipif; - mutex_enter(&ipif->ipif_ill->ill_lock); if (ilm_lookup_ipif(ipif, dst) != NULL) { /* * The packet is not yet reassembled, thus we need to @@ -3057,7 +3053,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp) mblk_t *mp_loop; ire_t *ire; - mutex_exit(&ipif->ipif_ill->ill_lock); if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, @@ -3082,8 +3077,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp) } if (ire != NULL) ire_refrele(ire); - } else { - mutex_exit(&ipif->ipif_ill->ill_lock); } if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c index f3c95ae362..cbea9be165 100644 --- a/usr/src/uts/common/inet/ip/ip_multi.c +++ b/usr/src/uts/common/inet/ip/ip_multi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -68,12 +68,10 @@ static void ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, static ilm_t *ilm_add_v6(ipif_t *ipif, const in6_addr_t *group, ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist, - int orig_ifindex, zoneid_t zoneid); + zoneid_t zoneid); static void ilm_delete(ilm_t *ilm); static int ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *group); static int ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *group); -static ilg_t *ilg_lookup_ill_index_v6(conn_t *connp, - const in6_addr_t *v6group, int index); static ilg_t *ilg_lookup_ipif(conn_t *connp, ipaddr_t group, ipif_t *ipif); static int ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, @@ -91,25 +89,21 @@ static int ip_opt_delete_group_excl(conn_t *connp, ipaddr_t group, static int ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src); +static void ill_ilm_walker_hold(ill_t *ill); +static void ill_ilm_walker_rele(ill_t *ill); /* * MT notes: * * Multicast joins operate on both the ilg and ilm structures. Multiple * threads operating on an conn (socket) trying to do multicast joins - * need to synchronize when operating on the ilg. Multiple threads + * need to synchronize when operating on the ilg. Multiple threads * potentially operating on different conn (socket endpoints) trying to * do multicast joins could eventually end up trying to manipulate the - * ilm simulatenously and need to synchronize on the access to the ilm. - * Both are amenable to standard Solaris MT techniques, but it would be - * complex to handle a failover or failback which needs to manipulate - * ilg/ilms if an applications can also simultaenously join/leave - * multicast groups. Hence multicast join/leave also go through the ipsq_t + * ilm simultaneously and need to synchronize access to the ilm. Currently, + * this is done by synchronizing join/leave via per-phyint ipsq_t * serialization. * - * Multicast joins and leaves are single-threaded per phyint/IPMP group - * using the ipsq serialization mechanism. - * * An ilm is an IP data structure used to track multicast join/leave. * An ilm is associated with a <multicast group, ipif> tuple in IPv4 and * with just <multicast group> in IPv6. ilm_refcnt is the number of ilg's @@ -211,12 +205,13 @@ conn_ilg_reap(conn_t *connp) * Returns a pointer to the next available ilg in conn_ilg. Allocs more * buffers in size of ILG_ALLOC_CHUNK ilgs when needed, and updates conn's * ilg tracking fields appropriately (conn_ilg_inuse reflects usage of the - * returned ilg). Returns NULL on failure (ENOMEM). + * returned ilg). Returns NULL on failure, in which case `*errp' will be + * filled in with the reason. * * Assumes connp->conn_lock is held. */ static ilg_t * -conn_ilg_alloc(conn_t *connp) +conn_ilg_alloc(conn_t *connp, int *errp) { ilg_t *new, *ret; int curcnt; @@ -224,10 +219,21 @@ conn_ilg_alloc(conn_t *connp) ASSERT(MUTEX_HELD(&connp->conn_lock)); ASSERT(connp->conn_ilg_inuse <= connp->conn_ilg_allocated); + /* + * If CONN_CLOSING is set, conn_ilg cleanup has begun and we must not + * create any ilgs. + */ + if (connp->conn_state_flags & CONN_CLOSING) { + *errp = EINVAL; + return (NULL); + } + if (connp->conn_ilg == NULL) { connp->conn_ilg = GETSTRUCT(ilg_t, ILG_ALLOC_CHUNK); - if (connp->conn_ilg == NULL) + if (connp->conn_ilg == NULL) { + *errp = ENOMEM; return (NULL); + } connp->conn_ilg_allocated = ILG_ALLOC_CHUNK; connp->conn_ilg_inuse = 0; } @@ -241,12 +247,15 @@ conn_ilg_alloc(conn_t *connp) * ilg_delete_all() will have to be changed when * this logic is changed. */ + *errp = EBUSY; return (NULL); } curcnt = connp->conn_ilg_allocated; new = GETSTRUCT(ilg_t, curcnt + ILG_ALLOC_CHUNK); - if (new == NULL) + if (new == NULL) { + *errp = ENOMEM; return (NULL); + } bcopy(connp->conn_ilg, new, sizeof (ilg_t) * curcnt); mi_free((char *)connp->conn_ilg); connp->conn_ilg = new; @@ -378,42 +387,6 @@ ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist) } } -/* - * If the given interface has failed, choose a new one to join on so - * that we continue to receive packets. ilg_orig_ifindex remembers - * what the application used to join on so that we know the ilg to - * delete even though we change the ill here. Callers will store the - * ilg returned from this function in ilg_ill. Thus when we receive - * a packet on ilg_ill, conn_wantpacket_v6 will deliver the packets. - * - * This function must be called as writer so we can walk the group - * list and examine flags without holding a lock. - */ -ill_t * -ip_choose_multi_ill(ill_t *ill, const in6_addr_t *grp) -{ - ill_t *till; - ill_group_t *illgrp = ill->ill_group; - - ASSERT(IAM_WRITER_ILL(ill)); - - if (IN6_IS_ADDR_UNSPECIFIED(grp) || illgrp == NULL) - return (ill); - - if ((ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) == 0) - return (ill); - - till = illgrp->illgrp_ill; - while (till != NULL && - (till->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE))) { - till = till->ill_group_next; - } - if (till != NULL) - return (till); - - return (ill); -} - static int ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist, boolean_t isv6) @@ -560,8 +533,7 @@ ilm_update_del(ilm_t *ilm, boolean_t isv6) } /* - * INADDR_ANY means all multicast addresses. This is only used - * by the multicast router. + * INADDR_ANY means all multicast addresses. * INADDR_ANY is stored as IPv6 unspecified addr. */ int @@ -578,40 +550,31 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat, if (!CLASSD(group) && group != INADDR_ANY) return (EINVAL); + if (IS_UNDER_IPMP(ill)) + return (EINVAL); + /* - * INADDR_ANY is represented as the IPv6 unspecifed addr. + * INADDR_ANY is represented as the IPv6 unspecified addr. */ if (group == INADDR_ANY) v6group = ipv6_all_zeros; else IN6_IPADDR_TO_V4MAPPED(group, &v6group); - mutex_enter(&ill->ill_lock); ilm = ilm_lookup_ipif(ipif, group); - mutex_exit(&ill->ill_lock); /* * Since we are writer, we know the ilm_flags itself cannot * change at this point, and ilm_lookup_ipif would not have * returned a DELETED ilm. However, the data path can free - * ilm->next via ilm_walker_cleanup() so we can safely + * ilm->ilm_next via ilm_walker_cleanup() so we can safely * access anything in ilm except ilm_next (for safe access to - * ilm_next we'd have to take the ill_lock). + * ilm_next we'd have to take the ill_lock). */ if (ilm != NULL) return (ilm_update_add(ilm, ilgstat, ilg_flist, B_FALSE)); - /* - * ilms are associated with ipifs in IPv4. It moves with the - * ipif if the ipif moves to a new ill when the interface - * fails. Thus we really don't check whether the ipif_ill - * has failed like in IPv6. If it has FAILED the ipif - * will move (daemon will move it) and hence the ilm, if the - * ipif is not IPIF_NOFAILOVER. For the IPIF_NOFAILOVER ipifs, - * we continue to receive in the same place even if the - * interface fails. - */ ilm = ilm_add_v6(ipif, &v6group, ilgstat, ilg_fmode, ilg_flist, - ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid); + ipif->ipif_zoneid); if (ilm == NULL) return (ENOMEM); @@ -623,10 +586,7 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat, */ if (ilm_numentries_v6(ill, &v6group) > 1) return (0); - if (ill->ill_group == NULL) - ret = ill_join_allmulti(ill); - else - ret = ill_nominate_mcast_rcv(ill->ill_group); + ret = ill_join_allmulti(ill); if (ret != 0) ilm_delete(ilm); return (ret); @@ -646,12 +606,8 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat, /* * The unspecified address means all multicast addresses. - * This is only used by the multicast router. * - * ill identifies the interface to join on; it may not match the - * interface requested by the application of a failover has taken - * place. orig_ifindex always identifies the interface requested - * by the app. + * ill identifies the interface to join on. * * ilgstat tells us if there's an ilg associated with this join, * and if so, if it's a new ilg or a change to an existing one. @@ -659,9 +615,8 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat, * the ilg (and will be EXCLUDE {NULL} in the case of no ilg). */ int -ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, - zoneid_t zoneid, ilg_stat_t ilgstat, mcast_record_t ilg_fmode, - slist_t *ilg_flist) +ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid, + ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist) { ilm_t *ilm; int ret; @@ -673,37 +628,20 @@ ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, return (EINVAL); } + if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_MC_SOLICITEDNODE(v6group)) + return (EINVAL); + /* - * An ilm is uniquely identified by the tuple of (group, ill, - * orig_ill). group is the multicast group address, ill is - * the interface on which it is currently joined, and orig_ill - * is the interface on which the application requested the - * join. orig_ill and ill are the same unless orig_ill has - * failed over. - * - * Both orig_ill and ill are required, which means we may have - * 2 ilms on an ill for the same group, but with different - * orig_ills. These must be kept separate, so that when failback - * occurs, the appropriate ilms are moved back to their orig_ill - * without disrupting memberships on the ill to which they had - * been moved. - * - * In order to track orig_ill, we store orig_ifindex in the - * ilm and ilg. + * An ilm is uniquely identified by the tuple of (group, ill) where + * `group' is the multicast group address, and `ill' is the interface + * on which it is currently joined. */ - mutex_enter(&ill->ill_lock); - ilm = ilm_lookup_ill_index_v6(ill, v6group, orig_ifindex, zoneid); - mutex_exit(&ill->ill_lock); + ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid); if (ilm != NULL) return (ilm_update_add(ilm, ilgstat, ilg_flist, B_TRUE)); - /* - * We need to remember where the application really wanted - * to join. This will be used later if we want to failback - * to the original interface. - */ ilm = ilm_add_v6(ill->ill_ipif, v6group, ilgstat, ilg_fmode, - ilg_flist, orig_ifindex, zoneid); + ilg_flist, zoneid); if (ilm == NULL) return (ENOMEM); @@ -715,11 +653,7 @@ ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, */ if (ilm_numentries_v6(ill, v6group) > 1) return (0); - if (ill->ill_group == NULL) - ret = ill_join_allmulti(ill); - else - ret = ill_nominate_mcast_rcv(ill->ill_group); - + ret = ill_join_allmulti(ill); if (ret != 0) ilm_delete(ilm); return (ret); @@ -756,6 +690,14 @@ ip_ll_send_enabmulti_req(ill_t *ill, const in6_addr_t *v6groupp) ASSERT(IAM_WRITER_ILL(ill)); /* + * If we're on the IPMP ill, use the nominated multicast interface to + * send and receive DLPI messages, if one exists. (If none exists, + * there are no usable interfaces and thus nothing to do.) + */ + if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) + return (0); + + /* * Create a AR_ENTRY_SQUERY message with a dl_enabmulti_req tacked * on. */ @@ -842,9 +784,8 @@ ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *v6groupp) } /* - * INADDR_ANY means all multicast addresses. This is only used - * by the multicast router. - * INADDR_ANY is stored as the IPv6 unspecifed addr. + * INADDR_ANY means all multicast addresses. + * INADDR_ANY is stored as the IPv6 unspecified addr. */ int ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving) @@ -859,7 +800,7 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving) return (EINVAL); /* - * INADDR_ANY is represented as the IPv6 unspecifed addr. + * INADDR_ANY is represented as the IPv6 unspecified addr. */ if (group == INADDR_ANY) v6group = ipv6_all_zeros; @@ -870,9 +811,7 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving) * Look for a match on the ipif. * (IP_DROP_MEMBERSHIP specifies an ipif using an IP address). */ - mutex_enter(&ill->ill_lock); ilm = ilm_lookup_ipif(ipif, group); - mutex_exit(&ill->ill_lock); if (ilm == NULL) return (ENOENT); @@ -897,11 +836,9 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving) return (0); /* If we never joined, then don't leave. */ - if (ill->ill_join_allmulti) { + if (ill->ill_join_allmulti) ill_leave_allmulti(ill); - if (ill->ill_group != NULL) - (void) ill_nominate_mcast_rcv(ill->ill_group); - } + return (0); } @@ -921,11 +858,10 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving) /* * The unspecified address means all multicast addresses. - * This is only used by the multicast router. */ int -ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, - zoneid_t zoneid, boolean_t no_ilg, boolean_t leaving) +ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid, + boolean_t no_ilg, boolean_t leaving) { ipif_t *ipif; ilm_t *ilm; @@ -938,25 +874,8 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, /* * Look for a match on the ill. - * (IPV6_LEAVE_GROUP specifies an ill using an ifindex). - * - * Similar to ip_addmulti_v6, we should always look using - * the orig_ifindex. - * - * 1) If orig_ifindex is different from ill's ifindex - * we should have an ilm with orig_ifindex created in - * ip_addmulti_v6. We should delete that here. - * - * 2) If orig_ifindex is same as ill's ifindex, we should - * not delete the ilm that is temporarily here because of - * a FAILOVER. Those ilms will have a ilm_orig_ifindex - * different from ill's ifindex. - * - * Thus, always lookup using orig_ifindex. */ - mutex_enter(&ill->ill_lock); - ilm = ilm_lookup_ill_index_v6(ill, v6group, orig_ifindex, zoneid); - mutex_exit(&ill->ill_lock); + ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid); if (ilm == NULL) return (ENOENT); @@ -985,11 +904,9 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, return (0); /* If we never joined, then don't leave. */ - if (ill->ill_join_allmulti) { + if (ill->ill_join_allmulti) ill_leave_allmulti(ill); - if (ill->ill_group != NULL) - (void) ill_nominate_mcast_rcv(ill->ill_group); - } + return (0); } @@ -1020,6 +937,13 @@ ip_ll_send_disabmulti_req(ill_t *ill, const in6_addr_t *v6groupp) uint32_t addrlen, addroff; ASSERT(IAM_WRITER_ILL(ill)); + + /* + * See comment in ip_ll_send_enabmulti_req(). + */ + if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) + return (0); + /* * Create a AR_ENTRY_SQUERY message with a dl_disabmulti_req tacked * on. @@ -1099,16 +1023,16 @@ ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *v6group) } /* - * Make the driver pass up all multicast packets - * - * With ill groups, the caller makes sure that there is only - * one ill joining the allmulti group. + * Make the driver pass up all multicast packets. NOTE: to keep callers + * IPMP-unaware, if an IPMP ill is passed in, the ill_join_allmulti flag is + * set on it (rather than the cast ill). */ int ill_join_allmulti(ill_t *ill) { mblk_t *promiscon_mp, *promiscoff_mp; uint32_t addrlen, addroff; + ill_t *join_ill = ill; ASSERT(IAM_WRITER_ILL(ill)); @@ -1120,7 +1044,13 @@ ill_join_allmulti(ill_t *ill) return (0); } - ASSERT(!ill->ill_join_allmulti); + /* + * See comment in ip_ll_send_enabmulti_req(). + */ + if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) + return (0); + + ASSERT(!join_ill->ill_join_allmulti); /* * Create a DL_PROMISCON_REQ message and send it directly to the DLPI @@ -1144,20 +1074,18 @@ ill_join_allmulti(ill_t *ill) ill_dlpi_send(ill, promiscon_mp); } - ill->ill_join_allmulti = B_TRUE; + join_ill->ill_join_allmulti = B_TRUE; return (0); } /* * Make the driver stop passing up all multicast packets - * - * With ill groups, we need to nominate some other ill as - * this ipif->ipif_ill is leaving the group. */ void ill_leave_allmulti(ill_t *ill) { - mblk_t *promiscoff_mp = ill->ill_promiscoff_mp; + mblk_t *promiscoff_mp; + ill_t *leave_ill = ill; ASSERT(IAM_WRITER_ILL(ill)); @@ -1169,7 +1097,13 @@ ill_leave_allmulti(ill_t *ill) return; } - ASSERT(ill->ill_join_allmulti); + /* + * See comment in ip_ll_send_enabmulti_req(). + */ + if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) + return; + + ASSERT(leave_ill->ill_join_allmulti); /* * Create a DL_PROMISCOFF_REQ message and send it directly to @@ -1179,12 +1113,13 @@ ill_leave_allmulti(ill_t *ill) */ if ((ill->ill_net_type == IRE_IF_RESOLVER) && !(ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST)) { + promiscoff_mp = ill->ill_promiscoff_mp; ASSERT(promiscoff_mp != NULL); ill->ill_promiscoff_mp = NULL; ill_dlpi_send(ill, promiscoff_mp); } - ill->ill_join_allmulti = B_FALSE; + leave_ill->ill_join_allmulti = B_FALSE; } static ill_t * @@ -1213,22 +1148,35 @@ int ip_join_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) { ill_t *ill; - int ret; + int ret = 0; if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL) return (ENODEV); + + /* + * The ip_addmulti*() functions won't allow IPMP underlying interfaces + * to join allmulti since only the nominated underlying interface in + * the group should receive multicast. We silently succeed to avoid + * having to teach IPobs (currently the only caller of this routine) + * to ignore failures in this case. + */ + if (IS_UNDER_IPMP(ill)) + goto out; + if (isv6) { - ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ifindex, - ill->ill_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); + ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ill->ill_zoneid, + ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); } else { ret = ip_addmulti(INADDR_ANY, ill->ill_ipif, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); } ill->ill_ipallmulti_cnt++; +out: ipsq_exit(ill->ill_phyint->phyint_ipsq); return (ret); } + int ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) { @@ -1236,14 +1184,17 @@ ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL) return (ENODEV); - ASSERT(ill->ill_ipallmulti_cnt != 0); - if (isv6) { - (void) ip_delmulti_v6(&ipv6_all_zeros, ill, ifindex, - ill->ill_zoneid, B_TRUE, B_TRUE); - } else { - (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE, B_TRUE); + + if (ill->ill_ipallmulti_cnt > 0) { + if (isv6) { + (void) ip_delmulti_v6(&ipv6_all_zeros, ill, + ill->ill_zoneid, B_TRUE, B_TRUE); + } else { + (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE, + B_TRUE); + } + ill->ill_ipallmulti_cnt--; } - ill->ill_ipallmulti_cnt--; ipsq_exit(ill->ill_phyint->phyint_ipsq); return (0); } @@ -1260,8 +1211,7 @@ ip_purge_allmulti(ill_t *ill) for (; ill->ill_ipallmulti_cnt > 0; ill->ill_ipallmulti_cnt--) { if (ill->ill_isv6) { (void) ip_delmulti_v6(&ipv6_all_zeros, ill, - ill->ill_phyint->phyint_ifindex, ill->ill_zoneid, - B_TRUE, B_TRUE); + ill->ill_zoneid, B_TRUE, B_TRUE); } else { (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE, B_TRUE); @@ -1539,13 +1489,14 @@ void ill_recover_multicast(ill_t *ill) { ilm_t *ilm; + ipif_t *ipif = ill->ill_ipif; char addrbuf[INET6_ADDRSTRLEN]; ASSERT(IAM_WRITER_ILL(ill)); ill->ill_need_recover_multicast = 0; - ILM_WALKER_HOLD(ill); + ill_ilm_walker_hold(ill); for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { /* * Check how many ipif's that have members in this group - @@ -1553,47 +1504,45 @@ ill_recover_multicast(ill_t *ill) * in the list. */ if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 && - ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm) + ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE, + ALL_ZONES) != ilm) { continue; - ip1dbg(("ill_recover_multicast: %s\n", - inet_ntop(AF_INET6, &ilm->ilm_v6addr, addrbuf, - sizeof (addrbuf)))); + } + + ip1dbg(("ill_recover_multicast: %s\n", inet_ntop(AF_INET6, + &ilm->ilm_v6addr, addrbuf, sizeof (addrbuf)))); + if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { - if (ill->ill_group == NULL) { - (void) ill_join_allmulti(ill); - } else { - /* - * We don't want to join on this ill, - * if somebody else in the group has - * already been nominated. - */ - (void) ill_nominate_mcast_rcv(ill->ill_group); - } + (void) ill_join_allmulti(ill); } else { - (void) ip_ll_addmulti_v6(ill->ill_ipif, - &ilm->ilm_v6addr); + if (ill->ill_isv6) + mld_joingroup(ilm); + else + igmp_joingroup(ilm); + + (void) ip_ll_addmulti_v6(ipif, &ilm->ilm_v6addr); } } - ILM_WALKER_RELE(ill); + ill_ilm_walker_rele(ill); + } /* * The opposite of ill_recover_multicast() -- leaves all multicast groups - * that were explicitly joined. Note that both these functions could be - * disposed of if we enhanced ARP to allow us to handle DL_DISABMULTI_REQ - * and DL_ENABMULTI_REQ messages when an interface is down. + * that were explicitly joined. */ void ill_leave_multicast(ill_t *ill) { ilm_t *ilm; + ipif_t *ipif = ill->ill_ipif; char addrbuf[INET6_ADDRSTRLEN]; ASSERT(IAM_WRITER_ILL(ill)); ill->ill_need_recover_multicast = 1; - ILM_WALKER_HOLD(ill); + ill_ilm_walker_hold(ill); for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { /* * Check how many ipif's that have members in this group - @@ -1601,25 +1550,26 @@ ill_leave_multicast(ill_t *ill) * in the list. */ if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 && - ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm) + ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE, + ALL_ZONES) != ilm) { continue; - ip1dbg(("ill_leave_multicast: %s\n", - inet_ntop(AF_INET6, &ilm->ilm_v6addr, addrbuf, - sizeof (addrbuf)))); + } + + ip1dbg(("ill_leave_multicast: %s\n", inet_ntop(AF_INET6, + &ilm->ilm_v6addr, addrbuf, sizeof (addrbuf)))); + if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { ill_leave_allmulti(ill); - /* - * If we were part of an IPMP group, then - * ill_handoff_responsibility() has already - * nominated a new member (so we don't). - */ - ASSERT(ill->ill_group == NULL); } else { - (void) ip_ll_delmulti_v6(ill->ill_ipif, - &ilm->ilm_v6addr); + if (ill->ill_isv6) + mld_leavegroup(ilm); + else + igmp_leavegroup(ilm); + + (void) ip_ll_delmulti_v6(ipif, &ilm->ilm_v6addr); } } - ILM_WALKER_RELE(ill); + ill_ilm_walker_rele(ill); } /* Find an ilm for matching the ill */ @@ -1628,91 +1578,79 @@ ilm_lookup_ill(ill_t *ill, ipaddr_t group, zoneid_t zoneid) { in6_addr_t v6group; - ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock)); /* - * INADDR_ANY is represented as the IPv6 unspecifed addr. + * INADDR_ANY is represented as the IPv6 unspecified addr. */ if (group == INADDR_ANY) v6group = ipv6_all_zeros; else IN6_IPADDR_TO_V4MAPPED(group, &v6group); - return (ilm_lookup_ill_v6(ill, &v6group, zoneid)); + return (ilm_lookup_ill_v6(ill, &v6group, B_TRUE, zoneid)); } /* - * Find an ilm for matching the ill. All the ilm lookup functions - * ignore ILM_DELETED ilms. These have been logically deleted, and - * igmp and linklayer disable multicast have been done. Only mi_free - * yet to be done. Still there in the list due to ilm_walkers. The - * last walker will release it. + * Find an ilm for address `v6group' on `ill' and zone `zoneid' (which may be + * ALL_ZONES). In general, if `ill' is in an IPMP group, we will match + * against any ill in the group. However, if `restrict_solicited' is set, + * then specifically for IPv6 solicited-node multicast, the match will be + * restricted to the specified `ill'. */ ilm_t * -ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group, zoneid_t zoneid) +ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group, + boolean_t restrict_solicited, zoneid_t zoneid) { ilm_t *ilm; + ilm_walker_t ilw; + boolean_t restrict_ill = B_FALSE; - ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock)); + /* + * In general, underlying interfaces cannot have multicast memberships + * and thus lookups always match across the illgrp. However, we must + * allow IPv6 solicited-node multicast memberships on underlying + * interfaces, and thus an IPMP meta-interface and one of its + * underlying ills may have the same solicited-node multicast address. + * In that case, we need to restrict the lookup to the requested ill. + * However, we may receive packets on an underlying interface that + * are for the corresponding IPMP interface's solicited-node multicast + * address, and thus in that case we need to match across the group -- + * hence the unfortunate `restrict_solicited' argument. + */ + if (IN6_IS_ADDR_MC_SOLICITEDNODE(v6group) && restrict_solicited) + restrict_ill = (IS_IPMP(ill) || IS_UNDER_IPMP(ill)); - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { - if (ilm->ilm_flags & ILM_DELETED) + ilm = ilm_walker_start(&ilw, ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group)) continue; - if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) && - (zoneid == ALL_ZONES || zoneid == ilm->ilm_zoneid)) - return (ilm); - } - return (NULL); -} - -ilm_t * -ilm_lookup_ill_index_v6(ill_t *ill, const in6_addr_t *v6group, int index, - zoneid_t zoneid) -{ - ilm_t *ilm; - - ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock)); - - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (ilm->ilm_flags & ILM_DELETED) + if (zoneid != ALL_ZONES && zoneid != ilm->ilm_zoneid) continue; - if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) && - (zoneid == ALL_ZONES || zoneid == ilm->ilm_zoneid) && - ilm->ilm_orig_ifindex == index) { - return (ilm); + if (!restrict_ill || ill == (ill->ill_isv6 ? + ilm->ilm_ill : ilm->ilm_ipif->ipif_ill)) { + break; } } - return (NULL); + ilm_walker_finish(&ilw); + return (ilm); } - /* - * Found an ilm for the ipif. Only needed for IPv4 which does + * Find an ilm for the ipif. Only needed for IPv4 which does * ipif specific socket options. */ ilm_t * ilm_lookup_ipif(ipif_t *ipif, ipaddr_t group) { - ill_t *ill = ipif->ipif_ill; - ilm_t *ilm; - in6_addr_t v6group; - - ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock)); - /* - * INADDR_ANY is represented as the IPv6 unspecifed addr. - */ - if (group == INADDR_ANY) - v6group = ipv6_all_zeros; - else - IN6_IPADDR_TO_V4MAPPED(group, &v6group); + ilm_t *ilm; + ilm_walker_t ilw; - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { - if (ilm->ilm_flags & ILM_DELETED) - continue; - if (ilm->ilm_ipif == ipif && - IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group)) - return (ilm); + ilm = ilm_walker_start(&ilw, ipif->ipif_ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + if (ilm->ilm_ipif == ipif && ilm->ilm_addr == group) + break; } - return (NULL); + ilm_walker_finish(&ilw); + return (ilm); } /* @@ -1739,8 +1677,7 @@ ilm_numentries_v6(ill_t *ill, const in6_addr_t *v6group) /* Caller guarantees that the group is not already on the list */ static ilm_t * ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat, - mcast_record_t ilg_fmode, slist_t *ilg_flist, int orig_ifindex, - zoneid_t zoneid) + mcast_record_t ilg_fmode, slist_t *ilg_flist, zoneid_t zoneid) { ill_t *ill = ipif->ipif_ill; ilm_t *ilm; @@ -1783,19 +1720,10 @@ ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat, (char *), "ilm", (void *), ilm); ipif->ipif_ilm_cnt++; } + ASSERT(ill->ill_ipst); ilm->ilm_ipst = ill->ill_ipst; /* No netstack_hold */ - /* - * After this if ilm moves to a new ill, we don't change - * the ilm_orig_ifindex. Thus, if ill_index != ilm_orig_ifindex, - * it has been moved. Indexes don't match even when the application - * wants to join on a FAILED/INACTIVE interface because we choose - * a new interface to join in. This is considered as an implicit - * move. - */ - ilm->ilm_orig_ifindex = orig_ifindex; - ASSERT(!(ipif->ipif_state_flags & IPIF_CONDEMNED)); ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED)); @@ -1969,6 +1897,108 @@ ilm_delete(ilm_t *ilm) } } +/* Increment the ILM walker count for `ill' */ +static void +ill_ilm_walker_hold(ill_t *ill) +{ + mutex_enter(&ill->ill_lock); + ill->ill_ilm_walker_cnt++; + mutex_exit(&ill->ill_lock); +} + +/* Decrement the ILM walker count for `ill' */ +static void +ill_ilm_walker_rele(ill_t *ill) +{ + mutex_enter(&ill->ill_lock); + ill->ill_ilm_walker_cnt--; + if (ill->ill_ilm_walker_cnt == 0 && ill->ill_ilm_cleanup_reqd) + ilm_walker_cleanup(ill); /* drops ill_lock */ + else + mutex_exit(&ill->ill_lock); +} + +/* + * Start walking the ILMs associated with `ill'; the first ILM in the walk + * (if any) is returned. State associated with the walk is stored in `ilw'. + * Note that walks associated with interfaces under IPMP also walk the ILMs + * on the associated IPMP interface; this is handled transparently to callers + * via ilm_walker_step(). (Usually with IPMP all ILMs will be on the IPMP + * interface; the only exception is to support IPv6 test addresses, which + * require ILMs for their associated solicited-node multicast addresses.) + */ +ilm_t * +ilm_walker_start(ilm_walker_t *ilw, ill_t *ill) +{ + ilw->ilw_ill = ill; + if (IS_UNDER_IPMP(ill)) + ilw->ilw_ipmp_ill = ipmp_ill_hold_ipmp_ill(ill); + else + ilw->ilw_ipmp_ill = NULL; + + ill_ilm_walker_hold(ill); + if (ilw->ilw_ipmp_ill != NULL) + ill_ilm_walker_hold(ilw->ilw_ipmp_ill); + + if (ilw->ilw_ipmp_ill != NULL && ilw->ilw_ipmp_ill->ill_ilm != NULL) + ilw->ilw_walk_ill = ilw->ilw_ipmp_ill; + else + ilw->ilw_walk_ill = ilw->ilw_ill; + + return (ilm_walker_step(ilw, NULL)); +} + +/* + * Helper function for ilm_walker_step() that returns the next ILM + * associated with `ilw', regardless of whether it's deleted. + */ +static ilm_t * +ilm_walker_step_all(ilm_walker_t *ilw, ilm_t *ilm) +{ + if (ilm == NULL) + return (ilw->ilw_walk_ill->ill_ilm); + + if (ilm->ilm_next != NULL) + return (ilm->ilm_next); + + if (ilw->ilw_ipmp_ill != NULL && IS_IPMP(ilw->ilw_walk_ill)) { + ilw->ilw_walk_ill = ilw->ilw_ill; + /* + * It's possible that ilw_ill left the group during our walk, + * so we can't ASSERT() that it's under IPMP. Callers that + * care will be writer on the IPSQ anyway. + */ + return (ilw->ilw_walk_ill->ill_ilm); + } + return (NULL); +} + +/* + * Step to the next ILM associated with `ilw'. + */ +ilm_t * +ilm_walker_step(ilm_walker_t *ilw, ilm_t *ilm) +{ + while ((ilm = ilm_walker_step_all(ilw, ilm)) != NULL) { + if (!(ilm->ilm_flags & ILM_DELETED)) + break; + } + return (ilm); +} + +/* + * Finish the ILM walk associated with `ilw'. + */ +void +ilm_walker_finish(ilm_walker_t *ilw) +{ + ill_ilm_walker_rele(ilw->ilw_ill); + if (ilw->ilw_ipmp_ill != NULL) { + ill_ilm_walker_rele(ilw->ilw_ipmp_ill); + ill_refrele(ilw->ilw_ipmp_ill); + } + bzero(&ilw, sizeof (ilw)); +} /* * Looks up the appropriate ipif given a v4 multicast group and interface @@ -2256,16 +2286,15 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf, * didn't find an ilg, there's nothing to do. */ if (!leave_grp) - ilg = conn_ilg_alloc(connp); + ilg = conn_ilg_alloc(connp, &err); if (leave_grp || ilg == NULL) { mutex_exit(&connp->conn_lock); - return (leave_grp ? 0 : ENOMEM); + return (leave_grp ? 0 : err); } ilgstat = ILGSTAT_NEW; IN6_IPADDR_TO_V4MAPPED(grp, &ilg->ilg_v6group); ilg->ilg_ipif = ipif; ilg->ilg_ill = NULL; - ilg->ilg_orig_ifindex = 0; } else if (leave_grp) { ilg_delete(connp, ilg, NULL); mutex_exit(&connp->conn_lock); @@ -2389,7 +2418,7 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf, const struct in6_addr *grp, ill_t *ill) { ilg_t *ilg; - int i, orig_ifindex, orig_fmode, new_fmode, err; + int i, orig_fmode, new_fmode, err; slist_t *orig_filter = NULL; slist_t *new_filter = NULL; struct sockaddr_storage *sl; @@ -2409,65 +2438,31 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf, ASSERT(IAM_WRITER_ILL(ill)); - /* - * Use the ifindex to do the lookup. We can't use the ill - * directly because ilg_ill could point to a different ill - * if things have moved. - */ - orig_ifindex = ill->ill_phyint->phyint_ifindex; - mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_index_v6(connp, grp, orig_ifindex); + ilg = ilg_lookup_ill_v6(connp, grp, ill); if (ilg == NULL) { /* * if the request was actually to leave, and we * didn't find an ilg, there's nothing to do. */ if (!leave_grp) - ilg = conn_ilg_alloc(connp); + ilg = conn_ilg_alloc(connp, &err); if (leave_grp || ilg == NULL) { mutex_exit(&connp->conn_lock); - return (leave_grp ? 0 : ENOMEM); + return (leave_grp ? 0 : err); } ilgstat = ILGSTAT_NEW; ilg->ilg_v6group = *grp; ilg->ilg_ipif = NULL; - /* - * Choose our target ill to join on. This might be - * different from the ill we've been given if it's - * currently down and part of a group. - * - * new ill is not refheld; we are writer. - */ - ill = ip_choose_multi_ill(ill, grp); - ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED)); ilg->ilg_ill = ill; - /* - * Remember the index that we joined on, so that we can - * successfully delete them later on and also search for - * duplicates if the application wants to join again. - */ - ilg->ilg_orig_ifindex = orig_ifindex; } else if (leave_grp) { - /* - * Use the ilg's current ill for the deletion, - * we might have failed over. - */ - ill = ilg->ilg_ill; ilg_delete(connp, ilg, NULL); mutex_exit(&connp->conn_lock); - (void) ip_delmulti_v6(grp, ill, orig_ifindex, - connp->conn_zoneid, B_FALSE, B_TRUE); + (void) ip_delmulti_v6(grp, ill, connp->conn_zoneid, B_FALSE, + B_TRUE); return (0); } else { ilgstat = ILGSTAT_CHANGE; - /* - * The current ill might be different from the one we were - * asked to join on (if failover has occurred); we should - * join on the ill stored in the ilg. The original ill - * is noted in ilg_orig_ifindex, which matched our request. - */ - ill = ilg->ilg_ill; /* preserve existing state in case ip_addmulti() fails */ orig_fmode = ilg->ilg_fmode; if (ilg->ilg_filter == NULL) { @@ -2531,8 +2526,8 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf, mutex_exit(&connp->conn_lock); - err = ip_addmulti_v6(grp, ill, orig_ifindex, connp->conn_zoneid, - ilgstat, new_fmode, new_filter); + err = ip_addmulti_v6(grp, ill, connp->conn_zoneid, ilgstat, new_fmode, + new_filter); if (err != 0) { /* * Restore the original filter state, or delete the @@ -2541,7 +2536,7 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf, * conn_lock. */ mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_index_v6(connp, grp, orig_ifindex); + ilg = ilg_lookup_ill_v6(connp, grp, ill); ASSERT(ilg != NULL); if (ilgstat == ILGSTAT_NEW) { ilg_delete(connp, ilg, NULL); @@ -3043,20 +3038,12 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src) { ilg_t *ilg; - ill_t *ilg_ill; - uint_t ilg_orig_ifindex; boolean_t leaving = B_TRUE; ASSERT(IAM_WRITER_ILL(ill)); - /* - * Use the index that we originally used to join. We can't - * use the ill directly because ilg_ill could point to - * a new ill if things have moved. - */ mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_index_v6(connp, v6group, - ill->ill_phyint->phyint_ifindex); + ilg = ilg_lookup_ill_v6(connp, v6group, ill); if ((ilg == NULL) || (ilg->ilg_flags & ILG_DELETED)) { mutex_exit(&connp->conn_lock); return (EADDRNOTAVAIL); @@ -3087,12 +3074,10 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group, leaving = B_FALSE; } - ilg_ill = ilg->ilg_ill; - ilg_orig_ifindex = ilg->ilg_orig_ifindex; ilg_delete(connp, ilg, v6src); mutex_exit(&connp->conn_lock); - (void) ip_delmulti_v6(v6group, ilg_ill, ilg_orig_ifindex, - connp->conn_zoneid, B_FALSE, leaving); + (void) ip_delmulti_v6(v6group, ill, connp->conn_zoneid, B_FALSE, + leaving); return (0); } @@ -3345,10 +3330,10 @@ ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode, if (ilg == NULL) { ilgstat = ILGSTAT_NEW; - if ((ilg = conn_ilg_alloc(connp)) == NULL) { + if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) { mutex_exit(&connp->conn_lock); l_free(new_filter); - return (ENOMEM); + return (error); } if (src != INADDR_ANY) { ilg->ilg_filter = l_alloc(); @@ -3369,7 +3354,6 @@ ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode, } ilg->ilg_ipif = ipif; ilg->ilg_ill = NULL; - ilg->ilg_orig_ifindex = 0; ilg->ilg_fmode = fmode; } else { int index; @@ -3437,7 +3421,6 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src) { int error = 0; - int orig_ifindex; ilg_t *ilg; ilg_stat_t ilgstat; slist_t *new_filter = NULL; @@ -3456,13 +3439,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, */ mutex_enter(&connp->conn_lock); - /* - * Use the ifindex to do the lookup. We can't use the ill - * directly because ilg_ill could point to a different ill if - * things have moved. - */ - orig_ifindex = ill->ill_phyint->phyint_ifindex; - ilg = ilg_lookup_ill_index_v6(connp, v6group, orig_ifindex); + ilg = ilg_lookup_ill_v6(connp, v6group, ill); /* * Depending on the option we're handling, may or may not be okay @@ -3501,10 +3478,10 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, } if (ilg == NULL) { - if ((ilg = conn_ilg_alloc(connp)) == NULL) { + if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) { mutex_exit(&connp->conn_lock); l_free(new_filter); - return (ENOMEM); + return (error); } if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) { ilg->ilg_filter = l_alloc(); @@ -3521,22 +3498,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, ilg->ilg_v6group = *v6group; ilg->ilg_fmode = fmode; ilg->ilg_ipif = NULL; - /* - * Choose our target ill to join on. This might be different - * from the ill we've been given if it's currently down and - * part of a group. - * - * new ill is not refheld; we are writer. - */ - ill = ip_choose_multi_ill(ill, v6group); - ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED)); ilg->ilg_ill = ill; - /* - * Remember the orig_ifindex that we joined on, so that we - * can successfully delete them later on and also search - * for duplicates if the application wants to join again. - */ - ilg->ilg_orig_ifindex = orig_ifindex; } else { int index; if (ilg->ilg_fmode != fmode || IN6_IS_ADDR_UNSPECIFIED(v6src)) { @@ -3560,13 +3522,6 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, ilgstat = ILGSTAT_CHANGE; index = ilg->ilg_filter->sl_numsrc++; ilg->ilg_filter->sl_addr[index] = *v6src; - /* - * The current ill might be different from the one we were - * asked to join on (if failover has occurred); we should - * join on the ill stored in the ilg. The original ill - * is noted in ilg_orig_ifindex, which matched our request. - */ - ill = ilg->ilg_ill; } /* @@ -3584,8 +3539,8 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, * info for the ill, which involves looking at the status of * all the ilgs associated with this group/interface pair. */ - error = ip_addmulti_v6(v6group, ill, orig_ifindex, connp->conn_zoneid, - ilgstat, new_fmode, new_filter); + error = ip_addmulti_v6(v6group, ill, connp->conn_zoneid, ilgstat, + new_fmode, new_filter); if (error != 0) { /* * But because we waited, we have to undo the ilg update @@ -3595,7 +3550,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, in6_addr_t delsrc = (ilgstat == ILGSTAT_NEW) ? ipv6_all_zeros : *v6src; mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_index_v6(connp, v6group, orig_ifindex); + ilg = ilg_lookup_ill_v6(connp, v6group, ill); ASSERT(ilg != NULL); ilg_delete(connp, ilg, &delsrc); mutex_exit(&connp->conn_lock); @@ -3639,7 +3594,7 @@ ilg_lookup_ill_withsrc(conn_t *connp, ipaddr_t group, ipaddr_t src, ill_t *ill) ASSERT(ilg->ilg_ill == NULL); ilg_ill = ipif->ipif_ill; ASSERT(!ilg_ill->ill_isv6); - if (ilg_ill == ill && + if (IS_ON_SAME_LAN(ilg_ill, ill) && IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group)) { if (SLIST_IS_EMPTY(ilg->ilg_filter)) { /* no source filter, so this is a match */ @@ -3692,7 +3647,7 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group, continue; ASSERT(ilg->ilg_ipif == NULL); ASSERT(ilg_ill->ill_isv6); - if (ilg_ill == ill && + if (IS_ON_SAME_LAN(ilg_ill, ill) && IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) { if (SLIST_IS_EMPTY(ilg->ilg_filter)) { /* no source filter, so this is a match */ @@ -3724,35 +3679,6 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group, } /* - * Get the ilg whose ilg_orig_ifindex is associated with ifindex. - * This is useful when the interface fails and we have moved - * to a new ill, but still would like to locate using the index - * that we originally used to join. Used only for IPv6 currently. - */ -static ilg_t * -ilg_lookup_ill_index_v6(conn_t *connp, const in6_addr_t *v6group, int ifindex) -{ - ilg_t *ilg; - int i; - - ASSERT(MUTEX_HELD(&connp->conn_lock)); - for (i = 0; i < connp->conn_ilg_inuse; i++) { - ilg = &connp->conn_ilg[i]; - if (ilg->ilg_ill == NULL || - (ilg->ilg_flags & ILG_DELETED) != 0) - continue; - /* ilg_ipif is NULL for V6 */ - ASSERT(ilg->ilg_ipif == NULL); - ASSERT(ilg->ilg_orig_ifindex != 0); - if (IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group) && - ilg->ilg_orig_ifindex == ifindex) { - return (ilg); - } - } - return (NULL); -} - -/* * Find an IPv6 ilg matching group and ill */ ilg_t * @@ -3863,32 +3789,28 @@ ilg_delete_all(conn_t *connp) in6_addr_t v6group; boolean_t success; ipsq_t *ipsq; - int orig_ifindex; mutex_enter(&connp->conn_lock); retry: ILG_WALKER_HOLD(connp); - for (i = connp->conn_ilg_inuse - 1; i >= 0; ) { + for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { ilg = &connp->conn_ilg[i]; /* * Since this walk is not atomic (we drop the * conn_lock and wait in ipsq_enter) we need * to check for the ILG_DELETED flag. */ - if (ilg->ilg_flags & ILG_DELETED) { - /* Go to the next ilg */ - i--; + if (ilg->ilg_flags & ILG_DELETED) continue; - } - v6group = ilg->ilg_v6group; - if (IN6_IS_ADDR_V4MAPPED(&v6group)) { + if (IN6_IS_ADDR_V4MAPPED(&ilg->ilg_v6group)) { ipif = ilg->ilg_ipif; ill = ipif->ipif_ill; } else { ipif = NULL; ill = ilg->ilg_ill; } + /* * We may not be able to refhold the ill if the ill/ipif * is changing. But we need to make sure that the ill will @@ -3897,11 +3819,9 @@ retry: * in which case the unplumb thread will handle the cleanup, * and we move on to the next ilg. */ - if (!ill_waiter_inc(ill)) { - /* Go to the next ilg */ - i--; + if (!ill_waiter_inc(ill)) continue; - } + mutex_exit(&connp->conn_lock); /* * To prevent deadlock between ill close which waits inside @@ -3916,51 +3836,31 @@ retry: ipsq = ill->ill_phyint->phyint_ipsq; ill_waiter_dcr(ill); mutex_enter(&connp->conn_lock); - if (!success) { - /* Go to the next ilg */ - i--; + if (!success) continue; - } /* - * Make sure that nothing has changed under. For eg. - * a failover/failback can change ilg_ill while we were - * waiting to become exclusive above + * Move on if the ilg was deleted while conn_lock was dropped. */ - if (IN6_IS_ADDR_V4MAPPED(&v6group)) { - ipif = ilg->ilg_ipif; - ill = ipif->ipif_ill; - } else { - ipif = NULL; - ill = ilg->ilg_ill; - } - if (!IAM_WRITER_ILL(ill) || (ilg->ilg_flags & ILG_DELETED)) { - /* - * The ilg has changed under us probably due - * to a failover or unplumb. Retry on the same ilg. - */ + if (ilg->ilg_flags & ILG_DELETED) { mutex_exit(&connp->conn_lock); ipsq_exit(ipsq); mutex_enter(&connp->conn_lock); continue; } v6group = ilg->ilg_v6group; - orig_ifindex = ilg->ilg_orig_ifindex; ilg_delete(connp, ilg, NULL); mutex_exit(&connp->conn_lock); - if (ipif != NULL) + if (ipif != NULL) { (void) ip_delmulti(V4_PART_OF_V6(v6group), ipif, B_FALSE, B_TRUE); - - else - (void) ip_delmulti_v6(&v6group, ill, orig_ifindex, + } else { + (void) ip_delmulti_v6(&v6group, ill, connp->conn_zoneid, B_FALSE, B_TRUE); - + } ipsq_exit(ipsq); mutex_enter(&connp->conn_lock); - /* Go to the next ilg */ - i--; } ILG_WALKER_RELE(connp); @@ -4063,7 +3963,6 @@ conn_delete_ill(conn_t *connp, caddr_t arg) int i; char group_buf[INET6_ADDRSTRLEN]; in6_addr_t v6group; - int orig_ifindex; ilg_t *ilg; /* @@ -4097,11 +3996,10 @@ conn_delete_ill(conn_t *connp, caddr_t arg) ill->ill_name)); v6group = ilg->ilg_v6group; - orig_ifindex = ilg->ilg_orig_ifindex; ilg_delete(connp, ilg, NULL); mutex_exit(&connp->conn_lock); - (void) ip_delmulti_v6(&v6group, ill, orig_ifindex, + (void) ip_delmulti_v6(&v6group, ill, connp->conn_zoneid, B_FALSE, B_TRUE); mutex_enter(&connp->conn_lock); } @@ -4115,7 +4013,6 @@ conn_delete_ill(conn_t *connp, caddr_t arg) if (connp->conn_multicast_ill == ill) { /* Revert to late binding */ connp->conn_multicast_ill = NULL; - connp->conn_orig_multicast_ifindex = 0; } mutex_exit(&connp->conn_lock); } diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c index b53897cefe..895cc74bd2 100644 --- a/usr/src/uts/common/inet/ip/ip_ndp.c +++ b/usr/src/uts/common/inet/ip/ip_ndp.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -83,8 +83,9 @@ static boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr, static void nce_ire_delete(nce_t *nce); static void nce_ire_delete1(ire_t *ire, char *nce_arg); static void nce_set_ll(nce_t *nce, uchar_t *ll_addr); -static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *); -static nce_t *nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr); +static nce_t *nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *, + nce_t *); +static nce_t *nce_lookup_mapping(ill_t *, const in6_addr_t *); static void nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr); static int nce_set_multicast(ill_t *ill, const in6_addr_t *addr); @@ -93,11 +94,16 @@ static mblk_t *nce_udreq_alloc(ill_t *ill); static void nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr); static uint32_t nce_solicit(nce_t *nce, mblk_t *mp); -static boolean_t nce_xmit(ill_t *ill, uint32_t operation, - ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender, +static boolean_t nce_xmit(ill_t *ill, uint8_t type, + boolean_t use_lla_addr, const in6_addr_t *sender, const in6_addr_t *target, int flag); +static boolean_t nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, + const in6_addr_t *target, uint_t flags); +static boolean_t nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, + const in6_addr_t *src, uint_t flags); static int ndp_add_v4(ill_t *, const in_addr_t *, uint16_t, nce_t **, nce_t *); +static ipif_t *ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill); #ifdef DEBUG static void nce_trace_cleanup(const nce_t *); @@ -110,22 +116,6 @@ static void nce_trace_cleanup(const nce_t *); (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ NCE_TABLE_SIZE)])) -/* - * Compute default flags to use for an advertisement of this nce's address. - */ -static int -nce_advert_flags(const nce_t *nce) -{ - int flag = 0; - - if (nce->nce_flags & NCE_F_ISROUTER) - flag |= NDP_ISROUTER; - if (!(nce->nce_flags & NCE_F_ANYCAST)) - flag |= NDP_ORIDE; - - return (flag); -} - /* Non-tunable probe interval, based on link capabilities */ #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) @@ -262,8 +252,7 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, mutex_exit(&ipst->ips_ndp6->ndp_g_lock); nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; mutex_exit(&nce->nce_lock); - dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE, - &ipv6_all_zeros, addr, NDP_PROBE); + dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE); if (dropped) { mutex_enter(&nce->nce_lock); nce->nce_pcnt++; @@ -282,23 +271,20 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, mutex_exit(&ipst->ips_ndp6->ndp_g_lock); nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1; mutex_exit(&nce->nce_lock); - dropped = nce_xmit(ill, - ND_NEIGHBOR_ADVERT, - ill, /* ill to be used for extracting ill_nd_lla */ - B_TRUE, /* use ill_nd_lla */ - addr, /* Source and target of the advertisement pkt */ - &ipv6_all_hosts_mcast, /* Destination of the packet */ - nce_advert_flags(nce)); + dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast, + 0); mutex_enter(&nce->nce_lock); if (dropped) nce->nce_unsolicit_count++; if (nce->nce_unsolicit_count != 0) { + ASSERT(nce->nce_timeout_id == 0); nce->nce_timeout_id = timeout(ndp_timer, nce, MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval)); } mutex_exit(&nce->nce_lock); mutex_enter(&ipst->ips_ndp6->ndp_g_lock); } + /* * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then * we call nce_fastpath as soon as the nce is resolved in ndp_process. @@ -311,10 +297,10 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, } int -ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, - const in6_addr_t *mask, const in6_addr_t *extract_mask, - uint32_t hw_extract_start, uint16_t flags, uint16_t state, - nce_t **newnce) +ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr, + const in6_addr_t *addr, const in6_addr_t *mask, + const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags, + uint16_t state, nce_t **newnce) { int err = 0; nce_t *nce; @@ -325,7 +311,7 @@ ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, /* Get head of v6 hash table */ nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); - nce = nce_lookup_addr(ill, addr, nce); + nce = nce_lookup_addr(ill, match_illgrp, addr, nce); if (nce == NULL) { err = ndp_add_v6(ill, hw_addr, @@ -562,13 +548,11 @@ nce_ire_delete_list(nce_t *nce) if (nce->nce_ipversion == IPV4_VERSION) { ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, nce_ire_delete1, - (char *)nce, nce->nce_ill); + IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill); } else { ASSERT(nce->nce_ipversion == IPV6_VERSION); ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, nce_ire_delete1, - (char *)nce, nce->nce_ill); + IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill); } NCE_REFRELE_NOTR(nce); nce = nce_next; @@ -628,8 +612,7 @@ ndp_restart_dad(nce_t *nce) nce->nce_state = ND_PROBE; nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1; mutex_exit(&nce->nce_lock); - dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL, - B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE); + dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE); if (dropped) { mutex_enter(&nce->nce_lock); nce->nce_pcnt++; @@ -649,22 +632,19 @@ ndp_restart_dad(nce_t *nce) * If one is found, the refcnt on the nce will be incremented. */ nce_t * -ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock) +ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr, + boolean_t caller_holds_lock) { nce_t *nce; - ip_stack_t *ipst; - - ASSERT(ill != NULL); - ipst = ill->ill_ipst; + ip_stack_t *ipst = ill->ill_ipst; - ASSERT(ill != NULL && ill->ill_isv6); - if (!caller_holds_lock) { + ASSERT(ill->ill_isv6); + if (!caller_holds_lock) mutex_enter(&ipst->ips_ndp6->ndp_g_lock); - } /* Get head of v6 hash table */ nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); - nce = nce_lookup_addr(ill, addr, nce); + nce = nce_lookup_addr(ill, match_illgrp, addr, nce); if (nce == NULL) nce = nce_lookup_mapping(ill, addr); if (!caller_holds_lock) @@ -685,14 +665,17 @@ ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock) in6_addr_t addr6; ip_stack_t *ipst = ill->ill_ipst; - if (!caller_holds_lock) { + if (!caller_holds_lock) mutex_enter(&ipst->ips_ndp4->ndp_g_lock); - } /* Get head of v4 hash table */ nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); - nce = nce_lookup_addr(ill, &addr6, nce); + /* + * NOTE: IPv4 never matches across the illgrp since the NCE's we're + * looking up have fastpath headers that are inherently per-ill. + */ + nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce); if (!caller_holds_lock) mutex_exit(&ipst->ips_ndp4->ndp_g_lock); return (nce); @@ -706,7 +689,8 @@ ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock) * lock (ndp_g_lock). */ static nce_t * -nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce) +nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr, + nce_t *nce) { ndp_g_t *ndp; ip_stack_t *ipst = ill->ill_ipst; @@ -716,12 +700,12 @@ nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce) else ndp = ipst->ips_ndp4; - ASSERT(ill != NULL); ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); if (IN6_IS_ADDR_UNSPECIFIED(addr)) return (NULL); for (; nce != NULL; nce = nce->nce_next) { - if (nce->nce_ill == ill) { + if (nce->nce_ill == ill || + match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) { if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) && IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) { @@ -771,8 +755,8 @@ nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr) * Process passed in parameters either from an incoming packet or via * user ioctl. */ -void -ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) +static void +nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) { ill_t *ill = nce->nce_ill; uint32_t hw_addr_len = ill->ill_nd_lla_len; @@ -852,7 +836,7 @@ ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) } else { /* * Send locally originated packets back - * into * ip_wput_v6. + * into ip_wput_v6. */ put(ill->ill_wq, mp); } @@ -918,6 +902,65 @@ ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) } /* + * Walker state structure used by ndp_process() / ndp_process_entry(). + */ +typedef struct ndp_process_data { + ill_t *np_ill; /* ill/illgrp to match against */ + const in6_addr_t *np_addr; /* IPv6 address to match */ + uchar_t *np_hw_addr; /* passed to nce_process() */ + uint32_t np_flag; /* passed to nce_process() */ + boolean_t np_is_adv; /* passed to nce_process() */ +} ndp_process_data_t; + +/* + * Walker callback used by ndp_process() for IPMP groups: calls nce_process() + * for each NCE with a matching address that's in the same IPMP group. + */ +static void +ndp_process_entry(nce_t *nce, void *arg) +{ + ndp_process_data_t *npp = arg; + + if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) && + IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) && + IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) { + nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv); + } +} + +/* + * Wrapper around nce_process() that handles IPMP. In particular, for IPMP, + * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have + * more than one NCE for a given IPv6 address to tend to. In that case, we + * need to walk all NCEs and callback nce_process() for each one. Since this + * is expensive, in the non-IPMP case we just directly call nce_process(). + * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP + * interfaces in an IPMP group share the same NCEs -- at which point this + * function can be removed entirely. + */ +void +ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) +{ + ill_t *ill = nce->nce_ill; + struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6; + ndp_process_data_t np; + + if (ill->ill_grp == NULL) { + nce_process(nce, hw_addr, flag, is_adv); + return; + } + + /* IPMP case: walk all NCEs */ + np.np_ill = ill; + np.np_addr = &nce->nce_addr; + np.np_flag = flag; + np.np_is_adv = is_adv; + np.np_hw_addr = hw_addr; + + ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES); +} + +/* * Pass arg1 to the pfi supplied, along with each nce in existence. * ndp_walk() places a REFHOLD on the nce and drops the lock when * walking the hash list. @@ -926,7 +969,6 @@ void ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, boolean_t trace) { - nce_t *nce; nce_t *nce1; nce_t **ncep; @@ -1021,27 +1063,58 @@ ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) int ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid) { - nce_t *nce; - int err = 0; + nce_t *nce, *hw_nce = NULL; + int err; + ill_t *ipmp_ill; + uint16_t nce_flags; uint32_t ms; mblk_t *mp_nce = NULL; ip_stack_t *ipst = ill->ill_ipst; + uchar_t *hwaddr = NULL; ASSERT(ill->ill_isv6); - if (IN6_IS_ADDR_MULTICAST(dst)) { - err = nce_set_multicast(ill, dst); - return (err); + + if (IN6_IS_ADDR_MULTICAST(dst)) + return (nce_set_multicast(ill, dst)); + + nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0; + + /* + * If `ill' is under IPMP, then first check to see if there's an NCE + * for `dst' on the IPMP meta-interface (e.g., because an application + * explicitly did an SIOCLIFSETND to tie a hardware address to `dst'). + * If so, we use that hardware address when creating the NCE below. + * Note that we don't yet have a mechanism to remove these NCEs if the + * NCE for `dst' on the IPMP meta-interface is subsequently removed -- + * but rather than build such a beast, we should fix NCEs so that they + * can be properly shared across an IPMP group. + */ + if (IS_UNDER_IPMP(ill)) { + if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { + hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE); + if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) { + hwaddr = hw_nce->nce_res_mp->b_rptr + + NCE_LL_ADDR_OFFSET(ipmp_ill); + nce_flags |= hw_nce->nce_flags; + } + ill_refrele(ipmp_ill); + } } + err = ndp_lookup_then_add_v6(ill, - NULL, /* No hardware address */ + B_FALSE, /* NCE fastpath is per ill; don't match across group */ + hwaddr, dst, &ipv6_all_ones, &ipv6_all_zeros, 0, - (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0, - ND_INCOMPLETE, + nce_flags, + hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE, &nce); + if (hw_nce != NULL) + NCE_REFRELE(hw_nce); + switch (err) { case 0: /* @@ -1057,11 +1130,10 @@ ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid) NCE_REFRELE(nce); return (0); } - rw_enter(&ipst->ips_ill_g_lock, RW_READER); + mutex_enter(&nce->nce_lock); if (nce->nce_state != ND_INCOMPLETE) { mutex_exit(&nce->nce_lock); - rw_exit(&ipst->ips_ill_g_lock); NCE_REFRELE(nce); return (0); } @@ -1069,14 +1141,11 @@ ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid) if (mp_nce == NULL) { /* The caller will free mp */ mutex_exit(&nce->nce_lock); - rw_exit(&ipst->ips_ill_g_lock); ndp_delete(nce); NCE_REFRELE(nce); return (ENOMEM); } - ms = nce_solicit(nce, mp_nce); - rw_exit(&ipst->ips_ill_g_lock); - if (ms == 0) { + if ((ms = nce_solicit(nce, mp_nce)) == 0) { /* The caller will free mp */ if (mp_nce != mp) freeb(mp_nce); @@ -1143,6 +1212,7 @@ ndp_noresolver(ill_t *ill, const in6_addr_t *dst) } err = ndp_lookup_then_add_v6(ill, + B_FALSE, /* NCE fastpath is per ill; don't match across group */ NULL, /* hardware address */ dst, &ipv6_all_ones, @@ -1191,7 +1261,7 @@ nce_set_multicast(ill_t *ill, const in6_addr_t *dst) mutex_enter(&ipst->ips_ndp6->ndp_g_lock); nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst)); - nce = nce_lookup_addr(ill, dst, nce); + nce = nce_lookup_addr(ill, B_FALSE, dst, nce); if (nce != NULL) { mutex_exit(&ipst->ips_ndp6->ndp_g_lock); NCE_REFRELE(nce); @@ -1259,7 +1329,13 @@ ndp_query(ill_t *ill, struct lif_nd_req *lnr) sin6 = (sin6_t *)&lnr->lnr_addr; addr = &sin6->sin6_addr; - nce = ndp_lookup_v6(ill, addr, B_FALSE); + /* + * NOTE: if the ill is an IPMP interface, then match against the whole + * illgrp. This e.g. allows in.ndpd to retrieve the link layer + * addresses for the data addresses on an IPMP interface even though + * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill. + */ + nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE); if (nce == NULL) return (ESRCH); /* If in INCOMPLETE state, no link layer address is available yet */ @@ -1347,24 +1423,14 @@ ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len, uint32_t nce_solicit(nce_t *nce, mblk_t *mp) { - ill_t *ill; - ill_t *src_ill; ip6_t *ip6h; - in6_addr_t src; - in6_addr_t dst; - ipif_t *ipif; - ip6i_t *ip6i; - boolean_t dropped = B_FALSE; - ip_stack_t *ipst = nce->nce_ill->ill_ipst; + in6_addr_t sender; + boolean_t dropped; - ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); ASSERT(MUTEX_HELD(&nce->nce_lock)); - ill = nce->nce_ill; - ASSERT(ill != NULL); - if (nce->nce_rcnt == 0) { + if (nce->nce_rcnt == 0) return (0); - } if (mp == NULL) { ASSERT(nce->nce_qd_mp != NULL); @@ -1385,60 +1451,22 @@ nce_solicit(nce_t *nce, mblk_t *mp) * could be from the nce_qd_mp which could have b_next/b_prev * non-NULL. */ - ip6i = (ip6i_t *)ip6h; - ASSERT((mp->b_wptr - (uchar_t *)ip6i) >= - sizeof (ip6i_t) + IPV6_HDR_LEN); + ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN); ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t)); } - src = ip6h->ip6_src; - /* - * If the src of outgoing packet is one of the assigned interface - * addresses use it, otherwise we will pick the source address below. - */ - src_ill = ill; - if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { - if (ill->ill_group != NULL) - src_ill = ill->ill_group->illgrp_ill; - for (; src_ill != NULL; src_ill = src_ill->ill_group_next) { - for (ipif = src_ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (IN6_ARE_ADDR_EQUAL(&src, - &ipif->ipif_v6lcl_addr)) { - break; - } - } - if (ipif != NULL) - break; - } - /* - * If no relevant ipif can be found, then it's not one of our - * addresses. Reset to :: and let nce_xmit. If an ipif can be - * found, but it's not yet done with DAD verification, then - * just postpone this transmission until later. - */ - if (src_ill == NULL) - src = ipv6_all_zeros; - else if (!ipif->ipif_addr_ready) - return (ill->ill_reachable_retrans_time); - } - dst = nce->nce_addr; + /* - * If source address is unspecified, nce_xmit will choose - * one for us and initialize the hardware address also - * appropriately. + * Need to copy the sender address into a local since `mp' can + * go away once we drop nce_lock. */ - if (IN6_IS_ADDR_UNSPECIFIED(&src)) - src_ill = NULL; + sender = ip6h->ip6_src; nce->nce_rcnt--; mutex_exit(&nce->nce_lock); - rw_exit(&ipst->ips_ill_g_lock); - dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src, - &dst, 0); - rw_enter(&ipst->ips_ill_g_lock, RW_READER); + dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0); mutex_enter(&nce->nce_lock); if (dropped) nce->nce_rcnt++; - return (ill->ill_reachable_retrans_time); + return (nce->nce_ill->ill_reachable_retrans_time); } /* @@ -1475,7 +1503,7 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) */ mutex_enter(&ill->ill_lock); if (!(ipif->ipif_flags & IPIF_DUPLICATE) || - (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) { + (ipif->ipif_state_flags & IPIF_CONDEMNED)) { mutex_exit(&ill->ill_lock); continue; } @@ -1485,8 +1513,8 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) mutex_exit(&ill->ill_lock); ipif->ipif_was_dup = B_TRUE; - if (ipif_ndp_up(ipif) != EINPROGRESS) - (void) ipif_up_done_v6(ipif); + VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); + (void) ipif_up_done_v6(ipif); } freeb(mp); } @@ -1515,7 +1543,7 @@ ipif6_dup_recovery(void *arg) /* * No lock, because this is just an optimization. */ - if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED)) + if (ipif->ipif_state_flags & IPIF_CONDEMNED) return; /* If the link is down, we'll retry this later */ @@ -1542,13 +1570,20 @@ ndp_do_recovery(ipif_t *ipif) if (mp == NULL) { mutex_enter(&ill->ill_lock); if (ipif->ipif_recovery_id == 0 && - !(ipif->ipif_state_flags & (IPIF_MOVING | - IPIF_CONDEMNED))) { + !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); } mutex_exit(&ill->ill_lock); } else { + /* + * A recovery timer may still be running if we got here from + * ill_restart_dad(); cancel that timer. + */ + if (ipif->ipif_recovery_id != 0) + (void) untimeout(ipif->ipif_recovery_id); + ipif->ipif_recovery_id = 0; + bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, sizeof (ipif->ipif_v6lcl_addr)); ill_refhold(ill); @@ -1558,41 +1593,51 @@ ndp_do_recovery(ipif_t *ipif) } /* - * Find the solicitation in the given message, and extract printable details - * (MAC and IP addresses) from it. + * Find the MAC and IP addresses in an NA/NS message. */ -static nd_neighbor_solicit_t * -ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf, - size_t hlen, char *sbuf, size_t slen, uchar_t **haddr) +static void +ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp, + uchar_t **haddr, uint_t *haddrlenp) { - nd_neighbor_solicit_t *ns; - ip6_t *ip6h; + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); + nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; + nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; uchar_t *addr; - int alen; + int alen = 0; - alen = 0; - ip6h = (ip6_t *)mp->b_rptr; if (dl_mp == NULL) { nd_opt_hdr_t *opt; - int nslen; + int len; /* * If it's from the fast-path, then it can't be a probe - * message, and thus must include the source linkaddr option. + * message, and thus must include a linkaddr option. * Extract that here. */ - ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN); - nslen = mp->b_wptr - (uchar_t *)ns; - if ((nslen -= sizeof (*ns)) > 0) { - opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen, - ND_OPT_SOURCE_LINKADDR); - if (opt != NULL && - opt->nd_opt_len * 8 - sizeof (*opt) >= - ill->ill_nd_lla_len) { - addr = (uchar_t *)(opt + 1); - alen = ill->ill_nd_lla_len; + switch (icmp6->icmp6_type) { + case ND_NEIGHBOR_SOLICIT: + len = mp->b_wptr - (uchar_t *)ns; + if ((len -= sizeof (*ns)) > 0) { + opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), + len, ND_OPT_SOURCE_LINKADDR); } + break; + case ND_NEIGHBOR_ADVERT: + len = mp->b_wptr - (uchar_t *)na; + if ((len -= sizeof (*na)) > 0) { + opt = ndp_get_option((nd_opt_hdr_t *)(na + 1), + len, ND_OPT_TARGET_LINKADDR); + } + break; + } + + if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >= + ill->ill_nd_lla_len) { + addr = (uchar_t *)(opt + 1); + alen = ill->ill_nd_lla_len; } + /* * We cheat a bit here for the sake of printing usable log * messages in the rare case where the reply we got was unicast @@ -1624,16 +1669,17 @@ ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf, } } } + if (alen > 0) { *haddr = addr; - (void) mac_colon_addr(addr, alen, hbuf, hlen); + *haddrlenp = alen; } else { *haddr = NULL; - (void) strcpy(hbuf, "?"); + *haddrlenp = 0; } - ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN); - (void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen); - return (ns); + + /* nd_ns_target and nd_na_target are at the same offset, so we cheat */ + *targp = ns->nd_ns_target; } /* @@ -1646,68 +1692,80 @@ ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) { ill_t *ill = rq->q_ptr; ipif_t *ipif; - char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ - char hbuf[MAC_STR_LEN]; - char sbuf[INET6_ADDRSTRLEN]; - nd_neighbor_solicit_t *ns; - mblk_t *dl_mp = NULL; - uchar_t *haddr; + mblk_t *dl_mp = NULL; + uchar_t *haddr; + uint_t haddrlen; ip_stack_t *ipst = ill->ill_ipst; + in6_addr_t targ; if (DB_TYPE(mp) != M_DATA) { dl_mp = mp; mp = mp->b_cont; } - ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf, - sizeof (sbuf), &haddr); - if (haddr != NULL && - bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { + + ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen); + if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { /* - * Ignore conflicts generated by misbehaving switches that just - * reflect our own messages back to us. + * Ignore conflicts generated by misbehaving switches that + * just reflect our own messages back to us. For IPMP, we may + * see reflections across any ill in the illgrp. */ - goto ignore_conflict; + if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || + IS_UNDER_IPMP(ill) && + ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL) + goto ignore_conflict; } - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + /* + * Look up the appropriate ipif. + */ + ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL, + NULL, ipst); + if (ipif == NULL) + goto ignore_conflict; - if ((ipif->ipif_flags & IPIF_POINTOPOINT) || - !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, - &ns->nd_ns_target)) { - continue; - } + /* Reload the ill to match the ipif */ + ill = ipif->ipif_ill; - /* If it's already marked, then don't do anything. */ - if (ipif->ipif_flags & IPIF_DUPLICATE) - continue; + /* If it's already duplicate or ineligible, then don't do anything. */ + if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { + ipif_refrele(ipif); + goto ignore_conflict; + } - /* - * If this is a failure during duplicate recovery, then don't - * complain. It may take a long time to recover. - */ - if (!ipif->ipif_was_dup) { - ipif_get_name(ipif, ibuf, sizeof (ibuf)); - cmn_err(CE_WARN, "%s has duplicate address %s (in " - "use by %s); disabled", ibuf, sbuf, hbuf); - } - mutex_enter(&ill->ill_lock); - ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); - ipif->ipif_flags |= IPIF_DUPLICATE; - ill->ill_ipif_dup_count++; - mutex_exit(&ill->ill_lock); - (void) ipif_down(ipif, NULL, NULL); - ipif_down_tail(ipif); - mutex_enter(&ill->ill_lock); - if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && - ill->ill_net_type == IRE_IF_RESOLVER && - !(ipif->ipif_state_flags & (IPIF_MOVING | - IPIF_CONDEMNED)) && - ipst->ips_ip_dup_recovery > 0) { - ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, - ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); - } - mutex_exit(&ill->ill_lock); + /* + * If this is a failure during duplicate recovery, then don't + * complain. It may take a long time to recover. + */ + if (!ipif->ipif_was_dup) { + char ibuf[LIFNAMSIZ]; + char hbuf[MAC_STR_LEN]; + char sbuf[INET6_ADDRSTRLEN]; + + ipif_get_name(ipif, ibuf, sizeof (ibuf)); + cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" + " disabled", ibuf, + inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), + mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf))); } + mutex_enter(&ill->ill_lock); + ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); + ipif->ipif_flags |= IPIF_DUPLICATE; + ill->ill_ipif_dup_count++; + mutex_exit(&ill->ill_lock); + (void) ipif_down(ipif, NULL, NULL); + ipif_down_tail(ipif); + mutex_enter(&ill->ill_lock); + if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && + ill->ill_net_type == IRE_IF_RESOLVER && + !(ipif->ipif_state_flags & IPIF_CONDEMNED) && + ipst->ips_ip_dup_recovery > 0) { + ASSERT(ipif->ipif_recovery_id == 0); + ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, + ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); + } + mutex_exit(&ill->ill_lock); + ipif_refrele(ipif); ignore_conflict: if (dl_mp != NULL) freeb(dl_mp); @@ -1721,7 +1779,7 @@ ignore_conflict: * we start a timer on the ipif. */ static void -ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) +ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) { if ((mp = copymsg(mp)) != NULL) { if (dl_mp == NULL) @@ -1736,7 +1794,6 @@ ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) B_FALSE); } } - ndp_delete(nce); } /* @@ -1757,6 +1814,7 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) NULL, NULL, ipst); if (ipif == NULL) return; + /* * First, figure out if this address is disposable. */ @@ -1786,19 +1844,21 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) * sending out an unsolicited Neighbor Advertisement. */ if (defs >= maxdefense) { - ip_ndp_failure(ill, mp, dl_mp, nce); + ip_ndp_failure(ill, mp, dl_mp); } else { char hbuf[MAC_STR_LEN]; char sbuf[INET6_ADDRSTRLEN]; uchar_t *haddr; + uint_t haddrlen; + in6_addr_t targ; - (void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, - sizeof (hbuf), sbuf, sizeof (sbuf), &haddr); + ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen); cmn_err(CE_WARN, "node %s is using our IP address %s on %s", - hbuf, sbuf, ill->ill_name); - (void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE, - &nce->nce_addr, &ipv6_all_hosts_mcast, - nce_advert_flags(nce)); + mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)), + inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), + ill->ill_name); + + (void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0); } } @@ -1843,6 +1903,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) bad_solicit = B_TRUE; goto done; } + } if (IN6_IS_ADDR_UNSPECIFIED(&src)) { /* Check to see if this is a valid DAD solicitation */ @@ -1859,7 +1920,13 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) } } - our_nce = ndp_lookup_v6(ill, &target, B_FALSE); + /* + * NOTE: with IPMP, it's possible the nominated multicast ill (which + * received this packet if it's multicast) is not the ill tied to + * e.g. the IPMP ill's data link-local. So we match across the illgrp + * to ensure we find the associated NCE. + */ + our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE); /* * If this is a valid Solicitation, a permanent * entry should exist in the cache @@ -1883,7 +1950,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) haddr = (uchar_t *)&opt[1]; if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || hlen == 0) { - ip1dbg(("ndp_input_advert: bad SLLA\n")); + ip1dbg(("ndp_input_solicit: bad SLLA\n")); bad_solicit = B_TRUE; goto done; } @@ -1934,6 +2001,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) goto no_source; err = ndp_lookup_then_add_v6(ill, + B_FALSE, haddr, &src, /* Soliciting nodes address */ &ipv6_all_ones, @@ -1949,8 +2017,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) break; case EEXIST: /* - * B_FALSE indicates this is not an - * an advertisement. + * B_FALSE indicates this is not an an advertisement. */ ndp_process(nnce, haddr, 0, B_FALSE); NCE_REFRELE(nnce); @@ -1985,7 +2052,7 @@ no_source: * If someone else is probing our address, then * we've crossed wires. Declare failure. */ - ip_ndp_failure(ill, mp, dl_mp, our_nce); + ip_ndp_failure(ill, mp, dl_mp); } goto done; } @@ -1995,15 +2062,8 @@ no_source: */ src = ipv6_all_hosts_mcast; } - flag |= nce_advert_flags(our_nce); /* Response to a solicitation */ - (void) nce_xmit(ill, - ND_NEIGHBOR_ADVERT, - ill, /* ill to be used for extracting ill_nd_lla */ - B_TRUE, /* use ill_nd_lla */ - &target, /* Source and target of the advertisement pkt */ - &src, /* IP Destination (source of original pkt) */ - flag); + (void) nce_xmit_advert(our_nce, B_TRUE, &src, flag); done: if (bad_solicit) BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); @@ -2023,8 +2083,8 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) in6_addr_t target; nd_opt_hdr_t *opt = NULL; int len; - mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; ip_stack_t *ipst = ill->ill_ipst; + mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; ip6h = (ip6_t *)mp->b_rptr; icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); @@ -2067,66 +2127,62 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) } /* - * If this interface is part of the group look at all the + * NOTE: we match across the illgrp since we need to do DAD for all of + * our local addresses, and those are spread across all the active * ills in the group. */ - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - if (ill->ill_group != NULL) - ill = ill->ill_group->illgrp_ill; + if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL) + return; - for (; ill != NULL; ill = ill->ill_group_next) { - mutex_enter(&ill->ill_lock); - if (!ILL_CAN_LOOKUP(ill)) { - mutex_exit(&ill->ill_lock); - continue; - } - ill_refhold_locked(ill); - mutex_exit(&ill->ill_lock); - dst_nce = ndp_lookup_v6(ill, &target, B_FALSE); - /* We have to drop the lock since ndp_process calls put* */ - rw_exit(&ipst->ips_ill_g_lock); - if (dst_nce != NULL) { - if ((dst_nce->nce_flags & NCE_F_PERMANENT) && - dst_nce->nce_state == ND_PROBE) { - /* - * Someone else sent an advertisement for an - * address that we're trying to configure. - * Tear it down. Note that dl_mp might be NULL - * if we're getting a unicast reply. This - * isn't typically done (multicast is the norm - * in response to a probe), but ip_ndp_failure - * will handle the dl_mp == NULL case as well. - */ - ip_ndp_failure(ill, mp, dl_mp, dst_nce); - } else if (dst_nce->nce_flags & NCE_F_PERMANENT) { - /* - * Someone just announced one of our local - * addresses. If it wasn't us, then this is a - * conflict. Defend the address or shut it - * down. - */ - if (dl_mp != NULL && - (haddr == NULL || - nce_cmp_ll_addr(dst_nce, haddr, - ill->ill_nd_lla_len))) { - ip_ndp_conflict(ill, mp, dl_mp, - dst_nce); - } - } else { - if (na->nd_na_flags_reserved & - ND_NA_FLAG_ROUTER) { - dst_nce->nce_flags |= NCE_F_ISROUTER; + if (dst_nce->nce_flags & NCE_F_PERMANENT) { + /* + * Someone just advertised one of our local addresses. First, + * check it it was us -- if so, we can safely ignore it. + */ + if (haddr != NULL) { + if (!nce_cmp_ll_addr(dst_nce, haddr, hlen)) + goto out; /* from us -- no conflict */ + + /* + * If we're in an IPMP group, check if this is an echo + * from another ill in the group. Use the double- + * checked locking pattern to avoid grabbing + * ill_g_lock in the non-IPMP case. + */ + if (IS_UNDER_IPMP(ill)) { + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill( + ill->ill_grp, haddr, hlen) != NULL) { + rw_exit(&ipst->ips_ill_g_lock); + goto out; } - /* B_TRUE indicates this an advertisement */ - ndp_process(dst_nce, haddr, - na->nd_na_flags_reserved, B_TRUE); + rw_exit(&ipst->ips_ill_g_lock); } - NCE_REFRELE(dst_nce); } - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - ill_refrele(ill); + + /* + * This appears to be a real conflict. If we're trying to + * configure this NCE (ND_PROBE), then shut it down. + * Otherwise, handle the discovered conflict. + * + * Note that dl_mp might be NULL if we're getting a unicast + * reply. This isn't typically done (multicast is the norm in + * response to a probe), but we can handle the dl_mp == NULL + * case as well. + */ + if (dst_nce->nce_state == ND_PROBE) + ip_ndp_failure(ill, mp, dl_mp); + else + ip_ndp_conflict(ill, mp, dl_mp, dst_nce); + } else { + if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) + dst_nce->nce_flags |= NCE_F_ISROUTER; + + /* B_TRUE indicates this an advertisement */ + ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE); } - rw_exit(&ipst->ips_ill_g_lock); +out: + NCE_REFRELE(dst_nce); } /* @@ -2194,6 +2250,40 @@ done: } /* + * Utility routine to send an advertisement. Assumes that the NCE cannot + * go away (e.g., because it's refheld). + */ +static boolean_t +nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target, + uint_t flags) +{ + ASSERT((flags & NDP_PROBE) == 0); + + if (nce->nce_flags & NCE_F_ISROUTER) + flags |= NDP_ISROUTER; + if (!(nce->nce_flags & NCE_F_ANYCAST)) + flags |= NDP_ORIDE; + + return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla, + &nce->nce_addr, target, flags)); +} + +/* + * Utility routine to send a solicitation. Assumes that the NCE cannot + * go away (e.g., because it's refheld). + */ +static boolean_t +nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender, + uint_t flags) +{ + if (flags & NDP_PROBE) + sender = &ipv6_all_zeros; + + return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla, + sender, &nce->nce_addr, flags)); +} + +/* * nce_xmit is called to form and transmit a ND solicitation or * advertisement ICMP packet. * @@ -2207,88 +2297,79 @@ done: * corresponding ill's ill_wq otherwise returns B_TRUE. */ static boolean_t -nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, - boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target, - int flag) +nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla, + const in6_addr_t *sender, const in6_addr_t *target, int flag) { + ill_t *hwaddr_ill; uint32_t len; icmp6_t *icmp6; mblk_t *mp; ip6_t *ip6h; nd_opt_hdr_t *opt; - uint_t plen; + uint_t plen, maxplen; ip6i_t *ip6i; ipif_t *src_ipif = NULL; uint8_t *hw_addr; zoneid_t zoneid = GLOBAL_ZONEID; + char buf[INET6_ADDRSTRLEN]; + + ASSERT(!IS_IPMP(ill)); /* - * If we have a unspecified source(sender) address, select a - * proper source address for the solicitation here itself so - * that we can initialize the h/w address correctly. This is - * needed for interface groups as source address can come from - * the whole group and the h/w address initialized from ill will - * be wrong if the source address comes from a different ill. - * - * If the sender is specified then we use this address in order - * to lookup the zoneid before calling ip_output_v6(). This is to - * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly - * by IP (we cannot guarantee that the global zone has an interface - * route to the destination). - * - * Note that the NA never comes here with the unspecified source - * address. The following asserts that whenever the source - * address is specified, the haddr also should be specified. + * Check that the sender is actually a usable address on `ill', and if + * so, track that as the src_ipif. If not, for solicitations, set the + * sender to :: so that a new one will be picked below; for adverts, + * drop the packet since we expect nce_xmit_advert() to always provide + * a valid sender. */ - ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL)); + if (!IN6_IS_ADDR_UNSPECIFIED(sender)) { + if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL || + !src_ipif->ipif_addr_ready) { + if (src_ipif != NULL) { + ipif_refrele(src_ipif); + src_ipif = NULL; + } + if (type == ND_NEIGHBOR_ADVERT) { + ip1dbg(("nce_xmit: No source ipif for src %s\n", + inet_ntop(AF_INET6, sender, buf, + sizeof (buf)))); + return (B_TRUE); + } + sender = &ipv6_all_zeros; + } + } + /* + * If we still have an unspecified source (sender) address and this + * isn't a probe, select a source address from `ill'. + */ if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) { - ASSERT(operation != ND_NEIGHBOR_ADVERT); + ASSERT(type != ND_NEIGHBOR_ADVERT); /* - * Pick a source address for this solicitation, but - * restrict the selection to addresses assigned to the - * output interface (or interface group). We do this - * because the destination will create a neighbor cache - * entry for the source address of this packet, so the - * source address had better be a valid neighbor. + * Pick a source address for this solicitation, but restrict + * the selection to addresses assigned to the output + * interface. We do this because the destination will create + * a neighbor cache entry for the source address of this + * packet, so the source address needs to be a valid neighbor. */ - src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL, + src_ipif = ipif_select_source_v6(ill, target, B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES); if (src_ipif == NULL) { - char buf[INET6_ADDRSTRLEN]; - ip1dbg(("nce_xmit: No source ipif for dst %s\n", - inet_ntop(AF_INET6, (char *)target, buf, - sizeof (buf)))); + inet_ntop(AF_INET6, target, buf, sizeof (buf)))); return (B_TRUE); } sender = &src_ipif->ipif_v6src_addr; - hwaddr_ill = src_ipif->ipif_ill; - } else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { - zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst); - /* - * It's possible for ipif_lookup_addr_zoneid_v6() to return - * ALL_ZONES if it cannot find a matching ipif for the address - * we are trying to use. In this case we err on the side of - * trying to send the packet by defaulting to the GLOBAL_ZONEID. - */ - if (zoneid == ALL_ZONES) - zoneid = GLOBAL_ZONEID; } /* - * Always make sure that the NS/NA packets don't get load - * spread. This is needed so that the probe packets sent - * by the in.mpathd daemon can really go out on the desired - * interface. Probe packets are made to go out on a desired - * interface by including a ip6i with ATTACH_IF flag. As these - * packets indirectly end up sending/receiving NS/NA packets - * (neighbor doing NUD), we have to make sure that NA - * also go out on the same interface. + * We're either sending a probe or we have a source address. */ - plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8; + ASSERT((flag & NDP_PROBE) || src_ipif != NULL); + + maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8); len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) + - plen * 8; + maxplen; mp = allocb(len, BPRI_LO); if (mp == NULL) { if (src_ipif != NULL) @@ -2301,28 +2382,27 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, ip6i = (ip6i_t *)mp->b_rptr; ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6i->ip6i_nxt = IPPROTO_RAW; - ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT; + ip6i->ip6i_flags = IP6I_HOPLIMIT; if (flag & NDP_PROBE) ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; - ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t)); ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); ip6h->ip6_nxt = IPPROTO_ICMPV6; ip6h->ip6_hops = IPV6_MAX_HOPS; + ip6h->ip6_src = *sender; ip6h->ip6_dst = *target; icmp6 = (icmp6_t *)&ip6h[1]; opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); - if (operation == ND_NEIGHBOR_SOLICIT) { + if (type == ND_NEIGHBOR_SOLICIT) { nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; if (!(flag & NDP_PROBE)) opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; - ip6h->ip6_src = *sender; ns->nd_ns_target = *target; if (!(flag & NDP_UNICAST)) { /* Form multicast address of the target */ @@ -2335,7 +2415,6 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, ASSERT(!(flag & NDP_PROBE)); opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; - ip6h->ip6_src = *sender; na->nd_na_target = *sender; if (flag & NDP_ISROUTER) na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; @@ -2347,22 +2426,48 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, hw_addr = NULL; if (!(flag & NDP_PROBE)) { + /* + * Use our source address to find the hardware address to put + * in the packet, so that the hardware address and IP address + * will match up -- even if that hardware address doesn't + * match the ill we actually transmit the packet through. + */ + if (IS_IPMP(src_ipif->ipif_ill)) { + hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif); + if (hwaddr_ill == NULL) { + ip1dbg(("nce_xmit: no bound ill!\n")); + ipif_refrele(src_ipif); + freemsg(mp); + return (B_TRUE); + } + } else { + hwaddr_ill = src_ipif->ipif_ill; + ill_refhold(hwaddr_ill); /* for symmetry */ + } + + plen = roundup(sizeof (nd_opt_hdr_t) + + hwaddr_ill->ill_nd_lla_len, 8); + hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla : hwaddr_ill->ill_phys_addr; if (hw_addr != NULL) { /* Fill in link layer address and option len */ - opt->nd_opt_len = (uint8_t)plen; + opt->nd_opt_len = (uint8_t)(plen / 8); bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len); } + + ill_refrele(hwaddr_ill); } - if (hw_addr == NULL) { - /* If there's no link layer address option, then strip it. */ - len -= plen * 8; - mp->b_wptr = mp->b_rptr + len; - ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); - } - icmp6->icmp6_type = (uint8_t)operation; + if (hw_addr == NULL) + plen = 0; + + /* Fix up the length of the packet now that plen is known */ + len -= (maxplen - plen); + mp->b_wptr = mp->b_rptr + len; + ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); + + icmp6->icmp6_type = type; icmp6->icmp6_code = 0; /* * Prepare for checksum by putting icmp length in the icmp @@ -2370,8 +2475,17 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, */ icmp6->icmp6_cksum = ip6h->ip6_plen; - if (src_ipif != NULL) + /* + * Before we toss the src_ipif, look up the zoneid to pass to + * ip_output_v6(). This is to ensure unicast ND_NEIGHBOR_ADVERT + * packets to be routed correctly by IP (we cannot guarantee that the + * global zone has an interface route to the destination). + */ + if (src_ipif != NULL) { + if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES) + zoneid = GLOBAL_ZONEID; ipif_refrele(src_ipif); + } ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT); return (B_FALSE); @@ -2448,7 +2562,6 @@ ndp_timer(void *arg) ill_t *ill = nce->nce_ill; uint32_t ms; char addrbuf[INET6_ADDRSTRLEN]; - mblk_t *mp; boolean_t dropped = B_FALSE; ip_stack_t *ipst = ill->ill_ipst; @@ -2460,11 +2573,6 @@ ndp_timer(void *arg) */ ASSERT(nce != NULL); - /* - * Grab the ill_g_lock now itself to avoid lock order problems. - * nce_solicit needs ill_g_lock to be able to traverse ills - */ - rw_enter(&ipst->ips_ill_g_lock, RW_READER); mutex_enter(&nce->nce_lock); NCE_REFHOLD_LOCKED(nce); nce->nce_timeout_id = 0; @@ -2474,11 +2582,10 @@ ndp_timer(void *arg) */ switch (nce->nce_state) { case ND_DELAY: - rw_exit(&ipst->ips_ill_g_lock); nce->nce_state = ND_PROBE; mutex_exit(&nce->nce_lock); - (void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE, - &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST); + (void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros, + NDP_UNICAST); if (ip_debug > 3) { /* ip2dbg */ pr_addr_dbg("ndp_timer: state for %s changed " @@ -2489,7 +2596,6 @@ ndp_timer(void *arg) return; case ND_PROBE: /* must be retransmit timer */ - rw_exit(&ipst->ips_ill_g_lock); nce->nce_pcnt--; ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT && nce->nce_pcnt >= -1); @@ -2504,8 +2610,8 @@ ndp_timer(void *arg) nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr, addrbuf, sizeof (addrbuf)))); mutex_exit(&nce->nce_lock); - dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, - B_FALSE, &ipv6_all_zeros, &nce->nce_addr, + dropped = nce_xmit_solicit(nce, B_FALSE, + &ipv6_all_zeros, (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE : NDP_UNICAST); if (dropped) { @@ -2542,8 +2648,8 @@ ndp_timer(void *arg) */ nce->nce_state = ND_REACHABLE; mutex_exit(&nce->nce_lock); - ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, - ALL_ZONES, NULL, NULL, NULL, NULL, ipst); + ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr, + nce->nce_ill); if (ipif != NULL) { if (ipif->ipif_was_dup) { char ibuf[LIFNAMSIZ + 10]; @@ -2566,9 +2672,8 @@ ndp_timer(void *arg) } /* Begin defending our new address */ nce->nce_unsolicit_count = 0; - dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, - B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast, - nce_advert_flags(nce)); + dropped = nce_xmit_advert(nce, B_FALSE, + &ipv6_all_hosts_mcast, 0); if (dropped) { nce->nce_unsolicit_count = 1; NDP_RESTART_TIMER(nce, @@ -2589,51 +2694,40 @@ ndp_timer(void *arg) } NCE_REFRELE(nce); return; - case ND_INCOMPLETE: + case ND_INCOMPLETE: { + ip6_t *ip6h; + ip6i_t *ip6i; + mblk_t *mp, *datamp, *nextmp, **prevmpp; + /* - * Must be resolvers retransmit timer. + * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp + * for any IPMP probe packets, and toss 'em. IPMP probe + * packets will always be at the head of nce_qd_mp and always + * have an ip6i_t header, so we can stop at the first queued + * ND packet without an ip6i_t. */ - for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) { - ip6i_t *ip6i; - ip6_t *ip6h; - mblk_t *data_mp; - - /* - * Walk the list of packets queued, and see if there - * are any multipathing probe packets. Such packets - * are always queued at the head. Since this is a - * retransmit timer firing, mark such packets as - * delayed in ND resolution. This info will be used - * in ip_wput_v6(). Multipathing probe packets will - * always have an ip6i_t. Once we hit a packet without - * it, we can break out of this loop. - */ - if (mp->b_datap->db_type == M_CTL) - data_mp = mp->b_cont; - else - data_mp = mp; - - ip6h = (ip6_t *)data_mp->b_rptr; + prevmpp = &nce->nce_qd_mp; + for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) { + nextmp = mp->b_next; + datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp; + ip6h = (ip6_t *)datamp->b_rptr; if (ip6h->ip6_nxt != IPPROTO_RAW) break; - /* - * This message should have been pulled up already in - * ip_wput_v6. We can't do pullups here because the - * b_next/b_prev is non-NULL. - */ ip6i = (ip6i_t *)ip6h; - ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >= - sizeof (ip6i_t) + IPV6_HDR_LEN); - - /* Mark this packet as delayed due to ND resolution */ - if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) - ip6i->ip6i_flags |= IP6I_ND_DELAYED; + if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) { + inet_freemsg(mp); + *prevmpp = nextmp; + } else { + prevmpp = &mp->b_next; + } } + + /* + * Must be resolver's retransmit timer. + */ if (nce->nce_qd_mp != NULL) { - ms = nce_solicit(nce, NULL); - rw_exit(&ipst->ips_ill_g_lock); - if (ms == 0) { + if ((ms = nce_solicit(nce, NULL)) == 0) { if (nce->nce_state != ND_REACHABLE) { mutex_exit(&nce->nce_lock); nce_resolv_failed(nce); @@ -2649,11 +2743,10 @@ ndp_timer(void *arg) return; } mutex_exit(&nce->nce_lock); - rw_exit(&ipst->ips_ill_g_lock); NCE_REFRELE(nce); break; - case ND_REACHABLE : - rw_exit(&ipst->ips_ill_g_lock); + } + case ND_REACHABLE: if (((nce->nce_flags & NCE_F_UNSOL_ADV) && nce->nce_unsolicit_count != 0) || ((nce->nce_flags & NCE_F_PERMANENT) && @@ -2661,13 +2754,8 @@ ndp_timer(void *arg) if (nce->nce_unsolicit_count > 0) nce->nce_unsolicit_count--; mutex_exit(&nce->nce_lock); - dropped = nce_xmit(ill, - ND_NEIGHBOR_ADVERT, - ill, /* ill to be used for hw addr */ - B_FALSE, /* use ill_phys_addr */ - &nce->nce_addr, - &ipv6_all_hosts_mcast, - nce_advert_flags(nce)); + dropped = nce_xmit_advert(nce, B_FALSE, + &ipv6_all_hosts_mcast, 0); if (dropped) { mutex_enter(&nce->nce_lock); nce->nce_unsolicit_count++; @@ -2686,7 +2774,6 @@ ndp_timer(void *arg) NCE_REFRELE(nce); break; default: - rw_exit(&ipst->ips_ill_g_lock); mutex_exit(&nce->nce_lock); NCE_REFRELE(nce); break; @@ -2819,23 +2906,20 @@ void nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert) { uint_t count = 0; - mblk_t **mpp; + mblk_t **mpp, *tmp; ASSERT(MUTEX_HELD(&nce->nce_lock)); - for (mpp = &nce->nce_qd_mp; *mpp != NULL; - mpp = &(*mpp)->b_next) { - if (++count > - nce->nce_ill->ill_max_buf) { - mblk_t *tmp = nce->nce_qd_mp->b_next; - + for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { + if (++count > nce->nce_ill->ill_max_buf) { + tmp = nce->nce_qd_mp->b_next; nce->nce_qd_mp->b_next = NULL; nce->nce_qd_mp->b_prev = NULL; freemsg(nce->nce_qd_mp); nce->nce_qd_mp = tmp; } } - /* put this on the list */ + if (head_insert) { mp->b_next = nce->nce_qd_mp; nce->nce_qd_mp = mp; @@ -2849,8 +2933,8 @@ nce_queue_mp(nce_t *nce, mblk_t *mp) { boolean_t head_insert = B_FALSE; ip6_t *ip6h; - ip6i_t *ip6i; - mblk_t *data_mp; + ip6i_t *ip6i; + mblk_t *data_mp; ASSERT(MUTEX_HELD(&nce->nce_lock)); @@ -2867,43 +2951,28 @@ nce_queue_mp(nce_t *nce, mblk_t *mp) * non-NULL. */ ip6i = (ip6i_t *)ip6h; - ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >= - sizeof (ip6i_t) + IPV6_HDR_LEN); + ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN); + /* - * Multipathing probe packets have IP6I_DROP_IFDELAYED set. - * This has 2 aspects mentioned below. - * 1. Perform head insertion in the nce_qd_mp for these packets. - * This ensures that next retransmit of ND solicitation - * will use the interface specified by the probe packet, - * for both NS and NA. This corresponds to the src address - * in the IPv6 packet. If we insert at tail, we will be - * depending on the packet at the head for successful - * ND resolution. This is not reliable, because the interface - * on which the NA arrives could be different from the interface - * on which the NS was sent, and if the receiving interface is - * failed, it will appear that the sending interface is also - * failed, causing in.mpathd to misdiagnose this as link - * failure. - * 2. Drop the original packet, if the ND resolution did not - * succeed in the first attempt. However we will create the - * nce and the ire, as soon as the ND resolution succeeds. - * We don't gain anything by queueing multiple probe packets - * and sending them back-to-back once resolution succeeds. - * It is sufficient to send just 1 packet after ND resolution - * succeeds. Since mpathd is sending down probe packets at a - * constant rate, we don't need to send the queued packet. We - * need to queue it only for NDP resolution. The benefit of - * dropping the probe packets that were delayed in ND - * resolution, is that in.mpathd will not see inflated - * RTT. If the ND resolution does not succeed within - * in.mpathd's failure detection time, mpathd may detect - * a failure, and it does not matter whether the packet - * was queued or dropped. + * If this packet is marked IP6I_IPMP_PROBE, then we need to: + * + * 1. Insert it at the head of the nce_qd_mp list. Consider + * the normal (non-probe) load-speading case where the + * source address of the ND packet is not tied to nce_ill. + * If the ill bound to the source address cannot receive, + * the response to the ND packet will not be received. + * However, if ND packets for nce_ill's probes are queued + * behind that ND packet, those probes will also fail to + * be sent, and thus in.mpathd will erroneously conclude + * that nce_ill has also failed. + * + * 2. Drop the probe packet in ndp_timer() if the ND did + * not succeed on the first attempt. This ensures that + * ND problems do not manifest as probe RTT spikes. */ - if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) + if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) head_insert = B_TRUE; } - nce_queue_mp_common(nce, mp, head_insert); } @@ -2988,13 +3057,17 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) (lnr->lnr_state_create != ND_STALE)) return (EINVAL); + if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN) + return (EINVAL); + sin6 = (sin6_t *)&lnr->lnr_addr; addr = &sin6->sin6_addr; mutex_enter(&ipst->ips_ndp6->ndp_g_lock); /* We know it can not be mapping so just look in the hash table */ nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); - nce = nce_lookup_addr(ill, addr, nce); + /* See comment in ndp_query() regarding IS_IPMP(ill) usage */ + nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce); if (nce != NULL) new_flags = nce->nce_flags; @@ -3065,7 +3138,7 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) * the link layer address passed in to determine the state * much like incoming packets. */ - ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); + nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); NCE_REFRELE(nce); return (0); } @@ -3463,7 +3536,11 @@ ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags, mutex_enter(&ipst->ips_ndp4->ndp_g_lock); nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); - nce = nce_lookup_addr(ill, &addr6, nce); + /* + * NOTE: IPv4 never matches across the illgrp since the NCE's we're + * looking up have fastpath headers that are inherently per-ill. + */ + nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce); if (nce == NULL) { err = ndp_add_v4(ill, addr, flags, newnce, src_nce); } else { @@ -3718,3 +3795,26 @@ ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns) mutex_exit(&ipst->ips_ndp4->ndp_g_lock); return (nce != NULL); } + +/* + * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly + * with IPMP. Specifically, since neighbor discovery is always done on + * underlying interfaces (even for addresses owned by an IPMP interface), we + * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface + * associated with `ill' (if it exists). + */ +static ipif_t * +ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill) +{ + ipif_t *ipif; + ip_stack_t *ipst = ill->ill_ipst; + + ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst); + if (ipif == NULL && IS_UNDER_IPMP(ill)) { + if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { + ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst); + ill_refrele(ill); + } + } + return (ipif); +} diff --git a/usr/src/uts/common/inet/ip/ip_netinfo.c b/usr/src/uts/common/inet/ip/ip_netinfo.c index 53665593be..e81c7a0e1f 100644 --- a/usr/src/uts/common/inet/ip/ip_netinfo.c +++ b/usr/src/uts/common/inet/ip/ip_netinfo.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -568,33 +568,17 @@ ip_getifname_impl(phy_if_t phy_ifdata, char *buffer, const size_t buflen, boolean_t isv6, ip_stack_t *ipst) { ill_t *ill; - char *name; ASSERT(buffer != NULL); ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, NULL, NULL, NULL, NULL, ipst); - if (ill != NULL) { - name = ill->ill_name; - } else { - /* Fallback to group names only if hook_emulation is set */ - if (ipst->ips_ipmp_hook_emulation) { - ill = ill_group_lookup_on_ifindex((uint_t)phy_ifdata, - isv6, ipst); - } - if (ill == NULL) - return (1); - name = ill->ill_phyint->phyint_groupname; - } - if (name != NULL) { - (void) strlcpy(buffer, name, buflen); - ill_refrele(ill); - return (0); - } else { - ill_refrele(ill); + if (ill == NULL) return (1); - } + (void) strlcpy(buffer, ill->ill_name, buflen); + ill_refrele(ill); + return (0); } /* @@ -625,9 +609,6 @@ ipv6_getmtu(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata) /* * Shared implementation to determine the MTU of a network interface - * - * Note: this does not handle a non-zero ifdata when ipmp_hook_emulation is set. - * But IP Filter only uses a zero ifdata. */ /* ARGSUSED */ static int @@ -653,16 +634,7 @@ ip_getmtu_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6, if ((ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, NULL, NULL, NULL, NULL, ipst)) == NULL) { - /* - * Fallback to group names only if hook_emulation - * is set - */ - if (ipst->ips_ipmp_hook_emulation) { - ill = ill_group_lookup_on_ifindex( - (uint_t)phy_ifdata, isv6, ipst); - } - if (ill == NULL) - return (0); + return (0); } mtu = ill->ill_max_frag; ill_refrele(ill); @@ -686,9 +658,6 @@ ip_getpmtuenabled(net_handle_t neti) /* * Get next interface from the current list of IPv4 physical network interfaces - * - * Note: this does not handle the case when ipmp_hook_emulation is set. - * But IP Filter does not use this function. */ static phy_if_t ip_phygetnext(net_handle_t neti, phy_if_t phy_ifdata) @@ -752,15 +721,10 @@ ip_phylookup_impl(const char *name, boolean_t isv6, ip_stack_t *ipst) ill = ill_lookup_on_name((char *)name, B_FALSE, isv6, NULL, NULL, NULL, NULL, NULL, ipst); - - /* Fallback to group names only if hook_emulation is set */ - if (ill == NULL && ipst->ips_ipmp_hook_emulation) { - ill = ill_group_lookup_on_name((char *)name, isv6, ipst); - } if (ill == NULL) return (0); - phy = ill->ill_phyint->phyint_hook_ifindex; + phy = ill->ill_phyint->phyint_ifindex; ill_refrele(ill); @@ -798,9 +762,6 @@ ipv6_lifgetnext(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata) /* * Shared implementation to get next interface from the current list of * logical network interfaces - * - * Note: this does not handle the case when ipmp_hook_emulation is set. - * But IP Filter does not use this function. */ static lif_if_t ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6, @@ -834,7 +795,7 @@ ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6, /* * It's safe to iterate the ill_ipif list when holding an ill_lock. * And it's also safe to access ipif_id without ipif refhold. - * See ipif_get_id(). + * See the field access rules in ip.h. */ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (!IPIF_CAN_LOOKUP(ipif)) @@ -1013,8 +974,8 @@ ip_inject_impl(inject_t style, net_inject_t *packet, boolean_t isv6, if (ire->ire_nce == NULL || ire->ire_nce->nce_fp_mp == NULL && ire->ire_nce->nce_res_mp == NULL) { - ip_newroute_v6(ire->ire_stq, mp, - &sin6->sin6_addr, NULL, NULL, ALL_ZONES, ipst); + ip_newroute_v6(ire->ire_stq, mp, &sin6->sin6_addr, + &ip6h->ip6_src, NULL, ALL_ZONES, ipst); ire_refrele(ire); return (0); @@ -1170,7 +1131,7 @@ ip_routeto_impl(struct sockaddr *address, struct sockaddr *nexthop, } ASSERT(ill != NULL); - phy_if = (phy_if_t)ill->ill_phyint->phyint_hook_ifindex; + phy_if = (phy_if_t)ill->ill_phyint->phyint_ifindex; if (sire != NULL) ire_refrele(sire); ire_refrele(ire); @@ -1305,9 +1266,6 @@ ipv6_getlifaddr(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata, /* * Shared implementation to determine the network addresses for an interface - * - * Note: this does not handle a non-zero ifdata when ipmp_hook_emulation is set. - * But IP Filter only uses a zero ifdata. */ /* ARGSUSED */ static int @@ -1531,12 +1489,6 @@ ip_ni_queue_func_impl(injection_t *inject, boolean_t out) ill = ill_lookup_on_ifindex((uint_t)packet->ni_physical, B_FALSE, NULL, NULL, NULL, NULL, ipst); - - /* Fallback to group names only if hook_emulation is set */ - if (ill == NULL && ipst->ips_ipmp_hook_emulation) { - ill = ill_group_lookup_on_ifindex((uint_t)packet->ni_physical, - B_FALSE, ipst); - } if (ill == NULL) { kmem_free(inject, sizeof (*inject)); return; @@ -1613,65 +1565,3 @@ done: kmem_free(info->hnei_event.hne_data, info->hnei_event.hne_datalen); kmem_free(arg, sizeof (hook_nic_event_int_t)); } - -/* - * Temporary function to support IPMP emulation for IP Filter. - * Lookup an ill based on the ifindex assigned to the group. - * Skips unusable ones i.e. where any of these flags are set: - * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE) - */ -ill_t * -ill_group_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) -{ - ill_t *ill; - phyint_t *phyi; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - phyi = phyint_lookup_group_ifindex(index, ipst); - if (phyi != NULL) { - ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; - if (ill != NULL) { - mutex_enter(&ill->ill_lock); - if (ILL_CAN_LOOKUP(ill)) { - ill_refhold_locked(ill); - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (ill); - } - mutex_exit(&ill->ill_lock); - } - } - rw_exit(&ipst->ips_ill_g_lock); - return (NULL); -} - -/* - * Temporary function to support IPMP emulation for IP Filter. - * Lookup an ill based on the group name. - * Skips unusable ones i.e. where any of these flags are set: - * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE) - */ -ill_t * -ill_group_lookup_on_name(char *name, boolean_t isv6, ip_stack_t *ipst) -{ - ill_t *ill; - phyint_t *phyi; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - phyi = phyint_lookup_group(name, B_TRUE, ipst); - if (phyi != NULL) { - ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; - if (ill != NULL) { - mutex_enter(&ill->ill_lock); - if (ILL_CAN_LOOKUP(ill)) { - ill_refhold_locked(ill); - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (ill); - } - mutex_exit(&ill->ill_lock); - } - } - rw_exit(&ipst->ips_ill_g_lock); - return (NULL); -} diff --git a/usr/src/uts/common/inet/ip/ip_opt_data.c b/usr/src/uts/common/inet/ip/ip_opt_data.c index bb6e98a99e..1c91ea667f 100644 --- a/usr/src/uts/common/inet/ip/ip_opt_data.c +++ b/usr/src/uts/common/inet/ip/ip_opt_data.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -119,9 +119,6 @@ opdes_t ip_opt_arr[] = { { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 /* no ifindex */ }, -{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, - sizeof (struct in_addr), 0 /* not initialized */ }, - { IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, @@ -199,12 +196,6 @@ opdes_t ip_opt_arr[] = { { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 /* no ifindex */ }, -{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 /* no ifindex */ }, - -{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 /* no ifindex */ }, - { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, sizeof (int), 0 }, diff --git a/usr/src/uts/common/inet/ip/ip_rts.c b/usr/src/uts/common/inet/ip/ip_rts.c index 3324d1d833..77ab2cc220 100644 --- a/usr/src/uts/common/inet/ip/ip_rts.c +++ b/usr/src/uts/common/inet/ip/ip_rts.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -93,34 +93,52 @@ static void rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics); static void ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *); /* - * Send the ack to all the routing queues. In case of the originating queue, - * send it only if the loopback is set. - * - * Messages are sent upstream only on routing sockets that did not specify an - * address family when they were created or when the address family matches the - * one specified by the caller. + * Send `mp' to all eligible routing queues. A queue is ineligible if: * + * 1. SO_USELOOPBACK is off and it is not the originating queue. + * 2. RTAW_UNDER_IPMP is on and RTSQ_UNDER_IPMP is clear in `flags'. + * 3. RTAW_UNDER_IPMP is off and RTSQ_NORMAL is clear in `flags'. + * 4. It is not the same address family as `af', and `af' isn't AF_UNSPEC. */ void -rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst) +rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags, + ip_stack_t *ipst) { mblk_t *mp1; conn_t *connp, *next_connp; + /* + * Since we don't have an ill_t here, RTSQ_DEFAULT must already be + * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP by now. + */ + ASSERT(!(flags & RTSQ_DEFAULT)); + mutex_enter(&ipst->ips_rts_clients->connf_lock); connp = ipst->ips_rts_clients->connf_head; - while (connp != NULL) { + for (; connp != NULL; connp = next_connp) { + next_connp = connp->conn_next; + /* * If there was a family specified when this routing socket was * created and it doesn't match the family of the message to * copy, then continue. */ if ((connp->conn_proto != AF_UNSPEC) && - (connp->conn_proto != af)) { - connp = connp->conn_next; + (connp->conn_proto != af)) continue; + + /* + * Queue the message only if the conn_t and flags match. + */ + if (connp->conn_rtaware & RTAW_UNDER_IPMP) { + if (!(flags & RTSQ_UNDER_IPMP)) + continue; + } else { + if (!(flags & RTSQ_NORMAL)) + continue; } + /* * For the originating queue, we only copy the message upstream * if loopback is set. For others reading on the routing @@ -128,8 +146,8 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst) * message. */ if ((o_connp == connp) && connp->conn_loopback == 0) { - connp = connp->conn_next; - continue; + connp = connp->conn_next; + continue; } CONN_INC_REF(connp); mutex_exit(&ipst->ips_rts_clients->connf_lock); @@ -145,10 +163,9 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst) } mutex_enter(&ipst->ips_rts_clients->connf_lock); - /* Follow the next pointer before releasing the conn. */ + /* reload next_connp since conn_next may have changed */ next_connp = connp->conn_next; CONN_DEC_REF(connp); - connp = next_connp; } mutex_exit(&ipst->ips_rts_clients->connf_lock); freemsg(mp); @@ -209,7 +226,7 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; - rts_queue_input(mp, NULL, af, ipst); + rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst); } /* ARGSUSED */ @@ -430,7 +447,7 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) if (index != 0) { ill_t *ill; - +lookup: /* * IPC must be refheld somewhere in ip_wput_nondata or * ip_wput_ioctl etc... and cleaned up if ioctl is killed. @@ -445,16 +462,33 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) goto done; } - ipif = ipif_get_next_ipif(NULL, ill); - ill_refrele(ill); /* - * If this is replacement ipif, prevent a route from - * being added. + * Since all interfaces in an IPMP group must be equivalent, + * we prevent changes to a specific underlying interface's + * routing configuration. However, for backward compatibility, + * we intepret a request to add a route on an underlying + * interface as a request to add a route on its IPMP interface. */ - if (ipif != NULL && ipif->ipif_replace_zero) { - error = ENETDOWN; - goto done; + if (IS_UNDER_IPMP(ill)) { + switch (rtm->rtm_type) { + case RTM_CHANGE: + case RTM_DELETE: + ill_refrele(ill); + error = EINVAL; + goto done; + case RTM_ADD: + index = ipmp_ill_get_ipmp_ifindex(ill); + ill_refrele(ill); + if (index == 0) { + error = EINVAL; + goto done; + } + goto lookup; + } } + + ipif = ipif_get_next_ipif(NULL, ill); + ill_refrele(ill); match_flags |= MATCH_IRE_ILL; } @@ -1037,7 +1071,7 @@ done: /* OK ACK already set up by caller except this */ ip2dbg(("ip_rts_request: OK ACK\n")); } - rts_queue_input(mp, connp, af, ipst); + rts_queue_input(mp, connp, af, RTSQ_ALL, ipst); } iocp->ioc_error = error; @@ -1724,7 +1758,7 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask, rtm->rtm_errno = error; rtm->rtm_flags |= RTF_DONE; rtm->rtm_addrs = rtm_addrs; - rts_queue_input(mp, NULL, AF_INET, ipst); + rts_queue_input(mp, NULL, AF_INET, RTSQ_ALL, ipst); } /* @@ -1733,7 +1767,13 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask, * Message type generated RTM_IFINFO. */ void -ip_rts_ifmsg(const ipif_t *ipif) +ip_rts_ifmsg(const ipif_t *ipif, uint_t flags) +{ + ip_rts_xifmsg(ipif, 0, 0, flags); +} + +void +ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags) { if_msghdr_t *ifm; mblk_t *mp; @@ -1741,12 +1781,12 @@ ip_rts_ifmsg(const ipif_t *ipif) ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; /* - * This message should be generated only - * when the physical device is changing - * state. + * This message should be generated only when the physical interface + * is changing state. */ if (ipif->ipif_id != 0) return; + if (ipif->ipif_isv6) { af = AF_INET6; mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0); @@ -1765,11 +1805,22 @@ ip_rts_ifmsg(const ipif_t *ipif) } ifm = (if_msghdr_t *)mp->b_rptr; ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; - ifm->ifm_flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags | - ipif->ipif_ill->ill_phyint->phyint_flags; + ifm->ifm_flags = (ipif->ipif_flags | ipif->ipif_ill->ill_flags | + ipif->ipif_ill->ill_phyint->phyint_flags | set) & ~clear; rts_getifdata(&ifm->ifm_data, ipif); ifm->ifm_addrs = RTA_IFP; - rts_queue_input(mp, NULL, af, ipst); + + if (flags & RTSQ_DEFAULT) { + flags = RTSQ_ALL; + /* + * If this message is for an underlying interface, prevent + * "normal" (IPMP-unaware) routing sockets from seeing it. + */ + if (IS_UNDER_IPMP(ipif->ipif_ill)) + flags &= ~RTSQ_NORMAL; + } + + rts_queue_input(mp, NULL, af, flags, ipst); } /* @@ -1778,7 +1829,7 @@ ip_rts_ifmsg(const ipif_t *ipif) * The structure of the code is based on the 4.4BSD-Lite2 <net/rtsock.c>. */ void -ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif) +ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags) { int pass; int ncmd; @@ -1793,6 +1844,17 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif) af = AF_INET6; else af = AF_INET; + + if (flags & RTSQ_DEFAULT) { + flags = RTSQ_ALL; + /* + * If this message is for an underlying interface, prevent + * "normal" (IPMP-unaware) routing sockets from seeing it. + */ + if (IS_UNDER_IPMP(ipif->ipif_ill)) + flags &= ~RTSQ_NORMAL; + } + /* * If the request is DELETE, send RTM_DELETE and RTM_DELADDR. * if the request is ADD, send RTM_NEWADDR and RTM_ADD. @@ -1827,7 +1889,7 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif) ifam->ifam_metric = ipif->ipif_metric; ifam->ifam_flags = ((cmd == RTM_ADD) ? RTF_UP : 0); ifam->ifam_addrs = rtm_addrs; - rts_queue_input(mp, NULL, af, ipst); + rts_queue_input(mp, NULL, af, flags, ipst); } if ((cmd == RTM_ADD && pass == 2) || (cmd == RTM_DELETE && pass == 1)) { @@ -1857,7 +1919,7 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif) if (error == 0) rtm->rtm_flags |= RTF_DONE; rtm->rtm_addrs = rtm_addrs; - rts_queue_input(mp, NULL, af, ipst); + rts_queue_input(mp, NULL, af, flags, ipst); } } } diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index 59ddb7461f..5afa70160d 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -2322,11 +2322,8 @@ ipcl_conn_cleanup(conn_t *connp) * We should replace these pointers with ifindex/ipaddr_t to * make the code less complex. */ - ASSERT(connp->conn_xmit_if_ill == NULL); - ASSERT(connp->conn_nofailover_ill == NULL); ASSERT(connp->conn_outgoing_ill == NULL); ASSERT(connp->conn_incoming_ill == NULL); - ASSERT(connp->conn_outgoing_pill == NULL); ASSERT(connp->conn_multicast_ipif == NULL); ASSERT(connp->conn_multicast_ill == NULL); #endif diff --git a/usr/src/uts/common/inet/ip/ipmp.c b/usr/src/uts/common/inet/ip/ipmp.c new file mode 100644 index 0000000000..b8f3768834 --- /dev/null +++ b/usr/src/uts/common/inet/ip/ipmp.c @@ -0,0 +1,2201 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <inet/arp.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> +#include <inet/ip_multi.h> +#include <inet/ip_rts.h> +#include <inet/mi.h> +#include <net/if_types.h> +#include <sys/dlpi.h> +#include <sys/kmem.h> +#include <sys/modhash.h> +#include <sys/sdt.h> +#include <sys/strsun.h> +#include <sys/sunddi.h> +#include <sys/types.h> + +/* + * Convenience macros for getting the ip_stack_t associated with an + * ipmp_illgrp_t or ipmp_grp_t. + */ +#define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint) +#define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst) + +/* + * Assorted constants that aren't important enough to be tunable. + */ +#define IPMP_GRP_HASH_SIZE 64 +#define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */ + +/* + * Templates for IPMP ARP messages. + */ +static const arie_t ipmp_aract_template = { + AR_IPMP_ACTIVATE, + sizeof (arie_t), /* Name offset */ + sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ +}; + +static const arie_t ipmp_ardeact_template = { + AR_IPMP_DEACTIVATE, + sizeof (arie_t), /* Name offset */ + sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ +}; + +/* + * IPMP meta-interface kstats (based on those in PSARC/1997/198). + */ +static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = { + { "obytes", KSTAT_DATA_UINT32 }, + { "obytes64", KSTAT_DATA_UINT64 }, + { "rbytes", KSTAT_DATA_UINT32 }, + { "rbytes64", KSTAT_DATA_UINT64 }, + { "opackets", KSTAT_DATA_UINT32 }, + { "opackets64", KSTAT_DATA_UINT64 }, + { "oerrors", KSTAT_DATA_UINT32 }, + { "ipackets", KSTAT_DATA_UINT32 }, + { "ipackets64", KSTAT_DATA_UINT64 }, + { "ierrors", KSTAT_DATA_UINT32 }, + { "multircv", KSTAT_DATA_UINT32 }, + { "multixmt", KSTAT_DATA_UINT32 }, + { "brdcstrcv", KSTAT_DATA_UINT32 }, + { "brdcstxmt", KSTAT_DATA_UINT32 }, + { "link_up", KSTAT_DATA_UINT32 } +}; + +static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t); +static int ipmp_grp_create_kstats(ipmp_grp_t *); +static int ipmp_grp_update_kstats(kstat_t *, int); +static void ipmp_grp_destroy_kstats(ipmp_grp_t *); +static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *); +static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *); +static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *); +static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t); +static boolean_t ipmp_ill_activate(ill_t *); +static void ipmp_ill_deactivate(ill_t *); +static void ipmp_ill_ire_mark_testhidden(ire_t *, char *); +static void ipmp_ill_ire_clear_testhidden(ire_t *, char *); +static void ipmp_ill_refresh_active_timer_start(ill_t *); +static void ipmp_ill_rtsaddrmsg(ill_t *, int); +static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action); +static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t); +static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *); +static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *); + +/* + * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init(). + */ +void +ipmp_init(ip_stack_t *ipst) +{ + ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash", + IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, + mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); + rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0); +} + +/* + * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini(). + */ +void +ipmp_destroy(ip_stack_t *ipst) +{ + mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash); + rw_destroy(&ipst->ips_ipmp_lock); +} + +/* + * Create an IPMP group named `grname', associate it with IPMP phyint `phyi', + * and add it to the hash. On success, return a pointer to the created group. + * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP + * meta-interface associated with the group also has the same name (but they + * may differ later via ipmp_grp_rename()). + */ +ipmp_grp_t * +ipmp_grp_create(const char *grname, phyint_t *phyi) +{ + ipmp_grp_t *grp; + ip_stack_t *ipst = PHYINT_TO_IPST(phyi); + mod_hash_hndl_t mh; + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL) + return (NULL); + + (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); + (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname)); + + /* + * Cache the group's phyint. This is safe since a phyint_t will + * outlive its ipmp_grp_t. + */ + grp->gr_phyint = phyi; + + /* + * Create IPMP group kstats. + */ + if (ipmp_grp_create_kstats(grp) != 0) { + kmem_free(grp, sizeof (ipmp_grp_t)); + return (NULL); + } + + /* + * Insert the group into the hash. + */ + if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) { + ipmp_grp_destroy_kstats(grp); + kmem_free(grp, sizeof (ipmp_grp_t)); + return (NULL); + } + ipmp_grp_insert(grp, mh); + + return (grp); +} + +/* + * Create IPMP kstat structures for `grp'. Return an errno upon failure. + */ +static int +ipmp_grp_create_kstats(ipmp_grp_t *grp) +{ + kstat_t *ksp; + netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; + + ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net", + KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id); + if (ksp == NULL) + return (ENOMEM); + + ksp->ks_update = ipmp_grp_update_kstats; + ksp->ks_private = grp; + bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats)); + + kstat_install(ksp); + grp->gr_ksp = ksp; + return (0); +} + +/* + * Update the IPMP kstats tracked by `ksp'; called by the kstats framework. + */ +static int +ipmp_grp_update_kstats(kstat_t *ksp, int rw) +{ + uint_t i; + kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); + ipmp_grp_t *grp = ksp->ks_private; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq; + phyint_t *phyi; + uint64_t phyi_kstats[IPMP_KSTAT_MAX]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Start with the group's baseline values. + */ + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + if (kn[i].data_type == KSTAT_DATA_UINT32) { + kn[i].value.ui32 = grp->gr_kstats0[i]; + } else { + ASSERT(kn[i].data_type == KSTAT_DATA_UINT64); + kn[i].value.ui64 = grp->gr_kstats0[i]; + } + } + + /* + * Add in the stats of each phyint currently in the group. Since we + * don't directly track the phyints in a group, we cheat by walking + * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while + * ill_g_lock is held.) + */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + ipsq = grp_ipsq->ipsq_next; + for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) { + phyi = ipsq->ipsq_phyint; + + /* + * If a phyint in a group is being unplumbed, it's possible + * that ill_glist_delete() -> phyint_free() already freed the + * phyint (and set ipsq_phyint to NULL), but the unplumb + * operation has yet to complete (and thus ipsq_dq() has yet + * to remove the phyint's IPSQ from the group IPSQ's phyint + * list). We skip those phyints here (note that their kstats + * have already been added to gr_kstats0[]). + */ + if (phyi == NULL) + continue; + + ipmp_phyint_get_kstats(phyi, phyi_kstats); + + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + phyi_kstats[i] -= phyi->phyint_kstats0[i]; + if (kn[i].data_type == KSTAT_DATA_UINT32) + kn[i].value.ui32 += phyi_kstats[i]; + else + kn[i].value.ui64 += phyi_kstats[i]; + } + } + + kn[IPMP_KSTAT_LINK_UP].value.ui32 = + (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0; + + rw_exit(&ipst->ips_ill_g_lock); + return (0); +} + +/* + * Destroy IPMP kstat structures for `grp'. + */ +static void +ipmp_grp_destroy_kstats(ipmp_grp_t *grp) +{ + netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; + + kstat_delete_netstack(grp->gr_ksp, id); + bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0)); + grp->gr_ksp = NULL; +} + +/* + * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it + * does not exist. + */ +ipmp_grp_t * +ipmp_grp_lookup(const char *grname, ip_stack_t *ipst) +{ + ipmp_grp_t *grp; + + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, + (mod_hash_val_t *)&grp) == 0) + return (grp); + + return (NULL); +} + +/* + * Place information about group `grp' into `lifgr'. + */ +void +ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr) +{ + ill_t *ill; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + lifgr->gi_v4 = (grp->gr_v4 != NULL); + lifgr->gi_v6 = (grp->gr_v6 != NULL); + lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4; + lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6; + lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP; + (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ); + lifgr->gi_m4ifname[0] = '\0'; + lifgr->gi_m6ifname[0] = '\0'; + lifgr->gi_bcifname[0] = '\0'; + + if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) { + (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ); + (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ); + } + + if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL) + (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ); +} + +/* + * Insert `grp' into the hash using the reserved hash entry `mh'. + * Caller must ensure `grp' is not yet in the hash. + */ +static void +ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh) +{ + int err; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + /* + * Since grp->gr_name will exist at least as long as `grp' is in the + * hash, we use it directly as the key. + */ + err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash, + (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh); + if (err != 0) { + /* + * This should never happen since `mh' was preallocated. + */ + panic("cannot insert IPMP group \"%s\" (err %d)", + grp->gr_name, err); + } +} + +/* + * Remove `grp' from the hash. Caller must ensure `grp' is in it. + */ +static void +ipmp_grp_remove(ipmp_grp_t *grp) +{ + int err; + mod_hash_val_t val; + mod_hash_key_t key = (mod_hash_key_t)grp->gr_name; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val); + if (err != 0 || val != grp) { + panic("cannot remove IPMP group \"%s\" (err %d)", + grp->gr_name, err); + } +} + +/* + * Attempt to rename `grp' to new name `grname'. Return an errno if the new + * group name already exists or is invalid, or if there isn't enough memory. + */ +int +ipmp_grp_rename(ipmp_grp_t *grp, const char *grname) +{ + mod_hash_hndl_t mh; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if (grname[0] == '\0') + return (EINVAL); + + if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, + (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND) + return (EEXIST); + + /* + * Before we remove the group from the hash, ensure we'll be able to + * re-insert it by reserving space. + */ + if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) + return (ENOMEM); + + ipmp_grp_remove(grp); + (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); + ipmp_grp_insert(grp, mh); + + return (0); +} + +/* + * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in + * the hash, and that there are no interfaces on it. + */ +void +ipmp_grp_destroy(ipmp_grp_t *grp) +{ + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + /* + * If there are still interfaces using this group, panic before things + * go really off the rails. + */ + if (grp->gr_nif != 0) + panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name); + + ipmp_grp_remove(grp); + ipmp_grp_destroy_kstats(grp); + + ASSERT(grp->gr_v4 == NULL); + ASSERT(grp->gr_v6 == NULL); + ASSERT(grp->gr_nv4 == 0); + ASSERT(grp->gr_nv6 == 0); + ASSERT(grp->gr_nactif == 0); + ASSERT(grp->gr_linkdownmp == NULL); + grp->gr_phyint = NULL; + + kmem_free(grp, sizeof (ipmp_grp_t)); +} + +/* + * Check whether `ill' is suitable for inclusion into `grp', and return an + * errno describing the problem (if any). NOTE: many of these errno values + * are interpreted by ifconfig, which will take corrective action and retry + * the SIOCSLIFGROUPNAME, so please exercise care when changing them. + */ +static int +ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill) +{ + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + /* + * To sidestep complicated address migration logic in the kernel and + * to force the kernel's all-hosts multicast memberships to be blown + * away, all addresses that had been brought up must be brought back + * down prior to adding an interface to a group. (This includes + * addresses currently down due to DAD.) Once the interface has been + * added to the group, its addresses can then be brought back up, at + * which point they will be moved to the IPMP meta-interface. + * NOTE: we do this before ill_appaddr_cnt() since bringing down the + * link-local causes in.ndpd to remove its ADDRCONF'd addresses. + */ + if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) + return (EADDRINUSE); + + /* + * To avoid confusing applications by changing addresses that are + * under their control, all such control must be removed prior to + * adding an interface into a group. + */ + if (ill_appaddr_cnt(ill) != 0) + return (EADDRNOTAVAIL); + + /* + * Since PTP addresses do not share the same broadcast domain, they + * are not allowed to be in an IPMP group. + */ + if (ill_ptpaddr_cnt(ill) != 0) + return (EINVAL); + + /* + * An ill must support multicast to be allowed into a group. + */ + if (!(ill->ill_flags & ILLF_MULTICAST)) + return (ENOTSUP); + + /* + * An ill must strictly be using ARP and/or ND for address + * resolution for it to be allowed into a group. + */ + if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV)) + return (ENOTSUP); + + /* + * An ill cannot also be using usesrc groups. (Although usesrc uses + * ill_g_usesrc_lock, we don't need to grab it since usesrc also does + * all its modifications as writer.) + */ + if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill)) + return (ENOTSUP); + + /* + * All ills in a group must be the same mactype. + */ + if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype) + return (EINVAL); + + return (0); +} + +/* + * Check whether `phyi' is suitable for inclusion into `grp', and return an + * errno describing the problem (if any). See comment above ipmp_grp_vet_ill() + * regarding errno values. + */ +int +ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi) +{ + int err = 0; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq)); + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + /* + * An interface cannot have address families plumbed that are not + * configured in the group. + */ + if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL || + phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL) + return (EAFNOSUPPORT); + + if (phyi->phyint_illv4 != NULL) + err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4); + if (err == 0 && phyi->phyint_illv6 != NULL) + err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6); + + return (err); +} + +/* + * Create a new illgrp on IPMP meta-interface `ill'. + */ +ipmp_illgrp_t * +ipmp_illgrp_create(ill_t *ill) +{ + uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; + ipmp_illgrp_t *illg; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_IPMP(ill)); + ASSERT(ill->ill_grp == NULL); + + if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL) + return (NULL); + + list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode)); + list_create(&illg->ig_actif, sizeof (ill_t), + offsetof(ill_t, ill_actnode)); + list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t), + offsetof(ipmp_arpent_t, ia_node)); + + illg->ig_ipmp_ill = ill; + ill->ill_grp = illg; + ipmp_illgrp_set_mtu(illg, mtu); + + return (illg); +} + +/* + * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface. + */ +void +ipmp_illgrp_destroy(ipmp_illgrp_t *illg) +{ + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + ASSERT(IS_IPMP(illg->ig_ipmp_ill)); + + /* + * Verify `illg' is empty. + */ + ASSERT(illg->ig_next_ill == NULL); + ASSERT(illg->ig_cast_ill == NULL); + ASSERT(list_is_empty(&illg->ig_arpent)); + ASSERT(list_is_empty(&illg->ig_if)); + ASSERT(list_is_empty(&illg->ig_actif)); + ASSERT(illg->ig_nactif == 0); + + /* + * Destroy `illg'. + */ + illg->ig_ipmp_ill->ill_grp = NULL; + illg->ig_ipmp_ill = NULL; + list_destroy(&illg->ig_if); + list_destroy(&illg->ig_actif); + list_destroy(&illg->ig_arpent); + kmem_free(illg, sizeof (ipmp_illgrp_t)); +} + +/* + * Add `ipif' to the pool of usable data addresses on `illg' and attempt to + * bind it to an underlying ill, while keeping an even address distribution. + * If the bind is successful, return a pointer to the bound ill. + */ +ill_t * +ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) +{ + ill_t *minill; + ipmp_arpent_t *entp; + + ASSERT(IAM_WRITER_IPIF(ipif)); + ASSERT(ipmp_ipif_is_dataaddr(ipif)); + + /* + * IPMP data address mappings are internally managed by IP itself, so + * delete any existing ARP entries associated with the address. + */ + if (!ipif->ipif_isv6) { + entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr); + if (entp != NULL) + ipmp_illgrp_destroy_arpent(illg, entp); + } + + if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) + ipmp_ill_bind_ipif(minill, ipif, Res_act_none); + + return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL); +} + +/* + * Delete `ipif' from the pool of usable data addresses on `illg'. If it's + * bound, unbind it from the underlying ill while keeping an even address + * distribution. + */ +void +ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) +{ + ill_t *maxill, *boundill = ipif->ipif_bound_ill; + + ASSERT(IAM_WRITER_IPIF(ipif)); + + if (boundill != NULL) { + (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE); + + maxill = ipmp_illgrp_max_ill(illg); + if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) { + ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); + ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind); + } + } +} + +/* + * Return the active ill with the greatest number of data addresses in `illg'. + */ +static ill_t * +ipmp_illgrp_max_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill, *bestill = NULL; + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + ill = list_head(&illg->ig_actif); + for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { + if (bestill == NULL || + ill->ill_bound_cnt > bestill->ill_bound_cnt) { + bestill = ill; + } + } + return (bestill); +} + +/* + * Return the active ill with the fewest number of data addresses in `illg'. + */ +static ill_t * +ipmp_illgrp_min_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill, *bestill = NULL; + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + ill = list_head(&illg->ig_actif); + for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { + if (bestill == NULL || + ill->ill_bound_cnt < bestill->ill_bound_cnt) { + if (ill->ill_bound_cnt == 0) + return (ill); /* can't get better */ + bestill = ill; + } + } + return (bestill); +} + +/* + * Return a pointer to IPMP meta-interface for `illg' (which must exist). + * Since ig_ipmp_ill never changes for a given illg, no locks are needed. + */ +ill_t * +ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg) +{ + return (illg->ig_ipmp_ill); +} + +/* + * Return a pointer to the next available underlying ill in `illg', or NULL if + * one doesn't exist. Caller must be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_next_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + if ((ill = illg->ig_next_ill) != NULL) { + illg->ig_next_ill = list_next(&illg->ig_actif, ill); + if (illg->ig_next_ill == NULL) + illg->ig_next_ill = list_head(&illg->ig_actif); + } + rw_exit(&ipst->ips_ipmp_lock); + + return (ill); +} + +/* + * Return a held pointer to the next available underlying ill in `illg', or + * NULL if one doesn't exist. Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill; + uint_t i; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + for (i = 0; i < illg->ig_nactif; i++) { + ill = illg->ig_next_ill; + illg->ig_next_ill = list_next(&illg->ig_actif, ill); + if (illg->ig_next_ill == NULL) + illg->ig_next_ill = list_head(&illg->ig_actif); + + if (ILL_CAN_LOOKUP(ill)) { + ill_refhold(ill); + rw_exit(&ipst->ips_ipmp_lock); + return (ill); + } + } + rw_exit(&ipst->ips_ipmp_lock); + + return (NULL); +} + +/* + * Return a pointer to the nominated multicast ill in `illg', or NULL if one + * doesn't exist. Caller must be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg) +{ + /* + * Since an IPMP ill's ill_grp gets cleared during I_PUNLINK but + * this function can get called after that point, handle NULL. + */ + if (illg == NULL) + return (NULL); + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + return (illg->ig_cast_ill); +} + +/* + * Return a held pointer to the nominated multicast ill in `illg', or NULL if + * one doesn't exist. Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) +{ + ill_t *castill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + castill = illg->ig_cast_ill; + if (castill != NULL && ILL_CAN_LOOKUP(castill)) { + ill_refhold(castill); + rw_exit(&ipst->ips_ipmp_lock); + return (castill); + } + rw_exit(&ipst->ips_ipmp_lock); + return (NULL); +} + +/* + * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL, + * any existing nomination is removed. Caller must be inside the IPSQ. + */ +static void +ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) +{ + ill_t *ocastill = illg->ig_cast_ill; + ill_t *ipmp_ill = illg->ig_ipmp_ill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ipmp_ill)); + + /* + * Disable old nominated ill (if any). + */ + if (ocastill != NULL) { + DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *, + illg, ill_t *, ocastill); + ASSERT(ocastill->ill_nom_cast); + ocastill->ill_nom_cast = B_FALSE; + /* + * If the IPMP meta-interface is down, we never did the join, + * so we must not try to leave. + */ + if (ipmp_ill->ill_dl_up) + ill_leave_multicast(ipmp_ill); + } + + /* + * Set new nomination. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + illg->ig_cast_ill = castill; + rw_exit(&ipst->ips_ipmp_lock); + + if (ocastill != NULL) { + /* + * Delete any IREs tied to the old nomination. We must do + * this after the new castill is set and has reached global + * visibility since the datapath has not been quiesced. + */ + ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, + ill_stq_cache_delete, ocastill, ocastill); + } + + /* + * Enable new nominated ill (if any). + */ + if (castill != NULL) { + DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *, + illg, ill_t *, castill); + ASSERT(!castill->ill_nom_cast); + castill->ill_nom_cast = B_TRUE; + /* + * If the IPMP meta-interface is down, the attempt to recover + * will silently fail but ill_need_recover_multicast will be + * erroneously cleared -- so check first. + */ + if (ipmp_ill->ill_dl_up) + ill_recover_multicast(ipmp_ill); + } + + /* + * For IPv4, refresh our broadcast IREs. This needs to be done even + * if there's no new nomination since ill_refresh_bcast() still must + * update the IPMP meta-interface's broadcast IREs to point back at + * the IPMP meta-interface itself. + */ + if (!ipmp_ill->ill_isv6) + ill_refresh_bcast(ipmp_ill); +} + +/* + * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an + * entry for the same IP address already exists, destroy it first. Return the + * created IPMP ARP entry, or NULL on failure. + */ +ipmp_arpent_t * +ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp) +{ + uchar_t *addrp; + area_t *area = (area_t *)mp->b_rptr; + ipmp_arpent_t *entp, *oentp; + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t)); + + if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL) + return (NULL); + + if ((mp = copyb(mp)) == NULL) { + kmem_free(entp, sizeof (ipmp_arpent_t)); + return (NULL); + } + + DB_TYPE(mp) = M_PROTO; + entp->ia_area_mp = mp; + entp->ia_proxyarp = proxyarp; + addrp = mi_offset_paramc(mp, area->area_proto_addr_offset, + sizeof (ipaddr_t)); + bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t)); + + if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL) + ipmp_illgrp_destroy_arpent(illg, oentp); + + list_insert_head(&illg->ig_arpent, entp); + return (entp); +} + +/* + * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it. + */ +void +ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) +{ + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + list_remove(&illg->ig_arpent, entp); + freeb(entp->ia_area_mp); + kmem_free(entp, sizeof (ipmp_arpent_t)); +} + +/* + * Mark that ARP has been notified about the IP address on `entp'; `illg' is + * taken as a debugging aid for DTrace FBT probes. + */ +/* ARGSUSED */ +void +ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) +{ + entp->ia_notified = B_TRUE; +} + +/* + * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is + * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist. + */ +ipmp_arpent_t * +ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp) +{ + ipmp_arpent_t *entp = list_head(&illg->ig_arpent); + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + if (addrp == NULL) + return (entp); + + for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) + if (entp->ia_ipaddr == *addrp) + break; + return (entp); +} + +/* + * Refresh ARP entries on `illg' to be distributed across its active + * interfaces. Entries that cannot be refreshed (e.g., because there are no + * active interfaces) are marked so that subsequent calls can try again. + */ +void +ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) +{ + ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill; + uint_t paddrlen = ipmp_ill->ill_phys_addr_length; + area_t *area; + mblk_t *area_mp; + uchar_t *physaddr; + ipmp_arpent_t *entp; + + ASSERT(IAM_WRITER_ILL(ipmp_ill)); + ASSERT(!ipmp_ill->ill_isv6); + + ill = list_head(&illg->ig_actif); + entp = list_head(&illg->ig_arpent); + for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) { + if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) { + entp->ia_notified = B_FALSE; + continue; + } + + area = (area_t *)entp->ia_area_mp->b_rptr; + ASSERT(paddrlen == ill->ill_phys_addr_length); + ASSERT(paddrlen == area->area_hw_addr_length); + physaddr = mi_offset_paramc(entp->ia_area_mp, + area->area_hw_addr_offset, paddrlen); + + /* + * If this is a proxy ARP entry, we can skip notifying ARP if + * the entry is already up-to-date. If it has changed, we + * update the entry's hardware address before notifying ARP. + */ + if (entp->ia_proxyarp) { + if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 && + entp->ia_notified) + continue; + bcopy(ill->ill_phys_addr, physaddr, paddrlen); + } + + if ((area_mp = copyb(entp->ia_area_mp)) == NULL) { + entp->ia_notified = B_FALSE; + continue; + } + + putnext(ipmp_ill->ill_rq, area_mp); + ipmp_illgrp_mark_arpent(illg, entp); + + if ((ill = list_next(&illg->ig_actif, ill)) == NULL) + ill = list_head(&illg->ig_actif); + } +} + +/* + * Return an interface in `illg' with the specified `physaddr', or NULL if one + * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ. + */ +ill_t * +ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen) +{ + ill_t *ill; + ill_t *ipmp_ill = illg->ig_ipmp_ill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock)); + + ill = list_head(&illg->ig_if); + for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { + if (ill->ill_phys_addr_length == paddrlen && + bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0) + return (ill); + } + return (NULL); +} + +/* + * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND. + * Caller must be inside the IPSQ unless this is initialization. + */ +static void +ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu) +{ + ill_t *ill = illg->ig_ipmp_ill; + mblk_t *mp; + + ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill)); + + /* + * If allocation fails, we have bigger problems than MTU. + */ + if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) { + illg->ig_mtu = mtu; + put(ill->ill_rq, mp); + } +} + +/* + * Recalculate the IPMP group MTU for `illg', and update its associated IPMP + * ill MTU if necessary. + */ +void +ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg) +{ + ill_t *ill; + ill_t *ipmp_ill = illg->ig_ipmp_ill; + uint_t mtu = 0; + + ASSERT(IAM_WRITER_ILL(ipmp_ill)); + + /* + * Since ill_max_mtu can only change under ill_lock, we hold ill_lock + * for each ill as we iterate through the list. Any changes to the + * ill_max_mtu will also trigger an update, so even if we missed it + * this time around, the update will catch it. + */ + ill = list_head(&illg->ig_if); + for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { + mutex_enter(&ill->ill_lock); + if (mtu == 0 || ill->ill_max_mtu < mtu) + mtu = ill->ill_max_mtu; + mutex_exit(&ill->ill_lock); + } + + /* + * MTU must be at least the minimum MTU. + */ + mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); + + if (illg->ig_mtu != mtu) + ipmp_illgrp_set_mtu(illg, mtu); +} + +/* + * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently + * allow the same link to be established more than once. + */ +void +ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp) +{ + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if (illg->ig_ipmp_ill->ill_isv6) { + ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg); + grp->gr_v6 = illg; + } else { + ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg); + grp->gr_v4 = illg; + } +} + +/* + * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp + * cannot be unlinked (e.g., because there are still interfaces using it). + */ +int +ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg) +{ + ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if (illg->ig_ipmp_ill->ill_isv6) { + if (grp->gr_nv6 + grp->gr_pendv6 != 0) + return (EBUSY); + grp->gr_v6 = NULL; + } else { + if (grp->gr_nv4 + grp->gr_pendv4 != 0) + return (EBUSY); + grp->gr_v4 = NULL; + } + return (0); +} + +/* + * Place `ill' into `illg', and rebalance the data addresses on `illg' + * to be spread evenly across the ills now in it. Also, adjust the IPMP + * ill as necessary to account for `ill' (e.g., MTU). + */ +void +ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) +{ + ill_t *ipmp_ill; + ipif_t *ipif; + ip_stack_t *ipst = ill->ill_ipst; + + /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */ + ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL); + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(ill->ill_grp == NULL); + + ipmp_ill = illg->ig_ipmp_ill; + + /* + * Account for `ill' joining the illgrp. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + if (ill->ill_isv6) + ill->ill_phyint->phyint_grp->gr_nv6++; + else + ill->ill_phyint->phyint_grp->gr_nv4++; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * Ensure the ILLF_ROUTER flag remains consistent across the group. + */ + mutex_enter(&ill->ill_lock); + if (ipmp_ill->ill_flags & ILLF_ROUTER) + ill->ill_flags |= ILLF_ROUTER; + else + ill->ill_flags &= ~ILLF_ROUTER; + mutex_exit(&ill->ill_lock); + + /* + * Blow away all multicast memberships that currently exist on `ill'. + * This may seem odd, but it's consistent with the application view + * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()). + */ + if (ill->ill_isv6) { + reset_conn_ill(ill); + reset_mrt_ill(ill); + } else { + ipif = ill->ill_ipif; + for (; ipif != NULL; ipif = ipif->ipif_next) { + reset_conn_ipif(ipif); + reset_mrt_vif_ipif(ipif); + } + } + ip_purge_allmulti(ill); + + /* + * Borrow the first ill's ill_phys_addr_length value for the illgrp's + * physical address length. All other ills must have the same value, + * since they are required to all be the same mactype. Also update + * the IPMP ill's MTU and CoS marking, if necessary. + */ + if (list_is_empty(&illg->ig_if)) { + ASSERT(ipmp_ill->ill_phys_addr_length == 0); + /* + * NOTE: we leave ill_phys_addr NULL since the IPMP group + * doesn't have a physical address. This means that code must + * not assume that ill_phys_addr is non-NULL just because + * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla. + */ + ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length; + ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length; + ipmp_ill->ill_type = ill->ill_type; + + if (ill->ill_flags & ILLF_COS_ENABLED) { + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags |= ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } + ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); + } else { + ASSERT(ipmp_ill->ill_phys_addr_length == + ill->ill_phys_addr_length); + ASSERT(ipmp_ill->ill_type == ill->ill_type); + + if (!(ill->ill_flags & ILLF_COS_ENABLED)) { + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } + if (illg->ig_mtu > ill->ill_max_mtu) + ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); + } + + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + list_insert_tail(&illg->ig_if, ill); + ill->ill_grp = illg; + rw_exit(&ipst->ips_ill_g_lock); + + /* + * Hide the IREs on `ill' so that we don't accidentally find them when + * sending data traffic. + */ + ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill); + + /* + * Merge any broadcast IREs, if need be. + */ + if (!ill->ill_isv6) + ill_refresh_bcast(ill); + + ipmp_ill_refresh_active(ill); +} + +/* + * Remove `ill' from its illgrp, and rebalance the data addresses in that + * illgrp to be spread evenly across the remaining ills. Also, adjust the + * IPMP ill as necessary now that `ill' is removed (e.g., MTU). + */ +void +ipmp_ill_leave_illgrp(ill_t *ill) +{ + ill_t *ipmp_ill; + ipif_t *ipif; + ipmp_arpent_t *entp; + ipmp_illgrp_t *illg = ill->ill_grp; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IS_UNDER_IPMP(ill)); + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(illg != NULL); + + ipmp_ill = illg->ig_ipmp_ill; + + /* + * Cancel IPMP-specific ill timeouts. + */ + (void) untimeout(ill->ill_refresh_tid); + + /* + * Expose any previously-hidden IREs on `ill'. + */ + ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill); + + /* + * Ensure the multicast state for each ipif on `ill' is down so that + * our ipif_multicast_up() (once `ill' leaves the group) will rejoin + * all eligible groups. + */ + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + if (ipif->ipif_flags & IPIF_UP) + ipif_multicast_down(ipif); + + /* + * Account for `ill' leaving the illgrp. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + if (ill->ill_isv6) + ill->ill_phyint->phyint_grp->gr_nv6--; + else + ill->ill_phyint->phyint_grp->gr_nv4--; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * Pull `ill' out of the interface lists. + */ + if (list_link_active(&ill->ill_actnode)) + ipmp_ill_deactivate(ill); + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + list_remove(&illg->ig_if, ill); + ill->ill_grp = NULL; + rw_exit(&ipst->ips_ill_g_lock); + + /* + * Recreate any broadcast IREs that had been shared, if need be. + */ + if (!ill->ill_isv6) + ill_refresh_bcast(ill); + + /* + * Re-establish multicast memberships that were previously being + * handled by the IPMP meta-interface. + */ + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + if (ipif->ipif_flags & IPIF_UP) + ipif_multicast_up(ipif); + + /* + * Refresh the group MTU based on the new interface list. + */ + ipmp_illgrp_refresh_mtu(illg); + + if (list_is_empty(&illg->ig_if)) { + /* + * No ills left in the illgrp; we no longer have a physical + * address length, nor can we support ARP, CoS, or anything + * else that depends on knowing the link layer type. + */ + while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL) + ipmp_illgrp_destroy_arpent(illg, entp); + + ipmp_ill->ill_phys_addr_length = 0; + ipmp_ill->ill_nd_lla_len = 0; + ipmp_ill->ill_type = IFT_OTHER; + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } else { + /* + * If `ill' didn't support CoS, see if it can now be enabled. + */ + if (!(ill->ill_flags & ILLF_COS_ENABLED)) { + ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED)); + + ill = list_head(&illg->ig_if); + do { + if (!(ill->ill_flags & ILLF_COS_ENABLED)) + break; + } while ((ill = list_next(&illg->ig_if, ill)) != NULL); + + if (ill == NULL) { + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags |= ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } + } + } +} + +/* + * Check if `ill' should be active, and activate or deactivate if need be. + * Return B_FALSE if a refresh was necessary but could not be performed. + */ +static boolean_t +ipmp_ill_try_refresh_active(ill_t *ill) +{ + boolean_t refreshed = B_TRUE; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + if (ipmp_ill_is_active(ill)) { + if (!list_link_active(&ill->ill_actnode)) + refreshed = ipmp_ill_activate(ill); + } else { + if (list_link_active(&ill->ill_actnode)) + ipmp_ill_deactivate(ill); + } + + return (refreshed); +} + +/* + * Check if `ill' should be active, and activate or deactivate if need be. + * If the refresh fails, schedule a timer to try again later. + */ +void +ipmp_ill_refresh_active(ill_t *ill) +{ + if (!ipmp_ill_try_refresh_active(ill)) + ipmp_ill_refresh_active_timer_start(ill); +} + +/* + * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'. + */ +static void +ipmp_ill_refresh_active_timer(void *ill_arg) +{ + ill_t *ill = ill_arg; + boolean_t refreshed = B_FALSE; + + /* + * Clear ill_refresh_tid to indicate that no timeout is pending + * (another thread could schedule a new timeout while we're still + * running, but that's harmless). If the ill is going away, bail. + */ + mutex_enter(&ill->ill_lock); + ill->ill_refresh_tid = 0; + if (ill->ill_state_flags & ILL_CONDEMNED) { + mutex_exit(&ill->ill_lock); + return; + } + mutex_exit(&ill->ill_lock); + + if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) { + refreshed = ipmp_ill_try_refresh_active(ill); + ipsq_exit(ill->ill_phyint->phyint_ipsq); + } + + /* + * If the refresh failed, schedule another attempt. + */ + if (!refreshed) + ipmp_ill_refresh_active_timer_start(ill); +} + +/* + * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'. + */ +static void +ipmp_ill_refresh_active_timer_start(ill_t *ill) +{ + mutex_enter(&ill->ill_lock); + + /* + * If the ill is going away or a refresh is already scheduled, bail. + */ + if (ill->ill_refresh_tid != 0 || + (ill->ill_state_flags & ILL_CONDEMNED)) { + mutex_exit(&ill->ill_lock); + return; + } + + ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill, + SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT)); + + mutex_exit(&ill->ill_lock); +} + +/* + * Activate `ill' so it will be used to send and receive data traffic. Return + * B_FALSE if `ill' cannot be activated. Note that we allocate any messages + * needed to deactivate `ill' here as well so that deactivation cannot fail. + */ +static boolean_t +ipmp_ill_activate(ill_t *ill) +{ + ipif_t *ipif; + mblk_t *actmp = NULL, *deactmp = NULL; + mblk_t *linkupmp = NULL, *linkdownmp = NULL; + ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; + const char *grifname = grp->gr_ifname; + ipmp_illgrp_t *illg = ill->ill_grp; + ill_t *maxill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + /* + * If this will be the first active interface in the group, allocate + * the link-up and link-down messages. + */ + if (grp->gr_nactif == 0) { + linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0); + linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0); + if (linkupmp == NULL || linkdownmp == NULL) + goto fail; + } + + /* + * For IPv4, allocate the activate/deactivate messages, and tell ARP. + */ + if (!ill->ill_isv6) { + actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template); + deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template); + if (actmp == NULL || deactmp == NULL) + goto fail; + + ASSERT(ill->ill_ardeact_mp == NULL); + ill->ill_ardeact_mp = deactmp; + putnext(illg->ig_ipmp_ill->ill_rq, actmp); + } + + if (list_is_empty(&illg->ig_actif)) { + /* + * Now that we have an active ill, nominate it for multicast + * and broadcast duties. Do this before ipmp_ill_bind_ipif() + * since that may need to send multicast packets (e.g., IPv6 + * neighbor discovery probes). + */ + ipmp_illgrp_set_cast(illg, ill); + + /* + * This is the first active ill in the illgrp -- add 'em all. + * We can access/walk ig_ipmp_ill's ipif list since we're + * writer on its IPSQ as well. + */ + ipif = illg->ig_ipmp_ill->ill_ipif; + for (; ipif != NULL; ipif = ipif->ipif_next) + if (ipmp_ipif_is_up_dataaddr(ipif)) + ipmp_ill_bind_ipif(ill, ipif, Res_act_initial); + } else { + /* + * Redistribute the addresses by moving them from the ill with + * the most addresses until the ill being activated is at the + * same level as the rest of the ills. + */ + for (;;) { + maxill = ipmp_illgrp_max_ill(illg); + ASSERT(maxill != NULL); + if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt) + break; + ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); + ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind); + } + + /* + * TODO: explore whether it's advantageous to flush IRE_CACHE + * bindings to force existing connections to be redistributed + * to the new ill. + */ + } + + /* + * Put the interface in the active list. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + list_insert_tail(&illg->ig_actif, ill); + illg->ig_nactif++; + illg->ig_next_ill = ill; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * Refresh ARP entries to use `ill', if need be. + */ + if (!ill->ill_isv6) + ipmp_illgrp_refresh_arpent(illg); + + /* + * Finally, mark the group link up, if necessary. + */ + if (grp->gr_nactif++ == 0) { + ASSERT(grp->gr_linkdownmp == NULL); + grp->gr_linkdownmp = linkdownmp; + put(illg->ig_ipmp_ill->ill_rq, linkupmp); + } + return (B_TRUE); +fail: + freemsg(actmp); + freemsg(deactmp); + freemsg(linkupmp); + freemsg(linkdownmp); + return (B_FALSE); +} + +/* + * Deactivate `ill' so it will not be used to send or receive data traffic. + */ +static void +ipmp_ill_deactivate(ill_t *ill) +{ + ill_t *minill; + ipif_t *ipif, *ubnextipif, *ubheadipif = NULL; + mblk_t *mp; + ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; + ipmp_illgrp_t *illg = ill->ill_grp; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + /* + * Delete IRE_CACHE entries tied to this ill before they become stale. + */ + ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, + ill_stq_cache_delete, ill, ill); + + /* + * Pull the interface out of the active list. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + list_remove(&illg->ig_actif, ill); + illg->ig_nactif--; + illg->ig_next_ill = list_head(&illg->ig_actif); + rw_exit(&ipst->ips_ipmp_lock); + + /* + * If the ill that's being deactivated had been nominated for + * multicast/broadcast, nominate a new one. + */ + if (ill == illg->ig_cast_ill) + ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif)); + + /* + * Unbind all of the ipifs bound to this ill, and save 'em in a list; + * we'll rebind them after we tell the resolver the ill is no longer + * active. We must do things in this order or the resolver could + * accidentally rebind to the ill we're trying to remove if multiple + * ills in the group have the same hardware address (which is + * unsupported, but shouldn't lead to a wedged machine). + */ + while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) { + ipif->ipif_bound_next = ubheadipif; + ubheadipif = ipif; + } + + if (!ill->ill_isv6) { + /* + * Tell ARP `ill' is no longer active in the group. + */ + mp = ill->ill_ardeact_mp; + ill->ill_ardeact_mp = NULL; + ASSERT(mp != NULL); + putnext(illg->ig_ipmp_ill->ill_rq, mp); + + /* + * Refresh any ARP entries that had been using `ill'. + */ + ipmp_illgrp_refresh_arpent(illg); + } + + /* + * Rebind each ipif from the deactivated ill to the active ill with + * the fewest ipifs. If there are no active ills, the ipifs will + * remain unbound. + */ + for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) { + ubnextipif = ipif->ipif_bound_next; + ipif->ipif_bound_next = NULL; + + if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) + ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind); + } + + /* + * Finally, mark the group link down, if necessary. + */ + if (--grp->gr_nactif == 0) { + mp = grp->gr_linkdownmp; + grp->gr_linkdownmp = NULL; + ASSERT(mp != NULL); + put(illg->ig_ipmp_ill->ill_rq, mp); + } +} + +/* + * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD) + * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners. + */ +static void +ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd) +{ + ipif_t *ipif; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE); + + /* + * If `ill' is truly down, there are no messages to generate since: + * + * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface + * and its addresses by bringing them down. But that's already + * true, so there's nothing to hide. + * + * 2. If cmd == RTM_ADD, then we're supposed to generate messages + * indicating that any previously-hidden up addresses are again + * back up (along with the interface). But they aren't, so + * there's nothing to expose. + */ + if (ill->ill_ipif_up_count == 0) + return; + + if (cmd == RTM_ADD) + ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL); + + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + if (ipif->ipif_flags & IPIF_UP) + ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL); + + if (cmd == RTM_DELETE) + ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL); +} + +/* + * Bind the address named by `ipif' to the underlying ill named by `ill'. + * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act' + * will indicate to the resolver whether this is an initial bringup of + * `ipif', or just a rebind to another ill. + */ +static void +ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) +{ + int err = 0; + ip_stack_t *ipst = ill->ill_ipst; + + ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif)); + ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill)); + ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif)); + ASSERT(ipif->ipif_bound_ill == NULL); + ASSERT(ipif->ipif_bound_next == NULL); + + ipif->ipif_bound_next = ill->ill_bound_ipif; + ill->ill_bound_ipif = ipif; + ill->ill_bound_cnt++; + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + ipif->ipif_bound_ill = ill; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * If necessary, tell ARP/NDP about the new mapping. Note that + * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills. + */ + if (act != Res_act_none) { + if (ill->ill_isv6) { + VERIFY(ipif_resolver_up(ipif, act) == 0); + err = ipif_ndp_up(ipif, act == Res_act_initial); + } else { + err = ipif_resolver_up(ipif, act); + } + + /* + * Since ipif_ndp_up() never returns EINPROGRESS and + * ipif_resolver_up() only returns EINPROGRESS when the + * associated ill is not up, we should never be here with + * EINPROGRESS. We rely on this to simplify the design. + */ + ASSERT(err != EINPROGRESS); + } + /* TODO: retry binding on failure? when? */ + ipif->ipif_bound = (err == 0); +} + +/* + * Unbind the address named by `ipif' from the underlying ill named by `ill'. + * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned. + * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is + * B_TRUE, notify the resolver about the change. + */ +static ipif_t * +ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) +{ + ill_t *ipmp_ill; + ipif_t *previpif; + ip_stack_t *ipst = ill->ill_ipst; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + ipmp_ill = ill->ill_grp->ig_ipmp_ill; + + /* + * If necessary, find an ipif to unbind. + */ + if (ipif == NULL) { + if ((ipif = ill->ill_bound_ipif) == NULL) { + ASSERT(ill->ill_bound_cnt == 0); + return (NULL); + } + } + + ASSERT(IAM_WRITER_IPIF(ipif)); + ASSERT(IS_IPMP(ipif->ipif_ill)); + ASSERT(ipif->ipif_bound_ill == ill); + ASSERT(ill->ill_bound_cnt > 0); + + /* + * Unbind it. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + ipif->ipif_bound_ill = NULL; + rw_exit(&ipst->ips_ipmp_lock); + ill->ill_bound_cnt--; + + if (ill->ill_bound_ipif == ipif) { + ill->ill_bound_ipif = ipif->ipif_bound_next; + } else { + previpif = ill->ill_bound_ipif; + while (previpif->ipif_bound_next != ipif) + previpif = previpif->ipif_bound_next; + + previpif->ipif_bound_next = ipif->ipif_bound_next; + } + ipif->ipif_bound_next = NULL; + + /* + * If requested, notify the resolvers (provided we're bound). + */ + if (notifyres && ipif->ipif_bound) { + if (ill->ill_isv6) { + ipif_ndp_down(ipif); + } else { + ASSERT(ipif->ipif_arp_del_mp != NULL); + putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp); + ipif->ipif_arp_del_mp = NULL; + } + } + ipif->ipif_bound = B_FALSE; + + return (ipif); +} + +/* + * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if + * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this + * to determine whether an ill should be considered active, other consumers + * may race and learn about an ill that should be deactivated/activated before + * IPMP has performed the activation/deactivation. This should be safe though + * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that + * would've been cleaned up by ipmp_ill_deactivate(). + */ +boolean_t +ipmp_ill_is_active(ill_t *ill) +{ + phyint_t *phyi = ill->ill_phyint; + + ASSERT(IS_UNDER_IPMP(ill)); + ASSERT(IAM_WRITER_ILL(ill) || + (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock))); + + /* + * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to + * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the + * link flapping logic to be just in in.mpathd and allows us to ignore + * changes to PHYI_RUNNING. + */ + return (!(ill->ill_ipif_up_count == 0 || + (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED)))); +} + +/* + * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet + * IREs with a source address on `ill_arg'. + */ +static void +ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) +{ + ill_t *ill = (ill_t *)ill_arg; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(!IS_IPMP(ill)); + + if (ire->ire_ipif->ipif_ill != ill) + return; + + switch (ire->ire_type) { + case IRE_HOST: + case IRE_PREFIX: + case IRE_DEFAULT: + case IRE_CACHE: + case IRE_IF_RESOLVER: + case IRE_IF_NORESOLVER: + DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); + ire->ire_marks |= IRE_MARK_TESTHIDDEN; + break; + default: + break; + } +} + +/* + * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source + * address on `ill_arg'. + */ +static void +ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) +{ + ill_t *ill = (ill_t *)ill_arg; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(!IS_IPMP(ill)); + + if (ire->ire_ipif->ipif_ill == ill) { + DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire); + ire->ire_marks &= ~IRE_MARK_TESTHIDDEN; + } +} + +/* + * Return a held pointer to the IPMP ill for underlying interface `ill', or + * NULL if one doesn't exist. (Unfortunately, this function needs to take an + * underlying ill rather than an ipmp_illgrp_t because an underlying ill's + * ill_grp pointer may become stale when not under an IPSQ and not holding + * ipmp_lock.) Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_ill_hold_ipmp_ill(ill_t *ill) +{ + ip_stack_t *ipst = ill->ill_ipst; + ipmp_illgrp_t *illg; + + ASSERT(!IS_IPMP(ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + illg = ill->ill_grp; + if (illg != NULL && ILL_CAN_LOOKUP(illg->ig_ipmp_ill)) { + ill_refhold(illg->ig_ipmp_ill); + rw_exit(&ipst->ips_ipmp_lock); + return (illg->ig_ipmp_ill); + } + /* + * Assume `ill' was removed from the illgrp in the meantime. + */ + rw_exit(&ill->ill_ipst->ips_ipmp_lock); + return (NULL); +} + +/* + * Return the interface index for the IPMP ill tied to underlying interface + * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ. + */ +uint_t +ipmp_ill_get_ipmp_ifindex(const ill_t *ill) +{ + uint_t ifindex = 0; + ip_stack_t *ipst = ill->ill_ipst; + ipmp_grp_t *grp; + + ASSERT(!IS_IPMP(ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + if ((grp = ill->ill_phyint->phyint_grp) != NULL) + ifindex = grp->gr_phyint->phyint_ifindex; + rw_exit(&ipst->ips_ipmp_lock); + return (ifindex); +} + +/* + * Place phyint `phyi' into IPMP group `grp'. + */ +void +ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp) +{ + ill_t *ill; + ipsq_t *ipsq = phyi->phyint_ipsq; + ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq; + ip_stack_t *ipst = PHYINT_TO_IPST(phyi); + + ASSERT(IAM_WRITER_IPSQ(ipsq)); + ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL); + + /* + * Send routing socket messages indicating that the phyint's ills + * and ipifs vanished. + */ + if (phyi->phyint_illv4 != NULL) { + ill = phyi->phyint_illv4; + ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); + } + + if (phyi->phyint_illv6 != NULL) { + ill = phyi->phyint_illv6; + ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); + } + + /* + * Snapshot the phyint's initial kstats as a baseline. + */ + ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0); + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + + phyi->phyint_grp = grp; + if (++grp->gr_nif == 1) + grp->gr_mactype = ill->ill_mactype; + else + ASSERT(grp->gr_mactype == ill->ill_mactype); + + /* + * Now that we're in the group, request a switch to the group's xop + * when we ipsq_exit(). All future operations will be exclusive on + * the group xop until ipmp_phyint_leave_grp() is called. + */ + ASSERT(ipsq->ipsq_swxop == NULL); + ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop); + ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop; + + rw_exit(&ipst->ips_ipmp_lock); +} + +/* + * Remove phyint `phyi' from its current IPMP group. + */ +void +ipmp_phyint_leave_grp(phyint_t *phyi) +{ + uint_t i; + ipsq_t *ipsq = phyi->phyint_ipsq; + ip_stack_t *ipst = PHYINT_TO_IPST(phyi); + uint64_t phyi_kstats[IPMP_KSTAT_MAX]; + + ASSERT(IAM_WRITER_IPSQ(ipsq)); + + /* + * If any of the phyint's ills are still in an illgrp, kick 'em out. + */ + if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4)) + ipmp_ill_leave_illgrp(phyi->phyint_illv4); + if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6)) + ipmp_ill_leave_illgrp(phyi->phyint_illv6); + + /* + * Send routing socket messages indicating that the phyint's ills + * and ipifs have reappeared. + */ + if (phyi->phyint_illv4 != NULL) + ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD); + if (phyi->phyint_illv6 != NULL) + ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD); + + /* + * Calculate the phyint's cumulative kstats while it was in the group, + * and add that to the group's baseline. + */ + ipmp_phyint_get_kstats(phyi, phyi_kstats); + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + phyi_kstats[i] -= phyi->phyint_kstats0[i]; + atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]); + } + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + + phyi->phyint_grp->gr_nif--; + phyi->phyint_grp = NULL; + + /* + * As our final act in leaving the group, request a switch back to our + * IPSQ's own xop when we ipsq_exit(). + */ + ASSERT(ipsq->ipsq_swxop == NULL); + ipsq->ipsq_swxop = &ipsq->ipsq_ownxop; + + rw_exit(&ipst->ips_ipmp_lock); +} + +/* + * Store the IPMP-related kstats for `phyi' into the array named by `kstats'. + * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements. + */ +static void +ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[]) +{ + uint_t i, j; + const char *name; + kstat_t *ksp; + kstat_named_t *kn; + + bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX); + + /* + * NOTE: ALL_ZONES here assumes that there's at most one link + * with a given name on a given system (safe for now). + */ + ksp = kstat_hold_byname("link", 0, phyi->phyint_name, ALL_ZONES); + if (ksp == NULL) + return; + + KSTAT_ENTER(ksp); + + if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { + /* + * Bring kstats up-to-date before recording. + */ + (void) KSTAT_UPDATE(ksp, KSTAT_READ); + + kn = KSTAT_NAMED_PTR(ksp); + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + name = ipmp_kstats[i].name; + kstats[i] = 0; + for (j = 0; j < ksp->ks_ndata; j++) { + if (strcmp(kn[j].name, name) != 0) + continue; + + switch (kn[j].data_type) { + case KSTAT_DATA_INT32: + case KSTAT_DATA_UINT32: + kstats[i] = kn[j].value.ui32; + break; +#ifdef _LP64 + case KSTAT_DATA_LONG: + case KSTAT_DATA_ULONG: + kstats[i] = kn[j].value.ul; + break; +#endif + case KSTAT_DATA_INT64: + case KSTAT_DATA_UINT64: + kstats[i] = kn[j].value.ui64; + break; + } + break; + } + } + } + + KSTAT_EXIT(ksp); + kstat_rele(ksp); +} + +/* + * Refresh the active state of all ills on `phyi'. + */ +void +ipmp_phyint_refresh_active(phyint_t *phyi) +{ + if (phyi->phyint_illv4 != NULL) + ipmp_ill_refresh_active(phyi->phyint_illv4); + if (phyi->phyint_illv6 != NULL) + ipmp_ill_refresh_active(phyi->phyint_illv6); +} + +/* + * Return a held pointer to the underlying ill bound to `ipif', or NULL if one + * doesn't exist. Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_ipif_hold_bound_ill(const ipif_t *ipif) +{ + ill_t *boundill; + ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + + ASSERT(IS_IPMP(ipif->ipif_ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + boundill = ipif->ipif_bound_ill; + if (boundill != NULL && ILL_CAN_LOOKUP(boundill)) { + ill_refhold(boundill); + rw_exit(&ipst->ips_ipmp_lock); + return (boundill); + } + rw_exit(&ipst->ips_ipmp_lock); + return (NULL); +} + +/* + * Return a pointer to the underlying ill bound to `ipif', or NULL if one + * doesn't exist. Caller must be inside the IPSQ. + */ +ill_t * +ipmp_ipif_bound_ill(const ipif_t *ipif) +{ + ASSERT(IAM_WRITER_ILL(ipif->ipif_ill)); + ASSERT(IS_IPMP(ipif->ipif_ill)); + + return (ipif->ipif_bound_ill); +} + +/* + * Check if `ipif' is a "stub" (placeholder address not being used). + */ +boolean_t +ipmp_ipif_is_stubaddr(const ipif_t *ipif) +{ + if (ipif->ipif_flags & IPIF_UP) + return (B_FALSE); + if (ipif->ipif_ill->ill_isv6) + return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); + else + return (ipif->ipif_lcl_addr == INADDR_ANY); +} + +/* + * Check if `ipif' is an IPMP data address. + */ +boolean_t +ipmp_ipif_is_dataaddr(const ipif_t *ipif) +{ + if (ipif->ipif_flags & IPIF_NOFAILOVER) + return (B_FALSE); + if (ipif->ipif_ill->ill_isv6) + return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); + else + return (ipif->ipif_lcl_addr != INADDR_ANY); +} + +/* + * Check if `ipif' is an IPIF_UP IPMP data address. + */ +static boolean_t +ipmp_ipif_is_up_dataaddr(const ipif_t *ipif) +{ + return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP)); +} diff --git a/usr/src/uts/common/inet/ip/rts.c b/usr/src/uts/common/inet/ip/rts.c index 4999f28d1e..2751b19993 100644 --- a/usr/src/uts/common/inet/ip/rts.c +++ b/usr/src/uts/common/inet/ip/rts.c @@ -561,7 +561,6 @@ rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) case SO_TYPE: *i1 = SOCK_RAW; break; - /* * The following three items are available here, * but are only meaningful to IP. @@ -597,6 +596,15 @@ rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) return (-1); } break; + case SOL_ROUTE: + switch (name) { + case RT_AWARE: + mutex_enter(&connp->conn_lock); + *i1 = connp->conn_rtaware; + mutex_exit(&connp->conn_lock); + break; + } + break; default: return (-1); } @@ -701,6 +709,20 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, return (EINVAL); } break; + case SOL_ROUTE: + switch (name) { + case RT_AWARE: + if (!checkonly) { + mutex_enter(&connp->conn_lock); + connp->conn_rtaware = *i1; + mutex_exit(&connp->conn_lock); + } + break; /* goto sizeof (int) option return */ + default: + *outlenp = 0; + return (EINVAL); + } + break; default: *outlenp = 0; return (EINVAL); diff --git a/usr/src/uts/common/inet/ip/rts_opt_data.c b/usr/src/uts/common/inet/ip/rts_opt_data.c index bac0eabdc4..7397b53b9e 100644 --- a/usr/src/uts/common/inet/ip/rts_opt_data.c +++ b/usr/src/uts/common/inet/ip/rts_opt_data.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -60,6 +60,7 @@ opdes_t rts_opt_arr[] = { { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, { SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ RT_AWARE, SOL_ROUTE, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, }; /* diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c index f785d8a3f6..8a3aa86d60 100644 --- a/usr/src/uts/common/inet/ip/spd.c +++ b/usr/src/uts/common/inet/ip/spd.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -3989,7 +3989,7 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h) ipsec_out_t *io; boolean_t v4; mblk_t *mp; - boolean_t secure, attach_if; + boolean_t secure; uint_t ifindex; ipsec_selector_t sel; ipsec_action_t *reflect_action = NULL; @@ -4012,7 +4012,6 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h) } else if (!ii->ipsec_in_loopback) reflect_action = ipsec_in_to_out_action(ii); secure = ii->ipsec_in_secure; - attach_if = ii->ipsec_in_attach_if; ifindex = ii->ipsec_in_ill_index; zoneid = ii->ipsec_in_zoneid; ASSERT(zoneid != ALL_ZONES); @@ -4057,7 +4056,6 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h) io->ipsec_out_proc_begin = B_FALSE; io->ipsec_out_secure = secure; io->ipsec_out_v4 = v4; - io->ipsec_out_attach_if = attach_if; io->ipsec_out_ill_index = ifindex; io->ipsec_out_zoneid = zoneid; io->ipsec_out_ns = ns; /* No netstack_hold */ @@ -4549,7 +4547,6 @@ ipsec_out_to_in(mblk_t *ipsec_mp) ii->ipsec_in_secure = B_TRUE; ii->ipsec_in_v4 = v4; ii->ipsec_in_icmp_loopback = icmp_loopback; - ii->ipsec_in_attach_if = B_FALSE; } /* diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h index d463c3f6ee..ad331d5706 100644 --- a/usr/src/uts/common/inet/ip6.h +++ b/usr/src/uts/common/inet/ip6.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -133,10 +133,8 @@ typedef struct ip6_info ip6i_t; #define IP6I_RAW_CHECKSUM 0x10 /* Compute checksum and stuff in ip6i_checksum_off */ #define IP6I_VERIFY_SRC 0x20 /* Verify ip6_src. Used when IPV6_PKTINFO */ -#define IP6I_ATTACH_IF 0x40 /* Bind to no failover address or BOUND_PIF. */ -#define IP6I_DROP_IFDELAYED 0x80 - /* Drop the packet if delayed in ndp resolver */ -#define IP6I_ND_DELAYED 0x100 /* Packet was delayed in ndp resolver */ +#define IP6I_IPMP_PROBE 0x40 /* IPMP (in.mpathd) probe packet */ + /* 0x80 - 0x100 available */ #define IP6I_DONTFRAG 0x200 /* Don't fragment this packet */ #define IP6I_HOPLIMIT 0x400 /* hoplimit has been set by the sender */ @@ -340,7 +338,7 @@ extern void icmp_time_exceeded_v6(queue_t *, mblk_t *, uint8_t, extern void icmp_unreachable_v6(queue_t *, mblk_t *, uint8_t, boolean_t, boolean_t, zoneid_t, ip_stack_t *); extern void icmp_inbound_error_fanout_v6(queue_t *, mblk_t *, ip6_t *, - icmp6_t *, ill_t *, boolean_t, zoneid_t); + icmp6_t *, ill_t *, ill_t *, boolean_t, zoneid_t); extern boolean_t conn_wantpacket_v6(conn_t *, ill_t *, ip6_t *, int, zoneid_t); extern mblk_t *ip_add_info_v6(mblk_t *, ill_t *, const in6_addr_t *); extern in6addr_scope_t ip_addr_scope_v6(const in6_addr_t *); @@ -382,7 +380,7 @@ extern int ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t, ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t, const in6_addr_t *, mblk_t *); extern void ip_newroute_ipif_v6(queue_t *, mblk_t *, ipif_t *, - in6_addr_t, int, zoneid_t); + const in6_addr_t *, const in6_addr_t *, int, zoneid_t); extern void ip_newroute_v6(queue_t *, mblk_t *, const in6_addr_t *, const in6_addr_t *, ill_t *, zoneid_t, ip_stack_t *); extern void *ip6_kstat_init(netstackid_t, ip6_stat_t *); diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h index c5982de059..094800197e 100644 --- a/usr/src/uts/common/inet/ip_if.h +++ b/usr/src/uts/common/inet/ip_if.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -80,7 +80,7 @@ extern "C" { */ #define IFF_PHYINT_FLAGS (IFF_LOOPBACK|IFF_RUNNING|IFF_PROMISC| \ IFF_ALLMULTI|IFF_INTELLIGENT|IFF_MULTI_BCAST|IFF_FAILED|IFF_STANDBY| \ - IFF_INACTIVE|IFF_OFFLINE|IFF_VIRTUAL) + IFF_INACTIVE|IFF_OFFLINE|IFF_VIRTUAL|IFF_IPMP) #define IFF_PHYINTINST_FLAGS (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP| \ IFF_MULTICAST|IFF_ROUTER|IFF_NONUD|IFF_NORTEXCH|IFF_IPV4|IFF_IPV6| \ @@ -91,11 +91,6 @@ extern "C" { IFF_DEPRECATED|IFF_ADDRCONF|IFF_ANYCAST|IFF_NOFAILOVER| \ IFF_PREFERRED|IFF_TEMPORARY|IFF_FIXEDMTU|IFF_DUPLICATE) -#define IPIF_REPL_CHECK(to_ipif, failback_cmd) \ - (((to_ipif)->ipif_replace_zero) || ((failback_cmd) && \ - !(to_ipif)->ipif_isv6 && !((to_ipif)->ipif_flags & IPIF_UP) && \ - (to_ipif)->ipif_lcl_addr == INADDR_ANY)) - #define PHYI_LOOPBACK IFF_LOOPBACK /* is a loopback net */ #define PHYI_RUNNING IFF_RUNNING /* resources allocated */ #define PHYI_PROMISC IFF_PROMISC /* receive all packets */ @@ -107,6 +102,7 @@ extern "C" { #define PHYI_INACTIVE IFF_INACTIVE /* Standby active or not ? */ #define PHYI_OFFLINE IFF_OFFLINE /* NIC has been offlined */ #define PHYI_VIRTUAL IFF_VIRTUAL /* Will not send or recv pkts */ +#define PHYI_IPMP IFF_IPMP /* IPMP meta-interface */ #define ILLF_DEBUG IFF_DEBUG /* turn on debugging */ #define ILLF_NOTRAILERS IFF_NOTRAILERS /* avoid use of trailers */ @@ -137,11 +133,6 @@ extern "C" { #define IPIF_FIXEDMTU IFF_FIXEDMTU /* set with SIOCSLIFMTU */ #define IPIF_DUPLICATE IFF_DUPLICATE /* address is in use */ -/* Source selection values for ipif_select_source_v6 */ -#define RESTRICT_TO_NONE 0x0 /* No restriction in source selection */ -#define RESTRICT_TO_GROUP 0x1 /* Restrict to IPMP group */ -#define RESTRICT_TO_ILL 0x2 /* Restrict to ILL */ - #ifdef DEBUG #define ILL_MAC_PERIM_HELD(ill) ill_mac_perim_held(ill) #else @@ -151,24 +142,23 @@ extern "C" { /* for ipif_resolver_up */ enum ip_resolver_action { Res_act_initial, /* initial address establishment */ - Res_act_move, /* address move (IPMP, new DL addr) */ - Res_act_defend /* address defense */ + Res_act_rebind, /* IPMP address rebind (new hwaddr) */ + Res_act_defend, /* address defense */ + Res_act_none /* do nothing */ }; -extern ill_t *illgrp_scheduler(ill_t *); -extern mblk_t *ill_arp_alloc(ill_t *, uchar_t *, caddr_t); -extern mblk_t *ipif_area_alloc(ipif_t *); +extern mblk_t *ill_arp_alloc(ill_t *, const uchar_t *, caddr_t); +extern mblk_t *ipif_area_alloc(ipif_t *, uint_t); extern mblk_t *ipif_ared_alloc(ipif_t *); extern mblk_t *ill_ared_alloc(ill_t *, ipaddr_t); -extern void ill_dlpi_done(ill_t *, t_uscalar_t); +extern mblk_t *ill_arie_alloc(ill_t *, const char *, const void *); extern boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t); +extern void ill_dlpi_done(ill_t *, t_uscalar_t); extern void ill_dlpi_send(ill_t *, mblk_t *); extern void ill_dlpi_send_deferred(ill_t *); extern void ill_capability_done(ill_t *); extern mblk_t *ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t); -extern ill_t *ill_group_lookup_on_ifindex(uint_t, boolean_t, ip_stack_t *); -extern ill_t *ill_group_lookup_on_name(char *, boolean_t, ip_stack_t *); /* NOTE: Keep unmodified ill_lookup_on_ifindex for ipp for now */ extern ill_t *ill_lookup_on_ifindex_global_instance(uint_t, boolean_t, queue_t *, mblk_t *, ipsq_func_t, int *); @@ -180,6 +170,7 @@ extern ill_t *ill_lookup_on_name(char *, boolean_t, extern uint_t ill_get_next_ifindex(uint_t, boolean_t, ip_stack_t *); extern uint_t ill_get_ifindex_by_name(char *, ip_stack_t *); extern void ill_ipif_cache_delete(ire_t *, char *); +extern void ill_stq_cache_delete(ire_t *, char *); extern void ill_delete(ill_t *); extern void ill_delete_tail(ill_t *); extern int ill_dl_phys(ill_t *, ipif_t *, mblk_t *, queue_t *); @@ -193,9 +184,9 @@ extern void ill_frag_prune(ill_t *, uint_t); extern void ill_frag_free_pkts(ill_t *, ipfb_t *, ipf_t *, int); extern time_t ill_frag_timeout(ill_t *, time_t); extern int ill_init(queue_t *, ill_t *); -extern int ill_nominate_mcast_rcv(ill_group_t *); -extern boolean_t ill_setdefaulttoken(ill_t *); +extern void ill_refresh_bcast(ill_t *); extern void ill_restart_dad(ill_t *, boolean_t); +extern boolean_t ill_setdefaulttoken(ill_t *); extern int ill_set_phys_addr(ill_t *, mblk_t *); extern void ill_set_ndmp(ill_t *, mblk_t *, uint_t, uint_t); @@ -222,11 +213,9 @@ extern void ill_capability_reset(ill_t *, boolean_t); extern void ill_taskq_dispatch(ip_stack_t *); extern void ill_mtu_change(ire_t *, char *); -extern void ill_group_cleanup(ill_t *); -extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *); -extern boolean_t ill_is_probeonly(ill_t *); -extern boolean_t ill_hook_event_create(ill_t *, lif_if_t, nic_event_t, - nic_event_data_t, size_t); +extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *); +extern uint_t ill_appaddr_cnt(const ill_t *); +extern uint_t ill_ptpaddr_cnt(const ill_t *); extern void ip_loopback_cleanup(ip_stack_t *); extern void ipif_get_name(const ipif_t *, char *, int); @@ -239,6 +228,8 @@ extern ipif_t *ipif_lookup_addr_v6(const in6_addr_t *, ill_t *, zoneid_t, queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *); extern boolean_t ip_addr_exists_v6(const in6_addr_t *, zoneid_t, ip_stack_t *); +extern ipif_t *ipif_lookup_addr_exact_v6(const in6_addr_t *, ill_t *, + ip_stack_t *); extern zoneid_t ipif_lookup_addr_zoneid(ipaddr_t, ill_t *, ip_stack_t *); extern zoneid_t ipif_lookup_addr_zoneid_v6(const in6_addr_t *, ill_t *, ip_stack_t *); @@ -251,31 +242,30 @@ extern ipif_t *ipif_lookup_multicast(ip_stack_t *, zoneid_t, boolean_t); extern ipif_t *ipif_lookup_remote(ill_t *, ipaddr_t, zoneid_t); extern ipif_t *ipif_lookup_onlink_addr(ipaddr_t, zoneid_t, ip_stack_t *); extern ipif_t *ipif_lookup_seqid(ill_t *, uint_t); -extern boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int, - ipif_t **); -extern boolean_t ipif_lookup_zoneid_group(ill_t *, zoneid_t, int, - ipif_t **); +extern boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int, ipif_t **); extern ipif_t *ipif_select_source(ill_t *, ipaddr_t, zoneid_t); extern boolean_t ipif_usesrc_avail(ill_t *, zoneid_t); extern void ipif_refhold(ipif_t *); extern void ipif_refhold_locked(ipif_t *); -extern void ipif_refrele(ipif_t *); +extern void ipif_refrele(ipif_t *); extern void ipif_all_down_tail(ipsq_t *, queue_t *, mblk_t *, void *); +extern void ipif_resolver_down(ipif_t *); extern int ipif_resolver_up(ipif_t *, enum ip_resolver_action); extern int ipif_arp_setup_multicast(ipif_t *, mblk_t **); extern int ipif_down(ipif_t *, queue_t *, mblk_t *); extern void ipif_down_tail(ipif_t *); +extern void ipif_multicast_down(ipif_t *); extern void ipif_multicast_up(ipif_t *); extern void ipif_ndp_down(ipif_t *); -extern int ipif_ndp_up(ipif_t *); +extern int ipif_ndp_up(ipif_t *, boolean_t); extern int ipif_ndp_setup_multicast(ipif_t *, struct nce_s **); extern int ipif_up_done(ipif_t *); extern int ipif_up_done_v6(ipif_t *); extern void ipif_up_notify(ipif_t *); -extern void ipif_update_other_ipifs_v6(ipif_t *, ill_group_t *); +extern void ipif_update_other_ipifs_v6(ipif_t *); extern void ipif_recreate_interface_routes_v6(ipif_t *, ipif_t *); extern void ill_update_source_selection(ill_t *); -extern ipif_t *ipif_select_source_v6(ill_t *, const in6_addr_t *, uint_t, +extern ipif_t *ipif_select_source_v6(ill_t *, const in6_addr_t *, boolean_t, uint32_t, zoneid_t); extern boolean_t ipif_cant_setlinklocal(ipif_t *); extern int ipif_setlinklocal(ipif_t *); @@ -284,11 +274,8 @@ extern ipif_t *ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t, queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *); extern ipif_t *ipif_get_next_ipif(ipif_t *curr, ill_t *ill); extern void ipif_ill_refrele_tail(ill_t *ill); -extern void ipif_arp_down(ipif_t *ipif); extern void ipif_mask_reply(ipif_t *); - -extern int illgrp_insert(ill_group_t **, ill_t *, char *, ill_group_t *, - boolean_t); +extern int ipif_up(ipif_t *, queue_t *, mblk_t *); extern void ipsq_current_start(ipsq_t *, ipif_t *, int); extern void ipsq_current_finish(ipsq_t *); @@ -451,13 +438,13 @@ extern int ip_sioctl_tmyaddr(ipif_t *, sin_t *, queue_t *, mblk_t *, extern int ip_sioctl_tunparam(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); +extern int ip_sioctl_get_binding(ipif_t *, sin_t *, queue_t *, + mblk_t *, ip_ioctl_cmd_t *, void *); extern int ip_sioctl_groupname(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); extern int ip_sioctl_get_groupname(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); -extern int ip_sioctl_slifoindex(ipif_t *, sin_t *, queue_t *, - mblk_t *, ip_ioctl_cmd_t *, void *); -extern int ip_sioctl_get_oindex(ipif_t *, sin_t *, queue_t *, +extern int ip_sioctl_groupinfo(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); extern int ip_sioctl_get_lifzone(ipif_t *, sin_t *, queue_t *, @@ -473,15 +460,11 @@ extern int ip_sioctl_slifusesrc(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); extern int ip_sioctl_get_lifsrcof(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); -extern int ip_sioctl_set_ipmpfailback(ipif_t *, sin_t *, queue_t *, - mblk_t *, ip_ioctl_cmd_t *, void *); extern void ip_sioctl_copyin_resume(ipsq_t *, queue_t *, mblk_t *, void *); extern void ip_sioctl_copyin_setup(queue_t *, mblk_t *); -extern void ip_sioctl_iocack(queue_t *, mblk_t *); +extern void ip_sioctl_iocack(ipsq_t *, queue_t *, mblk_t *, void *); extern ip_ioctl_cmd_t *ip_sioctl_lookup(int); -extern int ip_sioctl_move(ipif_t *, sin_t *, queue_t *, mblk_t *, - ip_ioctl_cmd_t *, void *); extern void conn_delete_ire(conn_t *, caddr_t); diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h index dae62ab499..369ba60005 100644 --- a/usr/src/uts/common/inet/ip_impl.h +++ b/usr/src/uts/common/inet/ip_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -394,11 +394,9 @@ typedef struct ip_lso_info_s { #define CONN_IS_LSO_MD_FASTPATH(connp) \ ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \ !((connp)->conn_nexthop_set) && /* IP_NEXTHOP */ \ - (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \ - (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \ (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */ -/* Definitons for fragmenting IP packets using MDT. */ +/* Definitions for fragmenting IP packets using MDT. */ /* * Smaller and private version of pdescinfo_t used specifically for IP, diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h index 7accbbcfa3..0a9f8add85 100644 --- a/usr/src/uts/common/inet/ip_ire.h +++ b/usr/src/uts/common/inet/ip_ire.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -86,31 +86,17 @@ extern "C" { /* return the ire. No recursive */ /* lookup should be done. */ #define MATCH_IRE_IHANDLE 0x0200 /* Match IRE on ihandle */ -#define MATCH_IRE_MARK_HIDDEN 0x0400 /* Match IRE ire_marks with */ - /* IRE_MARK_HIDDEN. */ +#define MATCH_IRE_MARK_TESTHIDDEN 0x0400 /* Match IRE_MARK_TESTHIDDEN IREs */ + /* - * MATCH_IRE_ILL is used whenever we want to specifically match an IRE - * whose ire_ipif->ipif_ill or (ill_t *)ire_stq->q_ptr matches a given - * ill. When MATCH_IRE_ILL is used to locate an IRE_CACHE, it implies - * that the packet will not be load balanced. This is normally used - * by in.mpathd to send out failure detection probes. - * - * MATCH_IRE_ILL_GROUP is used whenever we are not specific about which - * interface (ill) the packet should be sent out. This implies that the - * packets will be subjected to load balancing and it might go out on - * any interface in the group. When there is only interface in the group, - * MATCH_IRE_ILL_GROUP becomes MATCH_IRE_ILL. Most of the code uses - * MATCH_IRE_ILL_GROUP and MATCH_IRE_ILL is used in very few cases where - * we want to disable load balancing. - * * MATCH_IRE_PARENT is used whenever we unconditionally want to get the * parent IRE (sire) while recursively searching IREs for an offsubnet * destination. With this flag, even if no IRE_CACHETABLE or IRE_INTERFACE * is found to help resolving IRE_OFFSUBNET in lookup routines, the * IRE_OFFSUBNET sire, if any, is returned to the caller. */ -#define MATCH_IRE_ILL_GROUP 0x0800 /* Match IRE on ill or the ill_group. */ -#define MATCH_IRE_ILL 0x1000 /* Match IRE on the ill only */ +/* UNUSED 0x0800 */ +#define MATCH_IRE_ILL 0x1000 /* Match IRE on the ill */ #define MATCH_IRE_PARENT 0x2000 /* Match parent ire, if any, */ /* even if ire is not matched. */ @@ -305,7 +291,7 @@ extern ire_t *ire_ihandle_lookup_onlink(ire_t *); extern ire_t *ire_ihandle_lookup_offlink(ire_t *, ire_t *); extern ire_t *ire_ihandle_lookup_offlink_v6(ire_t *, ire_t *); -extern boolean_t ire_local_same_ill_group(ire_t *, ire_t *); +extern boolean_t ire_local_same_lan(ire_t *, ire_t *); extern boolean_t ire_local_ok_across_zones(ire_t *, zoneid_t, void *, const struct ts_label_s *, ip_stack_t *); @@ -354,7 +340,7 @@ extern ire_t *ipif_lookup_multi_ire_v6(ipif_t *, const in6_addr_t *); extern ire_t *ire_get_next_bcast_ire(ire_t *, ire_t *); extern ire_t *ire_get_next_default_ire(ire_t *, ire_t *); -extern void ire_arpresolve(ire_t *, ill_t *); +extern void ire_arpresolve(ire_t *); extern void ire_freemblk(ire_t *); extern boolean_t ire_match_args(ire_t *, ipaddr_t, ipaddr_t, ipaddr_t, int, const ipif_t *, zoneid_t, uint32_t, const struct ts_label_s *, int, diff --git a/usr/src/uts/common/inet/ip_multi.h b/usr/src/uts/common/inet/ip_multi.h index a3f4282cc7..7dee133967 100644 --- a/usr/src/uts/common/inet/ip_multi.h +++ b/usr/src/uts/common/inet/ip_multi.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -49,6 +49,15 @@ typedef enum { } ilg_stat_t; /* + * Flags shared via ips_mrt_flags, used by mcast_restart_timers_thread(). + */ +typedef enum { + IP_MRT_STOP = 0x1, /* request to stop thread */ + IP_MRT_DONE = 0x2, /* indication that thread is stopped */ + IP_MRT_RUN = 0x4 /* request to restart timers */ +} ip_mrt_flags_t; + +/* * Extern functions */ extern mblk_t *igmp_input(queue_t *, mblk_t *, ill_t *); @@ -78,9 +87,7 @@ extern int ip_get_dlpi_mbcast(ill_t *, mblk_t *); extern void ilm_free(ipif_t *); extern ilm_t *ilm_lookup_ill(ill_t *, ipaddr_t, zoneid_t); extern ilm_t *ilm_lookup_ill_v6(ill_t *, const in6_addr_t *, - zoneid_t); -extern ilm_t *ilm_lookup_ill_index_v6(ill_t *, const in6_addr_t *, - int, zoneid_t); + boolean_t, zoneid_t); extern ilm_t *ilm_lookup_ipif(ipif_t *, ipaddr_t); extern int ilm_numentries_v6(ill_t *, const in6_addr_t *); @@ -92,10 +99,10 @@ extern int ip_ll_send_enabmulti_req(ill_t *, const in6_addr_t *); extern int ip_addmulti(ipaddr_t, ipif_t *, ilg_stat_t, mcast_record_t, slist_t *); -extern int ip_addmulti_v6(const in6_addr_t *, ill_t *, int, +extern int ip_addmulti_v6(const in6_addr_t *, ill_t *, zoneid_t, ilg_stat_t, mcast_record_t, slist_t *); extern int ip_delmulti(ipaddr_t, ipif_t *, boolean_t, boolean_t); -extern int ip_delmulti_v6(const in6_addr_t *, ill_t *, int, +extern int ip_delmulti_v6(const in6_addr_t *, ill_t *, zoneid_t, boolean_t, boolean_t); extern int ill_join_allmulti(ill_t *); extern void ill_leave_allmulti(ill_t *); @@ -140,9 +147,11 @@ extern void reset_conn_ipif(ipif_t *); extern void reset_conn_ill(ill_t *); extern void reset_mrt_ill(ill_t *); extern void reset_mrt_vif_ipif(ipif_t *); -extern void igmp_start_timers(unsigned, ip_stack_t *); -extern void mld_start_timers(unsigned, ip_stack_t *); +extern void mcast_restart_timers_thread(ip_stack_t *); extern void ilm_inactive(ilm_t *); +extern ilm_t *ilm_walker_start(ilm_walker_t *, ill_t *); +extern ilm_t *ilm_walker_step(ilm_walker_t *, ilm_t *); +extern void ilm_walker_finish(ilm_walker_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip_ndp.h b/usr/src/uts/common/inet/ip_ndp.h index 4dbb56a884..5eda155c0e 100644 --- a/usr/src/uts/common/inet/ip_ndp.h +++ b/usr/src/uts/common/inet/ip_ndp.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_IP_NDP_H #define _INET_IP_NDP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/mutex.h> #include <sys/stream.h> #include <netinet/in.h> @@ -318,7 +316,8 @@ extern nd_opt_hdr_t *ndp_get_option(nd_opt_hdr_t *, int, int); extern void ndp_inactive(nce_t *); extern void ndp_input(ill_t *, mblk_t *, mblk_t *); extern boolean_t ndp_lookup_ipaddr(in_addr_t, netstack_t *); -extern nce_t *ndp_lookup_v6(ill_t *, const in6_addr_t *, boolean_t); +extern nce_t *ndp_lookup_v6(ill_t *, boolean_t, const in6_addr_t *, + boolean_t); extern nce_t *ndp_lookup_v4(ill_t *, const in_addr_t *, boolean_t); extern int ndp_mcastreq(ill_t *, const in6_addr_t *, uint32_t, uint32_t, mblk_t *); @@ -346,7 +345,7 @@ extern void nce_fastpath(nce_t *); extern int ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t, nce_t **); -extern int ndp_lookup_then_add_v6(ill_t *, uchar_t *, +extern int ndp_lookup_then_add_v6(ill_t *, boolean_t, uchar_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t, nce_t **); extern int ndp_lookup_then_add_v4(ill_t *, diff --git a/usr/src/uts/common/inet/ip_rts.h b/usr/src/uts/common/inet/ip_rts.h index 70b33e0278..61bc451995 100644 --- a/usr/src/uts/common/inet/ip_rts.h +++ b/usr/src/uts/common/inet/ip_rts.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,19 +37,28 @@ extern "C" { */ #define TSOL_RTSA_REQUEST_MAX 1 /* one per route destination */ +/* + * Flags for RTS queuing operations. + */ +#define RTSQ_UNDER_IPMP 0x01 /* send only on RTAW_UNDER_IPMP queues */ +#define RTSQ_NORMAL 0x02 /* send only on normal queues */ +#define RTSQ_ALL (RTSQ_UNDER_IPMP|RTSQ_NORMAL) /* send on all queues */ +#define RTSQ_DEFAULT 0x04 /* use standard filtering */ + #ifdef _KERNEL extern void ip_rts_change(int, ipaddr_t, ipaddr_t, - ipaddr_t, ipaddr_t, ipaddr_t, int, int, - int, ip_stack_t *); + ipaddr_t, ipaddr_t, ipaddr_t, int, int, int, ip_stack_t *); extern void ip_rts_change_v6(int, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, int, int, int, ip_stack_t *); -extern void ip_rts_ifmsg(const ipif_t *); +extern void ip_rts_ifmsg(const ipif_t *, uint_t); -extern void ip_rts_newaddrmsg(int, int, const ipif_t *); +extern void ip_rts_xifmsg(const ipif_t *, uint64_t, uint64_t, uint_t); + +extern void ip_rts_newaddrmsg(int, int, const ipif_t *, uint_t); extern int ip_rts_request(queue_t *, mblk_t *, cred_t *); @@ -70,9 +79,11 @@ extern void rts_fill_msg_v6(int, int, const in6_addr_t *, extern size_t rts_header_msg_size(int); -extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, ip_stack_t *); +extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, uint_t, + ip_stack_t *); extern int ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *, cred_t *); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h index 3c53e1a3d3..750378f587 100644 --- a/usr/src/uts/common/inet/ip_stack.h +++ b/usr/src/uts/common/inet/ip_stack.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +33,7 @@ extern "C" { #include <sys/netstack.h> #include <netinet/igmp_var.h> +#include <sys/modhash.h> #ifdef _KERNEL #include <sys/list.h> @@ -172,9 +173,6 @@ struct ip_stack { krwlock_t ips_ill_g_usesrc_lock; - struct ill_group *ips_illgrp_head_v4; /* Head of IPv4 ill groups */ - struct ill_group *ips_illgrp_head_v6; /* Head of IPv6 ill groups */ - /* Taskq dispatcher for capability operations */ kmutex_t ips_capab_taskq_lock; kcondvar_t ips_capab_taskq_cv; @@ -204,7 +202,6 @@ struct ip_stack { int ips_igmp_timer_scheduled_last; int ips_igmp_deferred_next; timeout_id_t ips_igmp_timeout_id; - kthread_t *ips_igmp_timer_thread; boolean_t ips_igmp_timer_setter_active; /* Following protected by mld_timer_lock */ @@ -212,7 +209,6 @@ struct ip_stack { int ips_mld_timer_scheduled_last; int ips_mld_deferred_next; timeout_id_t ips_mld_timeout_id; - kthread_t *ips_mld_timer_thread; boolean_t ips_mld_timer_setter_active; /* Protected by igmp_slowtimeout_lock */ @@ -269,8 +265,6 @@ struct ip_stack { int ips_ip_g_forward; int ips_ipv6_forward; - int ips_ipmp_hook_emulation; /* ndd variable */ - time_t ips_ip_g_frag_timeout; clock_t ips_ip_g_frag_timo_ms; @@ -280,8 +274,6 @@ struct ip_stack { clock_t ips_icmp_pkt_err_last; /* Number of packets sent in burst */ uint_t ips_icmp_pkt_err_sent; - /* Used by icmp_send_redirect_v6 for picking random src. */ - uint_t ips_icmp_redirect_v6_src_index; /* Protected by ip_mi_lock */ void *ips_ip_g_head; /* Instance Data List Head */ @@ -356,8 +348,6 @@ struct ip_stack { kstat_t *ips_loopback_ksp; - uint_t ips_ipif_src_random; - struct idl_s *ips_conn_drain_list; /* Array of conn drain lists */ uint_t ips_conn_drain_list_cnt; /* Count of conn_drain_list */ int ips_conn_drain_list_index; /* Next drain_list */ @@ -375,15 +365,6 @@ struct ip_stack { uint64_t ips_ipif_g_seqid; union phyint_list_u *ips_phyint_g_list; /* start of phyint list */ - /* - * Reflects value of FAILBACK variable in IPMP config file - * /etc/default/mpathd. Default value is B_TRUE. - * Set to B_FALSE if user disabled failback by configuring - * "FAILBACK=no" in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this - * information to kernel. - */ - boolean_t ips_ipmp_enable_failback; - /* ip_neti.c */ hook_family_t ips_ipv4root; hook_family_t ips_ipv6root; @@ -427,12 +408,25 @@ struct ip_stack { kcondvar_t ips_ipobs_cb_cv; struct __ldi_ident *ips_ldi_ident; + +/* ipmp.c */ + krwlock_t ips_ipmp_lock; + mod_hash_t *ips_ipmp_grp_hash; + +/* igmp.c */ + /* multicast restart timers thread logic */ + kmutex_t ips_mrt_lock; + uint_t ips_mrt_flags; + kcondvar_t ips_mrt_cv; + kcondvar_t ips_mrt_done_cv; + kthread_t *ips_mrt_thread; }; typedef struct ip_stack ip_stack_t; /* Finding an ip_stack_t */ #define CONNQ_TO_IPST(_q) (Q_TO_CONN(_q)->conn_netstack->netstack_ip) #define ILLQ_TO_IPST(_q) (((ill_t *)(_q)->q_ptr)->ill_ipst) +#define PHYINT_TO_IPST(phyi) ((phyi)->phyint_ipsq->ipsq_ipst) #else /* _KERNEL */ typedef int ip_stack_t; diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index 5fb86a5262..d80123a977 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -249,7 +249,6 @@ struct conn_s { squeue_t *conn_initial_sqp; /* Squeue at open time */ squeue_t *conn_final_sqp; /* Squeue after connect */ - ill_t *conn_nofailover_ill; /* Failover ill */ ill_t *conn_dhcpinit_ill; /* IP_DHCPINIT_IF */ ipsec_latch_t *conn_latch; /* latched state */ ill_t *conn_outgoing_ill; /* IP{,V6}_BOUND_IF */ @@ -295,7 +294,6 @@ struct conn_s { uint_t conn_proto; /* SO_PROTOTYPE state */ ill_t *conn_incoming_ill; /* IP{,V6}_BOUND_IF */ - ill_t *conn_outgoing_pill; /* IP{,V6}_BOUND_PIF */ ill_t *conn_oper_pending_ill; /* pending shared ioctl */ ilg_t *conn_ilg; /* Group memberships */ @@ -307,9 +305,6 @@ struct conn_s { struct ipif_s *conn_multicast_ipif; /* IP_MULTICAST_IF */ ill_t *conn_multicast_ill; /* IPV6_MULTICAST_IF */ - int conn_orig_bound_ifindex; /* BOUND_IF before MOVE */ - int conn_orig_multicast_ifindex; - /* IPv6 MC IF before MOVE */ struct conn_s *conn_drain_next; /* Next conn in drain list */ struct conn_s *conn_drain_prev; /* Prev conn in drain list */ idl_t *conn_idl; /* Ptr to the drain list head */ @@ -322,7 +317,7 @@ struct conn_s { uchar_t conn_broadcast_ttl; /* IP_BROADCAST_TTL */ #define conn_nexthop_v4 V4_PART_OF_V6(conn_nexthop_v6) cred_t *conn_peercred; /* Peer credentials, if any */ - + int conn_rtaware; /* RT_AWARE sockopt value */ kcondvar_t conn_sq_cv; /* For non-STREAMS socket IO */ kthread_t *conn_sq_caller; /* Caller of squeue sync ops */ sock_upcalls_t *conn_upcalls; /* Upcalls to sockfs */ diff --git a/usr/src/uts/common/inet/ipnet/ipnet.c b/usr/src/uts/common/inet/ipnet/ipnet.c index 577205f25a..e94af50424 100644 --- a/usr/src/uts/common/inet/ipnet/ipnet.c +++ b/usr/src/uts/common/inet/ipnet/ipnet.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -229,16 +229,19 @@ ipnet_if_init(void) int _init(void) { - int ret; + int ret; + boolean_t netstack_registered = B_FALSE; if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1) return (ENODEV); ipnet_minor_space = id_space_create("ipnet_minor_space", IPNET_MINOR_MIN, MAXMIN32); - netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini); + /* * We call ddi_taskq_create() with nthread == 1 to ensure in-order - * delivery of packets to clients. + * delivery of packets to clients. Note that we need to create the + * taskqs before calling netstack_register() since ipnet_stack_init() + * registers callbacks that use 'em. */ ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0); ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue", @@ -247,6 +250,10 @@ _init(void) ret = ENOMEM; goto done; } + + netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini); + netstack_registered = B_TRUE; + if ((ret = ipnet_if_init()) == 0) ret = mod_install(&modlinkage); done: @@ -255,7 +262,8 @@ done: ddi_taskq_destroy(ipnet_taskq); if (ipnet_nicevent_taskq != NULL) ddi_taskq_destroy(ipnet_nicevent_taskq); - netstack_unregister(NS_IPNET); + if (netstack_registered) + netstack_unregister(NS_IPNET); id_space_destroy(ipnet_minor_space); } return (ret); @@ -268,9 +276,10 @@ _fini(void) if ((err = mod_remove(&modlinkage)) != 0) return (err); + + netstack_unregister(NS_IPNET); ddi_taskq_destroy(ipnet_nicevent_taskq); ddi_taskq_destroy(ipnet_taskq); - netstack_unregister(NS_IPNET); id_space_destroy(ipnet_minor_space); return (0); } @@ -987,6 +996,7 @@ static boolean_t ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src, ipnet_addrp_t *dst) { + boolean_t obsif; uint64_t ifindex = ipnet->ipnet_if->if_index; ipnet_addrtype_t srctype, dsttype; @@ -994,6 +1004,13 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src, dsttype = ipnet_get_addrtype(ipnet, dst); /* + * If the packet's ifindex matches ours, or the packet's group ifindex + * matches ours, it's on the interface we're observing. (Thus, + * observing on the group ifindex matches all ifindexes in the group.) + */ + obsif = (ihd->ihd_ifindex == ifindex || ihd->ihd_grifindex == ifindex); + + /* * Do not allow an ipnet stream to see packets that are not from or to * its zone. The exception is when zones are using the shared stack * model. In this case, streams in the global zone have visibility @@ -1025,7 +1042,7 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src, * have our source address (this allows us to see packets we send). */ if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) { - if (ihd->ihd_ifindex == ifindex || srctype == IPNETADDR_MYADDR) + if (srctype == IPNETADDR_MYADDR || obsif) return (B_TRUE); } @@ -1033,7 +1050,7 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src, * We accept multicast and broadcast packets transmitted or received * on the interface we're observing. */ - if (dsttype == IPNETADDR_MBCAST && ihd->ihd_ifindex == ifindex) + if (dsttype == IPNETADDR_MBCAST && obsif) return (B_TRUE); return (B_FALSE); diff --git a/usr/src/uts/common/inet/ipsec_info.h b/usr/src/uts/common/inet/ipsec_info.h index b014bdade0..0348e10b91 100644 --- a/usr/src/uts/common/inet/ipsec_info.h +++ b/usr/src/uts/common/inet/ipsec_info.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_IPSEC_INFO_H #define _INET_IPSEC_INFO_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -114,12 +112,11 @@ typedef struct ipsec_in_s { ipsec_in_decaps : 1, /* Was this packet decapsulated from */ /* a matching inner packet? */ - ipsec_in_attach_if : 1, /* Don't load spread this packet */ ipsec_in_accelerated : 1, /* hardware accelerated packet */ ipsec_in_icmp_loopback : 1, /* Looped-back ICMP packet, */ /* all should trust this. */ - ipsec_in_pad_bits : 24; + ipsec_in_pad_bits : 25; int ipsec_in_ill_index; /* interface on which ipha_dst was */ /* configured when pkt was recv'd */ @@ -197,12 +194,11 @@ typedef struct ipsec_out_s { ipsec_out_reserved : 1, ipsec_out_v4 : 1, - ipsec_out_attach_if : 1, ipsec_out_unspec_src : 1, /* IPv6 ip6i_t info */ ipsec_out_reachable : 1, /* NDP reachability info */ ipsec_out_failed: 1, - ipsec_out_se_done: 1, + ipsec_out_esp_done: 1, ipsec_out_ah_done: 1, ipsec_out_need_policy: 1, @@ -225,7 +221,7 @@ typedef struct ipsec_out_s { */ ipsec_out_icmp_loopback: 1, ipsec_out_ip_nexthop : 1, /* IP_NEXTHOP option is set */ - ipsec_out_pad_bits : 12; + ipsec_out_pad_bits : 13; cred_t *ipsec_out_cred; uint32_t ipsec_out_capab_ill_index; diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h index 5abfc06581..a467abaee9 100644 --- a/usr/src/uts/common/inet/mib2.h +++ b/usr/src/uts/common/inet/mib2.h @@ -17,9 +17,8 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -27,8 +26,6 @@ #ifndef _INET_MIB2_H #define _INET_MIB2_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <netinet/in.h> /* For in6_addr_t */ #include <sys/tsol/label.h> /* For brange_t */ #include <sys/tsol/label_macro.h> /* For brange_t */ @@ -65,9 +62,14 @@ extern "C" { * #define OPTLEN(x) ((((x) + sizeof(long) - 1) / sizeof(long)) * sizeof(long)) * #define OPTVAL(opt) ((char *)(opt + 1)) * - * For get requests (T_NEGOTIATE), any MIB2_xxx value can be used (only + * For get requests (T_CURRENT), any MIB2_xxx value can be used (only * "get all" is supported, so all modules get a copy of the request to - * return everything it knows. Recommend: Use MIB2_IP + * return everything it knows. In general, we use MIB2_IP. There is + * one exception: in general, IP will not report information related to + * IRE_MARK_TESTHIDDEN routes (e.g., in the MIB2_IP_ROUTE table). + * However, using the special value EXPER_IP_AND_TESTHIDDEN will cause + * all information to be reported. This special value should only be + * used by IPMP-aware low-level utilities (e.g. in.mpathd). * * IMPORTANT: some fields are grouped in a different structure than * suggested by MIB-II, e.g., checksum error counts. The original MIB-2 @@ -79,7 +81,6 @@ extern "C" { #define IPPROTO_MAX 256 #endif - #define MIB2_SYSTEM (IPPROTO_MAX+1) #define MIB2_INTERFACES (IPPROTO_MAX+2) #define MIB2_AT (IPPROTO_MAX+3) @@ -108,12 +109,13 @@ extern "C" { #define EXPER_IGMP (EXPER+1) #define EXPER_DVMRP (EXPER+2) #define EXPER_RAWIP (EXPER+3) +#define EXPER_IP_AND_TESTHIDDEN (EXPER+4) /* * Define range of levels for experimental use */ #define EXPER_RANGE_START (EXPER+1) -#define EXPER_RANGE_END (EXPER+3) +#define EXPER_RANGE_END (EXPER+4) #define BUMP_MIB(s, x) { \ extern void __dtrace_probe___mib_##x(int, void *); \ diff --git a/usr/src/uts/common/inet/sctp/sctp_addr.c b/usr/src/uts/common/inet/sctp/sctp_addr.c index 1761396031..94cc8e8883 100644 --- a/usr/src/uts/common/inet/sctp/sctp_addr.c +++ b/usr/src/uts/common/inet/sctp/sctp_addr.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/systm.h> #include <sys/stream.h> diff --git a/usr/src/uts/common/inet/sctp_ip.h b/usr/src/uts/common/inet/sctp_ip.h index 16ab99abab..7b20d3fd2b 100644 --- a/usr/src/uts/common/inet/sctp_ip.h +++ b/usr/src/uts/common/inet/sctp_ip.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_SCTP_IP_H #define _INET_SCTP_IP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 488f8ee021..68e0883222 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -31,7 +31,6 @@ #include <sys/strsubr.h> #include <sys/stropts.h> #include <sys/strlog.h> -#include <sys/strsun.h> #define _SUN_TPI_VERSION 2 #include <sys/tihdr.h> #include <sys/timod.h> @@ -4683,18 +4682,10 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, /* ifindex must be already set */ ASSERT(ifindex != 0); - if (ltcp->tcp_bound_if != 0) { - /* - * Set newtcp's bound_if equal to - * listener's value. If ifindex is - * not the same as ltcp->tcp_bound_if, - * it must be a packet for the ipmp group - * of interfaces - */ + if (ltcp->tcp_bound_if != 0) tcp->tcp_bound_if = ltcp->tcp_bound_if; - } else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { + else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) tcp->tcp_bound_if = ifindex; - } tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary; tcp->tcp_recvifindex = 0; @@ -10716,9 +10707,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, ipp->ipp_fields |= IPPF_USE_MIN_MTU; ipp->ipp_use_min_mtu = *i1; break; - case IPV6_BOUND_PIF: - /* Handled at the IP level */ - return (-EINVAL); case IPV6_SEC_OPT: /* * We should not allow policy setting after @@ -18895,7 +18883,6 @@ tcp_zcopy_check(tcp_t *tcp) connp->conn_dontroute == 0 && !connp->conn_nexthop_set && connp->conn_outgoing_ill == NULL && - connp->conn_nofailover_ill == NULL && do_tcpzcopy == 1) { /* * the checks above closely resemble the fast path checks @@ -19139,7 +19126,6 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp) ipaddr_t dst; ire_t *ire; ill_t *ill; - conn_t *connp = tcp->tcp_connp; mblk_t *ire_fp_mp; tcp_stack_t *tcps = tcp->tcp_tcps; @@ -19164,14 +19150,6 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp) } ill = ire_to_ill(ire); - if (connp->conn_outgoing_ill != NULL) { - ill_t *conn_outgoing_ill = NULL; - /* - * Choose a good ill in the group to send the packets on. - */ - ire = conn_set_outgoing_ill(connp, ire, &conn_outgoing_ill); - ill = ire_to_ill(ire); - } ASSERT(ill != NULL); if (!tcp->tcp_ire_ill_check_done) { diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c index 15b5d04d61..8c8eee3b58 100644 --- a/usr/src/uts/common/inet/tcp/tcp_fusion.c +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +35,7 @@ #include <inet/common.h> #include <inet/optcom.h> #include <inet/ip.h> +#include <inet/ip_if.h> #include <inet/ip_impl.h> #include <inet/tcp.h> #include <inet/tcp_impl.h> diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index d977c27e53..e2314f8104 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -151,9 +151,6 @@ opdes_t tcp_opt_arr[] = { { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (in_addr_t), -1 /* not initialized */ }, -{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (int), 0 /* no ifindex */ }, - { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, sizeof (int), 0 }, diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 7c9433caa0..1178315cb5 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -80,6 +80,7 @@ #include <inet/ipp_common.h> #include <sys/squeue_impl.h> #include <inet/ipnet.h> +#include <sys/ethernet.h> /* * The ipsec_info.h header file is here since it has the definition for the @@ -2141,7 +2142,6 @@ udp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) case MCAST_UNBLOCK_SOURCE: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: - case IP_DONTFAILOVER_IF: /* cannot "get" the value for these */ return (-1); case IP_BOUND_IF: @@ -3152,9 +3152,7 @@ udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, ipp->ipp_use_min_mtu = *i1; break; - case IPV6_BOUND_PIF: case IPV6_SEC_OPT: - case IPV6_DONTFAILOVER_IF: case IPV6_SRC_PREFERENCES: case IPV6_V6ONLY: /* Handled at the IP level */ @@ -5351,7 +5349,6 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 || CONN_OUTBOUND_POLICY_PRESENT(connp, ipss) || connp->conn_dontroute || - connp->conn_nofailover_ill != NULL || connp->conn_outgoing_ill != NULL || optinfo.ip_opt_flags != 0 || optinfo.ip_opt_ill_index != 0 || ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION || @@ -5419,8 +5416,7 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr; ASSERT(ipif != NULL); - if (stq_ill != ipif->ipif_ill && (stq_ill->ill_group == NULL || - stq_ill->ill_group != ipif->ipif_ill->ill_group)) + if (!IS_ON_SAME_LAN(stq_ill, ipif->ipif_ill)) retry_caching = B_TRUE; } @@ -5444,7 +5440,7 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) ASSERT(ipif != NULL); ire = ire_ctable_lookup(dst, 0, 0, ipif, connp->conn_zoneid, MBLK_GETLABEL(mp), - MATCH_IRE_ILL_GROUP, ipst); + MATCH_IRE_ILL, ipst); } else { ASSERT(ipif == NULL); ire = ire_cache_lookup(dst, connp->conn_zoneid, @@ -5622,12 +5618,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) } if (CLASSD(dst)) { - boolean_t ilm_exists; - - ILM_WALKER_HOLD(ill); - ilm_exists = (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL); - ILM_WALKER_RELE(ill); - if (ilm_exists) { + if (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL) { ip_multicast_loopback(q, ill, mp, connp->conn_multicast_loop ? 0 : IP_FF_NO_MCAST_LOOP, zoneid); diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index 0ec5a2c45e..65729b82f1 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -132,9 +132,6 @@ opdes_t udp_opt_arr[] = { { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 /* no ifindex */ }, -{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (struct in_addr), 0 /* not initialized */ }, - { IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (int), 0 }, @@ -191,12 +188,6 @@ opdes_t udp_opt_arr[] = { { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 /* no ifindex */ }, -{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (int), 0 /* no ifindex */ }, - -{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (int), 0 /* no ifindex */ }, - { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, sizeof (int), 0 }, diff --git a/usr/src/uts/common/inet/vni/vni.c b/usr/src/uts/common/inet/vni/vni.c deleted file mode 100644 index a370a7b4be..0000000000 --- a/usr/src/uts/common/inet/vni/vni.c +++ /dev/null @@ -1,359 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - - -#include "vni_impl.h" -#include <sys/conf.h> -#include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/dlpi.h> -#include <sys/stat.h> -#include <sys/ethernet.h> -#include <sys/strsun.h> -#include <sys/stropts.h> - -static int vniopen(queue_t *, dev_t *, int, int, cred_t *); -static int vniclose(queue_t *, int, cred_t *); -static int vniwput(queue_t *, mblk_t *); -static int vniattach(dev_info_t *, ddi_attach_cmd_t); -static int vnidetach(dev_info_t *, ddi_detach_cmd_t); - -static struct module_info minfo = { - VNIIDNUM, /* mi_idnum */ - VNINAME, /* mi_idname */ - VNIMINPSZ, /* mi_minpsz */ - VNIMAXPSZ, /* mi_maxpsz */ - VNIHIWAT, /* mi_hiwat */ - VNILOWAT /* mi_lowat */ -}; - -static struct qinit vnirinit = { - NULL, /* qi_putp */ - NULL, /* qi_srvp */ - vniopen, /* qi_qopen */ - vniclose, /* qi_qclose */ - NULL, /* qi_qadmin */ - &minfo, /* qi_minfo */ - NULL /* qi_mstat */ -}; - -static struct qinit vniwinit = { - vniwput, /* qi_putp */ - NULL, /* qi_srvp */ - NULL, /* qi_qopen */ - NULL, /* qi_qclose */ - NULL, /* qi_qadmin */ - &minfo, /* qi_minfo */ - NULL /* qi_mstat */ -}; - -static struct streamtab vni_info = { - &vnirinit, /* st_rdinit */ - &vniwinit, /* st_wrinit */ - NULL, /* st_muxrinit */ - NULL /* st_muxwrinit */ -}; - -DDI_DEFINE_STREAM_OPS(vni_ops, nulldev, nulldev, vniattach, \ - vnidetach, nodev, nodev, VNIFLAGS, &vni_info, ddi_quiesce_not_supported); - -static struct modldrv modldrv = { - &mod_driverops, - "Virtual network interface", - &vni_ops, -}; - -static struct modlinkage modlinkage = { - MODREV_1, &modldrv, NULL -}; - -static vni_str_t *vni_strlist_head; - -/* - * DL_INFO_ACK template for VNI pseudo interface. - */ -static dl_info_ack_t dlvni_infoack = { - DL_INFO_ACK, /* dl_primitive */ - 0, /* dl_max_sdu */ - 0, /* dl_min_sdu */ - 0, /* dl_addr_length */ - SUNW_DL_VNI, /* dl_mac_type */ - 0, /* dl_reserved */ - 0, /* dl_current_state */ - 0, /* dl_sap_length */ - DL_CLDLS, /* dl_service_mode */ - 0, /* dl_qos_length */ - 0, /* dl_qos_offset */ - 0, /* dl_range_length */ - 0, /* dl_range_offset */ - DL_STYLE2, /* dl_provider_style */ - 0, /* dl_addr_offset */ - DL_VERSION_2, /* dl_version */ - 0, /* dl_brdcst_addr_length */ - 0, /* dl_brdcst_addr_offset */ - 0 /* dl_growth */ -}; - -int -_init(void) -{ - return (mod_install(&modlinkage)); -} - -int -_fini(void) -{ - return (mod_remove(&modlinkage)); -} - -int -_info(struct modinfo *modinfop) -{ - return (mod_info(&modlinkage, modinfop)); -} - -static int -vniattach(dev_info_t *devi, ddi_attach_cmd_t cmd) -{ - if (cmd != DDI_ATTACH) { - cmn_err(CE_NOTE, "vniattach failure: cmd != DDI_ATTACH\n"); - return (DDI_FAILURE); - } - - if (ddi_create_minor_node(devi, VNINAME, S_IFCHR, - ddi_get_instance(devi), DDI_PSEUDO, CLONE_DEV) == - DDI_FAILURE) { - ddi_remove_minor_node(devi, NULL); - cmn_err(CE_NOTE, "vniattach failure: ddi_create_minor_node\n"); - return (DDI_FAILURE); - } - - return (DDI_SUCCESS); -} - -static int -vnidetach(dev_info_t *devi, ddi_detach_cmd_t cmd) -{ - if (cmd != DDI_DETACH) - return (DDI_FAILURE); - - ddi_remove_minor_node(devi, NULL); - return (DDI_SUCCESS); -} - -/* ARGSUSED */ -static int -vniopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) -{ - vni_str_t *stp, *prevstp; - minor_t minordev = 0; - - if (sflag != CLONEOPEN) - return (EINVAL); - - prevstp = NULL; - - for (stp = vni_strlist_head; stp != NULL; stp = stp->st_next) { - if (minordev < stp->st_minor) - break; - minordev++; - prevstp = stp; - } - - stp = kmem_zalloc(sizeof (vni_str_t), KM_SLEEP); - - *devp = makedevice(getmajor(*devp), minordev); - - stp->st_minor = minordev; - stp->st_state = DL_UNATTACHED; - stp->st_next = NULL; - - q->q_ptr = stp; - WR(q)->q_ptr = stp; - - if (prevstp != NULL) { - stp->st_next = prevstp->st_next; - prevstp->st_next = stp; - } else { - stp->st_next = vni_strlist_head; - vni_strlist_head = stp; - } - - qprocson(q); - return (0); -} - -/* ARGSUSED */ -static int -vniclose(queue_t *q, int flag, cred_t *credp) -{ - vni_str_t *stp, **prevstpp; - - qprocsoff(q); - stp = (vni_str_t *)q->q_ptr; - stp->st_state = DL_UNATTACHED; - - /* Unlink the per-stream entry from the list and free it */ - stp = vni_strlist_head; - prevstpp = &vni_strlist_head; - - for (; stp != NULL; stp = stp->st_next) { - if (stp == (vni_str_t *)q->q_ptr) - break; - prevstpp = &stp->st_next; - } - - ASSERT(stp != NULL); - - *prevstpp = stp->st_next; - - kmem_free(stp, sizeof (vni_str_t)); - - q->q_ptr = WR(q)->q_ptr = NULL; - return (0); -} - -static int -vniwput(queue_t *q, mblk_t *mp) -{ - union DL_primitives *dlp; - vni_str_t *stp; - dl_info_ack_t *dlip; - t_scalar_t prim; - - stp = q->q_ptr; - - switch ((mp)->b_datap->db_type) { - case M_PROTO: - case M_PCPROTO: - if (MBLKL(mp) < sizeof (t_scalar_t)) { - dlerrorack(q, mp, DL_PRIM_INVAL, DL_UNSUPPORTED, 0); - return (0); - } - dlp = (void *)mp->b_rptr; - prim = dlp->dl_primitive; - switch (prim) { - case DL_ATTACH_REQ: - if (MBLKL(mp) < DL_ATTACH_REQ_SIZE) { - dlerrorack(q, mp, DL_ATTACH_REQ, DL_BADPRIM, 0); - return (0); - } - if (stp->st_state != DL_UNATTACHED) { - dlerrorack(q, mp, DL_ATTACH_REQ, DL_OUTSTATE, - 0); - return (0); - } - stp->st_ppa = dlp->attach_req.dl_ppa; - stp->st_state = DL_UNBOUND; - dlokack(q, mp, DL_ATTACH_REQ); - break; - case DL_BIND_REQ: - if (MBLKL(mp) < DL_BIND_REQ_SIZE) { - dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0); - return (0); - } - if (stp->st_state != DL_UNBOUND) { - dlerrorack(q, mp, DL_BIND_REQ, DL_OUTSTATE, 0); - return (0); - } - stp->st_state = DL_IDLE; - dlbindack(q, mp, dlp->bind_req.dl_sap, NULL, 0, 0, 0); - break; - case DL_INFO_REQ: - if (MBLKL(mp) < DL_INFO_REQ_SIZE) { - dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0); - return (0); - } - if ((mp = mexchange(q, mp, sizeof (dl_info_ack_t), - M_PCPROTO, DL_INFO_ACK)) == NULL) { - return (0); - } - dlip = (void *)mp->b_rptr; - *dlip = dlvni_infoack; - dlip->dl_current_state = stp->st_state; - qreply(q, mp); - break; - case DL_PHYS_ADDR_REQ: - if (MBLKL(mp) < DL_PHYS_ADDR_REQ_SIZE) { - dlerrorack(q, mp, DL_PHYS_ADDR_REQ, DL_BADPRIM, - 0); - return (0); - } - dlphysaddrack(q, mp, NULL, 0); - break; - case DL_UNBIND_REQ: - if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) { - dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0); - return (0); - } - if (stp->st_state != DL_IDLE) { - dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, - 0); - return (0); - } - /* Nothing to flush. But DLPI spec says to; so do it */ - flushq(q, FLUSHALL); - flushq(RD(q), FLUSHALL); - stp->st_state = DL_UNBOUND; - dlokack(q, mp, DL_UNBIND_REQ); - break; - case DL_DETACH_REQ: - if (MBLKL(mp) < DL_DETACH_REQ_SIZE) { - dlerrorack(q, mp, DL_DETACH_REQ, DL_BADPRIM, 0); - return (0); - } - if (stp->st_state != DL_UNBOUND) { - dlerrorack(q, mp, DL_DETACH_REQ, DL_OUTSTATE, - 0); - return (0); - } - stp->st_state = DL_UNATTACHED; - dlokack(q, mp, DL_DETACH_REQ); - break; - default: - dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0); - } - break; - case M_IOCTL: - /* - * No ioctl's currently supported. Need to have the NAK since - * ifconfig calls SIOCGTUNPARAM during the end of plumb - */ - miocnak(q, mp, 0, EINVAL); - break; - case M_FLUSH: - /* Really nothing to flush since no msgs enqueued */ - if (*mp->b_rptr & FLUSHR) { - qreply(q, mp); - } else { - freemsg(mp); - } - break; - default: - freemsg(mp); - break; - } - return (0); -} diff --git a/usr/src/uts/common/inet/vni/vni_impl.h b/usr/src/uts/common/inet/vni/vni_impl.h deleted file mode 100644 index ffba1b08bf..0000000000 --- a/usr/src/uts/common/inet/vni/vni_impl.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _INET_VNI_IMPL_H -#define _INET_VNI_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef __cplusplus -extern "C" { -#endif - -#include <sys/modctl.h> -#include <sys/stream.h> - -typedef struct vni_str { - struct vni_str *st_next; /* next in list */ - t_uscalar_t st_state; /* DLPI state */ - minor_t st_minor; /* corresponding minor */ - uint32_t st_ppa; /* physical point of attachment */ -} vni_str_t; - -#define DL_MAXPRIM DL_GET_STATISTICS_ACK -#define VNIIDNUM 0x2a84 -#define VNINAME "vni" -#define VNIFLAGS (D_MP|D_MTPERMOD) -#define VNIHIWAT 1024 -#define VNILOWAT 512 -#define VNIMINPSZ 0 -#define VNIMAXPSZ INFPSZ - -#ifdef __cplusplus -} -#endif - -#endif /* _INET_VNI_IMPL_H */ diff --git a/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c b/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c index 69feb36606..03d82fbcab 100644 --- a/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c +++ b/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c @@ -19,14 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - -#define AF_INET_OFFLOAD 30 - #include <sys/sockio.h> #include <sys/stream.h> #include <sys/errno.h> @@ -34,27 +30,24 @@ #include <sys/strsun.h> #include <inet/common.h> #include <net/if.h> +#include <net/if_types.h> #include <inet/mi.h> #include <sys/t_kuser.h> #include <sys/stropts.h> #include <sys/pathname.h> #include <sys/kstr.h> #include <sys/timod.h> +#include <sys/sunddi.h> #include <sys/ib/clients/rds/rds.h> #include <sys/ib/clients/rds/rds_transport.h> static sin_t sin_null; /* Zero address for quick clears */ -#define isdigit(ch) ((ch) >= '0' && (ch) <= '9') - -#define isalpha(ch) (((ch) >= 'a' && (ch) <= 'z') || \ - ((ch) >= 'A' && (ch) <= 'Z')) - /* * Just pass the ioctl to IP and the result to the caller. */ int -rds_do_ip_ioctl(int cmd, int len, caddr_t arg) +rds_do_ip_ioctl(int cmd, int len, void *arg) { vnode_t *kvp, *vp; TIUSER *tiptr; @@ -62,8 +55,7 @@ rds_do_ip_ioctl(int cmd, int len, caddr_t arg) k_sigset_t smask; int err = 0; - if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, - &kvp) == 0) { + if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE, &tiptr, CRED()) == 0) { vp = tiptr->fp->f_vnode; @@ -72,13 +64,13 @@ rds_do_ip_ioctl(int cmd, int len, caddr_t arg) return (EPROTO); } } else { - return (EPROTO); + return (EPROTO); } iocb.ic_cmd = cmd; iocb.ic_timout = 0; iocb.ic_len = len; - iocb.ic_dp = arg; + iocb.ic_dp = (caddr_t)arg; sigintr(&smask, 0); err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); sigunintr(&smask); @@ -88,197 +80,166 @@ rds_do_ip_ioctl(int cmd, int len, caddr_t arg) } /* - * Return 0 if the interface is IB. - * Return error (>0) if any error is encountered during processing. - * Return -1 if the interface is not IB and no error. + * Check if the IP interface named by `lifrp' is RDS-capable. */ -static int -rds_is_ib_interface(char *name) +static boolean_t +rds_capable_interface(struct lifreq *lifrp) { + char ifname[LIFNAMSIZ]; + char drv[MAXLINKNAMELEN]; + uint_t ppa; + char *cp; - char dev_path[MAXPATHLEN]; - char devname[MAXNAMELEN]; - ldi_handle_t lh; - dl_info_ack_t info; - int ret = 0; - int i; - k_sigset_t smask; + if (lifrp->lifr_type == IFT_IB) + return (B_TRUE); /* - * ibd devices are only style 2 devices - * so we will open only style 2 devices - * by ignoring the ppa + * Strip off the logical interface portion before getting + * intimate with the name. */ - i = strlen(name) - 1; - while ((i >= 0) && (!isalpha(name[i]))) i--; - if (i < 0) { - /* Invalid interface name, no alphabet */ - return (-1); - } - (void) strncpy(devname, name, i + 1); - devname[i + 1] = '\0'; + (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); + if ((cp = strchr(ifname, ':')) != NULL) + *cp = '\0'; - if (strcmp("lo", devname) == 0) { + if (strcmp("lo0", ifname) == 0) { /* - * loopback interface is considered RDS capable + * loopback is considered RDS-capable */ - return (0); + return (B_TRUE); } - (void) strncpy(dev_path, "/dev/", MAXPATHLEN); - if (strlcat(dev_path, devname, MAXPATHLEN) >= MAXPATHLEN) { - /* string overflow */ - return (-1); - } + return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS && + rds_transport_ops->rds_transport_if_lookup_by_name(drv)); +} - ret = ldi_open_by_name(dev_path, FREAD|FWRITE, kcred, &lh, rds_li); - if (ret != 0) { - return (ret); - } +/* + * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. + * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. + */ +static int +rds_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) +{ + int err; + int nifs; - sigintr(&smask, 0); - ret = dl_info(lh, &info, NULL, NULL, NULL); - sigunintr(&smask); - (void) ldi_close(lh, FREAD|FWRITE, kcred); - if (ret != 0) { - return (ret); - } + if ((err = rds_do_ip_ioctl(SIOCGIFNUM, sizeof (int), &nifs)) != 0) + return (err); - if (info.dl_mac_type != DL_IB && - !rds_transport_ops->rds_transport_if_lookup_by_name(devname)) { - return (-1); + /* + * Pad the interface count to account for additional interfaces that + * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. + */ + nifs += 4; + + bzero(lifcp, sizeof (struct lifconf)); + lifcp->lifc_family = AF_INET; + lifcp->lifc_len = *bufsizep = (nifs * sizeof (struct lifreq)); + lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_NOSLEEP); + if (lifcp->lifc_buf == NULL) + return (ENOMEM); + + err = rds_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); + if (err != 0) { + kmem_free(lifcp->lifc_buf, *bufsizep); + return (err); } - return (0); } void rds_ioctl_copyin_done(queue_t *q, mblk_t *mp) { - char *addr; + void *addr; mblk_t *mp1; int err = 0; - struct iocblk *iocp = (struct iocblk *)(uintptr_t)mp->b_rptr; + struct iocblk *iocp = (void *)mp->b_rptr; if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) { err = EPROTO; goto done; } - addr = (char *)mp1->b_rptr; + addr = mp1->b_rptr; switch (iocp->ioc_cmd) { - case SIOCGIFNUM: { - /* Get number of interfaces. */ - struct ifconf kifc; - struct ifreq *ifr; - int num_ifs; - int n; - - err = rds_do_ip_ioctl(iocp->ioc_cmd, sizeof (int), - (char *)&num_ifs); - if (err != 0) { - break; - } + uint_t bufsize; + struct lifconf lifc; + struct lifreq *lifrp; + int i, nifs, retval = 0; - kifc.ifc_len = num_ifs * sizeof (struct ifreq); - kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP); - err = rds_do_ip_ioctl(SIOCGIFCONF, - sizeof (struct ifconf), (caddr_t)&kifc); - if (err != 0) { - kmem_free(kifc.ifc_buf, kifc.ifc_len); + if ((err = rds_do_lifconf(&lifc, &bufsize)) != 0) break; - } - ifr = kifc.ifc_req; - n = num_ifs; - for (num_ifs = 0; n > 0; ifr++) { - err = rds_is_ib_interface(ifr->ifr_name); - if (err == 0) { - num_ifs++; - } else if (err > 0) { - num_ifs = 0; - break; - } else { - err = 0; + + nifs = lifc.lifc_len / sizeof (struct lifreq); + for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { + if (strlen(lifrp->lifr_name) <= IFNAMSIZ && + rds_capable_interface(lifrp)) { + retval++; } - n--; } - *((int *)(uintptr_t)addr) = num_ifs; - kmem_free(kifc.ifc_buf, kifc.ifc_len); - } + *((int *)addr) = retval; + kmem_free(lifc.lifc_buf, bufsize); break; + } case O_SIOCGIFCONF: case SIOCGIFCONF: { STRUCT_HANDLE(ifconf, ifc); caddr_t ubuf_addr; int ubuf_size; - struct ifconf kifc; - struct ifreq *ifr, *ptr; - int num_ifs; - - STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, - (struct ifconf *)(uintptr_t)addr); + uint_t bufsize; + int i, nifs; + struct lifconf lifc; + struct lifreq *lifrp; + struct ifreq *ifrp; + STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, (struct ifconf *)addr); ubuf_size = STRUCT_FGET(ifc, ifc_len); ubuf_addr = STRUCT_FGETP(ifc, ifc_buf); - err = rds_do_ip_ioctl(SIOCGIFNUM, sizeof (int), - (char *)&num_ifs); - if (err != 0) { + if ((err = rds_do_lifconf(&lifc, &bufsize)) != 0) break; - } - kifc.ifc_len = num_ifs * sizeof (struct ifreq); - kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP); - err = rds_do_ip_ioctl(iocp->ioc_cmd, - sizeof (struct ifconf), (caddr_t)&kifc); - if (err != 0) { - kmem_free(kifc.ifc_buf, kifc.ifc_len); - break; - } mp1 = mi_copyout_alloc(q, mp, ubuf_addr, ubuf_size, B_FALSE); if (mp1 == NULL) { err = ENOMEM; - kmem_free(kifc.ifc_buf, ubuf_size); + kmem_free(lifc.lifc_buf, bufsize); break; } - ifr = kifc.ifc_req; - ptr = (struct ifreq *)(uintptr_t)mp1->b_rptr; - for (; num_ifs > 0 && - (int)((uintptr_t)mp1->b_wptr - (uintptr_t)mp1->b_rptr) < - ubuf_size; num_ifs--, ifr++) { - err = rds_is_ib_interface(ifr->ifr_name); - if (err == 0) { - ifr->ifr_addr.sa_family = AF_INET_OFFLOAD; - bcopy((caddr_t)ifr, ptr, sizeof (struct ifreq)); - ptr++; - mp1->b_wptr = (uchar_t *)ptr; - } else if (err > 0) { - break; - } else { - err = 0; + ifrp = (void *)mp1->b_rptr; + nifs = lifc.lifc_len / sizeof (struct lifreq); + for (lifrp = lifc.lifc_req, i = 0; i < nifs && + MBLKTAIL(mp1) >= sizeof (struct ifreq); i++, lifrp++) { + /* + * Skip entries that are impossible to return with + * SIOCGIFCONF, or not RDS-capable. + */ + if (strlen(lifrp->lifr_name) > IFNAMSIZ || + !rds_capable_interface(lifrp)) { + continue; } + + ifrp->ifr_addr = *(struct sockaddr *)&lifrp->lifr_addr; + ifrp->ifr_addr.sa_family = AF_INET_OFFLOAD; + (void) strlcpy(ifrp->ifr_name, lifrp->lifr_name, + IFNAMSIZ); + ifrp++; + mp1->b_wptr += sizeof (struct ifreq); } - STRUCT_FSET(ifc, ifc_len, (int)((uintptr_t)mp1->b_wptr - - (uintptr_t)mp1->b_rptr)); - kmem_free(kifc.ifc_buf, kifc.ifc_len); - } + STRUCT_FSET(ifc, ifc_len, MBLKL(mp1)); + kmem_free(lifc.lifc_buf, bufsize); break; + } case SIOCGIFMTU: - err = rds_do_ip_ioctl(iocp->ioc_cmd, - sizeof (struct ifreq), addr); - break; - case SIOCGIFFLAGS: - err = rds_do_ip_ioctl(iocp->ioc_cmd, - sizeof (struct ifreq), addr); + err = rds_do_ip_ioctl(iocp->ioc_cmd, sizeof (struct ifreq), + addr); break; - case TI_GETMYNAME: { + case TI_GETMYNAME: { rds_t *rds; STRUCT_HANDLE(strbuf, sb); ipaddr_t v4addr; @@ -287,8 +248,7 @@ rds_ioctl_copyin_done(queue_t *q, mblk_t *mp) sin_t *sin; STRUCT_SET_HANDLE(sb, - ((struct iocblk *)(uintptr_t)mp->b_rptr)->ioc_flag, - (void *)(uintptr_t)addr); + ((struct iocblk *)(uintptr_t)mp->b_rptr)->ioc_flag, addr); rds = (rds_t *)q->q_ptr; ASSERT(rds->rds_family == AF_INET_OFFLOAD); addrlen = sizeof (sin_t); @@ -320,7 +280,6 @@ done: mi_copy_done(q, mp, err); } - void rds_ioctl_copyin_setup(queue_t *q, mblk_t *mp) { @@ -383,38 +342,26 @@ rds_ioctl(queue_t *q, mblk_t *mp) boolean_t rds_verify_bind_address(ipaddr_t addr) { - int numifs; - struct ifconf kifc; - struct ifreq *ifr; - boolean_t ret = B_FALSE; - - - if (rds_do_ip_ioctl(SIOCGIFNUM, sizeof (int), (caddr_t)&numifs)) { - return (ret); - } - - kifc.ifc_len = numifs * sizeof (struct ifreq); - kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP); - - if (rds_do_ip_ioctl(SIOCGIFCONF, sizeof (struct ifconf), - (caddr_t)&kifc)) { - goto done; - } - - ifr = kifc.ifc_req; - for (numifs = kifc.ifc_len / sizeof (struct ifreq); - numifs > 0; numifs--, ifr++) { - struct sockaddr_in *sin; - - sin = (struct sockaddr_in *)(uintptr_t)&ifr->ifr_addr; - if ((sin->sin_addr.s_addr == addr) && - (rds_is_ib_interface(ifr->ifr_name) == 0)) { - ret = B_TRUE; - break; + int i, nifs; + uint_t bufsize; + struct lifconf lifc; + struct lifreq *lifrp; + struct sockaddr_in *sinp; + boolean_t retval = B_FALSE; + + if (rds_do_lifconf(&lifc, &bufsize) != 0) + return (B_FALSE); + + nifs = lifc.lifc_len / sizeof (struct lifreq); + for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { + sinp = (struct sockaddr_in *)&lifrp->lifr_addr; + if (rds_capable_interface(lifrp) && + sinp->sin_addr.s_addr == addr) { + retval = B_TRUE; + break; } } -done: - kmem_free(kifc.ifc_buf, kifc.ifc_len); - return (ret); + kmem_free(lifc.lifc_buf, bufsize); + return (retval); } diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c index bcb3c235be..dd7c9554a5 100644 --- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c +++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +33,7 @@ #include <sys/socket.h> #include <sys/stat.h> #include <net/if_arp.h> +#include <net/if_types.h> #include <sys/file.h> #include <sys/sockio.h> #include <sys/pathname.h> @@ -528,62 +529,112 @@ ibcm_arp_get_ibd_insts(ibcm_arp_ibd_insts_t *ibds) } /* - * Return ibd interfaces and ibd instances. + * Issue an ioctl down to IP. There are several similar versions of this + * function (e.g., rpcib_do_ip_ioctl()); clearly a utility routine is needed. */ static int -ibcm_arp_get_ibd_ipaddr(ibcm_arp_ibd_insts_t *ibds) +ibcm_do_ip_ioctl(int cmd, int len, void *arg) { - TIUSER *tiptr; - vnode_t *kvp; - vnode_t *vp = NULL; - struct strioctl iocb; - struct lifreq lif_req; - int k, ip_cnt; - ibcm_arp_ip_t *ipp; + vnode_t *kvp; + TIUSER *tiptr; + struct strioctl iocb; + int err = 0; - if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { - if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE, - &tiptr, CRED()) == 0) { - vp = tiptr->fp->f_vnode; - } else { - VN_RELE(kvp); - } - } + if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) != 0) + return (EPROTO); - if (vp == NULL) - return (-1); + if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, &tiptr, CRED()) != 0) { + VN_RELE(kvp); + return (EPROTO); + } - /* Get ibd ip's */ - ip_cnt = 0; - for (k = 0, ipp = ibds->ibcm_arp_ip; k < ibds->ibcm_arp_ibd_cnt; - k++, ipp++) { + iocb.ic_cmd = cmd; + iocb.ic_timout = 0; + iocb.ic_len = len; + iocb.ic_dp = (caddr_t)arg; + err = kstr_ioctl(tiptr->fp->f_vnode, I_STR, (intptr_t)&iocb); + (void) t_kclose(tiptr, 0); + VN_RELE(kvp); + return (err); +} - (void) bzero((void *)&lif_req, sizeof (struct lifreq)); - (void) snprintf(lif_req.lifr_name, sizeof (lif_req.lifr_name), - "%s%d", IBCM_ARP_IBD_NAME, ipp->ip_inst); +/* + * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. + * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. + */ +static int +ibcm_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) +{ + int err; + struct lifnum lifn; + + bzero(&lifn, sizeof (struct lifnum)); + lifn.lifn_family = AF_UNSPEC; + + err = ibcm_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); + if (err != 0) + return (err); + + /* + * Pad the interface count to account for additional interfaces that + * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. + */ + lifn.lifn_count += 4; + + bzero(lifcp, sizeof (struct lifconf)); + lifcp->lifc_family = AF_UNSPEC; + lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); + lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); + + err = ibcm_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); + if (err != 0) { + kmem_free(lifcp->lifc_buf, *bufsizep); + return (err); + } + return (0); +} - (void) bzero((void *)&iocb, sizeof (struct strioctl)); - iocb.ic_cmd = SIOCGLIFADDR; - iocb.ic_timout = 0; - iocb.ic_len = sizeof (struct lifreq); - iocb.ic_dp = (caddr_t)&lif_req; +/* + * Fill in `ibds' with IP addresses tied to IFT_IB IP interfaces. Returns + * B_TRUE if at least one address was filled in. + */ +static boolean_t +ibcm_arp_get_ibd_ipaddr(ibcm_arp_ibd_insts_t *ibds) +{ + int i, nifs, naddr = 0; + uint_t bufsize; + struct lifconf lifc; + struct lifreq *lifrp; + ibcm_arp_ip_t *ipp; + + if (ibcm_do_lifconf(&lifc, &bufsize) != 0) + return (B_FALSE); + + nifs = lifc.lifc_len / sizeof (struct lifreq); + for (lifrp = lifc.lifc_req, i = 0; + i < nifs && naddr < ibds->ibcm_arp_ibd_cnt; i++, lifrp++) { + if (lifrp->lifr_type != IFT_IB) + continue; - if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) { + ipp = &ibds->ibcm_arp_ip[naddr]; + switch (lifrp->lifr_addr.ss_family) { + case AF_INET: ipp->ip_inet_family = AF_INET; - bcopy(&lif_req.lifr_addr, &ipp->ip_cm_sin, + bcopy(&lifrp->lifr_addr, &ipp->ip_cm_sin, sizeof (struct sockaddr_in)); - ip_cnt++; - continue; + naddr++; + break; + case AF_INET6: + ipp->ip_inet_family = AF_INET6; + bcopy(&lifrp->lifr_addr, &ipp->ip_cm_sin6, + sizeof (struct sockaddr_in6)); + naddr++; + break; } } - (void) t_kclose(tiptr, 0); - VN_RELE(kvp); - - if (ip_cnt == 0) - return (-1); - else - return (0); + kmem_free(lifc.lifc_buf, bufsize); + return (naddr > 0); } ibt_status_t @@ -600,7 +651,7 @@ ibcm_arp_get_ibds(ibcm_arp_ibd_insts_t *ibdp) return (IBT_SRC_IP_NOT_FOUND); /* Get the IP addresses of active ports. */ - if (ibcm_arp_get_ibd_ipaddr(ibdp) != 0) { + if (!ibcm_arp_get_ibd_ipaddr(ibdp)) { IBTF_DPRINTF_L2(cmlog, "ibcm_arp_get_ibds: failed to get " "ibd instance: IBT_SRC_IP_NOT_FOUND"); return (IBT_SRC_IP_NOT_FOUND); diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c index af622d5c8f..29b5116446 100644 --- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c +++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #include <sys/dlpi.h> @@ -35,24 +33,13 @@ #include <sys/ddi.h> #include <sys/cmn_err.h> #include <sys/socket.h> -#include <sys/tihdr.h> #include <net/if.h> -#include <net/if_arp.h> #include <net/if_types.h> -#include <net/if_dl.h> -#include <net/route.h> -#include <sys/sockio.h> #include <netinet/in.h> -#include <netinet/ip6.h> -#include <netinet/icmp6.h> #include <sys/ethernet.h> -#include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ -#include <inet/mi.h> #include <inet/arp.h> #include <inet/ip.h> -#include <inet/ip_multi.h> #include <inet/ip_ire.h> -#include <inet/ip_rts.h> #include <inet/ip_if.h> #include <sys/ib/mgt/ibcm/ibcm_arp.h> #include <inet/ip_ftable.h> @@ -389,21 +376,16 @@ ibcm_arp_pr_callback(ibcm_arp_prwqn_t *wqnp, int status) wqnp->func((void *)wqnp, status); } +/* + * Check if the interface is loopback or IB. + */ static int -ibcm_arp_check_interface(ibcm_arp_prwqn_t *wqnp, int length) +ibcm_arp_check_interface(ill_t *ill) { - /* - * if the i/f is not ib or lo device, fail the request - */ - if (bcmp(wqnp->ifname, "ibd", 3) == 0) { - if (length != IPOIB_ADDRL) { - return (EINVAL); - } - } else if (bcmp(wqnp->ifname, "lo", 2)) { - return (ETIMEDOUT); - } + if (IS_LOOPBACK(ill) || ill->ill_type == IFT_IB) + return (0); - return (0); + return (ETIMEDOUT); } #define IBTL_IPV4_ADDR(a) (a->un.ip4addr) @@ -414,11 +396,10 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, ibcm_arp_pr_comp_func_t func) { ibcm_arp_prwqn_t *wqnp; - ire_t *ire; - ire_t *src_ire; + ire_t *ire = NULL; + ire_t *src_ire = NULL; ipif_t *ipif; - ill_t *ill; - int length; + ill_t *ill, *hwaddr_ill = NULL; ip_stack_t *ipst; IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup(src %p dest %p)", @@ -449,13 +430,10 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, if (src_ire == NULL) { IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " "ire_ctable_lookup failed"); - netstack_rele(ipst->ips_netstack); - ibcm_arp_prwqn_delete(wqnp); ib_s->status = EFAULT; - return (1); + goto fail; } - /* * get an ire for the destination adress with the matching source * address @@ -463,16 +441,11 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, ire = ire_ftable_lookup(IBTL_IPV4_ADDR(dst_addr), 0, 0, 0, src_ire->ire_ipif, 0, src_ire->ire_zoneid, 0, NULL, MATCH_IRE_SRC, ipst); - - netstack_rele(ipst->ips_netstack); - if (ire == NULL) { IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " "ire_ftable_lookup failed"); - IRE_REFRELE(src_ire); - ibcm_arp_prwqn_delete(wqnp); ib_s->status = EFAULT; - return (1); + goto fail; } wqnp->src_addr.un.ip4addr = ire->ire_src_addr; @@ -480,35 +453,56 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, ipif = src_ire->ire_ipif; ill = ipif->ipif_ill; - length = ill->ill_name_length; - bcopy(ill->ill_name, &wqnp->ifname, ill->ill_name_length); - wqnp->ifname[length] = '\0'; - bcopy(ill->ill_phys_addr, &wqnp->src_mac, - ill->ill_phys_addr_length); + (void) strlcpy(wqnp->ifname, ill->ill_name, sizeof (wqnp->ifname)); - IRE_REFRELE(ire); - IRE_REFRELE(src_ire); + /* + * For IPMP data addresses, we need to use the hardware address of the + * interface bound to the given address. + */ + if (IS_IPMP(ill)) { + if ((hwaddr_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { + IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: no bound " + "ill for IPMP interface %s", ill->ill_name); + ib_s->status = EFAULT; + goto fail; + } + } else { + hwaddr_ill = ill; + ill_refhold(hwaddr_ill); /* for symmetry */ + } - ib_s->status = - ibcm_arp_check_interface(wqnp, ill->ill_phys_addr_length); - if (ib_s->status) { + bcopy(hwaddr_ill->ill_phys_addr, &wqnp->src_mac, + hwaddr_ill->ill_phys_addr_length); + + if ((ib_s->status = ibcm_arp_check_interface(hwaddr_ill)) != 0) { IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " "ibcm_arp_check_interface failed"); - ibcm_arp_prwqn_delete(wqnp); - return (1); + goto fail; } - ib_s->status = ibcm_arp_squery_arp(wqnp); - if (ib_s->status) { + if ((ib_s->status = ibcm_arp_squery_arp(wqnp)) != 0) { IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " "ibcm_arp_squery_arp failed"); - ibcm_arp_prwqn_delete(wqnp); - return (1); + goto fail; } - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: Return: 0x%p", wqnp); + ill_refrele(hwaddr_ill); + IRE_REFRELE(ire); + IRE_REFRELE(src_ire); + netstack_rele(ipst->ips_netstack); + IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: Return: 0x%p", wqnp); return (0); +fail: + if (hwaddr_ill != NULL) + ill_refrele(hwaddr_ill); + if (ire != NULL) + IRE_REFRELE(ire); + if (src_ire != NULL) + IRE_REFRELE(src_ire); + ibcm_arp_prwqn_delete(wqnp); + netstack_rele(ipst->ips_netstack); + return (1); } #define IBCM_H2N_GID(gid) \ diff --git a/usr/src/uts/common/ipp/ipgpc/classifier-objects.h b/usr/src/uts/common/ipp/ipgpc/classifier-objects.h index f1cb20b88d..4002a39573 100644 --- a/usr/src/uts/common/ipp/ipgpc/classifier-objects.h +++ b/usr/src/uts/common/ipp/ipgpc/classifier-objects.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPP_IPGPC_CLASSIFIER_OBJECTS_H #define _IPP_IPGPC_CLASSIFIER_OBJECTS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/time.h> #include <ipp/ipp.h> #include <ipp/ipgpc/ipgpc.h> @@ -64,14 +61,12 @@ extern "C" { #define IPGPC_TABLE_UID 8 #define IPGPC_TABLE_PROJID 9 #define IPGPC_TABLE_IF 10 -#define IPGPC_TABLE_IF_GRPNM 11 -#define IPGPC_TABLE_DIR 12 +#define IPGPC_TABLE_DIR 11 #define TABLE_ID_OFFSET IPGPC_TABLE_PROTOID #define PROTOID_IDX (IPGPC_TABLE_PROTOID - TABLE_ID_OFFSET) #define UID_IDX (IPGPC_TABLE_UID - TABLE_ID_OFFSET) #define PROJID_IDX (IPGPC_TABLE_PROJID - TABLE_ID_OFFSET) #define IF_IDX (IPGPC_TABLE_IF - TABLE_ID_OFFSET) -#define IF_GRPNM_IDX (IPGPC_TABLE_IF_GRPNM - TABLE_ID_OFFSET) #define DIR_IDX (IPGPC_TABLE_DIR - TABLE_ID_OFFSET) /* Match types for selector searching */ @@ -91,11 +86,10 @@ extern "C" { #define UID_MASK 0x40 #define PROJID_MASK 0x80 #define IF_MASK 0x100 -#define IF_GRPNM_MASK 0x200 -#define DIR_MASK 0x400 +#define DIR_MASK 0x200 #define ALL_MATCH_MASK (DS_MASK | PROTO_MASK | SADDR_MASK | DADDR_MASK | \ SPORT_MASK | DPORT_MASK | UID_MASK | PROJID_MASK | \ - IF_MASK | IF_GRPNM_MASK | DIR_MASK) + IF_MASK | DIR_MASK) #define HASH_SIZE 11 /* default hash table size */ @@ -108,7 +102,6 @@ typedef struct ipgpc_filter_s { char filter_name[MAXNAMELEN]; /* null terminated name of filter */ /* exact match selectors */ - char if_groupname[LIFNAMSIZ]; /* null terminated iface groupname */ uid_t uid; /* uid key, value = exact or IPGPC_WILDCARD */ projid_t projid; /* project id, " " */ uint_t if_index; /* interface index, " " or 0 for wildcard */ diff --git a/usr/src/uts/common/ipp/ipgpc/classifier.c b/usr/src/uts/common/ipp/ipgpc/classifier.c index bb09a3ca89..9137fcba9a 100644 --- a/usr/src/uts/common/ipp/ipgpc/classifier.c +++ b/usr/src/uts/common/ipp/ipgpc/classifier.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/kmem.h> #include <sys/systm.h> #include <sys/socket.h> @@ -78,7 +76,6 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table, uint16_t *slctrs_srchd) { int match_status; - int if_grpnm_hv; /* Find on packet direction */ match_status = @@ -96,19 +93,6 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table, return (match_status); } - /* Find on IF_GRPNM of packet */ - if (packet->if_groupname_len > 0) { - if_grpnm_hv = name_hash(packet->if_groupname, TABLE_SIZE); - } else { - if_grpnm_hv = IPGPC_WILDCARD; - } - match_status = - ipgpc_findfilters(IPGPC_TABLE_IF_GRPNM, if_grpnm_hv, fid_table); - if (CHECK_MATCH_STATUS(match_status, slctrs_srchd, - ipgpc_table_list[IF_GRPNM_IDX].info.mask) != NORMAL_MATCH) { - return (match_status); - } - /* Find on DS field */ match_status = ipgpc_findfilters(IPGPC_BA_DSID, packet->dsfield, fid_table); @@ -149,9 +133,8 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table, /* Find on IP Source Port field */ if (packet->sport > 0) { - match_status = - ipgpc_findfilters(IPGPC_TRIE_SPORTID, packet->sport, - fid_table); + match_status = ipgpc_findfilters(IPGPC_TRIE_SPORTID, + packet->sport, fid_table); if (CHECK_MATCH_STATUS(match_status, slctrs_srchd, ipgpc_trie_list[IPGPC_TRIE_SPORTID].info.mask) != NORMAL_MATCH) { @@ -164,9 +147,8 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table, /* Find on IP Destination Port field */ if (packet->dport > 0) { - match_status = - ipgpc_findfilters(IPGPC_TRIE_DPORTID, packet->dport, - fid_table); + match_status = ipgpc_findfilters(IPGPC_TRIE_DPORTID, + packet->dport, fid_table); if (CHECK_MATCH_STATUS(match_status, slctrs_srchd, ipgpc_trie_list[IPGPC_TRIE_DPORTID].info.mask) != NORMAL_MATCH) { @@ -261,12 +243,11 @@ ipgpc_classify(int af, ipgpc_packet_t *packet) match_status = 0; slctrs_srchd = ALL_MATCH_MASK; - bzero(fid_table, sizeof (ht_match_t) * HASH_SIZE); /* first search all address family independent selectors */ - if ((rc = common_classify(packet, fid_table, &slctrs_srchd)) != - NORMAL_MATCH) { + rc = common_classify(packet, fid_table, &slctrs_srchd); + if (rc != NORMAL_MATCH) { /* free all dynamic allocated memory */ FREE_FID_TABLE(fid_table, p, q, i); if (rc == NO_MATCHES) { @@ -453,7 +434,7 @@ bestmatch(ht_match_t *fid_table, uint16_t bestmask) */ real_prio = ((uint64_t)ipgpc_fid_list[key].filter.priority - << 32) | + << 32) | (uint64_t)~ipgpc_fid_list[key].filter.precedence; /* check to see if this is the new bestmatch */ @@ -689,35 +670,32 @@ parse_packet6(ipgpc_packet_t *packet, mblk_t *mp) void print_packet(int af, ipgpc_packet_t *pkt) { + char saddrbuf[INET6_ADDRSTRLEN]; + char daddrbuf[INET6_ADDRSTRLEN]; + if (af == AF_INET) { - char saddrbuf[INET_ADDRSTRLEN]; - char daddrbuf[INET_ADDRSTRLEN]; + (void) inet_ntop(af, &V4_PART_OF_V6(pkt->saddr), saddrbuf, + sizeof (saddrbuf)); + (void) inet_ntop(af, &V4_PART_OF_V6(pkt->daddr), daddrbuf, + sizeof (daddrbuf)); + ipgpc4dbg(("print_packet: saddr = %s, daddr = %s, sport = %u" \ ", dport = %u, proto = %u, dsfield = %x, uid = %d," \ - " if_index = %d, if_groupname = %s, projid = %d, " \ - "direction = %d", - inet_ntop(af, &V4_PART_OF_V6(pkt->saddr), saddrbuf, - sizeof (saddrbuf)), - inet_ntop(af, &V4_PART_OF_V6(pkt->daddr), daddrbuf, - sizeof (daddrbuf)), - ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto, + " if_index = %d, projid = %d, direction = %d", saddrbuf, + daddrbuf, ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto, pkt->dsfield, pkt->uid, pkt->if_index, - (pkt->if_groupname != NULL) ? pkt->if_groupname : "NULL", pkt->projid, pkt->direction)); } else if (af == AF_INET6) { - char saddrbuf[INET6_ADDRSTRLEN]; - char daddrbuf[INET6_ADDRSTRLEN]; + (void) inet_ntop(af, pkt->saddr.s6_addr32, saddrbuf, + sizeof (saddrbuf)); + (void) inet_ntop(af, pkt->daddr.s6_addr32, daddrbuf, + sizeof (daddrbuf)); + ipgpc4dbg(("print_packet: saddr = %s, daddr = %s, sport = %u" \ ", dport = %u, proto = %u, dsfield = %x, uid = %d," \ - " if_index = %d, if_groupname = %s, projid = %d, " \ - "direction = %d", - inet_ntop(af, pkt->saddr.s6_addr32, saddrbuf, - sizeof (saddrbuf)), - inet_ntop(af, pkt->daddr.s6_addr32, daddrbuf, - sizeof (daddrbuf)), - ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto, + " if_index = %d, projid = %d, direction = %d", saddrbuf, + daddrbuf, ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto, pkt->dsfield, pkt->uid, pkt->if_index, - (pkt->if_groupname != NULL) ? pkt->if_groupname : "NULL", pkt->projid, pkt->direction)); } } diff --git a/usr/src/uts/common/ipp/ipgpc/classifier.h b/usr/src/uts/common/ipp/ipgpc/classifier.h index 4ee36ae32b..629aeab2f5 100644 --- a/usr/src/uts/common/ipp/ipgpc/classifier.h +++ b/usr/src/uts/common/ipp/ipgpc/classifier.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPP_IPGPC_CLASSIFIER_H #define _IPP_IPGPC_CLASSIFIER_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/cmn_err.h> #include <ipp/ipgpc/filters.h> @@ -74,8 +71,6 @@ typedef struct ipgpc_packet_s { projid_t projid; /* project id for packet */ uint_t if_index; /* interface index */ uint32_t direction; /* packet direction */ - char *if_groupname; /* interface group name */ - uint_t if_groupname_len; /* interface group name length */ uint_t len; /* length of packet */ } ipgpc_packet_t; diff --git a/usr/src/uts/common/ipp/ipgpc/classifierddi.c b/usr/src/uts/common/ipp/ipgpc/classifierddi.c index d9955d84a6..4d31da6396 100644 --- a/usr/src/uts/common/ipp/ipgpc/classifierddi.c +++ b/usr/src/uts/common/ipp/ipgpc/classifierddi.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/systm.h> #include <sys/socket.h> #include <netinet/in.h> @@ -433,12 +431,6 @@ ipgpc_invoke_action(ipp_action_id_t aid, ipp_packet_t *packet) } } - /* The ill_index could be 0 when called from forwarding (read) path */ - if (ill_idx > 0) { - ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE, - NULL, NULL, NULL, NULL); - } - /* parse the packet from the message block */ ipha = (ipha_t *)mp->b_rptr; /* Determine IP Header Version */ @@ -452,23 +444,27 @@ ipgpc_invoke_action(ipp_action_id_t aid, ipp_packet_t *packet) pkt.direction = callout_pos; /* set packet direction */ + /* The ill_index could be 0 when called from forwarding (read) path */ + if (ill_idx > 0) { + ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE, + NULL, NULL, NULL, NULL); + } if (ill != NULL) { - pkt.if_index = ill->ill_phyint->phyint_ifindex; - pkt.if_groupname_len = - ill->ill_phyint->phyint_groupname_len; - if (pkt.if_groupname_len > 0) { - pkt.if_groupname = - ill->ill_phyint->phyint_groupname; - } else { - pkt.if_groupname = NULL; - } - /* Got the fields from the ILL, go ahead and refrele */ + /* + * Since all IPP actions in an IPMP group are performed + * relative to the IPMP group interface, if this is an + * underlying interface in an IPMP group, use the IPMP + * group interface's index. + */ + if (IS_UNDER_IPMP(ill)) + pkt.if_index = ipmp_ill_get_ipmp_ifindex(ill); + else + pkt.if_index = ill->ill_phyint->phyint_ifindex; + /* Got the field from the ILL, go ahead and refrele */ ill_refrele(ill); } else { - /* unknown if_index and if_group */ + /* unknown if_index */ pkt.if_index = IPGPC_UNSPECIFIED; - pkt.if_groupname = NULL; - pkt.if_groupname_len = 0; } if (ipgpc_debug > 5) { diff --git a/usr/src/uts/common/ipp/ipgpc/filters.c b/usr/src/uts/common/ipp/ipgpc/filters.c index 7dd4dce48e..3a2f954d0a 100644 --- a/usr/src/uts/common/ipp/ipgpc/filters.c +++ b/usr/src/uts/common/ipp/ipgpc/filters.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/atomic.h> #include <sys/types.h> #include <sys/systm.h> @@ -83,7 +81,6 @@ static ht_node_t proto_table[TABLE_SIZE]; /* protocol table */ static ht_node_t uid_table[TABLE_SIZE]; /* IPGPC_UID table */ static ht_node_t projid_table[TABLE_SIZE]; /* IPGPC_PROJID table */ static ht_node_t if_table[TABLE_SIZE]; /* Interface ID table */ -static ht_node_t if_grpnm_table[TABLE_SIZE]; /* Interface Group Name table */ static ht_node_t dir_table[TABLE_SIZE]; /* packet direction table */ static ipp_action_id_t ipgpc_aid; /* the action id for ipgpc */ @@ -262,9 +259,6 @@ initialize_tables(void) /* IF_INDEX selector structure */ insert_ipgpc_table_list_info(IF_IDX, if_table, IPGPC_UNSPECIFIED, IF_MASK); - /* IF_GRPNM_INDEX selector structure */ - insert_ipgpc_table_list_info(IF_GRPNM_IDX, if_grpnm_table, - IPGPC_WILDCARD, IF_GRPNM_MASK); /* DIR selector structure */ insert_ipgpc_table_list_info(DIR_IDX, dir_table, IPGPC_UNSPECIFIED, DIR_MASK); @@ -617,19 +611,6 @@ ipgpc_parse_filter(ipgpc_filter_t *filter, nvlist_t *nvlp) bcopy(s, filter->filter_name, (strlen(s) + 1)); - /* parse interface group name */ - if (nvlist_lookup_string(nvlp, IPGPC_IF_GROUPNAME, &s) != 0) { - filter->if_groupname[0] = '\0'; - } else { - /* check max interface group name lenght */ - if ((strlen(s) + 1) > LIFNAMSIZ) { - ipgpc0dbg(("ipgpc_parse_filter: interface group name" \ - " > LIFNAMSIZ")); - return (EINVAL); - } - bcopy(s, filter->if_groupname, (strlen(s) + 1)); - } - /* parse uid */ if (nvlist_lookup_uint32(nvlp, IPGPC_UID, &filter->uid) != 0) { filter->uid = (uid_t)IPGPC_WILDCARD; @@ -976,8 +957,6 @@ insertfid(int filter_id, ipgpc_filter_t *filter, uint_t class_id) static void common_addfilter(fid_t *fid, int filter_id) { - int if_grpnm_hv; - /* start trie inserts */ /* add source port selector */ if (t_insert(&ipgpc_trie_list[IPGPC_TRIE_SPORTID], filter_id, @@ -1025,17 +1004,6 @@ common_addfilter(fid_t *fid, int filter_id) fid->insert_map |= IF_MASK; } - /* add interface groupname selector */ - if (fid->filter.if_groupname[0] == '\0') { - if_grpnm_hv = IPGPC_WILDCARD; - } else { - if_grpnm_hv = name_hash(fid->filter.if_groupname, TABLE_SIZE); - } - if (ht_insert(&ipgpc_table_list[IF_GRPNM_IDX], filter_id, if_grpnm_hv) - == NORMAL_VALUE) { - fid->insert_map |= IF_GRPNM_MASK; - } - /* add direction selector */ if (ht_insert(&ipgpc_table_list[DIR_IDX], filter_id, fid->filter.direction) == NORMAL_VALUE) { @@ -1102,8 +1070,8 @@ ipgpc_addfilter(ipgpc_filter_t *filter, char *class_name, ipp_flags_t flags) fid_t *fid; unsigned class_id; - if ((err = class_name2id(&class_id, class_name, ipgpc_num_cls)) != - EEXIST) { + err = class_name2id(&class_id, class_name, ipgpc_num_cls); + if (err != EEXIST) { ipgpc0dbg(("ipgpc_addfilter: class lookup error %d", err)); return (err); } @@ -1376,9 +1344,8 @@ insertcid(ipgpc_class_t *in_class, int *out_class_id) /* init kstat entry */ if ((rc = class_statinit(in_class, class_id)) != 0) { ipgpc_cid_list[class_id].info = -1; - ipgpc0dbg(("insertcid: " \ - "class_statinit failed with " \ - "error %d", rc)); + ipgpc0dbg(("insertcid: " + "class_statinit failed with error %d", rc)); mutex_exit(&ipgpc_cid_list_lock); return (rc); } @@ -1409,8 +1376,6 @@ insertcid(ipgpc_class_t *in_class, int *out_class_id) static void common_removefilter(int in_filter_id, fid_t *fid) { - int if_grpnm_hv; - /* start trie removes */ t_remove(&ipgpc_trie_list[IPGPC_TRIE_SPORTID], in_filter_id, fid->filter.sport, fid->filter.sport_mask); @@ -1438,14 +1403,6 @@ common_removefilter(int in_filter_id, fid_t *fid) /* remove id from interface id table */ ht_remove(&ipgpc_table_list[IF_IDX], in_filter_id, fid->filter.if_index); - - /* remove id from interface group name table */ - if (fid->filter.if_groupname[0] == '\0') { - if_grpnm_hv = IPGPC_WILDCARD; - } else { - if_grpnm_hv = name_hash(fid->filter.if_groupname, TABLE_SIZE); - } - ht_remove(&ipgpc_table_list[IF_GRPNM_IDX], in_filter_id, if_grpnm_hv); /* remove id from direction table */ ht_remove(&ipgpc_table_list[DIR_IDX], in_filter_id, fid->filter.direction); @@ -1782,7 +1739,6 @@ int ipgpc_modifyclass(nvlist_t **nvlpp, ipp_flags_t flags) { unsigned class_id; - ipp_stat_t *cl_stats; ipgpc_class_t in_class; char *name; int rc; @@ -1837,15 +1793,14 @@ ipgpc_modifyclass(nvlist_t **nvlpp, ipp_flags_t flags) /* check to see if gather_stats booleans differ */ if ((ipgpc_cid_list[class_id].aclass.gather_stats != in_class.gather_stats)) { - if (ipgpc_cid_list[class_id].aclass.gather_stats == - B_TRUE) { - /* delete kstat entry */ - if (ipgpc_cid_list[class_id].cl_stats != NULL) { - cl_stats = - ipgpc_cid_list[class_id].cl_stats; - ipp_stat_destroy(cl_stats); - ipgpc_cid_list[class_id].cl_stats = NULL; - } + if (ipgpc_cid_list[class_id].aclass.gather_stats) { + /* delete kstat entry */ + if (ipgpc_cid_list[class_id].cl_stats != NULL) { + ipp_stat_destroy( + ipgpc_cid_list[class_id].cl_stats); + ipgpc_cid_list[class_id].cl_stats = + NULL; + } } else { /* gather_stats == B_FALSE */ if ((rc = class_statinit(&in_class, class_id)) != 0) { @@ -2326,14 +2281,6 @@ build_filter_nvlist(nvlist_t **nvlpp, ipgpc_filter_t *in_filter, return (rc); } - /* add interface groupname */ - if (in_filter->if_groupname[0] != '\0') { - if ((rc = nvlist_add_string(nvlp, IPGPC_IF_GROUPNAME, - in_filter->if_groupname)) != 0) { - return (rc); - } - } - /* add uid */ if (in_filter->uid != IPGPC_WILDCARD) { if ((rc = nvlist_add_uint32(nvlp, IPGPC_UID, in_filter->uid)) diff --git a/usr/src/uts/common/ipp/ipgpc/ipgpc.h b/usr/src/uts/common/ipp/ipgpc/ipgpc.h index f2e1354132..51edc313f8 100644 --- a/usr/src/uts/common/ipp/ipgpc/ipgpc.h +++ b/usr/src/uts/common/ipp/ipgpc/ipgpc.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPP_IPGPC_IPGPC_H #define _IPP_IPGPC_IPGPC_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/kmem.h> #include <sys/socket.h> @@ -48,7 +45,6 @@ extern "C" { #define IPGPC_NAME "ipgpc" /* config names of name-value pairs and type */ -#define IPGPC_IF_GROUPNAME "ipgpc.if_groupname" /* string */ #define IPGPC_UID "ipgpc.user" /* int32_t */ #define IPGPC_PROJID "ipgpc.projid" /* int32_t */ #define IPGPC_IF_INDEX "ipgpc.if_index" /* uint32_t */ diff --git a/usr/src/uts/common/net/if.h b/usr/src/uts/common/net/if.h index 904fe078cb..05f013e4dc 100644 --- a/usr/src/uts/common/net/if.h +++ b/usr/src/uts/common/net/if.h @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -12,7 +12,6 @@ #ifndef _NET_IF_H #define _NET_IF_H -#pragma ident "%Z%%M% %I% %E% SMI" /* if.h 1.26 90/05/29 SMI; from UCB 7.1 6/4/86 */ #include <sys/feature_tests.h> @@ -105,7 +104,7 @@ struct ifnet { * If you define a flag here, you need to define one in ip_if.h before * using the new flag in IP. Don't use these flags directly in IP. */ -#define IFF_UP 0x0000000001 /* interface is up */ +#define IFF_UP 0x0000000001 /* address is up */ #define IFF_BROADCAST 0x0000000002 /* broadcast address valid */ #define IFF_DEBUG 0x0000000004 /* turn on debugging */ #define IFF_LOOPBACK 0x0000000008 /* is a loopback net */ @@ -138,7 +137,7 @@ struct ifnet { */ #define IFF_NOXMIT 0x0000010000 /* Do not transmit packets */ #define IFF_NOLOCAL 0x0000020000 /* No address - just on-link subnet */ -#define IFF_DEPRECATED 0x0000040000 /* interface address deprecated */ +#define IFF_DEPRECATED 0x0000040000 /* Address is deprecated */ #define IFF_ADDRCONF 0x0000080000 /* address from stateless addrconf */ #define IFF_ROUTER 0x0000100000 /* router on this interface */ @@ -149,14 +148,12 @@ struct ifnet { #define IFF_IPV4 0x0001000000 /* IPv4 interface */ #define IFF_IPV6 0x0002000000 /* IPv6 interface */ /* 0x0004000000 was IFF_MIPRUNNING */ -#define IFF_NOFAILOVER 0x0008000000 /* Don't failover on NIC failure */ +#define IFF_NOFAILOVER 0x0008000000 /* in.mpathd(1M) test address */ -#define IFF_FAILED 0x0010000000 /* NIC has failed */ -#define IFF_STANDBY 0x0020000000 /* Standby NIC to be used on failures */ -#define IFF_INACTIVE 0x0040000000 /* NIC active or not ? */ - /* Used for Standby NIC or */ - /* when FAILBACK is disabled by user */ -#define IFF_OFFLINE 0x0080000000 /* NIC has been offlined */ +#define IFF_FAILED 0x0010000000 /* Interface has failed */ +#define IFF_STANDBY 0x0020000000 /* Interface is a hot-spare */ +#define IFF_INACTIVE 0x0040000000 /* Functioning but not used for data */ +#define IFF_OFFLINE 0x0080000000 /* Interface is offline */ /* * The IFF_XRESOLV flag is an evolving interface and is subject @@ -170,14 +167,22 @@ struct ifnet { #define IFF_FIXEDMTU 0x1000000000ll /* MTU manually set with SIOCSLIFMTU */ #define IFF_VIRTUAL 0x2000000000ll /* Does not send or receive packets */ #define IFF_DUPLICATE 0x4000000000ll /* Local address already in use */ +#define IFF_IPMP 0x8000000000ll /* IPMP IP interface */ -/* flags set internally only: */ +/* flags that cannot be changed by userland on any interface */ #define IFF_CANTCHANGE \ (IFF_BROADCAST | IFF_POINTOPOINT | IFF_RUNNING | IFF_PROMISC | \ IFF_MULTICAST | IFF_MULTI_BCAST | IFF_UNNUMBERED | IFF_IPV4 | \ - IFF_IPV6 | IFF_INACTIVE | IFF_FIXEDMTU | IFF_VIRTUAL | \ + IFF_IPV6 | IFF_IPMP | IFF_FIXEDMTU | IFF_VIRTUAL | \ IFF_LOOPBACK | IFF_ALLMULTI | IFF_DUPLICATE | IFF_COS_ENABLED) +/* flags that cannot be changed by userland on an IPMP interface */ +#define IFF_IPMP_CANTCHANGE IFF_FAILED + +/* flags that can never be set on an IPMP interface */ +#define IFF_IPMP_INVALID (IFF_STANDBY | IFF_INACTIVE | IFF_OFFLINE | \ + IFF_NOFAILOVER | IFF_NOARP | IFF_NONUD | IFF_XRESOLV) + /* * Output queues (ifp->if_snd) and internetwork datagram level (pup level 1) * input routines have queues of messages stored on ifqueue structures @@ -354,7 +359,7 @@ struct lifreq { } lifr_lifru1; #define lifr_addrlen lifr_lifru1.lifru_addrlen #define lifr_ppa lifr_lifru1.lifru_ppa /* Driver's ppa */ - uint_t lifr_movetoindex; /* FAILOVER/FAILBACK ifindex */ + uint_t lifr_type; /* IFT_ETHER, ... */ union { struct sockaddr_storage lifru_addr; struct sockaddr_storage lifru_dstaddr; @@ -371,6 +376,7 @@ struct lifreq { struct lif_nd_req lifru_nd_req; struct lif_ifinfo_req lifru_ifinfo_req; char lifru_groupname[LIFGRNAMSIZ]; /* SIOC[GS]LIFGROUPNAME */ + char lifru_binding[LIFNAMSIZ]; /* SIOCGLIFBINDING */ uint_t lifru_delay; /* SIOC[GS]LIFNOTIFYDELAY */ zoneid_t lifru_zoneid; /* SIOC[GS]LIFZONE */ } lifr_lifru; @@ -392,6 +398,7 @@ struct lifreq { #define lifr_nd lifr_lifru.lifru_nd_req /* SIOCLIF*ND */ #define lifr_ifinfo lifr_lifru.lifru_ifinfo_req /* SIOC[GS]LIFLNKINFO */ #define lifr_groupname lifr_lifru.lifru_groupname +#define lifr_binding lifr_lifru.lifru_binding #define lifr_delay lifr_lifru.lifru_delay #define lifr_zoneid lifr_lifru.lifru_zoneid }; @@ -556,6 +563,7 @@ struct lifsrcof { #define LIFC_TEMPORARY 0x04 /* Include IFF_TEMPORARY interfaces */ #define LIFC_ALLZONES 0x08 /* Include all zones */ /* (must be issued from global zone) */ +#define LIFC_UNDER_IPMP 0x10 /* Include underlying IPMP interfaces */ #if defined(_SYSCALL32) @@ -582,6 +590,22 @@ struct lifsrcof32 { #endif /* _SYSCALL32 */ /* + * IPMP group information, for use with SIOCGLIFGROUPINFO. + */ +typedef struct lifgroupinfo { + char gi_grname[LIFGRNAMSIZ]; /* group name (set by caller) */ + char gi_grifname[LIFNAMSIZ]; /* IPMP meta-interface name */ + char gi_m4ifname[LIFNAMSIZ]; /* v4 mcast interface name */ + char gi_m6ifname[LIFNAMSIZ]; /* v6 mcast interface name */ + char gi_bcifname[LIFNAMSIZ]; /* v4 bcast interface name */ + boolean_t gi_v4; /* group is plumbed for v4 */ + boolean_t gi_v6; /* group is plumbed for v6 */ + uint_t gi_nv4; /* # of underlying v4 if's */ + uint_t gi_nv6; /* # of underlying v6 if's */ + uint_t gi_mactype; /* DLPI mac type of group */ +} lifgroupinfo_t; + +/* * OBSOLETE: Structure used in SIOCGIFCONF request. * Used to retrieve interface configuration * for machine (useful for programs which diff --git a/usr/src/uts/common/net/route.h b/usr/src/uts/common/net/route.h index 078971918d..3e4307f25e 100644 --- a/usr/src/uts/common/net/route.h +++ b/usr/src/uts/common/net/route.h @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -45,7 +45,6 @@ #ifndef _NET_ROUTE_H #define _NET_ROUTE_H -#pragma ident "%Z%%M% %I% %E% SMI" /* from UCB 8.5 (Berkeley) 2/8/95 */ #include <sys/tsol/label.h> @@ -254,6 +253,18 @@ typedef struct tsol_rtsecattr_s { #define RTSA_CIPSO 0x100 /* CIPSO protocol */ #define RTSA_SLRANGE (RTSA_MINSL|RTSA_MAXSL) +/* + * Routing socket options. + */ +#define RT_AWARE 0x0001 /* set awareness of hidden interfaces */ + +/* + * Supported RT_AWARE values. As a convenience, the bit-values here mirror + * the LIFC_* values. + */ +#define RTAW_DEFAULT 0x0000 /* unaware application */ +#define RTAW_UNDER_IPMP 0x0010 /* aware of underlying IPMP interfaces */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h index 782e2dc340..fc2c750ba7 100644 --- a/usr/src/uts/common/netinet/in.h +++ b/usr/src/uts/common/netinet/in.h @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -932,15 +932,7 @@ typedef struct ipsec_req { #define IP_BOUND_IF 0x41 /* bind socket to an ifindex */ #define IP_UNSPEC_SRC 0x42 /* use unspecified source address */ #define IP_BROADCAST_TTL 0x43 /* use specific TTL for broadcast */ - -/* - * IP_DONTFAILOVER_IF option is used to indicate that outbound unicast and - * multicast packets go through the specified interface, no load spreading, - * no failover. - * This is a Sun private interface. - */ -#define IP_DONTFAILOVER_IF 0x44 - +/* can be reused 0x44 */ #define IP_DHCPINIT_IF 0x45 /* accept all unicast DHCP traffic */ /* @@ -1258,15 +1250,6 @@ typedef struct { #define IPV6_BOUND_IF 0x41 /* bind to an ifindex */ #define IPV6_UNSPEC_SRC 0x42 /* source of packets set to */ /* unspecified (all zeros) */ -#define IPV6_BOUND_PIF 0x43 /* Bind to Physical interface */ - /* No load balancing or failover */ -/* - * IPV6_DONTFAILOVER_IF option is used to indicate that outbound unicast and - * multicast packets go through the specified interface, no load spreading, - * no failover. - * This is a Sun private interface. - */ -#define IPV6_DONTFAILOVER_IF 0x44 /* * Miscellaneous IPv6 constants. diff --git a/usr/src/uts/common/rpc/rpcib.c b/usr/src/uts/common/rpc/rpcib.c index d0edb2e8f0..aba7803131 100644 --- a/usr/src/uts/common/rpc/rpcib.c +++ b/usr/src/uts/common/rpc/rpcib.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -56,7 +56,6 @@ #include <sys/errno.h> #include <sys/kmem.h> #include <sys/debug.h> -#include <sys/systm.h> #include <sys/pathname.h> #include <sys/kstat.h> #include <sys/t_lock.h> @@ -67,47 +66,43 @@ #include <sys/callb.h> #include <sys/sunddi.h> #include <sys/sunndi.h> -#include <sys/sunldi.h> #include <sys/sdt.h> -#include <sys/dlpi.h> #include <sys/ib/ibtl/ibti.h> #include <rpc/rpc.h> #include <rpc/ib.h> - #include <sys/modctl.h> - -#include <sys/pathname.h> #include <sys/kstr.h> #include <sys/sockio.h> #include <sys/vnode.h> #include <sys/tiuser.h> #include <net/if.h> +#include <net/if_types.h> #include <sys/cred.h> #include <rpc/rpc_rdma.h> - #include <nfs/nfs.h> -#include <sys/kstat.h> #include <sys/atomic.h> #define NFS_RDMA_PORT 2050 -extern char *inet_ntop(int, const void *, char *, int); - +/* + * Convenience structure used by rpcib_get_ib_addresses() + */ +typedef struct rpcib_ipaddrs { + void *ri_list; /* pointer to list of addresses */ + uint_t ri_count; /* number of addresses in list */ + uint_t ri_size; /* size of ri_list in bytes */ +} rpcib_ipaddrs_t; /* * Prototype declarations for driver ops */ - static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); -static int rpcib_is_ib_interface(char *); -static int rpcib_dl_info(ldi_handle_t, dl_info_ack_t *); -static int rpcib_do_ip_ioctl(int, int, caddr_t); -static boolean_t rpcib_get_ib_addresses(struct sockaddr_in *, - struct sockaddr_in6 *, uint_t *, uint_t *); -static uint_t rpcib_get_number_interfaces(void); +static boolean_t rpcib_rdma_capable_interface(struct lifreq *); +static int rpcib_do_ip_ioctl(int, int, void *); +static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); static int rpcib_cache_kstat_update(kstat_t *, int); static void rib_force_cleanup(void *); @@ -147,9 +142,6 @@ static struct cb_ops rpcib_cbops = { nodev /* int (*cb_awrite)() */ }; - - - /* * Device options */ @@ -205,8 +197,7 @@ typedef struct cache_struct { avl_node_t avl_link; } cache_avl_struct_t; - -static uint64_t rib_total_buffers = 0; +static uint64_t rib_total_buffers = 0; uint64_t cache_limit = 100 * 1024 * 1024; static volatile uint64_t cache_allocation = 0; static uint64_t cache_watermark = 80 * 1024 * 1024; @@ -409,12 +400,10 @@ rpcib_t rpcib; */ int rib_debug = 0; - int _init(void) { - int error; - int ret; + int error; error = mod_install((struct modlinkage *)&rib_modlinkage); if (error != 0) { @@ -423,11 +412,7 @@ _init(void) */ return (error); } - ret = ldi_ident_from_mod(&rib_modlinkage, &rpcib_li); - if (ret != 0) - rpcib_li = NULL; mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); - return (0); } @@ -448,7 +433,6 @@ _fini() return (status); } mutex_destroy(&plugin_state_lock); - ldi_ident_release(rpcib_li); return (0); } @@ -458,7 +442,6 @@ _info(struct modinfo *modinfop) return (mod_info(&rib_modlinkage, modinfop)); } - /* * rpcib_getinfo() * Given the device number, return the devinfo pointer or the @@ -1822,124 +1805,100 @@ refresh: rdma_stat rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca) { - struct sockaddr_in *sin4, *sin4arr; - struct sockaddr_in6 *sin6, *sin6arr; - uint_t nif, nif4, nif6, i; + uint_t i; ibt_path_info_t path; ibt_status_t ibt_status; uint8_t num_paths_p; ibt_ip_path_attr_t ipattr; ibt_ip_addr_t dstip; ibt_path_ip_src_t srcip; - + rpcib_ipaddrs_t addrs4; + rpcib_ipaddrs_t addrs6; + struct sockaddr_in *sinp; + struct sockaddr_in6 *sin6p; + rdma_stat retval = RDMA_SUCCESS; *hca = NULL; - ASSERT(raddr->buf != NULL); bzero(&path, sizeof (ibt_path_info_t)); bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); bzero(&srcip, sizeof (ibt_path_ip_src_t)); - /* Obtain the source IP addresses for the system */ - nif = rpcib_get_number_interfaces(); - sin4arr = (struct sockaddr_in *) - kmem_zalloc(sizeof (struct sockaddr_in) * nif, KM_SLEEP); - sin6arr = (struct sockaddr_in6 *) - kmem_zalloc(sizeof (struct sockaddr_in6) * nif, KM_SLEEP); - - (void) rpcib_get_ib_addresses(sin4arr, sin6arr, &nif4, &nif6); - - /* Are there really any IB interfaces available */ - if (nif4 == 0 && nif6 == 0) { - kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif); - kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif); - return (RDMA_FAILED); + if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || + (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { + retval = RDMA_FAILED; + goto done; } /* Prep the destination address */ switch (addr_type) { case AF_INET: - sin4 = (struct sockaddr_in *)raddr->buf; + sinp = (struct sockaddr_in *)raddr->buf; dstip.family = AF_INET; - dstip.un.ip4addr = sin4->sin_addr.s_addr; + dstip.un.ip4addr = sinp->sin_addr.s_addr; + sinp = addrs4.ri_list; - for (i = 0; i < nif4; i++) { + for (i = 0; i < addrs4.ri_count; i++) { num_paths_p = 0; ipattr.ipa_dst_ip = &dstip; ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; ipattr.ipa_ndst = 1; ipattr.ipa_max_paths = 1; ipattr.ipa_src_ip.family = dstip.family; - ipattr.ipa_src_ip.un.ip4addr = - sin4arr[i].sin_addr.s_addr; + ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr; ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, - IBT_PATH_NO_FLAGS, - &ipattr, - &path, - &num_paths_p, + IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p, &srcip); if (ibt_status == IBT_SUCCESS && num_paths_p != 0 && path.pi_hca_guid == rib_stat->hca->hca_guid) { *hca = rib_stat->hca; - - kmem_free(sin4arr, - sizeof (struct sockaddr_in) * nif); - kmem_free(sin6arr, - sizeof (struct sockaddr_in6) * nif); - - return (RDMA_SUCCESS); + goto done; } } + retval = RDMA_FAILED; break; case AF_INET6: - sin6 = (struct sockaddr_in6 *)raddr->buf; + sin6p = (struct sockaddr_in6 *)raddr->buf; dstip.family = AF_INET6; - dstip.un.ip6addr = sin6->sin6_addr; + dstip.un.ip6addr = sin6p->sin6_addr; + sin6p = addrs6.ri_list; - for (i = 0; i < nif6; i++) { + for (i = 0; i < addrs6.ri_count; i++) { num_paths_p = 0; ipattr.ipa_dst_ip = &dstip; ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; ipattr.ipa_ndst = 1; ipattr.ipa_max_paths = 1; ipattr.ipa_src_ip.family = dstip.family; - ipattr.ipa_src_ip.un.ip6addr = sin6arr[i].sin6_addr; + ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr; ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, - IBT_PATH_NO_FLAGS, - &ipattr, - &path, - &num_paths_p, + IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p, &srcip); if (ibt_status == IBT_SUCCESS && num_paths_p != 0 && path.pi_hca_guid == rib_stat->hca->hca_guid) { *hca = rib_stat->hca; - - kmem_free(sin4arr, - sizeof (struct sockaddr_in) * nif); - kmem_free(sin6arr, - sizeof (struct sockaddr_in6) * nif); - - return (RDMA_SUCCESS); + goto done; } } - + retval = RDMA_FAILED; break; default: - kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif); - kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif); - return (RDMA_INVAL); + retval = RDMA_INVAL; + break; } - - kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif); - kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif); - return (RDMA_FAILED); +done: + if (addrs4.ri_size > 0) + kmem_free(addrs4.ri_list, addrs4.ri_size); + if (addrs6.ri_size > 0) + kmem_free(addrs6.ri_list, addrs6.ri_size); + return (retval); } /* @@ -4668,123 +4627,31 @@ rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) return (RDMA_SUCCESS); } - /* - * Return 0 if the interface is IB. - * Return error (>0) if any error is encountered during processing. - * Return -1 if the interface is not IB and no error. + * Check if the IP interface named by `lifrp' is RDMA-capable. */ -#define isalpha(ch) (((ch) >= 'a' && (ch) <= 'z') || \ - ((ch) >= 'A' && (ch) <= 'Z')) -static int -rpcib_is_ib_interface(char *name) +static boolean_t +rpcib_rdma_capable_interface(struct lifreq *lifrp) { + char ifname[LIFNAMSIZ]; + char *cp; - char dev_path[MAXPATHLEN]; - char devname[MAXNAMELEN]; - ldi_handle_t lh; - dl_info_ack_t info; - int ret = 0; - int i; + if (lifrp->lifr_type == IFT_IB) + return (B_TRUE); /* - * ibd devices are only style 2 devices - * so we will open only style 2 devices - * by ignoring the ppa + * Strip off the logical interface portion before getting + * intimate with the name. */ + (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); + if ((cp = strchr(ifname, ':')) != NULL) + *cp = '\0'; - i = strlen(name) - 1; - while ((i >= 0) && (!isalpha(name[i]))) i--; - - if (i < 0) { - /* Invalid interface name, no alphabet */ - return (-1); - } - - (void) strncpy(devname, name, i + 1); - devname[i + 1] = '\0'; - - if (strcmp("lo", devname) == 0) { - /* - * loopback interface not rpc/rdma capable - */ - return (-1); - } - - (void) strncpy(dev_path, "/dev/", MAXPATHLEN); - if (strlcat(dev_path, devname, MAXPATHLEN) >= MAXPATHLEN) { - /* string overflow */ - return (-1); - } - - ret = ldi_open_by_name(dev_path, FREAD|FWRITE, kcred, &lh, rpcib_li); - if (ret != 0) { - return (ret); - } - ret = rpcib_dl_info(lh, &info); - (void) ldi_close(lh, FREAD|FWRITE, kcred); - if (ret != 0) { - return (ret); - } - - if (info.dl_mac_type != DL_IB) { - return (-1); - } - - return (0); + return (strcmp("lo0", ifname) == 0); } static int -rpcib_dl_info(ldi_handle_t lh, dl_info_ack_t *info) -{ - dl_info_req_t *info_req; - union DL_primitives *dl_prim; - mblk_t *mp; - k_sigset_t smask; - int error; - - if ((mp = allocb(sizeof (dl_info_req_t), BPRI_MED)) == NULL) { - return (ENOMEM); - } - - mp->b_datap->db_type = M_PROTO; - - info_req = (dl_info_req_t *)(uintptr_t)mp->b_wptr; - mp->b_wptr += sizeof (dl_info_req_t); - info_req->dl_primitive = DL_INFO_REQ; - - sigintr(&smask, 0); - if ((error = ldi_putmsg(lh, mp)) != 0) { - sigunintr(&smask); - return (error); - } - if ((error = ldi_getmsg(lh, &mp, (timestruc_t *)NULL)) != 0) { - sigunintr(&smask); - return (error); - } - sigunintr(&smask); - - dl_prim = (union DL_primitives *)(uintptr_t)mp->b_rptr; - switch (dl_prim->dl_primitive) { - case DL_INFO_ACK: - if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) < - sizeof (dl_info_ack_t)) { - error = -1; - } else { - *info = *(dl_info_ack_t *)(uintptr_t)mp->b_rptr; - error = 0; - } - break; - default: - error = -1; - break; - } - - freemsg(mp); - return (error); -} -static int -rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg) +rpcib_do_ip_ioctl(int cmd, int len, void *arg) { vnode_t *kvp, *vp; TIUSER *tiptr; @@ -4792,23 +4659,22 @@ rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg) k_sigset_t smask; int err = 0; - if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, - &kvp) == 0) { - if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE, + if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { + if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, &tiptr, CRED()) == 0) { - vp = tiptr->fp->f_vnode; - } else { - VN_RELE(kvp); - return (EPROTO); + vp = tiptr->fp->f_vnode; + } else { + VN_RELE(kvp); + return (EPROTO); } } else { - return (EPROTO); + return (EPROTO); } iocb.ic_cmd = cmd; iocb.ic_timout = 0; iocb.ic_len = len; - iocb.ic_dp = arg; + iocb.ic_dp = (caddr_t)arg; sigintr(&smask, 0); err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); sigunintr(&smask); @@ -4817,65 +4683,89 @@ rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg) return (err); } -static uint_t rpcib_get_number_interfaces(void) { -uint_t numifs; - if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (uint_t), (caddr_t)&numifs)) { - return (0); +/* + * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. + * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. + */ +static int +rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) +{ + int err; + struct lifnum lifn; + + bzero(&lifn, sizeof (struct lifnum)); + lifn.lifn_family = AF_UNSPEC; + + err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); + if (err != 0) + return (err); + + /* + * Pad the interface count to account for additional interfaces that + * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. + */ + lifn.lifn_count += 4; + + bzero(lifcp, sizeof (struct lifconf)); + lifcp->lifc_family = AF_UNSPEC; + lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); + lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); + + err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); + if (err != 0) { + kmem_free(lifcp->lifc_buf, *bufsizep); + return (err); } - return (numifs); + return (0); } static boolean_t -rpcib_get_ib_addresses( - struct sockaddr_in *saddr4, - struct sockaddr_in6 *saddr6, - uint_t *number4, - uint_t *number6) +rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) { - int numifs; - struct ifconf kifc; - struct ifreq *ifr; - boolean_t ret = B_FALSE; + uint_t i, nifs; + uint_t bufsize; + struct lifconf lifc; + struct lifreq *lifrp; + struct sockaddr_in *sinp; + struct sockaddr_in6 *sin6p; - *number4 = 0; - *number6 = 0; + bzero(addrs4, sizeof (rpcib_ipaddrs_t)); + bzero(addrs6, sizeof (rpcib_ipaddrs_t)); - if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (int), (caddr_t)&numifs)) { - return (ret); + if (rpcib_do_lifconf(&lifc, &bufsize) != 0) + return (B_FALSE); + + if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { + kmem_free(lifc.lifc_buf, bufsize); + return (B_FALSE); } - kifc.ifc_len = numifs * sizeof (struct ifreq); - kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP); + /* + * Worst case is that all of the addresses are IB-capable and have + * the same address family, so size our buffers accordingly. + */ + addrs4->ri_size = nifs * sizeof (struct sockaddr_in); + addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); + addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); + addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); - if (rpcib_do_ip_ioctl(SIOCGIFCONF, sizeof (struct ifconf), - (caddr_t)&kifc)) { - goto done; - } + for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { + if (!rpcib_rdma_capable_interface(lifrp)) + continue; - ifr = kifc.ifc_req; - for (numifs = kifc.ifc_len / sizeof (struct ifreq); - numifs > 0; numifs--, ifr++) { - struct sockaddr_in *sin4; - struct sockaddr_in6 *sin6; - - if ((rpcib_is_ib_interface(ifr->ifr_name) == 0)) { - sin4 = (struct sockaddr_in *)(uintptr_t)&ifr->ifr_addr; - sin6 = (struct sockaddr_in6 *)(uintptr_t)&ifr->ifr_addr; - if (sin4->sin_family == AF_INET) { - saddr4[*number4] = *(struct sockaddr_in *) - (uintptr_t)&ifr->ifr_addr; - *number4 = *number4 + 1; - } else if (sin6->sin6_family == AF_INET6) { - saddr6[*number6] = *(struct sockaddr_in6 *) - (uintptr_t)&ifr->ifr_addr; - *number6 = *number6 + 1; - } + if (lifrp->lifr_addr.ss_family == AF_INET) { + sinp = addrs4->ri_list; + bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], + sizeof (struct sockaddr_in)); + } else if (lifrp->lifr_addr.ss_family == AF_INET6) { + sin6p = addrs6->ri_list; + bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], + sizeof (struct sockaddr_in6)); } } - ret = B_TRUE; -done: - kmem_free(kifc.ifc_buf, kifc.ifc_len); - return (ret); + + kmem_free(lifc.lifc_buf, bufsize); + return (B_TRUE); } /* ARGSUSED */ diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index aa01ddeed6..9f9c95c78d 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -266,13 +266,16 @@ typedef struct dl_ipnetinfo { #define DL_OTHER 0x09 /* Any other medium not listed above */ /* * Private media types. These must be above the value 0x80000000 as - * stated in the DLPI specification. + * stated in the DLPI specification. NOTE: The SUNW_ prefix is used + * to denote synthetic DLPI types that are internal to the stack. */ #define DL_IPV4 0x80000001ul /* IPv4 Tunnel Link */ #define DL_IPV6 0x80000002ul /* IPv6 Tunnel Link */ #define SUNW_DL_VNI 0x80000003ul /* Virtual network interface */ #define DL_WIFI 0x80000004ul /* IEEE 802.11 */ #define DL_IPNET 0x80000005ul /* ipnet(7D) link */ +#define SUNW_DL_IPMP 0x80000006ul /* IPMP stub interface */ + /* * DLPI provider service supported. * These must be allowed to be bitwise-OR for dl_service_mode in diff --git a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h index e421c0b9c0..7bb54ad12e 100644 --- a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h +++ b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h @@ -19,34 +19,23 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_IB_MGT_IBCM_IBCM_ARP_H #define _SYS_IB_MGT_IBCM_IBCM_ARP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif - #include <sys/ib/mgt/ibcm/ibcm_impl.h> #include <sys/modhash.h> #include <sys/ib/clients/ibd/ibd.h> #include <sys/strsun.h> -#include <sys/strsubr.h> #include <sys/socket.h> #include <sys/stat.h> /* for S_IFCHR */ -#include <inet/common.h> -#include <inet/ip.h> -#include <inet/ip_if.h> -#include <inet/ip_ire.h> -#include <inet/ip_rts.h> -#include <sys/dlpi.h> -#include <net/route.h> /* * IPoIB addr lookup completion function @@ -103,7 +92,6 @@ typedef struct ibcm_arp_streams_s { /* GID to IP-Addr and Ip-Addr to GID look-up functions. */ -#define IBCM_ARP_IBD_NAME "ibd" #define IBCM_ARP_IBD_INSTANCES 4 typedef struct ibcm_arp_ip_s { diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h index 593505a426..4e3b2b5778 100644 --- a/usr/src/uts/common/sys/socket.h +++ b/usr/src/uts/common/sys/socket.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -156,12 +156,10 @@ struct so_snd_bufinfo { /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x1010 /* access rights (array of int) */ - #define SO_SECATTR 0x1011 /* socket's security attributes */ #define SCM_UCRED 0x1012 /* sender's ucred */ #define SO_TIMESTAMP 0x1013 /* socket-level timestamp option */ #define SCM_TIMESTAMP SO_TIMESTAMP /* socket control message timestamp */ - #define SO_ALLZONES 0x1014 /* bind in all zones */ #define SO_EXCLBIND 0x1015 /* exclusive binding */ @@ -203,9 +201,12 @@ struct linger { }; /* - * Level number for (get/set)sockopt() to apply to socket itself. + * Levels for (get/set)sockopt() that don't apply to a specific protocol. */ #define SOL_SOCKET 0xffff /* options for socket level */ +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) +#define SOL_ROUTE 0xfffe /* options for routing socket level */ +#endif /* * Address families. diff --git a/usr/src/uts/common/sys/sockio.h b/usr/src/uts/common/sys/sockio.h index 9e107ff3ef..0ef5394fea 100644 --- a/usr/src/uts/common/sys/sockio.h +++ b/usr/src/uts/common/sys/sockio.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -175,7 +175,7 @@ extern "C" { #define SIOCSLIFNETMASK _IOW('i', 126, struct lifreq) /* set subnetmask */ #define SIOCGLIFMETRIC _IOWR('i', 127, struct lifreq) /* get if metric */ #define SIOCSLIFMETRIC _IOW('i', 128, struct lifreq) /* set if metric */ -#define SIOCSLIFNAME _IOWR('i', 129, struct lifreq) /* set interface name */ +#define SIOCSLIFNAME _IOWR('i', 129, struct lifreq) /* set interface name */ #define SIOCGLIFNUM _IOWR('i', 130, struct lifnum) /* get number of ifs */ #define SIOCGLIFMUXID _IOWR('i', 131, struct lifreq) /* get if muxid */ #define SIOCSLIFMUXID _IOW('i', 132, struct lifreq) /* set if muxid */ @@ -223,22 +223,21 @@ extern "C" { #define SIOCLIPSECONFIG _IOW('i', 152, 0) /* List Policy */ /* - * IOCTLS for implementing load balancing and failover within IP. + * 153 can be reused (was consolidation-private SIOCLIFFAILOVER). */ -#define SIOCLIFFAILOVER _IOW('i', 153, struct lifreq) /* Failover */ -#define SIOCLIFFAILBACK _IOW('i', 154, struct lifreq) /* Failback */ -#define SIOCSLIFGROUPNAME _IOW('i', 155, struct lifreq) /* Group interfaces */ -#define SIOCGLIFGROUPNAME _IOWR('i', 156, struct lifreq) /* Get group name */ -#define SIOCGLIFOINDEX _IOWR('i', 157, struct lifreq) /* get orig if index */ /* - * Leave 158 - 160 unused; used to be SIOC*IFARP ioctls. + * IP Multipathing ioctls. */ +#define SIOCGLIFBINDING _IOWR('i', 154, struct lifreq) +#define SIOCSLIFGROUPNAME _IOW('i', 155, struct lifreq) +#define SIOCGLIFGROUPNAME _IOWR('i', 156, struct lifreq) +#define SIOCGLIFGROUPINFO _IOWR('i', 157, struct lifgroupinfo) /* - * IOCTL for implementing load balancing and failover within IP. + * Leave 158 - 160 unused; used to be SIOC*IFARP ioctls. + * However, 161 can be reused (was consolidation-private SIOCSLIFOINDEX). */ -#define SIOCSLIFOINDEX _IOWR('i', 161, struct lifreq) /* set orig if index */ /* * IOCTLS which provide an interface to the IPv6 address selection policy. @@ -309,10 +308,8 @@ extern "C" { #define SIOCSIPMSFILTER _IOW('i', 181, 0) /* - * IOCTL for implementing "disable FAILBACK" IPMP configuration. + * 182 can be reused (was consolidation-private SIOCSIPMPFAILBACK). */ -#define SIOCSIPMPFAILBACK _IOW('i', 182, int) /* enable/disable */ - /* FAILBACK */ #define SIOCSENABLESDP _IOWR('i', 183, int) /* Enable SDP */ diff --git a/usr/src/uts/common/sys/sysevent/eventdefs.h b/usr/src/uts/common/sys/sysevent/eventdefs.h index ac21686e84..dcf36f748c 100644 --- a/usr/src/uts/common/sys/sysevent/eventdefs.h +++ b/usr/src/uts/common/sys/sysevent/eventdefs.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -179,6 +179,8 @@ extern "C" { /* Interface within an IPMP group has changed state or type */ #define ESC_IPMP_IF_CHANGE "ESC_ipmp_if_change" +/* IPMP probe has changed state */ +#define ESC_IPMP_PROBE_STATE "ESC_ipmp_probe_state" /* * EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes diff --git a/usr/src/uts/common/sys/sysevent/ipmp.h b/usr/src/uts/common/sys/sysevent/ipmp.h index 137fa918cd..ba39a5bb2b 100644 --- a/usr/src/uts/common/sys/sysevent/ipmp.h +++ b/usr/src/uts/common/sys/sysevent/ipmp.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,16 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SYSEVENT_IPMP_H #define _SYS_SYSEVENT_IPMP_H -#pragma ident "%Z%%M% %I% %E% SMI" - - /* * IPMP sysevent definitions. Note that all of these definitions are * Sun-private and are subject to change at any time. @@ -39,13 +35,18 @@ extern "C" { #endif +/* + * Event channel associated with these events + */ +#define IPMP_EVENT_CHAN "com.sun:ipmp:events" /* * Event type EC_IPMP/ESC_IPMP_GROUP_STATE event schema * * Event Class - EC_IPMP * Event Sub-Class - ESC_IPMP_GROUP_STATE - * Event Publisher - SUNW:usr:in.mpathd + * Event Vendor - com.sun + * Event Publisher - in.mpathd * * Attribute Name - IPMP_EVENT_VERSION * Attribute Type - SE_DATA_TYPE_UINT32 @@ -70,18 +71,20 @@ extern "C" { #define IPMP_GROUP_STATE "ipmp_group_state" typedef enum { - IPMP_GROUP_OK, /* at least one interface in group is ok */ - IPMP_GROUP_FAILED /* all interfaces in the group have failed */ + IPMP_GROUP_OK, /* all interfaces in the group are ok */ + IPMP_GROUP_FAILED, /* all interfaces in the group are unusable */ + IPMP_GROUP_DEGRADED /* some interfaces in the group are unusable */ } ipmp_group_state_t; -#define IPMP_EVENT_CUR_VERSION 1 +#define IPMP_EVENT_CUR_VERSION 2 /* * Event type EC_IPMP/ESC_IPMP_GROUP_CHANGE event schema * * Event Class - EC_IPMP * Event Sub-Class - ESC_IPMP_GROUP_CHANGE - * Event Publisher - SUNW:usr:in.mpathd + * Event Vendor - com.sun + * Event Publisher - in.mpathd * * Attribute Name - IPMP_GROUP_NAME * Attribute Type - SE_DATA_TYPE_STRING @@ -113,7 +116,8 @@ typedef enum { * * Event Class - EC_IPMP * Event Sub-Class - ESC_IPMP_GROUP_MEMBER_CHANGE - * Event Publisher - SUNW:usr:in.mpathd + * Event Vendor - com.sun + * Event Publisher - in.mpathd * * Attribute Name - IPMP_GROUP_NAME * Attribute Type - SE_DATA_TYPE_STRING @@ -171,7 +175,8 @@ typedef enum { * * Event Class - EC_IPMP * Event Sub-Class - ESC_IPMP_IF_CHANGE - * Event Publisher - SUNW:usr:in.mpathd + * Event Vendor - com.sun + * Event Publisher - in.mpathd * * Attribute Name - IPMP_GROUP_NAME * Attribute Type - SE_DATA_TYPE_STRING @@ -198,6 +203,75 @@ typedef enum { * Attribute Value - <if-type> */ +#define IPMP_PROBE_ID "ipmp_probe_id" +#define IPMP_PROBE_STATE "ipmp_probe_state" +#define IPMP_PROBE_START_TIME "ipmp_probe_start_time" +#define IPMP_PROBE_SENT_TIME "ipmp_probe_sent_time" +#define IPMP_PROBE_ACKRECV_TIME "ipmp_probe_ackrecv_time" +#define IPMP_PROBE_ACKPROC_TIME "ipmp_probe_ackproc_time" +#define IPMP_PROBE_TARGET "ipmp_probe_target" +#define IPMP_PROBE_TARGET_RTTAVG "ipmp_probe_target_rttavg" +#define IPMP_PROBE_TARGET_RTTDEV "ipmp_probe_target_rttdev" + +typedef enum { + IPMP_PROBE_SENT, /* the probe has been sent */ + IPMP_PROBE_ACKED, /* the probe has been acked */ + IPMP_PROBE_LOST /* the probe has been lost */ +} ipmp_probe_state_t; + +/* + * Event type EC_IPMP/ESC_IPMP_PROBE_STATE event schema + * + * Event Class - EC_IPMP + * Event Sub-Class - ESC_IPMP_PROBE_STATE + * Event Vendor - com.sun + * Event Publisher - in.mpathd + * + * Attribute Name - IPMP_PROBE_ID + * Attribute Type - SE_DATA_TYPE_UINT32 + * Attribute Value - <probe-id> + * + * Attribute Name - IPMP_EVENT_VERSION + * Attribute Type - SE_DATA_TYPE_UINT32 + * Attribute Value - <version> + * + * Attribute Name - IPMP_IF_NAME + * Attribute Type - SE_DATA_TYPE_STRING + * Attribute Value - <if-name> + * + * Attribute Name - IPMP_PROBE_STATE + * Attribute Type - SE_DATA_TYPE_UINT32 + * Attribute Value - <probe-state> + * + * Attribute Name - IPMP_PROBE_START_TIME + * Attribute Type - SE_DATA_TYPE_TIME + * Attribute Value - <probe-start-time> + * + * Attribute Name - IPMP_PROBE_SENT_TIME + * Attribute Type - SE_DATA_TYPE_TIME + * Attribute Value - <probe-sent-time> + * + * Attribute Name - IPMP_PROBE_ACKRECV_TIME + * Attribute Type - SE_DATA_TYPE_TIME + * Attribute Value - <probe-ackrecv-time> + * + * Attribute Name - IPMP_PROBE_ACKPROC_TIME + * Attribute Type - SE_DATA_TYPE_TIME + * Attribute Value - <probe-ackproc-time> + * + * Attribute Name - IPMP_PROBE_TARGET + * Attribute Type - SE_DATA_TYPE_BYTES + * Attribute Value - <probe-target-ip> + * + * Attribute Name - IPMP_PROBE_TARGET_RTTAVG + * Attribute Type - SE_DATA_TYPE_UINT32 + * Attribute Value - <probe-target-rttavg> + * + * Attribute Name - IPMP_PROBE_TARGET_RTTDEV + * Attribute Type - SE_DATA_TYPE_UINT32 + * Attribute Value - <probe-target-rttdev> + */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared index 9585034efb..b25c2fb0cc 100644 --- a/usr/src/uts/intel/Makefile.intel.shared +++ b/usr/src/uts/intel/Makefile.intel.shared @@ -216,6 +216,7 @@ DRV_KMODS += cryptoadm DRV_KMODS += dda DRV_KMODS += devinfo DRV_KMODS += dld +DRV_KMODS += dlpistub DRV_KMODS += dmd DRV_KMODS_32 += dnet DRV_KMODS += dump @@ -321,7 +322,6 @@ DRV_KMODS += udp6 DRV_KMODS += ucode DRV_KMODS += ural DRV_KMODS += vgatext -DRV_KMODS += vni DRV_KMODS += vnic DRV_KMODS += vscan DRV_KMODS += wc diff --git a/usr/src/uts/intel/vni/Makefile b/usr/src/uts/intel/dlpistub/Makefile index aa32704615..53cf2092a7 100644 --- a/usr/src/uts/intel/vni/Makefile +++ b/usr/src/uts/intel/dlpistub/Makefile @@ -18,18 +18,11 @@ # # CDDL HEADER END # -# -# uts/intel/vni/Makefile -# -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# -# This makefile drives the production of the vni streams kernel -# module. -# -# intel architecture dependent +# This makefile drives the production of the dlpistub STREAMS module. +# intel architecture dependent # # @@ -40,11 +33,11 @@ UTSBASE = ../.. # # Define the module and object file sets. # -MODULE = vni -OBJECTS = $(VNI_OBJS:%=$(OBJS_DIR)/%) -LINTS = $(VNI_OBJS:%.o=$(LINTS_DIR)/%.ln) +MODULE = dlpistub +OBJECTS = $(DLPISTUB_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(DLPISTUB_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -CONF_SRCDIR = $(UTSBASE)/common/inet/vni +CONF_SRCDIR = $(UTSBASE)/common/inet/dlpistub # # Include common rules. diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64 index 3972f1b4ec..d89224677b 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.debug64 +++ b/usr/src/uts/intel/ip/ip.global-objs.debug64 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -161,6 +161,9 @@ ipinfov4 ipinfov6 iplrinit iplwinit +ipmp_aract_template +ipmp_ardeact_template +ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64 index f6a97be29b..0e58fdc219 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.obj64 +++ b/usr/src/uts/intel/ip/ip.global-objs.obj64 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -160,6 +160,9 @@ ipinfov4 ipinfov6 iplrinit iplwinit +ipmp_aract_template +ipmp_ardeact_template +ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache diff --git a/usr/src/uts/intel/os/name_to_major b/usr/src/uts/intel/os/name_to_major index 3d58c314b7..eb70695abd 100644 --- a/usr/src/uts/intel/os/name_to_major +++ b/usr/src/uts/intel/os/name_to_major @@ -102,7 +102,7 @@ kmdb 171 sctp 172 sctp6 173 scsi_vhci 174 -vni 175 +dlpistub 175 cpuid 176 bmc 177 dld 178 diff --git a/usr/src/uts/sparc/Makefile.sparc.shared b/usr/src/uts/sparc/Makefile.sparc.shared index 3723be6f32..39fba551aa 100644 --- a/usr/src/uts/sparc/Makefile.sparc.shared +++ b/usr/src/uts/sparc/Makefile.sparc.shared @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This makefile contains the common definitions for all sparc @@ -216,7 +216,8 @@ DRV_KMODS += ippctl sctp sctp6 DRV_KMODS += dld DRV_KMODS += ipf DRV_KMODS += rpcib -DRV_KMODS += vni vnic +DRV_KMODS += dlpistub +DRV_KMODS += vnic DRV_KMODS += xge DRV_KMODS += rds DRV_KMODS += chxge diff --git a/usr/src/uts/sparc/vni/Makefile b/usr/src/uts/sparc/dlpistub/Makefile index 6a96edc17e..548361738a 100644 --- a/usr/src/uts/sparc/vni/Makefile +++ b/usr/src/uts/sparc/dlpistub/Makefile @@ -18,18 +18,11 @@ # # CDDL HEADER END # -# -# uts/sparc/vni/Makefile -# -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# -# This makefile drives the production of the vni streams kernel -# module. -# -# sparc architecture dependent +# This makefile drives the production of the dlpistub STREAMS module. +# sparc architecture dependent # # @@ -40,11 +33,11 @@ UTSBASE = ../.. # # Define the module and object file sets. # -MODULE = vni -OBJECTS = $(VNI_OBJS:%=$(OBJS_DIR)/%) -LINTS = $(VNI_OBJS:%.o=$(LINTS_DIR)/%.ln) +MODULE = dlpistub +OBJECTS = $(DLPISTUB_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(DLPISTUB_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -CONF_SRCDIR = $(UTSBASE)/common/inet/vni +CONF_SRCDIR = $(UTSBASE)/common/inet/dlpistub # # Include common rules. diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64 index 279bd92d0b..6606b472bf 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.debug64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -161,6 +161,9 @@ ipinfov4 ipinfov6 iplrinit iplwinit +ipmp_aract_template +ipmp_ardeact_template +ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64 index 4f4bc3e376..89d40afbbb 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.obj64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -160,6 +160,9 @@ ipinfov4 ipinfov6 iplrinit iplwinit +ipmp_aract_template +ipmp_ardeact_template +ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache diff --git a/usr/src/uts/sparc/os/name_to_major b/usr/src/uts/sparc/os/name_to_major index ff58cf5113..9702d00ad7 100644 --- a/usr/src/uts/sparc/os/name_to_major +++ b/usr/src/uts/sparc/os/name_to_major @@ -182,7 +182,7 @@ pic16f819 233 kmdb 234 sctp 235 sctp6 236 -vni 237 +dlpistub 237 cpuid 238 did 239 ntwdt 240 |