diff options
Diffstat (limited to 'usr')
168 files changed, 17264 insertions, 18066 deletions
diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c index 34bb772632..5a4779cfa5 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -133,6 +133,7 @@ main(int argc, char **argv) boolean_t is_verbose; int ipc_fd; int c; + int aware = RTAW_UNDER_IPMP; struct rlimit rl; debug_level = df_get_int("", B_FALSE, DF_DEBUG_LEVEL); @@ -301,6 +302,17 @@ main(int argc, char **argv) dhcpmsg(MSG_ERR, "cannot open routing socket"); return (EXIT_FAILURE); } + + /* + * We're IPMP-aware and can manage IPMP test addresses, so issue + * RT_AWARE to get routing socket messages for interfaces under IPMP. + */ + if (setsockopt(rtsock_fd, SOL_ROUTE, RT_AWARE, &aware, + sizeof (aware)) == -1) { + dhcpmsg(MSG_ERR, "cannot set RT_AWARE on routing socket"); + return (EXIT_FAILURE); + } + if (iu_register_event(eh, rtsock_fd, POLLIN, rtsock_event, 0) == -1) { dhcpmsg(MSG_ERR, "cannot register routing socket for messages"); return (EXIT_FAILURE); @@ -1182,7 +1194,7 @@ check_lif(dhcp_lif_t *lif, const struct ifa_msghdr *ifam, int msglen) lif->lif_name); lif_mark_decline(lif, "duplicate address"); close_ip_lif(lif); - (void) open_ip_lif(lif, INADDR_ANY); + (void) open_ip_lif(lif, INADDR_ANY, B_TRUE); } dad_wait = lif->lif_dad_wait; diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c index 4637ecc346..6cfce9f0a9 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * BOUND state of the DHCP client state machine. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/socket.h> #include <sys/types.h> #include <string.h> @@ -358,7 +356,8 @@ dhcp_bound_complete(dhcp_smach_t *dsmp) lif = dsmp->dsm_lif; if (router_list != NULL && (router_list->len % sizeof (ipaddr_t)) == 0 && - strchr(lif->lif_name, ':') == NULL) { + strchr(lif->lif_name, ':') == NULL && + !lif->lif_pif->pif_under_ipmp) { dsmp->dsm_nrouters = router_list->len / sizeof (ipaddr_t); dsmp->dsm_routers = malloc(router_list->len); diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c index 0cfdad40e3..5d2d5fb99e 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -76,6 +76,7 @@ insert_pif(const char *pname, boolean_t isv6, int *error) { dhcp_pif_t *pif; struct lifreq lifr; + lifgroupinfo_t lifgr; dlpi_handle_t dh = NULL; int fd = isv6 ? v6_sock_fd : v4_sock_fd; @@ -127,12 +128,60 @@ insert_pif(const char *pname, boolean_t isv6, int *error) } /* - * For IPv4, use DLPI to determine the hardware type, hardware - * address, and hardware address length. + * Check if the pif is in an IPMP group. Interfaces using IPMP don't + * have dedicated hardware addresses, and get their hardware type from + * the SIOCGLIFGROUPINFO ioctl rather than DLPI. */ - if (!isv6) { - int rc; - dlpi_info_t dlinfo; + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1) { + *error = DHCP_IPC_E_INT; + dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFGROUPNAME for %s", pname); + goto failure; + } + + if (lifr.lifr_groupname[0] != '\0') { + (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname, + LIFGRNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPINFO, &lifgr) == -1) { + *error = DHCP_IPC_E_INT; + dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFGROUPINFO for %s", + lifgr.gi_grname); + goto failure; + } + + pif->pif_hwtype = dlpi_arptype(lifgr.gi_mactype); + pif->pif_under_ipmp = (strcmp(pname, lifgr.gi_grifname) != 0); + (void) strlcpy(pif->pif_grifname, lifgr.gi_grifname, LIFNAMSIZ); + + /* + * For IPMP underlying interfaces, stash the interface index + * of the IPMP meta-interface; we'll use it to send/receive + * traffic. This is both necessary (since IP_BOUND_IF for + * non-unicast traffic won't work on underlying interfaces) + * and preferred (since a test address lease will be able to + * be maintained as long as another interface in the group is + * still functioning). + */ + if (pif->pif_under_ipmp) { + (void) strlcpy(lifr.lifr_name, pif->pif_grifname, + LIFNAMSIZ); + + if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) { + *error = DHCP_IPC_E_INT; + dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFINDEX " + "for %s", lifr.lifr_name); + goto failure; + } + pif->pif_grindex = lifr.lifr_index; + } + } + + /* + * For IPv4, if the hardware type is still unknown, use DLPI to + * determine it, the hardware address, and hardware address length. + */ + if (!isv6 && pif->pif_hwtype == 0) { + int rc; + dlpi_info_t dlinfo; if ((rc = dlpi_open(pname, &dh, 0)) != DLPI_SUCCESS) { dhcpmsg(MSG_ERROR, "insert_pif: dlpi_open: %s", @@ -661,11 +710,12 @@ verify_lif(const dhcp_lif_t *lif) boolean_t isv6; int fd; struct lifreq lifr; + dhcp_pif_t *pif = lif->lif_pif; (void) memset(&lifr, 0, sizeof (struct lifreq)); (void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ); - isv6 = lif->lif_pif->pif_isv6; + isv6 = pif->pif_isv6; fd = isv6 ? v6_sock_fd : v4_sock_fd; if (ioctl(fd, SIOCGLIFFLAGS, &lifr) == -1) { @@ -689,43 +739,41 @@ verify_lif(const dhcp_lif_t *lif) } /* - * Special case: if the interface has gone down as a duplicate, then - * this alone does _not_ mean that we're abandoning it just yet. Allow - * the state machine to handle this normally by trying to get a new - * lease. - */ - if ((lifr.lifr_flags & (IFF_UP|IFF_DUPLICATE)) == IFF_DUPLICATE) { - dhcpmsg(MSG_DEBUG, "verify_lif: duplicate address on %s", - lif->lif_name); - return (B_TRUE); - } - - /* - * If the user has torn down or started up the interface manually, then - * abandon the lease. - */ - if ((lif->lif_flags ^ lifr.lifr_flags) & IFF_UP) { - dhcpmsg(MSG_DEBUG, "verify_lif: user has %s %s", - lifr.lifr_flags & IFF_UP ? "started up" : "shut down", - lif->lif_name); - return (B_FALSE); - } - - /* * Check for delete and recreate. */ if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) { - dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX failed on %s", - lif->lif_name); + if (errno != ENXIO) { + dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX failed " + "on %s", lif->lif_name); + } return (B_FALSE); } - if (lifr.lifr_index != lif->lif_pif->pif_index) { + if (lifr.lifr_index != pif->pif_index) { dhcpmsg(MSG_DEBUG, "verify_lif: ifindex on %s changed: %u to %u", - lif->lif_name, lif->lif_pif->pif_index, lifr.lifr_index); + lif->lif_name, pif->pif_index, lifr.lifr_index); return (B_FALSE); } + if (pif->pif_under_ipmp) { + (void) strlcpy(lifr.lifr_name, pif->pif_grifname, LIFNAMSIZ); + + if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) { + if (errno != ENXIO) { + dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX " + "failed on %s", lifr.lifr_name); + } + return (B_FALSE); + } + + if (lifr.lifr_index != pif->pif_grindex) { + dhcpmsg(MSG_DEBUG, "verify_lif: IPMP group ifindex " + "on %s changed: %u to %u", lifr.lifr_name, + pif->pif_grindex, lifr.lifr_index); + return (B_FALSE); + } + } + /* * If the IP address, netmask, or broadcast address have changed, or * the interface has been unplumbed, then we act like there has been an @@ -934,6 +982,13 @@ plumb_lif(dhcp_pif_t *pif, const in6_addr_t *addr) lifr.lifr_name); goto failure; } + + /* + * See comment in set_lif_dhcp(). + */ + if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER)) + lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED; + lifr.lifr_flags |= IFF_UP | IFF_DHCPRUNNING; if (ioctl(v6_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) { dhcpmsg(MSG_ERR, "plumb_lif: SIOCSLIFFLAGS %s", @@ -1060,8 +1115,9 @@ set_lif_dhcp(dhcp_lif_t *lif, boolean_t is_adopting) int fd; int err; struct lifreq lifr; + dhcp_pif_t *pif = lif->lif_pif; - fd = lif->lif_pif->pif_isv6 ? v6_sock_fd : v4_sock_fd; + fd = pif->pif_isv6 ? v6_sock_fd : v4_sock_fd; (void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ); @@ -1098,6 +1154,17 @@ set_lif_dhcp(dhcp_lif_t *lif, boolean_t is_adopting) "set on %s", lif->lif_name); } } else { + /* + * If the lif is on an interface under IPMP, IFF_NOFAILOVER + * must be set or the kernel will prevent us from setting + * IFF_DHCPRUNNING (since the subsequent IFF_UP would lead to + * migration). We set IFF_DEPRECATED too since the kernel + * will set it automatically when setting IFF_NOFAILOVER, + * causing our lif_flags value to grow stale. + */ + if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER)) + lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED; + lifr.lifr_flags |= IFF_DHCPRUNNING; if (ioctl(fd, SIOCSLIFFLAGS, &lifr) == -1) { dhcpmsg(MSG_ERR, "set_lif_dhcp: SIOCSLIFFLAGS for %s", @@ -1207,6 +1274,13 @@ clear_lif_deprecated(dhcp_lif_t *lif) return (B_FALSE); } + /* + * Don't try to clear IFF_DEPRECATED if this is a test address, + * since IPMP's use of IFF_DEPRECATED is not compatible with ours. + */ + if (lifr.lifr_flags & IFF_NOFAILOVER) + return (B_TRUE); + if (!(lifr.lifr_flags & IFF_DEPRECATED)) return (B_TRUE); @@ -1226,16 +1300,19 @@ clear_lif_deprecated(dhcp_lif_t *lif) * * input: dhcp_lif_t *: the logical interface to operate on * in_addr_t: the address the socket will be bound to (in hbo) + * boolean_t: B_TRUE if the address should be brought up (if needed) * output: boolean_t: B_TRUE if the socket was opened successfully. */ boolean_t -open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) +open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo, boolean_t bringup) { const char *errmsg; struct lifreq lifr; int on = 1; uchar_t ttl = 255; + uint32_t ifindex; + dhcp_pif_t *pif = lif->lif_pif; if (lif->lif_sock_ip_fd != -1) { dhcpmsg(MSG_WARNING, "open_ip_lif: socket already open on %s", @@ -1270,7 +1347,7 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) } if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_DHCPINIT_IF, - &lif->lif_pif->pif_index, sizeof (int)) == -1) { + &pif->pif_index, sizeof (int)) == -1) { errmsg = "cannot set IP_DHCPINIT_IF"; goto failure; } @@ -1288,23 +1365,40 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) goto failure; } - if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_BOUND_IF, - &lif->lif_pif->pif_index, sizeof (int)) == -1) { + ifindex = pif->pif_under_ipmp ? pif->pif_grindex : pif->pif_index; + if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_BOUND_IF, &ifindex, + sizeof (int)) == -1) { errmsg = "cannot set IP_BOUND_IF"; goto failure; } - /* - * Make sure at least one lif on the interface we used in IP_BOUND_IF - * is IFF_UP so that we can send and receive IP packets. - */ (void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ); if (ioctl(v4_sock_fd, SIOCGLIFFLAGS, &lifr) == -1) { errmsg = "cannot get interface flags"; goto failure; } - if (!(lifr.lifr_flags & IFF_UP)) { + /* + * If the lif is part of an interface under IPMP, IFF_NOFAILOVER must + * be set or the kernel will prevent us from setting IFF_DHCPRUNNING + * (since the subsequent IFF_UP would lead to migration). We set + * IFF_DEPRECATED too since the kernel will set it automatically when + * setting IFF_NOFAILOVER, causing our lif_flags value to grow stale. + */ + if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER)) { + lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED; + if (ioctl(v4_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) { + errmsg = "cannot set IFF_NOFAILOVER"; + goto failure; + } + } + lif->lif_flags = lifr.lifr_flags; + + /* + * If this is initial bringup, make sure the address we're acquiring a + * lease on is IFF_UP. + */ + if (bringup && !(lifr.lifr_flags & IFF_UP)) { /* * Start from a clean slate. */ @@ -1330,6 +1424,30 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) ((struct sockaddr_in *)&lifr.lifr_addr)->sin_addr.s_addr; } + /* + * Usually, bringing up the address we're acquiring a lease on is + * sufficient to allow packets to be sent and received via the + * IP_BOUND_IF we did earlier. However, if we're acquiring a lease on + * an underlying IPMP interface, the group interface will be used for + * sending and receiving IP packets via IP_BOUND_IF. Thus, ensure at + * least one address on the group interface is IFF_UP. + */ + if (bringup && pif->pif_under_ipmp) { + (void) strlcpy(lifr.lifr_name, pif->pif_grifname, LIFNAMSIZ); + if (ioctl(v4_sock_fd, SIOCGLIFFLAGS, &lifr) == -1) { + errmsg = "cannot get IPMP group interface flags"; + goto failure; + } + + if (!(lifr.lifr_flags & IFF_UP)) { + lifr.lifr_flags |= IFF_UP; + if (ioctl(v4_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) { + errmsg = "cannot bring up IPMP group interface"; + goto failure; + } + } + } + lif->lif_packet_id = iu_register_event(eh, lif->lif_sock_ip_fd, POLLIN, dhcp_packet_lif, lif); if (lif->lif_packet_id == -1) { diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h index a59e3ea68d..46cf30bedb 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef INTERFACE_H #define INTERFACE_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Interface.[ch] encapsulate all of the agent's knowledge of network * interfaces from the DHCP agent's perspective. See interface.c for @@ -66,6 +64,9 @@ struct dhcp_pif_s { boolean_t pif_running; /* interface is running */ uint_t pif_hold_count; /* reference count */ char pif_name[LIFNAMSIZ]; + char pif_grifname[LIFNAMSIZ]; + uint32_t pif_grindex; /* interface index for pif_grifname */ + boolean_t pif_under_ipmp; /* is an ipmp underlying interface */ }; struct dhcp_lif_s { @@ -182,7 +183,7 @@ dhcp_lif_t *attach_lif(const char *, boolean_t, int *); int set_lif_dhcp(dhcp_lif_t *, boolean_t); void set_lif_deprecated(dhcp_lif_t *); boolean_t clear_lif_deprecated(dhcp_lif_t *); -boolean_t open_ip_lif(dhcp_lif_t *, in_addr_t); +boolean_t open_ip_lif(dhcp_lif_t *, in_addr_t, boolean_t); void close_ip_lif(dhcp_lif_t *); void lif_mark_decline(dhcp_lif_t *, const char *); boolean_t schedule_lif_timer(dhcp_lif_t *, dhcp_timer_t *, diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c index 8a32b55ea5..a763530436 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <string.h> #include <sys/types.h> #include <stdlib.h> @@ -970,7 +968,10 @@ send_pkt_internal(dhcp_smach_t *dsmp) ipi6->ipi6_addr = lif->lif_v6addr; else ipi6->ipi6_addr = my_in6addr_any; - ipi6->ipi6_ifindex = lif->lif_pif->pif_index; + if (lif->lif_pif->pif_under_ipmp) + ipi6->ipi6_ifindex = lif->lif_pif->pif_grindex; + else + ipi6->ipi6_ifindex = lif->lif_pif->pif_index; cmsg->cmsg_len = (char *)(ipi6 + 1) - (char *)cmsg; /* diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c index a8c05de986..78da07aebf 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * REQUESTING state of the client state machine. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdlib.h> #include <string.h> #include <search.h> @@ -1008,7 +1006,8 @@ dhcp_acknak_global(iu_eh_t *ehp, int fd, short events, iu_event_id_t id, for (dsmp = lookup_smach_by_xid(xid, NULL, isv6); dsmp != NULL; dsmp = lookup_smach_by_xid(xid, dsmp, isv6)) { pif = dsmp->dsm_lif->lif_pif; - if (pif->pif_index == plp->ifindex) + if (pif->pif_index == plp->ifindex || + pif->pif_under_ipmp && pif->pif_grindex == plp->ifindex) break; } diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c index 9ae7fd7aba..852b428551 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * This module contains core functions for managing DHCP state machine * instances. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdlib.h> #include <search.h> #include <string.h> @@ -151,7 +149,7 @@ insert_smach(dhcp_lif_t *lif, int *error) /* * With IPv4 DHCP, we use a socket per lif. */ - if (!open_ip_lif(lif, INADDR_ANY)) { + if (!open_ip_lif(lif, INADDR_ANY, B_TRUE)) { dhcpmsg(MSG_ERR, "unable to open socket for %s", lif->lif_name); /* This will also dispose of the LIF */ @@ -696,14 +694,15 @@ set_smach_state(dhcp_smach_t *dsmp, DHCPSTATE state) if (is_bound_state(dsmp->dsm_state)) { if (!is_bound_state(state)) { close_ip_lif(lif); - if (!open_ip_lif(lif, INADDR_ANY)) + if (!open_ip_lif(lif, INADDR_ANY, + B_FALSE)) return (B_FALSE); } } else { if (is_bound_state(state)) { close_ip_lif(lif); if (!open_ip_lif(lif, - ntohl(lif->lif_addr))) + ntohl(lif->lif_addr), B_FALSE)) return (B_FALSE); } } @@ -952,11 +951,14 @@ no_specified_id: * unable to parse it. We need to determine if a Client ID is required * and, if so, generate one. * - * If it's IPv4 and not a logical interface, then we need to preserve - * backward-compatibility by avoiding new-fangled DUID/IAID - * construction. + * If it's IPv4, not in an IPMP group, and not a logical interface, + * then we need to preserve backward-compatibility by avoiding + * new-fangled DUID/IAID construction. (Note: even for IPMP test + * addresses, we construct a DUID/IAID since we may renew a lease for + * an IPMP test address on any functioning IP interface in the group.) */ - if (!pif->pif_isv6 && strchr(dsmp->dsm_name, ':') == NULL) { + if (!pif->pif_isv6 && pif->pif_grifname[0] == '\0' && + strchr(dsmp->dsm_name, ':') == NULL) { if (pif->pif_hwtype == ARPHRD_IB) { /* * This comes from the DHCP over IPoIB specification. diff --git a/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c b/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c index 47e1202b32..d73722cc55 100644 --- a/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c +++ b/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -8,8 +8,6 @@ * specifies the terms and conditions for redistribution. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Ifparse splits up an ifconfig command line, and was written for use * with the networking boot scripts; see $SRC/cmd/svc/shell/net_include.sh @@ -184,6 +182,7 @@ struct cmd { { "auto-revarp", 0, AF_INET, PARSEFIXED}, { "plumb", 0, AF_ANY, PARSENOW }, { "unplumb", 0, AF_ANY, PARSENOW }, + { "ipmp", 0, AF_ANY, PARSELOG0 }, { "subnet", NEXTARG, AF_ANY, 0 }, { "token", NEXTARG, AF_INET6, PARSELOG0 }, { "tsrc", NEXTARG, AF_ANY, PARSELOG0 }, diff --git a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c index b9a02b54e7..2d115e221b 100644 --- a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c +++ b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,8 +29,6 @@ * MROUTING Revision 3.5 */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * simple netstat based on snmp/mib-2 interface to the TCP/IP stack * @@ -221,6 +219,7 @@ static char *plural(int n); static char *pluraly(int n); static char *plurales(int n); static void process_filter(char *arg); +static char *ifindex2str(uint_t, char *); static boolean_t family_selected(int family); static void usage(char *); @@ -680,8 +679,14 @@ mibget(int sd) tor->OPT_offset = sizeof (struct T_optmgmt_req); tor->OPT_length = sizeof (struct opthdr); tor->MGMT_flags = T_CURRENT; + + + /* + * Note: we use the special level value below so that IP will return + * us information concerning IRE_MARK_TESTHIDDEN routes. + */ req = (struct opthdr *)&tor[1]; - req->level = MIB2_IP; /* any MIB2_xxx value ok here */ + req->level = EXPER_IP_AND_TESTHIDDEN; req->name = 0; req->len = 0; @@ -712,7 +717,7 @@ mibget(int sd) stderr); i = 0; for (last_item = first_item; last_item; - last_item = last_item->next_item) + last_item = last_item->next_item) (void) printf("%d %4d %5d %d\n", ++i, last_item->group, @@ -1707,19 +1712,19 @@ mib_get_constants(mib_item_t *item) ipRouteAttributeSize = ip->ipRouteAttributeSize; transportMLPSize = ip->transportMLPSize; assert(IS_P2ALIGNED(ipAddrEntrySize, - sizeof (mib2_ipAddrEntry_t *)) && - IS_P2ALIGNED(ipRouteEntrySize, - sizeof (mib2_ipRouteEntry_t *)) && - IS_P2ALIGNED(ipNetToMediaEntrySize, - sizeof (mib2_ipNetToMediaEntry_t *)) && - IS_P2ALIGNED(ipMemberEntrySize, - sizeof (ip_member_t *)) && - IS_P2ALIGNED(ipGroupSourceEntrySize, - sizeof (ip_grpsrc_t *)) && - IS_P2ALIGNED(ipRouteAttributeSize, - sizeof (mib2_ipAttributeEntry_t *)) && - IS_P2ALIGNED(transportMLPSize, - sizeof (mib2_transportMLPEntry_t *))); + sizeof (mib2_ipAddrEntry_t *))); + assert(IS_P2ALIGNED(ipRouteEntrySize, + sizeof (mib2_ipRouteEntry_t *))); + assert(IS_P2ALIGNED(ipNetToMediaEntrySize, + sizeof (mib2_ipNetToMediaEntry_t *))); + assert(IS_P2ALIGNED(ipMemberEntrySize, + sizeof (ip_member_t *))); + assert(IS_P2ALIGNED(ipGroupSourceEntrySize, + sizeof (ip_grpsrc_t *))); + assert(IS_P2ALIGNED(ipRouteAttributeSize, + sizeof (mib2_ipAttributeEntry_t *))); + assert(IS_P2ALIGNED(transportMLPSize, + sizeof (mib2_transportMLPEntry_t *))); break; } case EXPER_DVMRP: { @@ -1728,8 +1733,9 @@ mib_get_constants(mib_item_t *item) vifctlSize = mrts->mrts_vifctlSize; mfcctlSize = mrts->mrts_mfcctlSize; assert(IS_P2ALIGNED(vifctlSize, - sizeof (struct vifclt *)) && - IS_P2ALIGNED(mfcctlSize, sizeof (struct mfcctl *))); + sizeof (struct vifclt *))); + assert(IS_P2ALIGNED(mfcctlSize, + sizeof (struct mfcctl *))); break; } case MIB2_IP6: { @@ -1745,17 +1751,17 @@ mib_get_constants(mib_item_t *item) ipv6GroupSourceEntrySize = ip6->ipv6GroupSourceEntrySize; assert(IS_P2ALIGNED(ipv6IfStatsEntrySize, - sizeof (mib2_ipv6IfStatsEntry_t *)) && - IS_P2ALIGNED(ipv6AddrEntrySize, - sizeof (mib2_ipv6AddrEntry_t *)) && - IS_P2ALIGNED(ipv6RouteEntrySize, - sizeof (mib2_ipv6RouteEntry_t *)) && - IS_P2ALIGNED(ipv6NetToMediaEntrySize, - sizeof (mib2_ipv6NetToMediaEntry_t *)) && - IS_P2ALIGNED(ipv6MemberEntrySize, - sizeof (ipv6_member_t *)) && - IS_P2ALIGNED(ipv6GroupSourceEntrySize, - sizeof (ipv6_grpsrc_t *))); + sizeof (mib2_ipv6IfStatsEntry_t *))); + assert(IS_P2ALIGNED(ipv6AddrEntrySize, + sizeof (mib2_ipv6AddrEntry_t *))); + assert(IS_P2ALIGNED(ipv6RouteEntrySize, + sizeof (mib2_ipv6RouteEntry_t *))); + assert(IS_P2ALIGNED(ipv6NetToMediaEntrySize, + sizeof (mib2_ipv6NetToMediaEntry_t *))); + assert(IS_P2ALIGNED(ipv6MemberEntrySize, + sizeof (ipv6_member_t *))); + assert(IS_P2ALIGNED(ipv6GroupSourceEntrySize, + sizeof (ipv6_grpsrc_t *))); break; } case MIB2_ICMP6: { @@ -1774,9 +1780,9 @@ mib_get_constants(mib_item_t *item) tcpConnEntrySize = tcp->tcpConnTableSize; tcp6ConnEntrySize = tcp->tcp6ConnTableSize; assert(IS_P2ALIGNED(tcpConnEntrySize, - sizeof (mib2_tcpConnEntry_t *)) && - IS_P2ALIGNED(tcp6ConnEntrySize, - sizeof (mib2_tcp6ConnEntry_t *))); + sizeof (mib2_tcpConnEntry_t *))); + assert(IS_P2ALIGNED(tcp6ConnEntrySize, + sizeof (mib2_tcp6ConnEntry_t *))); break; } case MIB2_UDP: { @@ -1785,9 +1791,9 @@ mib_get_constants(mib_item_t *item) udpEntrySize = udp->udpEntrySize; udp6EntrySize = udp->udp6EntrySize; assert(IS_P2ALIGNED(udpEntrySize, - sizeof (mib2_udpEntry_t *)) && - IS_P2ALIGNED(udp6EntrySize, - sizeof (mib2_udp6Entry_t *))); + sizeof (mib2_udpEntry_t *))); + assert(IS_P2ALIGNED(udp6EntrySize, + sizeof (mib2_udp6Entry_t *))); break; } case MIB2_SCTP: { @@ -1843,7 +1849,6 @@ stat_report(mib_item_t *item) { int jtemp = 0; char ifname[LIFNAMSIZ + 1]; - char *ifnamep; /* 'for' loop 1: */ for (; item; item = item->next_item) { @@ -1891,12 +1896,10 @@ stat_report(mib_item_t *item) bzero(&sum6, sizeof (sum6)); /* 'for' loop 2a: */ for (ip6 = (mib2_ipv6IfStatsEntry_t *)item->valp; - (char *)ip6 < (char *)item->valp - + item->length; + (char *)ip6 < (char *)item->valp + item->length; /* LINTED: (note 1) */ ip6 = (mib2_ipv6IfStatsEntry_t *)((char *)ip6 + ipv6IfStatsEntrySize)) { - if (ip6->ipv6IfIndex == 0) { /* * The "unknown interface" ip6 @@ -1905,19 +1908,10 @@ stat_report(mib_item_t *item) sum_ip6_stats(ip6, &sum6); continue; /* 'for' loop 2a */ } - ifnamep = if_indextoname( - ip6->ipv6IfIndex, - ifname); - if (ifnamep == NULL) { - (void) printf( - "Invalid ifindex %d\n", - ip6->ipv6IfIndex); - continue; /* 'for' loop 2a */ - } - if (Aflag) { (void) printf("\nIPv6 for %s\n", - ifnamep); + ifindex2str(ip6->ipv6IfIndex, + ifname)); print_ip6_stats(ip6); } sum_ip6_stats(ip6, &sum6); @@ -1935,15 +1929,10 @@ stat_report(mib_item_t *item) break; bzero(&sum6, sizeof (sum6)); /* 'for' loop 2b: */ - for (icmp6 = - (mib2_ipv6IfIcmpEntry_t *)item->valp; - (char *)icmp6 < (char *)item->valp - + item->length; - icmp6 = - /* LINTED: (note 1) */ - (mib2_ipv6IfIcmpEntry_t *)((char *)icmp6 - + ipv6IfIcmpEntrySize)) { - + for (icmp6 = (mib2_ipv6IfIcmpEntry_t *)item->valp; + (char *)icmp6 < (char *)item->valp + item->length; + icmp6 = (void *)((char *)icmp6 + + ipv6IfIcmpEntrySize)) { if (icmp6->ipv6IfIcmpIfIndex == 0) { /* * The "unknown interface" icmp6 @@ -1952,19 +1941,10 @@ stat_report(mib_item_t *item) sum_icmp6_stats(icmp6, &sum6); continue; /* 'for' loop 2b: */ } - ifnamep = if_indextoname( - icmp6->ipv6IfIcmpIfIndex, ifname); - if (ifnamep == NULL) { - (void) printf( - "Invalid ifindex %d\n", - icmp6->ipv6IfIcmpIfIndex); - continue; /* 'for' loop 2b: */ - } - if (Aflag) { - (void) printf( - "\nICMPv6 for %s\n", - ifnamep); + (void) printf("\nICMPv6 for %s\n", + ifindex2str( + icmp6->ipv6IfIcmpIfIndex, ifname)); print_icmp6_stats(icmp6); } sum_icmp6_stats(icmp6, &sum6); @@ -2369,51 +2349,49 @@ print_mrt_stats(struct mrtstat *mrts) { (void) puts("DVMRP multicast routing:"); (void) printf(" %10u hit%s - kernel forwarding cache hits\n", - mrts->mrts_mfc_hits, PLURAL(mrts->mrts_mfc_hits)); + mrts->mrts_mfc_hits, PLURAL(mrts->mrts_mfc_hits)); (void) printf(" %10u miss%s - kernel forwarding cache misses\n", - mrts->mrts_mfc_misses, PLURALES(mrts->mrts_mfc_misses)); + mrts->mrts_mfc_misses, PLURALES(mrts->mrts_mfc_misses)); (void) printf(" %10u packet%s potentially forwarded\n", - mrts->mrts_fwd_in, PLURAL(mrts->mrts_fwd_in)); + mrts->mrts_fwd_in, PLURAL(mrts->mrts_fwd_in)); (void) printf(" %10u packet%s actually sent out\n", - mrts->mrts_fwd_out, PLURAL(mrts->mrts_fwd_out)); + mrts->mrts_fwd_out, PLURAL(mrts->mrts_fwd_out)); (void) printf(" %10u upcall%s - upcalls made to mrouted\n", - mrts->mrts_upcalls, PLURAL(mrts->mrts_upcalls)); + mrts->mrts_upcalls, PLURAL(mrts->mrts_upcalls)); (void) printf(" %10u packet%s not sent out due to lack of resources\n", - mrts->mrts_fwd_drop, PLURAL(mrts->mrts_fwd_drop)); + mrts->mrts_fwd_drop, PLURAL(mrts->mrts_fwd_drop)); (void) printf(" %10u datagram%s with malformed tunnel options\n", - mrts->mrts_bad_tunnel, PLURAL(mrts->mrts_bad_tunnel)); + mrts->mrts_bad_tunnel, PLURAL(mrts->mrts_bad_tunnel)); (void) printf(" %10u datagram%s with no room for tunnel options\n", - mrts->mrts_cant_tunnel, PLURAL(mrts->mrts_cant_tunnel)); + mrts->mrts_cant_tunnel, PLURAL(mrts->mrts_cant_tunnel)); (void) printf(" %10u datagram%s arrived on wrong interface\n", - mrts->mrts_wrong_if, PLURAL(mrts->mrts_wrong_if)); + mrts->mrts_wrong_if, PLURAL(mrts->mrts_wrong_if)); (void) printf(" %10u datagram%s dropped due to upcall Q overflow\n", - mrts->mrts_upq_ovflw, PLURAL(mrts->mrts_upq_ovflw)); + mrts->mrts_upq_ovflw, PLURAL(mrts->mrts_upq_ovflw)); (void) printf(" %10u datagram%s cleaned up by the cache\n", - mrts->mrts_cache_cleanups, PLURAL(mrts->mrts_cache_cleanups)); + mrts->mrts_cache_cleanups, PLURAL(mrts->mrts_cache_cleanups)); (void) printf(" %10u datagram%s dropped selectively by ratelimiter\n", - mrts->mrts_drop_sel, PLURAL(mrts->mrts_drop_sel)); + mrts->mrts_drop_sel, PLURAL(mrts->mrts_drop_sel)); (void) printf(" %10u datagram%s dropped - bucket Q overflow\n", - mrts->mrts_q_overflow, PLURAL(mrts->mrts_q_overflow)); + mrts->mrts_q_overflow, PLURAL(mrts->mrts_q_overflow)); (void) printf(" %10u datagram%s dropped - larger than bkt size\n", - mrts->mrts_pkt2large, PLURAL(mrts->mrts_pkt2large)); + mrts->mrts_pkt2large, PLURAL(mrts->mrts_pkt2large)); (void) printf("\nPIM multicast routing:\n"); (void) printf(" %10u datagram%s dropped - bad version number\n", - mrts->mrts_pim_badversion, PLURAL(mrts->mrts_pim_badversion)); + mrts->mrts_pim_badversion, PLURAL(mrts->mrts_pim_badversion)); (void) printf(" %10u datagram%s dropped - bad checksum\n", - mrts->mrts_pim_rcv_badcsum, PLURAL(mrts->mrts_pim_rcv_badcsum)); + mrts->mrts_pim_rcv_badcsum, PLURAL(mrts->mrts_pim_rcv_badcsum)); (void) printf(" %10u datagram%s dropped - bad register packets\n", - mrts->mrts_pim_badregisters, - PLURAL(mrts->mrts_pim_badregisters)); + mrts->mrts_pim_badregisters, PLURAL(mrts->mrts_pim_badregisters)); (void) printf( - " %10u datagram%s potentially forwarded - register packets\n", - mrts->mrts_pim_regforwards, PLURAL(mrts->mrts_pim_regforwards)); + " %10u datagram%s potentially forwarded - register packets\n", + mrts->mrts_pim_regforwards, PLURAL(mrts->mrts_pim_regforwards)); (void) printf(" %10u datagram%s dropped - register send drops\n", - mrts->mrts_pim_regsend_drops, - PLURAL(mrts->mrts_pim_regsend_drops)); + mrts->mrts_pim_regsend_drops, PLURAL(mrts->mrts_pim_regsend_drops)); (void) printf(" %10u datagram%s dropped - packet malformed\n", - mrts->mrts_pim_malformed, PLURAL(mrts->mrts_pim_malformed)); + mrts->mrts_pim_malformed, PLURAL(mrts->mrts_pim_malformed)); (void) printf(" %10u datagram%s dropped - no memory to forward\n", - mrts->mrts_pim_nomemory, PLURAL(mrts->mrts_pim_nomemory)); + mrts->mrts_pim_nomemory, PLURAL(mrts->mrts_pim_nomemory)); } static void @@ -2674,7 +2652,7 @@ if_report(mib_item_t *item, char *matchname, "Ierrs", "Opkts", "Oerrs", "Collis", "Queue"); - first = B_FALSE; + first = B_FALSE; } if_report_ip4(ap, ifname, logintname, &stat, B_TRUE); @@ -2717,7 +2695,7 @@ if_report(mib_item_t *item, char *matchname, + item->length; ap++) { (void) octetstr(&ap->ipAdEntIfIndex, - 'a', ifname, sizeof (ifname)); + 'a', ifname, sizeof (ifname)); (void) strtok(ifname, ":"); if (matchname) { @@ -3387,7 +3365,7 @@ dhcp_walk_interfaces(uint_t flags_on, uint_t flags_off, int af, */ (void) memset(&lifn, 0, sizeof (lifn)); lifn.lifn_family = af; - lifn.lifn_flags = LIFC_ALLZONES | LIFC_NOXMIT; + lifn.lifn_flags = LIFC_ALLZONES | LIFC_NOXMIT | LIFC_UNDER_IPMP; if (ioctl(sock_fd, SIOCGLIFNUM, &lifn) == -1) n_ifs = LIFN_GUARD_VALUE; else @@ -3471,7 +3449,6 @@ group_report(mib_item_t *item) ip_grpsrc_t *ips; ipv6_member_t *ipmp6; ipv6_grpsrc_t *ips6; - char *ifnamep; boolean_t first, first_src; /* 'for' loop 1: */ @@ -3604,7 +3581,7 @@ group_report(mib_item_t *item) (char *)ipmp6 < (char *)v6grp->valp + v6grp->length; /* LINTED: (note 1) */ ipmp6 = (ipv6_member_t *)((char *)ipmp6 + - ipv6MemberEntrySize)) { + ipv6MemberEntrySize)) { if (first) { (void) puts("Group Memberships: " "IPv6"); @@ -3615,15 +3592,8 @@ group_report(mib_item_t *item) first = B_FALSE; } - ifnamep = if_indextoname( - ipmp6->ipv6GroupMemberIfIndex, ifname); - if (ifnamep == NULL) { - (void) printf("Invalid ifindex %d\n", - ipmp6->ipv6GroupMemberIfIndex); - continue; - } (void) printf("%-5s %-27s %5u\n", - ifnamep, + ifindex2str(ipmp6->ipv6GroupMemberIfIndex, ifname), pr_addr6(&ipmp6->ipv6GroupMemberAddress, abuf, sizeof (abuf)), ipmp6->ipv6GroupMemberRefCnt); @@ -3784,7 +3754,6 @@ ndp_report(mib_item_t *item) char xbuf[STR_EXPAND * OCTET_LENGTH + 1]; mib2_ipv6NetToMediaEntry_t *np6; char ifname[LIFNAMSIZ + 1]; - char *ifnamep; boolean_t first; if (!(family_selected(AF_INET6))) @@ -3820,13 +3789,6 @@ ndp_report(mib_item_t *item) first = B_FALSE; } - ifnamep = if_indextoname(np6->ipv6NetToMediaIfIndex, - ifname); - if (ifnamep == NULL) { - (void) printf("Invalid ifindex %d\n", - np6->ipv6NetToMediaIfIndex); - continue; /* 'for' loop 2 */ - } switch (np6->ipv6NetToMediaState) { case ND_INCOMPLETE: state = "INCOMPLETE"; @@ -3865,7 +3827,7 @@ ndp_report(mib_item_t *item) break; } (void) printf("%-5s %-17s %-7s %-12s %-27s\n", - ifnamep, + ifindex2str(np6->ipv6NetToMediaIfIndex, ifname), octetstr(&np6->ipv6NetToMediaPhysAddress, 'h', xbuf, sizeof (xbuf)), type, @@ -4472,7 +4434,7 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first, (void) printf("%-27s %-27s %-5s %5u%c %5u %3u " "%-5s %6u %6u %s\n", pr_prefix6(&rp6->ipv6RouteDest, - rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), + rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), IN6_IS_ADDR_UNSPECIFIED(&rp6->ipv6RouteNextHop) ? " --" : pr_addr6(&rp6->ipv6RouteNextHop, gwbuf, sizeof (gwbuf)), @@ -4489,7 +4451,7 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first, } else { (void) printf("%-27s %-27s %-5s %3u %7u %-5s %s\n", pr_prefix6(&rp6->ipv6RouteDest, - rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), + rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), IN6_IS_ADDR_UNSPECIFIED(&rp6->ipv6RouteNextHop) ? " --" : pr_addr6(&rp6->ipv6RouteNextHop, gwbuf, sizeof (gwbuf)), @@ -4690,9 +4652,9 @@ tcp_report_item_v4(const mib2_tcpConnEntry_t *tp, boolean_t first, (void) printf("%-20s\n%-20s %5u %08x %08x %5u %08x %08x " "%5u %5u %s\n", pr_ap(tp->tcpConnLocalAddress, - tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), + tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap(tp->tcpConnRemAddress, - tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), + tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), tp->tcpConnEntryInfo.ce_swnd, tp->tcpConnEntryInfo.ce_snxt, tp->tcpConnEntryInfo.ce_suna, @@ -4710,9 +4672,9 @@ tcp_report_item_v4(const mib2_tcpConnEntry_t *tp, boolean_t first, (void) printf("%-20s %-20s %5u %6d %5u %6d %s\n", pr_ap(tp->tcpConnLocalAddress, - tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), + tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap(tp->tcpConnRemAddress, - tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), + tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), tp->tcpConnEntryInfo.ce_swnd, (sq >= 0) ? sq : 0, tp->tcpConnEntryInfo.ce_rwnd, @@ -4756,9 +4718,9 @@ tcp_report_item_v6(const mib2_tcp6ConnEntry_t *tp6, boolean_t first, (void) printf("%-33s\n%-33s %5u %08x %08x %5u %08x %08x " "%5u %5u %-11s %s\n", pr_ap6(&tp6->tcp6ConnLocalAddress, - tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), + tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap6(&tp6->tcp6ConnRemAddress, - tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), + tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), tp6->tcp6ConnEntryInfo.ce_swnd, tp6->tcp6ConnEntryInfo.ce_snxt, tp6->tcp6ConnEntryInfo.ce_suna, @@ -4777,9 +4739,9 @@ tcp_report_item_v6(const mib2_tcp6ConnEntry_t *tp6, boolean_t first, (void) printf("%-33s %-33s %5u %6d %5u %6d %-11s %s\n", pr_ap6(&tp6->tcp6ConnLocalAddress, - tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), + tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap6(&tp6->tcp6ConnRemAddress, - tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), + tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), tp6->tcp6ConnEntryInfo.ce_swnd, (sq >= 0) ? sq : 0, tp6->tcp6ConnEntryInfo.ce_rwnd, @@ -5112,7 +5074,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr, * displaying. */ switch (type) { - case MIB2_SCTP_ADDR_V4: + case MIB2_SCTP_ADDR_V4: /* v4 */ v6addr = *addr; @@ -5124,7 +5086,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr, } break; - case MIB2_SCTP_ADDR_V6: + case MIB2_SCTP_ADDR_V6: /* v6 */ if (port > 0) { (void) pr_ap6(addr, port, "sctp", name, namelen); @@ -5133,7 +5095,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr, } break; - default: + default: (void) snprintf(name, namelen, "<unknown addr type>"); break; } @@ -5379,7 +5341,7 @@ mrt_report(mib_item_t *item) case EXPER_DVMRP_MRT: if (Dflag) (void) printf("%u records for ipMfcTable:\n", - item->length/sizeof (struct vifctl)); + item->length/sizeof (struct vifctl)); if (item->length/sizeof (struct vifctl) == 0) { (void) puts("\nMulticast Forwarding Cache is " "empty"); @@ -5402,10 +5364,10 @@ mrt_report(mib_item_t *item) abuf, sizeof (abuf))); (void) printf("%-15.15s %6s %3u ", pr_net(mfccp->mfcc_mcastgrp.s_addr, - mfccp->mfcc_mcastgrp.s_addr, - abuf, sizeof (abuf)), + mfccp->mfcc_mcastgrp.s_addr, + abuf, sizeof (abuf)), pktscale((int)mfccp->mfcc_pkt_cnt), - mfccp->mfcc_parent); + mfccp->mfcc_parent); for (vifi = 0; vifi < MAXVIFS; ++vifi) { if (mfccp->mfcc_ttls[vifi]) { @@ -5468,7 +5430,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes) strncmp(ksp->ks_name, "streams_dblk", 12) == 0) { (void) safe_kstat_read(kc, ksp, NULL); total_buf_inuse -= - kstat_named_value(ksp, "buf_constructed"); + kstat_named_value(ksp, "buf_constructed"); continue; /* 'for' loop 1 */ } @@ -5501,7 +5463,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes) if (buf_size == 0) { (void) printf("%-22s [couldn't find statistics for %s]\n", - title, name); + title, name); return; } @@ -5511,7 +5473,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes) (void) snprintf(buf, sizeof (buf), "%s", title); (void) printf("%-22s %6d %9d %11lld %11d\n", buf, - total_buf_inuse, total_buf_max, total_alloc, total_alloc_fail); + total_buf_inuse, total_buf_max, total_alloc, total_alloc_fail); } static void @@ -5534,7 +5496,7 @@ m_report(void) kmem_cache_stats("qband", "qband_cache", 0, &total_bytes); (void) printf("\n%lld Kbytes allocated for streams data\n", - total_bytes / 1024); + total_bytes / 1024); (void) putchar('\n'); (void) fflush(stdout); @@ -5967,7 +5929,7 @@ portname(uint_t port, char *proto, char *dst, uint_t dstlen) sp = getservbyport(htons(port), proto); if (sp || port == 0) (void) snprintf(dst, dstlen, "%.*s", MAXHOSTNAMELEN, - sp ? sp->s_name : "*"); + sp ? sp->s_name : "*"); else (void) snprintf(dst, dstlen, "%d", port); dst[dstlen - 1] = 0; @@ -6161,8 +6123,8 @@ process_filter(char *arg) */ if (hp->h_addr_list[0] != NULL && /* LINTED: (note 1) */ - IN6_IS_ADDR_V4MAPPED((in6_addr_t - *)hp->h_addr_list[0])) { + IN6_IS_ADDR_V4MAPPED((in6_addr_t *) + hp->h_addr_list[0])) { maxv = IP_ABITS; } else { maxv = IPV6_ABITS; @@ -6226,6 +6188,21 @@ family_selected(int family) } /* + * Convert the interface index to a string using the buffer `ifname', which + * must be at least LIFNAMSIZ bytes. We first try to map it to name. If that + * fails (e.g., because we're inside a zone and it does not have access to + * interface for the index in question), just return "if#<num>". + */ +static char * +ifindex2str(uint_t ifindex, char *ifname) +{ + if (if_indextoname(ifindex, ifname) == NULL) + (void) snprintf(ifname, LIFNAMSIZ, "if#%d", ifindex); + + return (ifname); +} + +/* * print the usage line */ static void diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile index f0c4c03250..f3ce9fae4b 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile @@ -19,51 +19,58 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# - -PROG = in.mpathd -OBJS = mpd_tables.o mpd_main.o mpd_probe.o -SRCS = $(OBJS:%.o=%.c) -DEFAULTFILES = mpathd.dfl +PROG = in.mpathd +ROOTFS_PROG = $(PROG) +OBJS = mpd_tables.o mpd_main.o mpd_probe.o +SRCS = $(OBJS:%.o=%.c) +DEFAULTFILES = mpathd.dfl include ../../../Makefile.cmd -POFILE = $(PROG).po -POFILES = $(SRCS:%.c=%.po) +ROOTCMDDIR = $(ROOT)/lib/inet + +POFILE = $(PROG).po +POFILES = $(SRCS:%.c=%.po) -C99MODE= $(C99_ENABLE) +C99MODE = $(C99_ENABLE) # # We need access to the ancillary data features which are only available # via the SUS standards. Further, C99 support requires SUSv3 or higher. # CPPFLAGS += -D_XOPEN_SOURCE=600 -D__EXTENSIONS__ -LDLIBS += -lsocket -lnsl -lsysevent -lnvpair -lipmp -lc +LDLIBS += -lsocket -lnsl -lsysevent -lnvpair -lipmp -linetutil -ldlpi +LINTFLAGS += -erroff=E_INCONS_ARG_DECL2 -erroff=E_INCONS_ARG_USED2 -LINTFLAGS += -erroff=E_FUNC_DECL_VAR_ARG2 -erroff=E_INCONS_VAL_TYPE_DECL2 \ - -erroff=E_FUNC_USED_VAR_ARG2 -erroff=E_INCONS_ARG_DECL2 \ - -erroff=E_NAME_USED_NOT_DEF2 -erroff=E_INCONS_ARG_USED2 \ - -errtags=yes +# +# Instrument in.mpathd with CTF data to ease debugging. +# +CTFCONVERT_HOOK = && $(CTFCONVERT_O) +CTFMERGE_HOOK = && $(CTFMERGE) -L VERSION -o $@ $(OBJS) +$(OBJS) := CFLAGS += $(CTF_FLAGS) .KEEP_STATE: all: $(PROG) $(PROG): $(OBJS) - $(LINK.c) -o $@ $(OBJS) $(LDLIBS) + $(LINK.c) -o $@ $(OBJS) $(LDLIBS) $(CTFMERGE_HOOK) $(POST_PROCESS) include ../Makefile.lib +$(ROOTLIBINETPROG): + $(RM) $@; $(SYMLINK) ../../../lib/inet/$(PROG) $@ + $(ROOTSBINPROG): - $(RM) $@; $(SYMLINK) ../usr/lib/inet/$(PROG) $@ + $(RM) $@; $(SYMLINK) ../lib/inet/$(PROG) $@ -install: all $(ROOTLIBINETPROG) $(ROOTSBINPROG) $(ROOTETCDEFAULTFILES) +install: all $(ROOTLIBINETPROG) $(ROOTSBINPROG) $(ROOTCMD) \ + $(ROOTETCDEFAULTFILES) clean: $(RM) $(OBJS) diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h index 9b07e2a7a3..e7cb096bf7 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _MPD_DEFS_H #define _MPD_DEFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -78,12 +76,13 @@ extern "C" { #include <locale.h> #include <deflt.h> +#include <libdlpi.h> +#include <libinetutil.h> #include <libnvpair.h> #include <libsysevent.h> #include <sys/sysevent.h> #include <sys/sysevent/eventdefs.h> #include <sys/sysevent/ipmp.h> -#include <zone.h> #include <ipmp_mpathd.h> #include <ipmp_query_impl.h> @@ -92,7 +91,7 @@ extern "C" { /* Debug flags */ #define D_ALL 0xffff /* enable all debug */ #define D_PROBE 0x0001 /* probe mechanism */ -#define D_FAILOVER 0x0002 /* failover mechanism */ +#define D_FAILREP 0x0002 /* failure/repair mechanism */ #define D_PHYINT 0x0004 /* phyint table */ #define D_LOGINT 0x0008 /* logint table */ #define D_TARGET 0x0010 /* target table */ @@ -199,10 +198,8 @@ extern int user_failure_detection_time; /* User specified fdt */ extern int ifsock_v4; /* IPv4 socket for ioctls */ extern int ifsock_v6; /* IPv6 socket for ioctls */ -extern boolean_t full_scan_required; /* Do full scans */ - extern int debug; /* debug option */ - +extern boolean_t cleanup_started; /* true if we're shutting down */ extern boolean_t handle_link_notifications; /* @@ -212,6 +209,7 @@ extern void timer_schedule(uint_t delay); extern void logmsg(int pri, const char *fmt, ...); extern void logperror(const char *str); extern int poll_add(int fd); +extern int poll_remove(int fd); extern uint64_t getcurrentsec(void); extern uint_t getcurrenttime(void); diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c index aa6a99fb9c..e1e22e12d4 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mpd_defs.h" #include "mpd_tables.h" @@ -46,7 +44,6 @@ static int lsock_v6; /* Listen socket to detect mpathd */ static int mibfd = -1; /* fd to get mib info */ static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ -boolean_t full_scan_required = _B_FALSE; static uint_t last_initifs_time; /* Time when initifs was last run */ static char **argv0; /* Saved for re-exec on SIGHUP */ boolean_t handle_link_notifications = _B_TRUE; @@ -58,10 +55,6 @@ static void check_if_removed(struct phyint_instance *pii); static void select_test_ifs(void); static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); -static void router_add_v4(mib2_ipRouteEntry_t *rp1, - struct in_addr nexthop_v4); -static void router_add_v6(mib2_ipv6RouteEntry_t *rp1, - struct in6_addr nexthop_v6); static void router_add_common(int af, char *ifname, struct in6_addr nexthop); static void init_router_targets(); @@ -74,17 +67,17 @@ static void check_addr_unique(struct phyint_instance *, static void init_host_targets(void); static void dup_host_targets(struct phyint_instance *desired_pii); static void loopback_cmd(int sock, int family); -static int poll_remove(int fd); static boolean_t daemonize(void); static int closefunc(void *, int); static unsigned int process_cmd(int newfd, union mi_commands *mpi); static unsigned int process_query(int fd, mi_query_t *miq); +static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop); static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); static unsigned int send_result(int fd, unsigned int error, int syserror); -struct local_addr *laddr_list = NULL; +addrlist_t *localaddrs; /* * Return the current time in milliseconds (from an arbitrary reference) @@ -153,7 +146,7 @@ retry: /* * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. */ -static int +int poll_remove(int fd) { int i; @@ -205,17 +198,11 @@ pii_process(int af, char *name, struct phyint_instance **pii_p) break; case PI_GROUP_CHANGED: - /* - * The phyint has changed group. - */ - restore_phyint(pii->pii_phyint); - /* FALLTHRU */ - case PI_IFINDEX_CHANGED: /* - * Interface index has changed. Delete and - * recreate the phyint as it is quite likely - * the interface has been unplumbed and replumbed. + * Interface index or group membership has changed. + * Delete the old state and recreate based on the new + * state (it may no longer be in a group). */ pii_other = phyint_inst_other(pii); if (pii_other != NULL) @@ -249,51 +236,26 @@ pii_process(int af, char *name, struct phyint_instance **pii_p) } /* - * This phyint is leaving the group. Try to restore the phyint to its - * initial state. Return the addresses that belong to other group members, - * to the group, and take back any addresses owned by this phyint - */ -void -restore_phyint(struct phyint *pi) -{ - if (pi->pi_group == phyint_anongroup) - return; - - /* - * Move everthing to some other member in the group. - * The phyint has changed group in the kernel. But we - * have yet to do it in our tables. - */ - if (!pi->pi_empty) - (void) try_failover(pi, FAILOVER_TO_ANY); - /* - * Move all addresses owned by 'pi' back to pi, from each - * of the other members of the group - */ - (void) try_failback(pi); -} - -/* * Scan all interfaces to detect changes as well as new and deleted interfaces */ static void initifs() { - int n; + int i, nlifr; int af; char *cp; char *buf; - int numifs; + int sockfd; + uint64_t flags; struct lifnum lifn; struct lifconf lifc; + struct lifreq lifreq; struct lifreq *lifr; struct logint *li; struct phyint_instance *pii; struct phyint_instance *next_pii; - char pi_name[LIFNAMSIZ + 1]; - boolean_t exists; - struct phyint *pi; - struct local_addr *next; + struct phyint_group *pg, *next_pg; + char pi_name[LIFNAMSIZ + 1]; if (debug & D_PHYINT) logdebug("initifs: Scanning interfaces\n"); @@ -301,13 +263,9 @@ initifs() last_initifs_time = getcurrenttime(); /* - * Free the laddr_list before collecting the local addresses. + * Free the existing local address list; we'll build a new list below. */ - while (laddr_list != NULL) { - next = laddr_list->next; - free(laddr_list); - laddr_list = next; - } + addrlist_free(&localaddrs); /* * Mark the interfaces so that we can find phyints and logints @@ -326,122 +284,142 @@ initifs() } } + /* + * As above, mark groups so that we can detect IPMP interfaces which + * have been removed from the kernel. Also, delete the group address + * list since we'll iteratively recreate it below. + */ + for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { + pg->pg_in_use = _B_FALSE; + addrlist_free(&pg->pg_addrs); + } + lifn.lifn_family = AF_UNSPEC; - lifn.lifn_flags = LIFC_ALLZONES; + lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; +again: if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { - logperror("initifs: ioctl (get interface numbers)"); + logperror("initifs: ioctl (get interface count)"); return; } - numifs = lifn.lifn_count; + /* + * Pad the interface count to detect when additional interfaces have + * been configured between SIOCGLIFNUM and SIOCGLIFCONF. + */ + lifn.lifn_count += 4; - buf = (char *)calloc(numifs, sizeof (struct lifreq)); - if (buf == NULL) { + if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) { logperror("initifs: calloc"); return; } lifc.lifc_family = AF_UNSPEC; - lifc.lifc_flags = LIFC_ALLZONES; - lifc.lifc_len = numifs * sizeof (struct lifreq); + lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; + lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); lifc.lifc_buf = buf; if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { - /* - * EINVAL is commonly encountered, when things change - * underneath us rapidly, (eg. at boot, when new interfaces - * are plumbed successively) and the kernel finds the buffer - * size we passed as too small. We will retry again - * when we see the next routing socket msg, or at worst after - * IF_SCAN_INTERVAL ms. - */ - if (errno != EINVAL) { - logperror("initifs: ioctl" - " (get interface configuration)"); - } + logperror("initifs: ioctl (get interface configuration)"); free(buf); return; } - lifr = (struct lifreq *)lifc.lifc_req; - /* - * For each lifreq returned by SIOGGLIFCONF, call pii_process() - * and get the state of the corresponding phyint_instance. If it is - * successful, then call logint_init_from_k() to get the state of the - * logint. + * If every lifr_req slot is taken, then additional interfaces must + * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF. + * Recalculate to make sure we didn't miss any interfaces. */ - for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) { - int sockfd; - struct local_addr *taddr; - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - struct lifreq lifreq; + nlifr = lifc.lifc_len / sizeof (struct lifreq); + if (nlifr >= lifn.lifn_count) { + free(buf); + goto again; + } + /* + * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the + * global list of addresses, phyint groups, phyints, and logints. + */ + for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) { af = lifr->lifr_addr.ss_family; - - /* - * Collect all local addresses. - */ sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; - (void) memset(&lifreq, 0, sizeof (lifreq)); - (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, - sizeof (lifreq.lifr_name)); + (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ); if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { if (errno != ENXIO) logperror("initifs: ioctl (SIOCGLIFFLAGS)"); continue; } + flags = lifreq.lifr_flags; + + /* + * If the address is IFF_UP, add it to the local address list. + * (We ignore addresses that aren't IFF_UP since another node + * might legitimately have that address IFF_UP.) + */ + if (flags & IFF_UP) { + (void) addrlist_add(&localaddrs, lifr->lifr_name, flags, + &lifr->lifr_addr); + } /* - * Add the interface address to laddr_list. - * Another node might have the same IP address which is up. - * In that case, it is appropriate to use the address as a - * target, even though it is also configured (but not up) on - * the local system. - * Hence,the interface address is not added to laddr_list - * unless it is IFF_UP. + * If this address is on an IPMP meta-interface, update our + * phyint_group information (either by recording that group + * still exists or creating a new group), and track what + * group the address is part of. */ - if (lifreq.lifr_flags & IFF_UP) { - taddr = malloc(sizeof (struct local_addr)); - if (taddr == NULL) { - logperror("initifs: malloc"); + if (flags & IFF_IPMP) { + if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) { + if (errno != ENXIO) + logperror("initifs: ioctl " + "(SIOCGLIFGROUPNAME)"); continue; } - if (af == AF_INET) { - sin = (struct sockaddr_in *)&lifr->lifr_addr; - IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, - &taddr->addr); - } else { - sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr; - taddr->addr = sin6->sin6_addr; + + pg = phyint_group_lookup(lifreq.lifr_groupname); + if (pg == NULL) { + pg = phyint_group_create(lifreq.lifr_groupname); + if (pg == NULL) { + logerr("initifs: cannot create group " + "%s\n", lifreq.lifr_groupname); + continue; + } + phyint_group_insert(pg); + } + pg->pg_in_use = _B_TRUE; + + /* + * Add this to the group's list of data addresses. + */ + if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags, + &lifr->lifr_addr)) { + logerr("initifs: insufficient memory to track " + "data address information for %s\n", + lifr->lifr_name); } - taddr->next = laddr_list; - laddr_list = taddr; + continue; } /* - * Need to pass a phyint name to pii_process. Insert the - * null where the ':' IF_SEPARATOR is found in the logical - * name. + * This isn't an address on an IPMP meta-interface, so it's + * either on an underlying interface or not related to any + * group. Update our phyint and logint information (via + * pii_process() and logint_init_from_k()) -- but first, + * convert the logint name to a phyint name so we can call + * pii_process(). */ (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) *cp = '\0'; - exists = pii_process(af, pi_name, &pii); - if (exists) { + if (pii_process(af, pi_name, &pii)) { /* The phyint is fine. So process the logint */ logint_init_from_k(pii, lifr->lifr_name); check_addr_unique(pii, &lifr->lifr_addr); } - } - free(buf); /* - * Scan for phyints and logints that have disappeared from the + * Scan for groups, phyints and logints that have disappeared from the * kernel, and delete them. */ for (pii = phyint_instances; pii != NULL; pii = next_pii) { @@ -449,70 +427,31 @@ initifs() check_if_removed(pii); } + for (pg = phyint_groups; pg != NULL; pg = next_pg) { + next_pg = pg->pg_next; + if (!pg->pg_in_use) { + phyint_group_delete(pg); + continue; + } + /* + * Refresh the group's state. This is necessary since the + * group's state is defined by the set of usable interfaces in + * the group, and an interface is considered unusable if all + * of its addresses are down. When an address goes down/up, + * the RTM_DELADDR/RTM_NEWADDR brings us through here. + */ + phyint_group_refresh_state(pg); + } + /* * Select a test address for sending probes on each phyint instance */ select_test_ifs(); /* - * Handle link up/down notifications from the NICs. + * Handle link up/down notifications. */ process_link_state_changes(); - - for (pi = phyints; pi != NULL; pi = pi->pi_next) { - /* - * If this is a case of group failure, we don't have much - * to do until the group recovers again. - */ - if (GROUP_FAILED(pi->pi_group)) - continue; - - /* - * Try/Retry any pending failovers / failbacks, that did not - * not complete, or that could not be initiated previously. - * This implements the 3 invariants described in the big block - * comment at the beginning of probe.c - */ - if (pi->pi_flags & IFF_INACTIVE) { - if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) - (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); - } else { - struct phyint_instance *pii; - - /* - * Skip LINK UP interfaces which are not capable - * of probing. - */ - pii = pi->pi_v4; - if (pii == NULL || - (LINK_UP(pi) && !PROBE_CAPABLE(pii))) { - pii = pi->pi_v6; - if (pii == NULL || - (LINK_UP(pi) && !PROBE_CAPABLE(pii))) - continue; - } - - /* - * It is possible that the phyint has started - * receiving packets, after it has been marked - * PI_FAILED. Don't initiate failover, if the - * phyint has started recovering. failure_state() - * captures this check. A similar logic is used - * for failback/repair case. - */ - if (pi->pi_state == PI_FAILED && !pi->pi_empty && - (failure_state(pii) == PHYINT_FAILURE)) { - (void) try_failover(pi, FAILOVER_NORMAL); - } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) { - if (try_failback(pi) != IPMP_FAILURE) { - (void) change_lif_flags(pi, IFF_FAILED, - _B_FALSE); - /* Per state diagram */ - pi->pi_empty = 0; - } - } - } - } } /* @@ -569,7 +508,7 @@ check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss) * The probe socket is closed on each interface instance, and the * interface state set to PI_OFFLINE. */ -static void +void stop_probing(struct phyint *pi) { struct phyint_instance *pii; @@ -631,7 +570,6 @@ select_test_ifs(void) struct logint *li; struct logint *probe_logint; boolean_t target_scan_reqd = _B_FALSE; - struct target *tg; int rating; if (debug & D_PHYINT) @@ -645,8 +583,8 @@ select_test_ifs(void) probe_logint = NULL; /* - * An interface that is offline, should not be probed. - * Offline interfaces should always in PI_OFFLINE state, + * An interface that is offline should not be probed. + * IFF_OFFLINE interfaces should always be PI_OFFLINE * unless some other entity has set the offline flag. */ if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { @@ -659,6 +597,15 @@ select_test_ifs(void) stop_probing(pii->pii_phyint); } continue; + } else { + /* + * If something cleared IFF_OFFLINE (e.g., by accident + * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is + * inherently racy), the phyint may still be offline. + * Just ignore it. + */ + if (pii->pii_phyint->pi_state == PI_OFFLINE) + continue; } li = pii->pii_probe_logint; @@ -776,17 +723,6 @@ select_test_ifs(void) phyint_chstate(pii->pii_phyint, PI_NOTARGETS); } - if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { - tg = pii->pii_targets; - if (tg != NULL) - target_delete(tg); - assert(pii->pii_targets == NULL); - assert(pii->pii_target_next == NULL); - assert(pii->pii_ntargets == 0); - target_create(pii, probe_logint->li_dstaddr, - _B_TRUE); - } - /* * If no targets are currently known for this phyint * we need to call init_router_targets. Since @@ -806,15 +742,16 @@ select_test_ifs(void) } /* - * Check the interface list for any interfaces that are marked - * PI_FAILED but no longer enabled to send probes, and call - * phyint_check_for_repair() to see if the link now indicates that the - * interface should be repaired. Also see the state diagram in + * Scan the interface list for any interfaces that are PI_FAILED or + * PI_NOTARGETS but no longer enabled to send probes, and call + * phyint_check_for_repair() to see if the link state indicates that + * the interface should be repaired. Also see the state diagram in * mpd_probe.c. */ for (pi = phyints; pi != NULL; pi = pi->pi_next) { - if (pi->pi_state == PI_FAILED && - !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { + if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) && + (pi->pi_state == PI_FAILED || + pi->pi_state == PI_NOTARGETS)) { phyint_check_for_repair(pi); } } @@ -875,15 +812,14 @@ check_testconfig(void) pi->pi_v6->pii_probe_logint->li_dupaddr) li = pi->pi_v6->pii_probe_logint; - if (li != NULL) { - if (!pi->pi_duptaddrmsg_printed) { - (void) pr_addr(li->li_phyint_inst->pii_af, - li->li_addr, abuf, sizeof (abuf)); - logerr("Test address %s is not unique in " - "group; disabling probe-based failure " - "detection on %s\n", abuf, pi->pi_name); - pi->pi_duptaddrmsg_printed = 1; - } + if (li != NULL && li->li_dupaddr) { + if (pi->pi_duptaddrmsg_printed) + continue; + logerr("Test address %s is not unique in group; " + "disabling probe-based failure detection on %s\n", + pr_addr(li->li_phyint_inst->pii_af, + li->li_addr, abuf, sizeof (abuf)), pi->pi_name); + pi->pi_duptaddrmsg_printed = 1; continue; } @@ -915,10 +851,10 @@ check_config(void) boolean_t v6_in_group; /* - * All phyints of a group must be homogenous to ensure that - * failover or failback can be done. If any phyint in a group - * has IPv4 plumbed, check that all phyints have IPv4 plumbed. - * Do a similar check for IPv6. + * All phyints of a group must be homogeneous to ensure that they can + * take over for one another. If any phyint in a group has IPv4 + * plumbed, check that all phyints have IPv4 plumbed. Do a similar + * check for IPv6. */ for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { if (pg == phyint_anongroup) @@ -949,9 +885,9 @@ check_config(void) if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { if (!pi->pi_cfgmsg_printed) { - logerr("NIC %s of group %s is" - " not plumbed for IPv4 and may" - " affect failover capability\n", + logerr("IP interface %s in group %s is" + " not plumbed for IPv4, affecting" + " IPv4 connectivity\n", pi->pi_name, pi->pi_group->pg_name); pi->pi_cfgmsg_printed = 1; @@ -959,9 +895,9 @@ check_config(void) } else if (v6_in_group == _B_TRUE && pi->pi_v6 == NULL) { if (!pi->pi_cfgmsg_printed) { - logerr("NIC %s of group %s is" - " not plumbed for IPv6 and may" - " affect failover capability\n", + logerr("IP interface %s in group %s is" + " not plumbed for IPv6, affecting" + " IPv6 connectivity\n", pi->pi_name, pi->pi_group->pg_name); pi->pi_cfgmsg_printed = 1; @@ -974,10 +910,10 @@ check_config(void) * error recovery message */ if (pi->pi_cfgmsg_printed) { - logerr("NIC %s is now consistent with " - "group %s and failover capability " - "is restored\n", pi->pi_name, - pi->pi_group->pg_name); + logerr("IP interface %s is now" + " consistent with group %s " + " and connectivity is restored\n", + pi->pi_name, pi->pi_group->pg_name); pi->pi_cfgmsg_printed = 0; } } @@ -1117,8 +1053,8 @@ run_timeouts(void) static int eventpipe_read = -1; /* Used for synchronous signal delivery */ static int eventpipe_write = -1; -static boolean_t cleanup_started = _B_FALSE; - /* Don't write to eventpipe if in cleanup */ +boolean_t cleanup_started = _B_FALSE; /* true if we're going away */ + /* * Ensure that signals are processed synchronously with the rest of * the code by just writing a one character signal number on the pipe. @@ -1228,7 +1164,7 @@ in_signal(int fd) "Number of probes sent %lld\n" "Number of probe acks received %lld\n" "Number of probes/acks lost %lld\n" - "Number of valid unacknowled probes %lld\n" + "Number of valid unacknowledged probes %lld\n" "Number of ambiguous probe acks received %lld\n", AF_STR(pii->pii_af), pii->pii_name, sent, acked, lost, unacked, unknown); @@ -1321,12 +1257,20 @@ setup_rtsock(int af) { int s; int flags; + int aware = RTAW_UNDER_IPMP; s = socket(PF_ROUTE, SOCK_RAW, af); if (s == -1) { logperror("setup_rtsock: socket PF_ROUTE"); exit(1); } + + if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) { + logperror("setup_rtsock: setsockopt RT_AWARE"); + (void) close(s); + exit(1); + } + if ((flags = fcntl(s, F_GETFL, 0)) < 0) { logperror("setup_rtsock: fcntl F_GETFL"); (void) close(s); @@ -1347,8 +1291,7 @@ setup_rtsock(int af) /* * Process an RTM_IFINFO message received on a routing socket. * The return value indicates whether a full interface scan is required. - * Link up/down notifications from the NICs are reflected in the - * IFF_RUNNING flag. + * Link up/down notifications are reflected in the IFF_RUNNING flag. * If just the state of the IFF_RUNNING interface flag has changed, a * a full interface scan isn't required. */ @@ -1400,7 +1343,7 @@ process_rtm_ifinfo(if_msghdr_t *ifm, int type) /* * We want to try and avoid doing a full interface scan for - * link state notifications from the NICs, as indicated + * link state notifications from the datalink layer, as indicated * by the state of the IFF_RUNNING flag. If just the * IFF_RUNNING flag has changed state, the link state changes * are processed without a full scan. @@ -1441,25 +1384,7 @@ process_rtm_ifinfo(if_msghdr_t *ifm, int type) * types. */ if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) - phyint_newtype(pi); - - /* - * If IFF_INACTIVE has been set, then no data addresses should be - * hosted on the interface. If IFF_INACTIVE has been cleared, then - * move previously failed-over addresses back to it, provided it is - * not failed. For details, see the state diagram in mpd_probe.c. - */ - if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) { - if (pii->pii_flags & IFF_INACTIVE) { - if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) - (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); - } else { - if (pi->pi_state == PI_RUNNING && !pi->pi_full) { - pi->pi_empty = 0; - (void) try_failback(pi); - } - } - } + phyint_changed(pi); /* Has just the IFF_RUNNING flag changed state ? */ if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { @@ -1620,22 +1545,24 @@ update_router_list(int fd) t_scalar_t prim; tor = (struct T_optmgmt_req *)&buf; - tor->PRIM_type = T_SVR4_OPTMGMT_REQ; tor->OPT_offset = sizeof (struct T_optmgmt_req); tor->OPT_length = sizeof (struct opthdr); tor->MGMT_flags = T_CURRENT; + /* + * Note: we use the special level value below so that IP will return + * us information concerning IRE_MARK_TESTHIDDEN routes. + */ req = (struct opthdr *)&tor[1]; - req->level = MIB2_IP; /* any MIB2_xxx value ok here */ + req->level = EXPER_IP_AND_TESTHIDDEN; req->name = 0; req->len = 0; ctlbuf.buf = (char *)&buf; ctlbuf.len = tor->OPT_length + tor->OPT_offset; ctlbuf.maxlen = sizeof (buf); - flags = 0; - if (putmsg(fd, &ctlbuf, NULL, flags) == -1) { + if (putmsg(fd, &ctlbuf, NULL, 0) == -1) { logperror("update_router_list: putmsg(ctl)"); return (_B_FALSE); } @@ -1689,7 +1616,8 @@ update_router_list(int fd) case T_OPTMGMT_ACK: toa = &buf.uprim.optmgmt_ack; optp = (struct opthdr *)&toa[1]; - if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) { + if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) + + sizeof (struct opthdr))) { logerr("update_router_list: ctlbuf.len %d\n", ctlbuf.len); return (_B_FALSE); @@ -1707,7 +1635,7 @@ update_router_list(int fd) return (_B_FALSE); } - /* Process the T_OPGMGMT_ACK below */ + /* Process the T_OPTMGMT_ACK below */ assert(prim == T_OPTMGMT_ACK); switch (status) { @@ -1717,9 +1645,8 @@ update_router_list(int fd) * message. If this is the last message i.e EOD, * return, else process the next T_OPTMGMT_ACK msg. */ - if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) + - sizeof (struct opthdr)) && optp->len == 0 && - optp->name == 0 && optp->level == 0) { + if (optp->len == 0 && optp->name == 0 && + optp->level == 0) { /* * This is the EOD message. Return */ @@ -1747,17 +1674,14 @@ update_router_list(int fd) databuf.len = 0; flags = 0; for (;;) { - status = getmsg(fd, NULL, &databuf, &flags); - if (status >= 0) { + if (getmsg(fd, NULL, &databuf, &flags) >= 0) break; - } else if (errno == EINTR) { + if (errno == EINTR) continue; - } else { - logperror("update_router_list:" - " getmsg(data)"); - free(databuf.buf); - return (_B_FALSE); - } + + logperror("update_router_list: getmsg(data)"); + free(databuf.buf); + return (_B_FALSE); } if (optp->level == MIB2_IP && @@ -1777,18 +1701,35 @@ update_router_list(int fd) /* NOTREACHED */ } + +/* + * Convert octet `octp' to a phyint name and store in `ifname' + */ +static void +oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize) +{ + char *cp; + size_t len = MIN(octp->o_length, ifsize - 1); + + (void) strncpy(ifname, octp->o_bytes, len); + ifname[len] = '\0'; + + if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL) + *cp = '\0'; +} + /* - * Examine the IPv4 routing table, for default routers. For each default - * router, populate the list of targets of each phyint that is on the same - * link as the default router + * Examine the IPv4 routing table `buf' for possible targets. For each + * possible target, if it's on the same subnet an interface route, pass + * it to router_add_common() for further consideration. */ static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) { - mib2_ipRouteEntry_t *rp; - mib2_ipRouteEntry_t *rp1; - struct in_addr nexthop_v4; - mib2_ipRouteEntry_t *endp; + char ifname[LIFNAMSIZ]; + mib2_ipRouteEntry_t *rp, *rp1, *endp; + struct in_addr nexthop_v4; + struct in6_addr nexthop; if (len == 0) return; @@ -1797,75 +1738,40 @@ ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) endp = buf + (len / sizeof (mib2_ipRouteEntry_t)); /* - * Loop thru the routing table entries. Process any IRE_DEFAULT, - * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. - * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. - * This is a potential target for probing, which we try to add - * to the list of probe targets. + * Scan the routing table entries for any IRE_OFFSUBNET entries, and + * cross-reference them with the interface routes to determine if + * they're possible probe targets. */ for (rp = buf; rp < endp; rp++) { if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) continue; - /* Get the nexthop address. */ + /* Get the nexthop address. */ nexthop_v4.s_addr = rp->ipRouteNextHop; /* - * Get the nexthop address. Then determine the outgoing - * interface, by examining all interface IREs, and picking the - * match. We don't look at the interface specified in the route - * because we need to add the router target on all matching - * interfaces anyway; the goal is to avoid falling back to - * multicast when some interfaces are in the same subnet but - * not in the same group. + * Rescan the routing table looking for interface routes that + * are on the same subnet, and try to add them. If they're + * not relevant (e.g., the interface route isn't part of an + * IPMP group, router_add_common() will discard). */ for (rp1 = buf; rp1 < endp; rp1++) { - if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) { + if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) || + rp1->ipRouteIfIndex.o_length == 0) continue; - } - /* - * Determine the interface IRE that matches the nexthop. - * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask) - */ - if ((rp1->ipRouteDest & rp1->ipRouteMask) == - (nexthop_v4.s_addr & rp1->ipRouteMask)) { - /* - * We found the interface ire - */ - router_add_v4(rp1, nexthop_v4); - } + if ((rp1->ipRouteDest & rp1->ipRouteMask) != + (nexthop_v4.s_addr & rp1->ipRouteMask)) + continue; + + oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ); + IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); + router_add_common(AF_INET, ifname, nexthop); } } } void -router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4) -{ - char *cp; - char ifname[LIFNAMSIZ + 1]; - struct in6_addr nexthop; - int len; - - if (debug & D_TARGET) - logdebug("router_add_v4()\n"); - - len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1); - (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len); - ifname[len] = '\0'; - - if (ifname[0] == '\0') - return; - - cp = strchr(ifname, IF_SEPARATOR); - if (cp != NULL) - *cp = '\0'; - - IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); - router_add_common(AF_INET, ifname, nexthop); -} - -void router_add_common(int af, char *ifname, struct in6_addr nexthop) { struct phyint_instance *pii; @@ -1906,16 +1812,17 @@ router_add_common(int af, char *ifname, struct in6_addr nexthop) } /* - * Examine the IPv6 routing table, for default routers. For each default - * router, populate the list of targets of each phyint that is on the same - * link as the default router + * Examine the IPv6 routing table `buf' for possible link-local targets, and + * pass any contenders to router_add_common() for further consideration. */ static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) { - mib2_ipv6RouteEntry_t *rp; - mib2_ipv6RouteEntry_t *endp; - struct in6_addr nexthop_v6; + struct lifreq lifr; + char ifname[LIFNAMSIZ]; + char grname[LIFGRNAMSIZ]; + mib2_ipv6RouteEntry_t *rp, *rp1, *endp; + struct in6_addr nexthop_v6; if (debug & D_TARGET) logdebug("ire_process_v6(len %d)\n", len); @@ -1927,62 +1834,51 @@ ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t)); /* - * Loop thru the routing table entries. Process any IRE_DEFAULT, - * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. - * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. - * This is a potential target for probing, which we try to add - * to the list of probe targets. + * Scan the routing table entries for any IRE_OFFSUBNET entries, and + * cross-reference them with the interface routes to determine if + * they're possible probe targets. */ for (rp = buf; rp < endp; rp++) { - if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET)) + if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) || + !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop)) continue; - /* - * We have the outgoing interface in ipv6RouteIfIndex - * if ipv6RouteIfindex.o_length is non-zero. The outgoing - * interface must be present for link-local addresses. Since - * we use only link-local addreses for probing, we don't - * consider the case when the outgoing interface is not - * known and we need to scan interface ires - */ + /* Get the nexthop address. */ nexthop_v6 = rp->ipv6RouteNextHop; - if (rp->ipv6RouteIfIndex.o_length != 0) { - /* - * We already have the outgoing interface - * in ipv6RouteIfIndex. - */ - router_add_v6(rp, nexthop_v6); - } - } -} - -void -router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6) -{ - char ifname[LIFNAMSIZ + 1]; - char *cp; - int len; - - if (debug & D_TARGET) - logdebug("router_add_v6()\n"); - - len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1); - (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len); - ifname[len] = '\0'; + /* + * The interface name should always exist for link-locals; + * we use it to map this entry to an IPMP group name. + */ + if (rp->ipv6RouteIfIndex.o_length == 0) + continue; - if (ifname[0] == '\0') - return; + oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ); + if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 || + strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) { + continue; + } - cp = strchr(ifname, IF_SEPARATOR); - if (cp != NULL) - *cp = '\0'; + /* + * Rescan the list of routes for interface routes, and add the + * above target to any interfaces in the same IPMP group. + */ + for (rp1 = buf; rp1 < endp; rp1++) { + if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) || + rp1->ipv6RouteIfIndex.o_length == 0) { + continue; + } + oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ); + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); - router_add_common(AF_INET6, ifname, nexthop_v6); + if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 && + strcmp(lifr.lifr_groupname, grname) == 0) { + router_add_common(AF_INET6, ifname, nexthop_v6); + } + } + } } - - /* * Build a list of target routers, by scanning the routing tables. * It is assumed that interface routes exist, to reach the routers. @@ -2001,11 +1897,9 @@ init_router_targets(void) for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { pi = pii->pii_phyint; /* - * Exclude ptp and host targets. Set tg_in_use to false, - * only for router targets. + * Set tg_in_use to false only for router targets. */ - if (!pii->pii_targets_are_routers || - (pi->pi_flags & IFF_POINTOPOINT)) + if (!pii->pii_targets_are_routers) continue; for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) @@ -2026,15 +1920,21 @@ init_router_targets(void) } for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { - if (!pii->pii_targets_are_routers || - (pi->pi_flags & IFF_POINTOPOINT)) + pi = pii->pii_phyint; + if (!pii->pii_targets_are_routers) continue; for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { next_tg = tg->tg_next; - if (!tg->tg_in_use) { + /* + * If the group has failed, it's likely the route was + * removed by an application affected by that failure. + * In that case, we keep the target so that we can + * reliably repair, at which point we'll refresh the + * target list again. + */ + if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group)) target_delete(tg); - } } } } @@ -2140,7 +2040,7 @@ getdefault(char *name) * Command line options below */ boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ -boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */ +boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */ static boolean_t adopt = _B_FALSE; static boolean_t foreground = _B_FALSE; @@ -2149,6 +2049,7 @@ main(int argc, char *argv[]) { int i; int c; + struct phyint *pi; struct phyint_instance *pii; char *value; @@ -2173,14 +2074,15 @@ main(int argc, char *argv[]) if (user_failure_detection_time <= 0) { user_failure_detection_time = FAILURE_DETECTION_TIME; logerr("Invalid failure detection time %s, assuming " - "default %d\n", value, user_failure_detection_time); + "default of %d ms\n", value, + user_failure_detection_time); } else if (user_failure_detection_time < MIN_FAILURE_DETECTION_TIME) { user_failure_detection_time = MIN_FAILURE_DETECTION_TIME; logerr("Too small failure detection time of %s, " - "assuming minimum %d\n", value, + "assuming minimum of %d ms\n", value, user_failure_detection_time); } free(value); @@ -2211,9 +2113,9 @@ main(int argc, char *argv[]) */ value = getdefault("FAILBACK"); if (value != NULL) { - if (strncasecmp(value, "yes", 3) == 0) + if (strcasecmp(value, "yes") == 0) failback_enabled = _B_TRUE; - else if (strncasecmp(value, "no", 2) == 0) + else if (strcasecmp(value, "no") == 0) failback_enabled = _B_FALSE; else logerr("Invalid value for FAILBACK %s\n", value); @@ -2229,9 +2131,9 @@ main(int argc, char *argv[]) */ value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); if (value != NULL) { - if (strncasecmp(value, "yes", 3) == 0) + if (strcasecmp(value, "yes") == 0) track_all_phyints = _B_FALSE; - else if (strncasecmp(value, "no", 2) == 0) + else if (strcasecmp(value, "no") == 0) track_all_phyints = _B_TRUE; else logerr("Invalid value for " @@ -2340,12 +2242,6 @@ main(int argc, char *argv[]) initifs(); - /* Inform kernel whether failback is enabled or disabled */ - if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) { - logperror("main: ioctl (SIOCSIPMPFAILBACK)"); - exit(1); - } - /* * If we're operating in "adopt" mode and no interfaces need to be * tracked, shut down (ifconfig(1M) will restart us on demand if @@ -2379,6 +2275,7 @@ main(int argc, char *argv[]) process_rtsock(rtsock_v4, rtsock_v6); break; } + for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { if (pollfds[i].fd == pii->pii_probe_sock) { @@ -2389,15 +2286,21 @@ main(int argc, char *argv[]) break; } } + + for (pi = phyints; pi != NULL; pi = pi->pi_next) { + if (pi->pi_notes != 0 && + pollfds[i].fd == dlpi_fd(pi->pi_dh)) { + (void) dlpi_recv(pi->pi_dh, NULL, NULL, + NULL, NULL, 0, NULL); + break; + } + } + if (pollfds[i].fd == lsock_v4) loopback_cmd(lsock_v4, AF_INET); else if (pollfds[i].fd == lsock_v6) loopback_cmd(lsock_v6, AF_INET6); } - if (full_scan_required) { - initifs(); - full_scan_required = _B_FALSE; - } } /* NOTREACHED */ return (EXIT_SUCCESS); @@ -2481,29 +2384,23 @@ static struct { { "MI_PING", sizeof (uint32_t) }, { "MI_OFFLINE", sizeof (mi_offline_t) }, { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, - { "MI_SETOINDEX", sizeof (mi_setoindex_t) }, { "MI_QUERY", sizeof (mi_query_t) } }; /* - * Commands received over the loopback interface come here. Currently - * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP - * module. ifconfig only makes a connection, and closes it to check if - * in.mpathd is running. - * if_mpadm sends commands in the format specified by the mpathd_interface - * structure. + * Commands received over the loopback interface come here (via libipmp). */ static void loopback_cmd(int sock, int family) { int newfd; ssize_t len; + boolean_t is_priv = _B_FALSE; struct sockaddr_storage peer; struct sockaddr_in *peer_sin; struct sockaddr_in6 *peer_sin6; socklen_t peerlen; union mi_commands mpi; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; char abuf[INET6_ADDRSTRLEN]; uint_t cmd; int retval; @@ -2528,10 +2425,11 @@ loopback_cmd(int sock, int family) return; } peer_sin = (struct sockaddr_in *)&peer; - if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) || - (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) { - (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, - abuf, sizeof (abuf)); + is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED; + (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, + abuf, sizeof (abuf)); + + if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) { logerr("Attempt to connect from addr %s port %d\n", abuf, ntohs(peer_sin->sin_port)); (void) close(newfd); @@ -2551,11 +2449,10 @@ loopback_cmd(int sock, int family) * talking to us. */ peer_sin6 = (struct sockaddr_in6 *)&peer; - if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) || - (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr, - &loopback_addr))) { - (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, - sizeof (abuf)); + is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED; + (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, + sizeof (abuf)); + if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) { logerr("Attempt to connect from addr %s port %d\n", abuf, ntohs(peer_sin6->sin6_port)); (void) close(newfd); @@ -2575,15 +2472,6 @@ loopback_cmd(int sock, int family) len = read(newfd, &mpi, sizeof (mpi)); /* - * ifconfig does not send any data. Just tests to see if mpathd - * is already running. - */ - if (len <= 0) { - (void) close(newfd); - return; - } - - /* * In theory, we can receive any sized message for a stream socket, * but we don't expect that to happen for a small message over a * loopback connection. @@ -2591,6 +2479,8 @@ loopback_cmd(int sock, int family) if (len < sizeof (uint32_t)) { logerr("loopback_cmd: bad command format or read returns " "partial data %d\n", len); + (void) close(newfd); + return; } cmd = mpi.mi_command; @@ -2600,6 +2490,16 @@ loopback_cmd(int sock, int family) return; } + /* + * Only MI_PING and MI_QUERY can come from unprivileged sources. + */ + if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) { + logerr("Unprivileged request from %s for privileged " + "command %s\n", abuf, commands[cmd].name); + (void) close(newfd); + return; + } + if (len < commands[cmd].size) { logerr("loopback_cmd: short %s command (expected %d, got %d)\n", commands[cmd].name, commands[cmd].size, len); @@ -2615,179 +2515,46 @@ loopback_cmd(int sock, int family) (void) close(newfd); } -extern int global_errno; /* set by failover() or failback() */ - /* - * Process the offline, undo offline and set original index commands, - * received from if_mpadm(1M) + * Process the commands received via libipmp. */ static unsigned int process_cmd(int newfd, union mi_commands *mpi) { - uint_t nif = 0; - uint32_t cmd; struct phyint *pi; - struct phyint *pi2; - struct phyint_group *pg; - boolean_t success; - int error; struct mi_offline *mio; struct mi_undo_offline *miu; - struct lifreq lifr; - int ifsock; - struct mi_setoindex *mis; + unsigned int retval; - cmd = mpi->mi_command; + switch (mpi->mi_command) { + case MI_PING: + return (send_result(newfd, IPMP_SUCCESS, 0)); - switch (cmd) { case MI_OFFLINE: mio = &mpi->mi_ocmd; - /* - * Lookup the interface that needs to be offlined. - * If it does not exist, return a suitable error. - */ + pi = phyint_lookup(mio->mio_ifname); if (pi == NULL) - return (send_result(newfd, IPMP_FAILURE, EINVAL)); - - /* - * Verify that the minimum redundancy requirements are met. - * The multipathing group must have at least the specified - * number of functional interfaces after offlining the - * requested interface. Otherwise return a suitable error. - */ - pg = pi->pi_group; - nif = 0; - if (pg != phyint_anongroup) { - for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL; - pi2 = pi2->pi_pgnext) { - if ((pi2->pi_state == PI_RUNNING) || - (pg->pg_groupfailed && - !(pi2->pi_flags & IFF_OFFLINE))) - nif++; - } - } - if (nif < mio->mio_min_redundancy) - return (send_result(newfd, IPMP_EMINRED, 0)); + return (send_result(newfd, IPMP_EUNKIF, 0)); - /* - * The order of operation is to set IFF_OFFLINE, followed by - * failover. Setting IFF_OFFLINE ensures that no new ipif's - * can be created. Subsequent failover moves everything on - * the OFFLINE interface to some other functional interface. - */ - success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE); - if (success) { - if (!pi->pi_empty) { - error = try_failover(pi, FAILOVER_NORMAL); - if (error != 0) { - if (!change_lif_flags(pi, IFF_OFFLINE, - _B_FALSE)) { - logerr("process_cmd: couldn't" - " clear OFFLINE flag on" - " %s\n", pi->pi_name); - /* - * Offline interfaces should - * not be probed. - */ - stop_probing(pi); - } - return (send_result(newfd, error, - global_errno)); - } - } - } else { + retval = phyint_offline(pi, mio->mio_min_redundancy); + if (retval == IPMP_FAILURE) return (send_result(newfd, IPMP_FAILURE, errno)); - } - /* - * The interface is now Offline, so stop probing it. - * Note that if_mpadm(1M) will down the test addresses, - * after receiving a success reply from us. The routing - * socket message will then make us close the socket used - * for sending probes. But it is more logical that an - * offlined interface must not be probed, even if it has - * test addresses. - */ - stop_probing(pi); - return (send_result(newfd, IPMP_SUCCESS, 0)); + return (send_result(newfd, retval, 0)); case MI_UNDO_OFFLINE: miu = &mpi->mi_ucmd; - /* - * Undo the offline command. As usual lookup the interface. - * Send an error if it does not exist or is not offline. - */ - pi = phyint_lookup(miu->miu_ifname); - if (pi == NULL || pi->pi_state != PI_OFFLINE) - return (send_result(newfd, IPMP_FAILURE, EINVAL)); - - /* - * Reset the state of the interface based on the current link - * state; if this phyint subsequently acquires a test address, - * the state will be updated later as a result of the probes. - */ - if (LINK_UP(pi)) - phyint_chstate(pi, PI_RUNNING); - else - phyint_chstate(pi, PI_FAILED); - - if (pi->pi_state == PI_RUNNING) { - /* - * Note that the success of MI_UNDO_OFFLINE is not - * contingent on actually failing back; in the odd - * case where we cannot do it here, we will try again - * in initifs() since pi->pi_full will still be zero. - */ - if (do_failback(pi) != IPMP_SUCCESS) { - logdebug("process_cmd: cannot failback from " - "%s during MI_UNDO_OFFLINE\n", pi->pi_name); - } - } - - /* - * Clear the IFF_OFFLINE flag. We have to do this last - * because do_failback() relies on it being set to decide - * when to display messages. - */ - (void) change_lif_flags(pi, IFF_OFFLINE, _B_FALSE); - - /* - * Give the requestor time to configure test addresses - * before complaining that they're missing. - */ - pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME; - - return (send_result(newfd, IPMP_SUCCESS, 0)); - - case MI_SETOINDEX: - mis = &mpi->mi_scmd; - /* Get the socket for doing ioctls */ - ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6; - - /* - * Get index of new original interface. - * The index is returned in lifr.lifr_index. - */ - (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname, - sizeof (lifr.lifr_name)); + pi = phyint_lookup(miu->miu_ifname); + if (pi == NULL) + return (send_result(newfd, IPMP_EUNKIF, 0)); - if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) + retval = phyint_undo_offline(pi); + if (retval == IPMP_FAILURE) return (send_result(newfd, IPMP_FAILURE, errno)); - /* - * Set new original interface index. - * The new index was put into lifr.lifr_index by the - * SIOCGLIFINDEX ioctl. - */ - (void) strlcpy(lifr.lifr_name, mis->mis_lifname, - sizeof (lifr.lifr_name)); - - if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0) - return (send_result(newfd, IPMP_FAILURE, errno)); - - return (send_result(newfd, IPMP_SUCCESS, 0)); + return (send_result(newfd, retval, 0)); case MI_QUERY: return (process_query(newfd, &mpi->mi_qcmd)); @@ -2806,6 +2573,8 @@ process_cmd(int newfd, union mi_commands *mpi) static unsigned int process_query(int fd, mi_query_t *miq) { + ipmp_addrinfo_t *adinfop; + ipmp_addrinfolist_t *adlp; ipmp_groupinfo_t *grinfop; ipmp_groupinfolist_t *grlp; ipmp_grouplist_t *grlistp; @@ -2815,6 +2584,19 @@ process_query(int fd, mi_query_t *miq) unsigned int retval; switch (miq->miq_inforeq) { + case IPMP_ADDRINFO: + retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr, + &adinfop); + if (retval != IPMP_SUCCESS) + return (send_result(fd, retval, errno)); + + retval = send_result(fd, IPMP_SUCCESS, 0); + if (retval == IPMP_SUCCESS) + retval = send_addrinfo(fd, adinfop); + + ipmp_freeaddrinfo(adinfop); + return (retval); + case IPMP_GROUPLIST: retval = getgrouplist(&grlistp); if (retval != IPMP_SUCCESS) @@ -2829,7 +2611,7 @@ process_query(int fd, mi_query_t *miq) case IPMP_GROUPINFO: miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; - retval = getgroupinfo(miq->miq_ifname, &grinfop); + retval = getgroupinfo(miq->miq_grname, &grinfop); if (retval != IPMP_SUCCESS) return (send_result(fd, retval, errno)); @@ -2854,6 +2636,11 @@ process_query(int fd, mi_query_t *miq) return (retval); case IPMP_SNAP: + /* + * Before taking the snapshot, sync with the kernel. + */ + initifs(); + retval = getsnap(&snap); if (retval != IPMP_SUCCESS) return (send_result(fd, retval, errno)); @@ -2883,6 +2670,13 @@ process_query(int fd, mi_query_t *miq) if (retval != IPMP_SUCCESS) goto out; } + + adlp = snap->sn_adinfolistp; + for (; adlp != NULL; adlp = adlp->adl_next) { + retval = send_addrinfo(fd, adlp->adl_adinfop); + if (retval != IPMP_SUCCESS) + goto out; + } out: ipmp_snap_free(snap); return (retval); @@ -2902,14 +2696,20 @@ static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) { ipmp_iflist_t *iflistp = grinfop->gr_iflistp; + ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; unsigned int retval; retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); if (retval != IPMP_SUCCESS) return (retval); - return (ipmp_writetlv(fd, IPMP_IFLIST, - IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp)); + retval = ipmp_writetlv(fd, IPMP_IFLIST, + IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp); + if (retval != IPMP_SUCCESS) + return (retval); + + return (ipmp_writetlv(fd, IPMP_ADDRLIST, + IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp)); } /* @@ -2919,7 +2719,31 @@ send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) { - return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop)); + ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp; + ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp; + unsigned int retval; + + retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop); + if (retval != IPMP_SUCCESS) + return (retval); + + retval = ipmp_writetlv(fd, IPMP_ADDRLIST, + IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p); + if (retval != IPMP_SUCCESS) + return (retval); + + return (ipmp_writetlv(fd, IPMP_ADDRLIST, + IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p)); +} + +/* + * Send the address information pointed to by `adinfop' on file descriptor + * `fd'. Returns an IPMP error code. + */ +static unsigned int +send_addrinfo(int fd, ipmp_addrinfo_t *adinfop) +{ + return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop)); } /* @@ -3109,3 +2933,32 @@ close_probe_socket(struct phyint_instance *pii, boolean_t polled) pii->pii_probe_sock = -1; pii->pii_basetime_inited = 0; } + +boolean_t +addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags, + struct sockaddr_storage *ssp) +{ + addrlist_t *addrp; + + if ((addrp = malloc(sizeof (addrlist_t))) == NULL) + return (_B_FALSE); + + (void) strlcpy(addrp->al_name, name, LIFNAMSIZ); + addrp->al_flags = flags; + addrp->al_addr = *ssp; + addrp->al_next = *addrsp; + *addrsp = addrp; + return (_B_TRUE); +} + +void +addrlist_free(addrlist_t **addrsp) +{ + addrlist_t *addrp, *next_addrp; + + for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) { + next_addrp = addrp->al_next; + free(addrp); + } + *addrsp = NULL; +} diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c index a2ff76a983..cf327fbaff 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -20,8 +20,6 @@ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mpd_defs.h" #include "mpd_tables.h" @@ -45,7 +43,7 @@ struct pr_icmp uint16_t pr_icmp_cksum; /* checksum field */ uint16_t pr_icmp_id; /* Identification */ uint16_t pr_icmp_seq; /* sequence number */ - uint32_t pr_icmp_timestamp; /* Time stamp */ + uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */ uint32_t pr_icmp_mtype; /* Message type */ }; @@ -58,11 +56,12 @@ static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ -static void *find_ancillary(struct msghdr *msg, int cmsg_type); -static void pi_set_crtt(struct target *tg, int m, +static void *find_ancillary(struct msghdr *msg, int cmsg_level, + int cmsg_type); +static void pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni); static void incoming_echo_reply(struct phyint_instance *pii, - struct pr_icmp *reply, struct in6_addr fromaddr); + struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp); static void incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr); static void incoming_mcast_reply(struct phyint_instance *pii, @@ -78,13 +77,11 @@ static void probe_success_info(struct phyint_instance *pii, struct target *cur_tg, struct probe_success_count *psinfo); static boolean_t phyint_repaired(struct phyint *pi); -static int failover(struct phyint *from, struct phyint *to); -static int failback(struct phyint *from, struct phyint *to); -static struct phyint *get_failover_dst(struct phyint *pi, int failover_type); - static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); static int in_cksum(ushort_t *addr, int len); static void reset_snxt_basetimes(void); +static int ns2ms(int64_t ns); +static int64_t tv2ns(struct timeval *); /* * CRTT - Conservative Round Trip Time Estimate @@ -104,7 +101,7 @@ static void reset_snxt_basetimes(void); * Phyint state diagram * * The state of a phyint that is capable of being probed, is completely - * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>. + * specified by the 3-tuple <pi_state, pg_state, I>. * * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state * of the link (according to the driver). If the phyint is also configured @@ -117,8 +114,8 @@ static void reset_snxt_basetimes(void); * state, which indicates that the link is apparently functional but that * in.mpathd is unable to send probes to verify functionality (in this case, * in.mpathd makes the optimistic assumption that the interface is working - * correctly and thus does not perform a failover, but reports the interface - * as IPMP_IF_UNKNOWN through the async events and query interfaces). + * correctly and thus does not mark the interface FAILED, but reports it as + * IPMP_IF_UNKNOWN through the async events and query interfaces). * * At any point, a phyint may be administratively marked offline via if_mpadm. * In this case, the interface always transitions to PI_OFFLINE, regardless @@ -131,8 +128,11 @@ static void reset_snxt_basetimes(void); * PI_RUNNING: The failure detection logic says the phyint is good. * PI_FAILED: The failure detection logic says the phyint has failed. * - * pg_groupfailed - Group failure, all interfaces in the group have failed. - * The pi_state may be either PI_FAILED or PI_NOTARGETS. + * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED. + * PG_OK: All interfaces in the group are OK. + * PG_DEGRADED: Some interfaces in the group are unusable. + * PG_FAILED: All interfaces in the group are unusable. + * * In the case of router targets, we assume that the current list of * targets obtained from the routing table, is still valid, so the * phyint stat is PI_FAILED. In the case of host targets, we delete the @@ -140,144 +140,46 @@ static void reset_snxt_basetimes(void); * target list. So the phyints are in the PI_NOTARGETS state. * * I - value of (pi_flags & IFF_INACTIVE) - * IFF_INACTIVE: No failovers have been done to this phyint, from - * other phyints. This phyint is inactive. Phyint can be a Standby. - * When failback has been disabled (FAILOVER=no configured), - * phyint can also be a non-STANDBY. In this case IFF_INACTIVE - * is set when phyint subsequently recovers after a failure. - * - * pi_empty - * This phyint has failed over successfully to another phyint, and - * this phyint is currently "empty". It does not host any addresses or - * multicast membership etc. This is the state of a phyint after a - * failover from the phyint has completed successfully and no subsequent - * 'failover to' or 'failback to' has occurred on the phyint. - * IP guarantees that no new logicals will be hosted nor any multicast - * joins permitted on the phyint, since the phyint is either failed or - * inactive. pi_empty is set implies the phyint is either failed or - * inactive. - * - * pi_full - * The phyint hosts all of its own addresses that it "owns". If the - * phyint was previously failed or inactive, failbacks to the phyint - * has completed successfully. i.e. No more failbacks to this phyint - * can produce any change in system state whatsoever. - * - * Not all 32 possible combinations of the above 5-tuple are possible. - * Furthermore some of the above combinations are transient. They may occur - * only because the failover or failback did not complete successfully. The - * failover/failback will be retried and eventually a stable state will be - * reached. - * - * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd. - * The following are the state machines. 'from' and 'to' are the src and - * dst of the failover/failback, below - * - * pi_empty state machine - * --------------------------------------------------------------------------- - * Event State -> New State - * --------------------------------------------------------------------------- - * successful completion from.pi_empty = 0 -> from.pi_empty = 1 - * of failover + * IFF_INACTIVE: This phyint will not send or receive packets. + * Usually, inactive is tied to standby interfaces that are not yet + * needed (e.g., no non-standby interfaces in the group have failed). + * When failback has been disabled (FAILBACK=no configured), phyint can + * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint + * subsequently recovers after a failure. * - * Initiate failover to.pi_empty = X -> to.pi_empty = 0 + * Not all 9 possible combinations of the above 3-tuple are possible. * - * Initiate failback to.pi_empty = X -> to.pi_empty = 0 - * - * group failure pi_empty = X -> pi_empty = 0 - * --------------------------------------------------------------------------- - * - * pi_full state machine - * --------------------------------------------------------------------------- - * Event State -> New State - * --------------------------------------------------------------------------- - * successful completion to.pi_full = 0 -> to.pi_full = 1 - * of failback from - * each of the other phyints - * - * Initiate failover from.pi_full = X -> from.pi_full = 0 - * - * group failure pi_full = X -> pi_full = 0 - * --------------------------------------------------------------------------- + * I is tracked by IP. pi_state is tracked by mpathd. * * pi_state state machine * --------------------------------------------------------------------------- * Event State New State * Action: * --------------------------------------------------------------------------- - * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) + * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) * detection : set IFF_FAILED on this phyint - * : failover from this phyint to another * - * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) + * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) * detection : set IFF_FAILED on this phyint * - * NIC repair (PI_FAILED, I == 0, FAILBACK=yes) + * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes) * detection -> (PI_RUNNING, I == 0) - * : to.pi_empty = 0 * : clear IFF_FAILED on this phyint - * : failback to this phyint if enabled * - * NIC repair (PI_FAILED, I == 0, FAILBACK=no) + * IP interface repair (PI_FAILED, I == 0, FAILBACK=no) * detection -> (PI_RUNNING, I == 1) - * : to.pi_empty = 0 * : clear IFF_FAILED on this phyint * : if failback is disabled set I == 1 * * Group failure (perform on all phyints in the group) * detection PI_RUNNING PI_FAILED * (Router targets) : set IFF_FAILED - * : clear pi_empty and pi_full * * Group failure (perform on all phyints in the group) * detection PI_RUNNING PI_NOTARGETS * (Host targets) : set IFF_FAILED - * : clear pi_empty and pi_full * : delete the target list on all phyints * --------------------------------------------------------------------------- - * - * I state machine - * --------------------------------------------------------------------------- - * Event State Action: - * --------------------------------------------------------------------------- - * Turn on I pi_empty == 0, STANDBY : failover from standby - * - * Turn off I PI_RUNNING, STANDBY : pi_empty = 0 - * pi_full == 0 : failback to this if enabled - * --------------------------------------------------------------------------- - * - * Assertions: (Read '==>' as implies) - * - * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED) - * (pi_empty == 1) ==> (pi_full == 0) - * (pi_full == 1) ==> (pi_empty == 0) - * - * Invariants - * - * pg_groupfailed = 0 && - * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby - * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint - * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint - * - * 1. says that an inactive standby, that is not empty, has to be failed - * over. For a standby to be truly inactive, it should not host any - * addresses. So we move them to some other phyint. Usually we catch the - * turn on of IFF_INACTIVE, and perform this action. However if the failover - * did not complete successfully, then subsequently we have lost the edge - * trigger, and this invariant kicks in and completes the action. - * - * 2. says that any failed phyint that is not empty must be failed over. - * Usually we do the failover when we detect NIC failure. However if the - * failover does not complete successfully, this invariant kicks in and - * completes the failover. We exclude inactive standby which is covered by 1. - * - * 3. says that any running phyint that is not full must be failed back. - * Usually we do the failback when we detect NIC repair. However if the - * failback does not complete successfully, this invariant kicks in and - * completes the failback. Note that we don't want to failback to an inactive - * standby. - * - * The invariants 1 - 3 and the actions are in initifs(). */ struct probes_missed probes_missed; @@ -295,7 +197,7 @@ struct probes_missed probes_missed; * not less than the current CRTT. pii_probes[] stores data * about these probes. These packets consume sequence number space. * - * PROBE_RTT: This type is used to make only rtt measurments. Normally these + * PROBE_RTT: This type is used to make only rtt measurements. Normally these * are not used. Under heavy network load, the rtt may go up very high, * due to a spike, or may appear to go high, due to extreme scheduling * delays. Once the network stress is removed, mpathd takes long time to @@ -310,17 +212,19 @@ struct probes_missed probes_missed; * no targets are known. The packet is multicast to the all hosts addr. */ static void -probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) +probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime) { + hrtime_t sent_hrtime; + struct timeval sent_tv; struct pr_icmp probe_pkt; /* Probe packet */ - struct sockaddr_in6 whereto6; /* target address IPv6 */ - struct sockaddr_in whereto; /* target address IPv4 */ + struct sockaddr_storage targ; /* target address */ + uint_t targaddrlen; /* targed address length */ int pr_ndx; /* probe index in pii->pii_probes[] */ boolean_t sent = _B_TRUE; if (debug & D_TARGET) { - logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af), - pii->pii_name, probe_type, cur_time); + logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af), + pii->pii_name, probe_type, start_hrtime); } assert(pii->pii_probe_sock != -1); @@ -339,7 +243,7 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) * network byte order at initialization itself. */ probe_pkt.pr_icmp_id = pii->pii_icmpid; - probe_pkt.pr_icmp_timestamp = htonl(cur_time); + probe_pkt.pr_icmp_timestamp = htonll(start_hrtime); probe_pkt.pr_icmp_mtype = htonl(probe_type); /* @@ -349,38 +253,34 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && pii->pii_rtt_target_next != NULL)); + bzero(&targ, sizeof (targ)); + targ.ss_family = pii->pii_af; + if (pii->pii_af == AF_INET6) { - bzero(&whereto6, sizeof (whereto6)); - whereto6.sin6_family = AF_INET6; + struct in6_addr *addr6; + + addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr; + targaddrlen = sizeof (struct sockaddr_in6); if (probe_type == PROBE_MULTI) { - whereto6.sin6_addr = all_nodes_mcast_v6; + *addr6 = all_nodes_mcast_v6; } else if (probe_type == PROBE_UNI) { - whereto6.sin6_addr = pii->pii_target_next->tg_address; - } else { - /* type is PROBE_RTT */ - whereto6.sin6_addr = - pii->pii_rtt_target_next->tg_address; - } - if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, - sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6, - sizeof (whereto6)) != sizeof (probe_pkt)) { - logperror_pii(pii, "probe: probe sendto"); - sent = _B_FALSE; + *addr6 = pii->pii_target_next->tg_address; + } else { /* type is PROBE_RTT */ + *addr6 = pii->pii_rtt_target_next->tg_address; } } else { - bzero(&whereto, sizeof (whereto)); - whereto.sin_family = AF_INET; + struct in_addr *addr4; + + addr4 = &((struct sockaddr_in *)&targ)->sin_addr; + targaddrlen = sizeof (struct sockaddr_in); if (probe_type == PROBE_MULTI) { - whereto.sin_addr = all_nodes_mcast_v4; + *addr4 = all_nodes_mcast_v4; } else if (probe_type == PROBE_UNI) { IN6_V4MAPPED_TO_INADDR( - &pii->pii_target_next->tg_address, - &whereto.sin_addr); - } else { - /* type is PROBE_RTT */ + &pii->pii_target_next->tg_address, addr4); + } else { /* type is PROBE_RTT */ IN6_V4MAPPED_TO_INADDR( - &pii->pii_rtt_target_next->tg_address, - &whereto.sin_addr); + &pii->pii_rtt_target_next->tg_address, addr4); } /* @@ -388,12 +288,18 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) */ probe_pkt.pr_icmp_cksum = in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); - if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, - sizeof (probe_pkt), 0, (struct sockaddr *)&whereto, - sizeof (whereto)) != sizeof (probe_pkt)) { - logperror_pii(pii, "probe: probe sendto"); - sent = _B_FALSE; - } + } + + /* + * Use the current time as the time we sent. Not atomic, but the best + * we can do from here. + */ + sent_hrtime = gethrtime(); + (void) gettimeofday(&sent_tv, NULL); + if (sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0, + (struct sockaddr *)&targ, targaddrlen) != sizeof (probe_pkt)) { + logperror_pii(pii, "probe: probe sendto"); + sent = _B_FALSE; } /* @@ -415,9 +321,13 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) pii->pii_cum_stats.acked++; pii->pii_cum_stats.sent++; - pii->pii_probes[pr_ndx].pr_status = PR_UNACKED; + pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt; + pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv; + pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime; + pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime; pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; - pii->pii_probes[pr_ndx].pr_time_sent = cur_time; + probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED); + pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); pii->pii_target_next = target_next(pii->pii_target_next); assert(pii->pii_target_next != NULL); @@ -448,33 +358,42 @@ in_data(struct phyint_instance *pii) { struct sockaddr_in from; struct in6_addr fromaddr; - uint_t fromlen; - static uint_t in_packet[(IP_MAXPACKET + 1)/4]; + static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; + static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; struct ip *ip; int iphlen; int len; char abuf[INET_ADDRSTRLEN]; - struct pr_icmp *reply; + struct msghdr msg; + struct iovec iov; + struct pr_icmp *reply; + struct timeval *recv_tvp; if (debug & D_PROBE) { logdebug("in_data(%s %s)\n", AF_STR(pii->pii_af), pii->pii_name); } + iov.iov_base = (char *)in_packet; + iov.iov_len = sizeof (in_packet); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_name = (struct sockaddr *)&from; + msg.msg_namelen = sizeof (from); + msg.msg_control = ancillary_data; + msg.msg_controllen = sizeof (ancillary_data); + /* * Poll has already told us that a message is waiting, * on this socket. Read it now. We should not block. */ - fromlen = sizeof (from); - len = recvfrom(pii->pii_probe_sock, (char *)in_packet, - sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen); - if (len < 0) { - logperror_pii(pii, "in_data: recvfrom"); + if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { + logperror_pii(pii, "in_data: recvmsg"); return; } /* - * If the NIC has indicated the link is down, don't go + * If the datalink has indicated the link is down, don't go * any further. */ if (LINK_DOWN(pii->pii_phyint)) @@ -483,6 +402,15 @@ in_data(struct phyint_instance *pii) /* Get the printable address for error reporting */ (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); + /* Ignore packets > 64k or control buffers that don't fit */ + if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { + if (debug & D_PKTBAD) { + logdebug("Truncated message: msg_flags 0x%x from %s\n", + msg.msg_flags, abuf); + } + return; + } + /* Make sure packet contains at least minimum ICMP header */ ip = (struct ip *)in_packet; iphlen = ip->ip_hl << 2; @@ -528,10 +456,17 @@ in_data(struct phyint_instance *pii) return; } + recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); + if (recv_tvp == NULL) { + logtrace("message without timestamp from %s on %s\n", + abuf, pii->pii_name); + return; + } + IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) /* Unicast probe reply */ - incoming_echo_reply(pii, reply, fromaddr); + incoming_echo_reply(pii, reply, fromaddr, recv_tvp); else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { /* Multicast reply */ incoming_mcast_reply(pii, reply, fromaddr); @@ -543,7 +478,6 @@ in_data(struct phyint_instance *pii) reply->pr_icmp_mtype, abuf, pii->pii_name); return; } - } /* @@ -559,8 +493,9 @@ in6_data(struct phyint_instance *pii) char abuf[INET6_ADDRSTRLEN]; struct msghdr msg; struct iovec iov; - uchar_t *opt; + void *opt; struct pr_icmp *reply; + struct timeval *recv_tvp; if (debug & D_PROBE) { logdebug("in6_data(%s %s)\n", @@ -577,12 +512,12 @@ in6_data(struct phyint_instance *pii) msg.msg_controllen = sizeof (ancillary_data); if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { - logperror_pii(pii, "in6_data: recvfrom"); + logperror_pii(pii, "in6_data: recvmsg"); return; } /* - * If the NIC has indicated that the link is down, don't go + * If the datalink has indicated that the link is down, don't go * any further. */ if (LINK_DOWN(pii->pii_phyint)) @@ -623,13 +558,14 @@ in6_data(struct phyint_instance *pii) "%s on %s\n", abuf, pii->pii_name); return; } - opt = find_ancillary(&msg, IPV6_RTHDR); + opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR); if (opt != NULL) { /* Can't allow routing headers in probe replies */ logtrace("message with routing header from %s on %s\n", abuf, pii->pii_name); return; } + if (reply->pr_icmp_code != 0) { logtrace("probe reply code: %d from %s on %s\n", reply->pr_icmp_code, abuf, pii->pii_name); @@ -640,8 +576,16 @@ in6_data(struct phyint_instance *pii) len, abuf, pii->pii_name); return; } + + recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); + if (recv_tvp == NULL) { + logtrace("message without timestamp from %s on %s\n", + abuf, pii->pii_name); + return; + } + if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { - incoming_echo_reply(pii, reply, from.sin6_addr); + incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp); } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { incoming_mcast_reply(pii, reply, from.sin6_addr); } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { @@ -663,11 +607,9 @@ static void incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr) { - int m; /* rtt measurment in ms */ - uint32_t cur_time; /* in ms from some arbitrary point */ + int64_t m; /* rtt measurement in ns */ char abuf[INET6_ADDRSTRLEN]; struct target *target; - uint32_t pr_icmp_timestamp; struct phyint_group *pg; /* Get the printable address for error reporting */ @@ -683,10 +625,7 @@ incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, if (target == NULL) return; - pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); - cur_time = getcurrenttime(); - m = (int)(cur_time - pr_icmp_timestamp); - + m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp)); /* Invalid rtt. It has wrapped around */ if (m < 0) return; @@ -754,29 +693,30 @@ incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, */ static void incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, - struct in6_addr fromaddr) + struct in6_addr fromaddr, struct timeval *recv_tvp) { - int m; /* rtt measurment in ms */ - uint32_t cur_time; /* in ms from some arbitrary point */ + int64_t m; /* rtt measurement in ns */ + hrtime_t cur_hrtime; /* in ns from some arbitrary point */ char abuf[INET6_ADDRSTRLEN]; int pr_ndx; struct target *target; boolean_t exception; - uint32_t pr_icmp_timestamp; + uint64_t pr_icmp_timestamp; uint16_t pr_icmp_seq; + struct probe_stats *pr_statp; struct phyint_group *pg = pii->pii_phyint->pi_group; /* Get the printable address for error reporting */ (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); if (debug & D_PROBE) { - logdebug("incoming_echo_reply: %s %s %s seq %u\n", + logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n", AF_STR(pii->pii_af), pii->pii_name, abuf, - ntohs(reply->pr_icmp_seq)); + ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp)); } - pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); - pr_icmp_seq = ntohs(reply->pr_icmp_seq); + pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp); + pr_icmp_seq = ntohs(reply->pr_icmp_seq); /* Reject out of window probe replies */ if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || @@ -786,15 +726,16 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, pii->pii_cum_stats.unknown++; return; } - cur_time = getcurrenttime(); - m = (int)(cur_time - pr_icmp_timestamp); + + cur_hrtime = gethrtime(); + m = (int64_t)(cur_hrtime - pr_icmp_timestamp); if (m < 0) { /* * This is a ridiculously high value of rtt. rtt has wrapped * around. Log a message, and ignore the rtt. */ - logerr("incoming_echo_reply: rtt wraparound cur_time %u reply " - "timestamp %u\n", cur_time, pr_icmp_timestamp); + logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld " + "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp); } /* @@ -868,10 +809,10 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, * debugger, or the system was hung or too busy for a * substantial time that we didn't get a chance to run. */ - if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) { + if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) { /* - * If the probe corresponding to this receieved response - * was truly sent 'm' ms. ago, then this response must + * If the probe corresponding to this received response + * was truly sent 'm' ns. ago, then this response must * have been rejected by the sequence number checks. The * fact that it has passed the sequence number checks * means that the measured rtt is wrong. We were probably @@ -947,7 +888,7 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, * adjusts pii->pii_target_next */ target_delete(target); - probe(pii, PROBE_MULTI, cur_time); + probe(pii, PROBE_MULTI, cur_hrtime); } } else { /* @@ -999,8 +940,12 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, } } out: - pii->pii_probes[pr_ndx].pr_status = PR_ACKED; - pii->pii_probes[pr_ndx].pr_time_acked = cur_time; + pr_statp = &pii->pii_probes[pr_ndx]; + pr_statp->pr_hrtime_ackproc = cur_hrtime; + pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent + + (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent)); + + probe_chstate(pr_statp, pii, PR_ACKED); /* * Update pii->pii_rack, i.e. the sequence number of the last received @@ -1240,13 +1185,13 @@ incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, * * New scaled average and deviation are passed back via sap and svp */ -static int -compute_crtt(int *sap, int *svp, int m) +static int64_t +compute_crtt(int64_t *sap, int64_t *svp, int64_t m) { - int sa = *sap; - int sv = *svp; - int crtt; - int saved_m = m; + int64_t sa = *sap; + int64_t sv = *svp; + int64_t crtt; + int64_t saved_m = m; assert(*sap >= -1); assert(*svp >= 0); @@ -1285,8 +1230,8 @@ compute_crtt(int *sap, int *svp, int m) crtt = (sa >> 3) + sv; if (debug & D_PROBE) { - logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = " - "%d\n", saved_m, sa, sv, crtt); + logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> " + "crtt = %lld\n", saved_m, sa, sv, crtt); } *sap = sa; @@ -1300,22 +1245,22 @@ compute_crtt(int *sap, int *svp, int m) } static void -pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) +pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni) { struct phyint_instance *pii = tg->tg_phyint_inst; int probe_interval = pii->pii_phyint->pi_group->pg_probeint; - int sa = tg->tg_rtt_sa; - int sv = tg->tg_rtt_sd; + int64_t sa = tg->tg_rtt_sa; + int64_t sv = tg->tg_rtt_sd; int new_crtt; int i; if (debug & D_PROBE) - logdebug("pi_set_crtt: target - m %d\n", m); + logdebug("pi_set_crtt: target - m %lld\n", m); /* store the round trip time, in case we need to defer computation */ tg->tg_deferred[tg->tg_num_deferred] = m; - new_crtt = compute_crtt(&sa, &sv, m); + new_crtt = ns2ms(compute_crtt(&sa, &sv, m)); /* * If this probe's round trip time would singlehandedly cause an @@ -1342,8 +1287,8 @@ pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) } for (i = 0; i <= tg->tg_num_deferred; i++) { - tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa, - &tg->tg_rtt_sd, tg->tg_deferred[i]); + tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa, + &tg->tg_rtt_sd, tg->tg_deferred[i])); } tg->tg_num_deferred = 0; @@ -1373,13 +1318,13 @@ pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) * If not found return NULL. */ static void * -find_ancillary(struct msghdr *msg, int cmsg_type) +find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type) { struct cmsghdr *cmsg; for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { - if (cmsg->cmsg_level == IPPROTO_IPV6 && + if (cmsg->cmsg_level == cmsg_level && cmsg->cmsg_type == cmsg_type) { return (CMSG_DATA(cmsg)); } @@ -1388,107 +1333,194 @@ find_ancillary(struct msghdr *msg, int cmsg_type) } /* - * See if a previously failed interface has started working again. + * Try to activate another INACTIVE interface in the same group as `pi'. + * Prefer STANDBY INACTIVE to just INACTIVE. */ void -phyint_check_for_repair(struct phyint *pi) +phyint_activate_another(struct phyint *pi) { - if (phyint_repaired(pi)) { - if (pi->pi_group == phyint_anongroup) { - logerr("NIC repair detected on %s\n", pi->pi_name); - } else { - logerr("NIC repair detected on %s of group %s\n", - pi->pi_name, pi->pi_group->pg_name); - } + struct phyint *pi2; + struct phyint *inactivepi = NULL; - /* - * If the interface is offline, just clear the FAILED flag, - * delaying the state change and failback operation until it - * is brought back online. - */ - if (pi->pi_state == PI_OFFLINE) { - (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); - return; - } + if (pi->pi_group == phyint_anongroup) + return; - if (pi->pi_flags & IFF_STANDBY) { - (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); - } else { - if (try_failback(pi) != IPMP_FAILURE) { - (void) change_lif_flags(pi, - IFF_FAILED, _B_FALSE); - /* Per state diagram */ - pi->pi_empty = 0; + for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (pi == pi2 || pi2->pi_state != PI_RUNNING || + !(pi2->pi_flags & IFF_INACTIVE)) + continue; + + inactivepi = pi2; + if (pi2->pi_flags & IFF_STANDBY) + break; + } + + if (inactivepi != NULL) + (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE); +} + +/* + * Transition a phyint back to PI_RUNNING (from PI_FAILED or PI_OFFLINE). The + * caller must ensure that the transition is appropriate. Clears IFF_OFFLINE + * or IFF_FAILED, as appropriate. Also sets IFF_INACTIVE on this or other + * interfaces as appropriate (see comment below). Finally, also updates the + * phyint's group state to account for the change. + */ +void +phyint_transition_to_running(struct phyint *pi) +{ + struct phyint *pi2; + struct phyint *actstandbypi = NULL; + uint_t nactive = 0, nnonstandby = 0; + boolean_t onlining = (pi->pi_state == PI_OFFLINE); + uint64_t set, clear; + + /* + * The interface is running again, but should it or another interface + * in the group end up INACTIVE? There are three cases: + * + * 1. If it's a STANDBY interface, it should be end up INACTIVE if + * the group is operating at capacity (i.e., there are at least as + * many active interfaces as non-STANDBY interfaces in the group). + * No other interfaces should be changed. + * + * 2. If it's a non-STANDBY interface and we're onlining it or + * FAILBACK is enabled, then it should *not* end up INACTIVE. + * Further, if the group is above capacity as a result of this + * interface, then an active STANDBY interface in the group should + * end up INACTIVE. + * + * 3. If it's a non-STANDBY interface, we're repairing it, and + * FAILBACK is disabled, then it should end up INACTIVE *unless* + * the group was failed (in which case we have no choice but to + * use it). No other interfaces should be changed. + */ + if (pi->pi_group != phyint_anongroup) { + pi2 = pi->pi_group->pg_phyint; + for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (!(pi2->pi_flags & IFF_STANDBY)) + nnonstandby++; + + if (pi2->pi_state == PI_RUNNING) { + if (!(pi2->pi_flags & IFF_INACTIVE)) { + nactive++; + if (pi2->pi_flags & IFF_STANDBY) + actstandbypi = pi2; + } } } + } - phyint_chstate(pi, PI_RUNNING); + set = 0; + clear = (onlining ? IFF_OFFLINE : IFF_FAILED); - if (GROUP_FAILED(pi->pi_group)) { - /* - * This is the 1st phyint to receive a response - * after group failure. - */ - logerr("At least 1 interface (%s) of group %s has " - "repaired\n", pi->pi_name, pi->pi_group->pg_name); - phyint_group_chstate(pi->pi_group, PG_RUNNING); - } + if (pi->pi_flags & IFF_STANDBY) { /* case 1 */ + if (nactive >= nnonstandby) + set |= IFF_INACTIVE; + else + clear |= IFF_INACTIVE; + } else if (onlining || failback_enabled) { /* case 2 */ + if (nactive >= nnonstandby && actstandbypi != NULL) + (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0); + } else if (!GROUP_FAILED(pi->pi_group)) { /* case 3 */ + set |= IFF_INACTIVE; + } + (void) change_pif_flags(pi, set, clear); + + phyint_chstate(pi, PI_RUNNING); + + /* + * Update the group state to account for the change. + */ + phyint_group_refresh_state(pi->pi_group); +} + +/* + * See if a previously failed interface has started working again. + */ +void +phyint_check_for_repair(struct phyint *pi) +{ + if (!phyint_repaired(pi)) + return; + + if (pi->pi_group == phyint_anongroup) { + logerr("IP interface repair detected on %s\n", pi->pi_name); + } else { + logerr("IP interface repair detected on %s of group %s\n", + pi->pi_name, pi->pi_group->pg_name); } + + /* + * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet. + * So just clear IFF_OFFLINE and defer phyint_transition_to_running() + * until it is brought back online. + */ + if (pi->pi_state == PI_OFFLINE) { + (void) change_pif_flags(pi, 0, IFF_FAILED); + return; + } + + phyint_transition_to_running(pi); /* calls phyint_chstate() */ } /* - * See if a previously functioning interface has failed, or if the - * whole group of interfaces has failed. + * See if an interface has failed, or if the whole group of interfaces has + * failed. */ static void phyint_inst_check_for_failure(struct phyint_instance *pii) { - struct phyint *pi; - struct phyint *pi2; - - pi = pii->pii_phyint; + struct phyint *pi = pii->pii_phyint; + struct phyint *pi2; + boolean_t was_active; switch (failure_state(pii)) { case PHYINT_FAILURE: - (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); + was_active = ((pi->pi_flags & IFF_INACTIVE) == 0); + + (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); if (pi->pi_group == phyint_anongroup) { - logerr("NIC failure detected on %s\n", pii->pii_name); + logerr("IP interface failure detected on %s\n", + pii->pii_name); } else { - logerr("NIC failure detected on %s of group %s\n", - pii->pii_name, pi->pi_group->pg_name); + logerr("IP interface failure detected on %s of group" + " %s\n", pii->pii_name, pi->pi_group->pg_name); } + /* - * Do the failover, unless the interface is offline (in - * which case we've already failed over). + * If the interface is offline, the state change will be + * noted when it comes back online. */ if (pi->pi_state != PI_OFFLINE) { + /* + * If the failed interface was active, activate + * another INACTIVE interface in the group if + * possible. (If the interface is PI_OFFLINE, + * we already activated another.) + */ + if (was_active) + phyint_activate_another(pi); + phyint_chstate(pi, PI_FAILED); reset_crtt_all(pi); - if (!(pi->pi_flags & IFF_INACTIVE)) - (void) try_failover(pi, FAILOVER_NORMAL); } break; case GROUP_FAILURE: - logerr("All Interfaces in group %s have failed\n", - pi->pi_group->pg_name); - for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; - pi2 = pi2->pi_pgnext) { - if (pi2->pi_flags & IFF_OFFLINE) + pi2 = pi->pi_group->pg_phyint; + for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { + (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE); + if (pi2->pi_state == PI_OFFLINE) /* see comment above */ continue; - (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE); - reset_crtt_all(pi2); + reset_crtt_all(pi2); /* - * In the case of host targets, we - * would have flushed the targets, - * and gone to PI_NOTARGETS state. + * In the case of host targets, we would have flushed + * the targets, and gone to PI_NOTARGETS state. */ if (pi2->pi_state == PI_RUNNING) phyint_chstate(pi2, PI_FAILED); - - pi2->pi_empty = 0; - pi2->pi_full = 0; } break; @@ -1519,7 +1551,8 @@ phyint_inst_timer(struct phyint_instance *pii) hrtime_t cur_hrtime; int probe_interval = pii->pii_phyint->pi_group->pg_probeint; - cur_time = getcurrenttime(); + cur_hrtime = gethrtime(); + cur_time = ns2ms(cur_hrtime); if (debug & D_TIMER) { logdebug("phyint_inst_timer(%s %s)\n", @@ -1621,7 +1654,7 @@ phyint_inst_timer(struct phyint_instance *pii) * the failure detection (fd) probe timer has not yet fired. * Need to send only an rtt probe. The probe type is PROBE_RTT. */ - probe(pii, PROBE_RTT, cur_time); + probe(pii, PROBE_RTT, cur_hrtime); return (interval); } /* @@ -1651,7 +1684,7 @@ phyint_inst_timer(struct phyint_instance *pii) * We can have at most, the latest 2 probes that we sent, in * the PR_UNACKED state. All previous probes sent, are either * PR_LOST or PR_ACKED. An unacknowledged probe is considered - * timed out if the probe's time_sent + the CRTT < currenttime. + * timed out if the probe's time_start + the CRTT < currenttime. * For each of the last 2 probes, examine whether it has timed * out. If so, mark it PR_LOST. The probe stats is a circular array. */ @@ -1686,16 +1719,15 @@ phyint_inst_timer(struct phyint_instance *pii) * not available use group's probe interval, * which is a worst case estimate. */ + timeout = ns2ms(pr_statp->pr_hrtime_start); if (cur_tg->tg_crtt != 0) { - timeout = pr_statp->pr_time_sent + - cur_tg->tg_crtt; + timeout += cur_tg->tg_crtt; } else { - timeout = pr_statp->pr_time_sent + - probe_interval; + timeout += probe_interval; } if (TIME_LT(timeout, cur_time)) { - pr_statp->pr_status = PR_LOST; pr_statp->pr_time_lost = timeout; + probe_chstate(pr_statp, pii, PR_LOST); } else if (i == 1) { /* * We are forced to consider this probe @@ -1711,8 +1743,8 @@ phyint_inst_timer(struct phyint_instance *pii) * when the timer fires, we find 2 valid * unacked probes, and they are yet to timeout */ - pr_statp->pr_status = PR_LOST; pr_statp->pr_time_lost = cur_time; + probe_chstate(pr_statp, pii, PR_LOST); } else { /* * Only the most recent probe can enter @@ -1740,16 +1772,15 @@ phyint_inst_timer(struct phyint_instance *pii) * The timer has fired. Take appropriate action depending * on the current state of the phyint. * - * PI_RUNNING state - Failure detection and failover - * PI_FAILED state - Repair detection and failback + * PI_RUNNING state - Failure detection + * PI_FAILED state - Repair detection */ switch (pii->pii_phyint->pi_state) { case PI_FAILED: /* * If the most recent probe (excluding unacked probes that * are yet to time out) has been acked, check whether the - * phyint is now repaired. If the phyint is repaired, then - * attempt failback, unless it is an inactive standby. + * phyint is now repaired. */ if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { phyint_check_for_repair(pii->pii_phyint); @@ -1760,10 +1791,8 @@ phyint_inst_timer(struct phyint_instance *pii) /* * It's possible our probes have been lost because of a * spanning-tree mandated quiet period on the switch. If so, - * ignore the lost probes and consider the interface to still - * be functioning. + * ignore the lost probes. */ - cur_hrtime = gethrtime(); if (pii->pii_fd_hrtime - cur_hrtime > 0) break; @@ -1771,8 +1800,7 @@ phyint_inst_timer(struct phyint_instance *pii) /* * We have 1 or more failed probes (excluding unacked * probes that are yet to time out). Determine if the - * phyint has failed. If so attempt a failover, - * unless it is an inactive standby + * phyint has failed. */ phyint_inst_check_for_failure(pii); } @@ -1790,16 +1818,16 @@ phyint_inst_timer(struct phyint_instance *pii) * was called, the target list may be empty. */ if (pii->pii_target_next != NULL) { - probe(pii, PROBE_UNI, cur_time); + probe(pii, PROBE_UNI, cur_hrtime); /* * If we have just the one probe target, and we're not using * router targets, try to find another as we presently have * no resilience. */ if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) - probe(pii, PROBE_MULTI, cur_time); + probe(pii, PROBE_MULTI, cur_hrtime); } else { - probe(pii, PROBE_MULTI, cur_time); + probe(pii, PROBE_MULTI, cur_hrtime); } return (interval); } @@ -1859,8 +1887,8 @@ process_link_state_down(struct phyint *pi) /* * Clear the probe statistics arrays, we don't want the repair - * detection logic relying on probes that were succesful prior - * to the link going down. + * detection logic relying on probes that were successful prior + * to the link going down. */ if (PROBE_CAPABLE(pi->pi_v4)) clear_pii_probe_stats(pi->pi_v4); @@ -2016,7 +2044,7 @@ phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) pii->pii_target_next = target_next(cur_tg); } else { target_delete(cur_tg); - probe(pii, PROBE_MULTI, getcurrenttime()); + probe(pii, PROBE_MULTI, gethrtime()); } return (PHYINT_OK); } @@ -2065,13 +2093,13 @@ failure_state(struct phyint_instance *pii) struct probe_success_count psinfo; uint_t pi2_tls; /* time last success */ uint_t pi_tff; /* time first fail */ - struct phyint *pi2; + struct phyint *pi2; struct phyint *pi; struct phyint_instance *pii2; struct phyint_group *pg; - boolean_t alone; + int retval; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("phyint_failed(%s)\n", pii->pii_name); pi = pii->pii_phyint; @@ -2082,24 +2110,13 @@ failure_state(struct phyint_instance *pii) return (PHYINT_OK); /* - * At this point, the link is down, or the phyint is suspect, - * as it has lost NUM_PROBE_FAILS or more probes. If the phyint - * does not belong to any group, or is the only member of the - * group capable of being probed, return PHYINT_FAILURE. + * At this point, the link is down, or the phyint is suspect, as it + * has lost NUM_PROBE_FAILS or more probes. If the phyint does not + * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue + * on to determine whether this should be considered a PHYINT_FAILURE + * or GROUP_FAILURE. */ - alone = _B_TRUE; - if (pg != phyint_anongroup) { - for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { - if (pi2 == pi) - continue; - if (PROBE_CAPABLE(pi2->pi_v4) || - PROBE_CAPABLE(pi2->pi_v6)) { - alone = _B_FALSE; - break; - } - } - } - if (alone) + if (pg == phyint_anongroup) return (PHYINT_FAILURE); /* @@ -2116,6 +2133,7 @@ failure_state(struct phyint_instance *pii) * after it was received, so there is no point looking at the tls * of other phyints. */ + retval = GROUP_FAILURE; for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { /* Exclude ourself from comparison */ if (pi2 == pi) @@ -2123,76 +2141,86 @@ failure_state(struct phyint_instance *pii) if (LINK_DOWN(pi)) { /* - * We use FLAGS_TO_LINK_STATE() to test the - * flags directly, rather then LINK_UP() or - * LINK_DOWN(), as we may not have got round - * to processing the link state for the other - * phyints in the group yet. + * We use FLAGS_TO_LINK_STATE() to test the flags + * directly, rather then LINK_UP() or LINK_DOWN(), as + * we may not have got round to processing the link + * state for the other phyints in the group yet. * - * The check for PI_RUNNING and group - * failure handles the case when the - * group begins to recover. The first - * phyint to recover should not trigger - * a failover from the soon-to-recover - * other phyints to the first recovered - * phyint. PI_RUNNING will be set, and - * pg_groupfailed cleared only after - * receipt of NUM_PROBE_REPAIRS, by - * which time the other phyints should - * have received at least 1 packet, - * and so will not have NUM_PROBE_FAILS. + * The check for PI_RUNNING and group failure handles + * the case when the group begins to recover. + * PI_RUNNING will be set, and group failure cleared + * only after receipt of NUM_PROBE_REPAIRS, by which + * time the other phyints should have received at + * least 1 packet, and so will not have NUM_PROBE_FAILS. */ if ((pi2->pi_state == PI_RUNNING) && - !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) - return (PHYINT_FAILURE); - } else { - /* - * Need to compare against both IPv4 and - * IPv6 instances. - */ - pii2 = pi2->pi_v4; - if (pii2 != NULL) { - probe_success_info(pii2, NULL, &psinfo); - if (psinfo.ps_tls_valid) { - pi2_tls = psinfo.ps_tls; - /* - * See comment above regarding check - * for PI_RUNNING and group failure. - */ - if (TIME_GT(pi2_tls, pi_tff) && - (pi2->pi_state == PI_RUNNING) && - !GROUP_FAILED(pg) && - FLAGS_TO_LINK_STATE(pi2)) - return (PHYINT_FAILURE); + !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) { + retval = PHYINT_FAILURE; + break; + } + continue; + } + + if (LINK_DOWN(pi2)) + continue; + + /* + * If there's no probe-based failure detection on this + * interface, and its link is still up, then it's still + * working and thus the group has not failed. + */ + if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) { + retval = PHYINT_FAILURE; + break; + } + + /* + * Need to compare against both IPv4 and IPv6 instances. + */ + pii2 = pi2->pi_v4; + if (pii2 != NULL) { + probe_success_info(pii2, NULL, &psinfo); + if (psinfo.ps_tls_valid) { + pi2_tls = psinfo.ps_tls; + /* + * See comment above regarding check + * for PI_RUNNING and group failure. + */ + if (TIME_GT(pi2_tls, pi_tff) && + (pi2->pi_state == PI_RUNNING) && + !GROUP_FAILED(pg) && + FLAGS_TO_LINK_STATE(pi2)) { + retval = PHYINT_FAILURE; + break; } } + } - pii2 = pi2->pi_v6; - if (pii2 != NULL) { - probe_success_info(pii2, NULL, &psinfo); - if (psinfo.ps_tls_valid) { - pi2_tls = psinfo.ps_tls; - /* - * See comment above regarding check - * for PI_RUNNING and group failure. - */ - if (TIME_GT(pi2_tls, pi_tff) && - (pi2->pi_state == PI_RUNNING) && - !GROUP_FAILED(pg) && - FLAGS_TO_LINK_STATE(pi2)) - return (PHYINT_FAILURE); + pii2 = pi2->pi_v6; + if (pii2 != NULL) { + probe_success_info(pii2, NULL, &psinfo); + if (psinfo.ps_tls_valid) { + pi2_tls = psinfo.ps_tls; + /* + * See comment above regarding check + * for PI_RUNNING and group failure. + */ + if (TIME_GT(pi2_tls, pi_tff) && + (pi2->pi_state == PI_RUNNING) && + !GROUP_FAILED(pg) && + FLAGS_TO_LINK_STATE(pi2)) { + retval = PHYINT_FAILURE; + break; } } } } /* - * Change the group state to PG_FAILED if it's not already. + * Update the group state to account for the changes. */ - if (!GROUP_FAILED(pg)) - phyint_group_chstate(pg, PG_FAILED); - - return (GROUP_FAILURE); + phyint_group_refresh_state(pg); + return (retval); } /* @@ -2215,7 +2243,7 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, uint_t timeout; struct target *tg; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("probe_success_info(%s)\n", pii->pii_name); bzero(psinfo, sizeof (*psinfo)); @@ -2248,10 +2276,11 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, * not available use the value of the group's probe * interval which is a worst case estimate. */ + timeout = ns2ms(pr_statp->pr_hrtime_start); if (tg->tg_crtt != 0) { - timeout = pr_statp->pr_time_sent + tg->tg_crtt; + timeout += tg->tg_crtt; } else { - timeout = pr_statp->pr_time_sent + + timeout += pii->pii_phyint->pi_group->pg_probeint; } @@ -2261,7 +2290,7 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, * recent consecutive successes. */ pr_statp->pr_time_lost = timeout; - pr_statp->pr_status = PR_LOST; + probe_chstate(pr_statp, pii, PR_LOST); pi_found_failure = _B_TRUE; if (cur_tg != NULL && tg == cur_tg) { /* @@ -2292,7 +2321,8 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, * the most recent probe success. */ if (!psinfo->ps_tls_valid) { - psinfo->ps_tls = pr_statp->pr_time_acked; + psinfo->ps_tls = + ns2ms(pr_statp->pr_hrtime_ackproc); psinfo->ps_tls_valid = _B_TRUE; } break; @@ -2339,7 +2369,7 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, uint_t timeout; struct target *tg; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("probe_fail_info(%s)\n", pii->pii_name); bzero(pfinfo, sizeof (*pfinfo)); @@ -2377,10 +2407,11 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, * not available use the group's probe interval, * which is a worst case estimate. */ + timeout = ns2ms(pr_statp->pr_hrtime_start); if (tg->tg_crtt != 0) { - timeout = pr_statp->pr_time_sent + tg->tg_crtt; + timeout += tg->tg_crtt; } else { - timeout = pr_statp->pr_time_sent + + timeout += pii->pii_phyint->pi_group->pg_probeint; } @@ -2388,7 +2419,7 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, break; pr_statp->pr_time_lost = timeout; - pr_statp->pr_status = PR_LOST; + probe_chstate(pr_statp, pii, PR_LOST); /* FALLTHRU */ case PR_LOST: @@ -2421,6 +2452,19 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, } /* + * Change the state of probe `pr' on phyint_instance `pii' to state `state'. + */ +void +probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state) +{ + if (pr->pr_status == state) + return; + + pr->pr_status = state; + (void) probe_state_event(pr, pii); +} + +/* * Check if the phyint has been repaired. If no test address has been * configured, then consider the interface repaired if the link is up (unless * the link is flapping; see below). Otherwise, look for proof of probes @@ -2436,7 +2480,7 @@ phyint_repaired(struct phyint *pi) int pr_ndx; uint_t cur_time; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("phyint_repaired(%s)\n", pi->pi_name); if (LINK_DOWN(pi)) @@ -2458,7 +2502,7 @@ phyint_repaired(struct phyint *pi) } if (!pi->pi_lfmsg_printed) { logerr("The link has come up on %s more than %d times " - "in the last minute; disabling failback until it " + "in the last minute; disabling repair until it " "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); pi->pi_lfmsg_printed = 1; } @@ -2490,354 +2534,41 @@ phyint_repaired(struct phyint *pi) } /* - * Try failover from phyint 'pi' to a suitable destination. - */ -int -try_failover(struct phyint *pi, int failover_type) -{ - struct phyint *dst; - int err; - - if (debug & D_FAILOVER) - logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type); - - /* - * Attempt to find a failover destination 'dst'. - * dst will be null if any of the following is true - * Phyint is not part of a group OR - * Phyint is the only member of a group OR - * No suitable failover dst was available - */ - dst = get_failover_dst(pi, failover_type); - if (dst == NULL) - return (IPMP_EMINRED); - - dst->pi_empty = 0; /* Per state diagram */ - pi->pi_full = 0; /* Per state diagram */ - - err = failover(pi, dst); - - if (debug & D_FAILOVER) { - logdebug("failed over from %s to %s ret %d\n", - pi->pi_name, dst->pi_name, err); - } - if (err == 0) { - pi->pi_empty = 1; /* Per state diagram */ - /* - * we don't want to print out this message if a - * phyint is leaving the group, nor for failover from - * standby - */ - if (failover_type == FAILOVER_NORMAL) { - logerr("Successfully failed over from NIC %s to NIC " - "%s\n", pi->pi_name, dst->pi_name); - } - return (0); - } else { - /* - * The failover did not succeed. We must retry the failover - * only after resyncing our state based on the kernel's. - * For eg. either the src or the dst might have been unplumbed - * causing this failure. initifs() will be called again, - * from main, since full_scan_required has been set to true - * by failover(); - */ - return (IPMP_FAILURE); - } -} - -/* - * global_errno captures the errno value, if failover() or failback() - * fails. This is sent to if_mpadm(1M). - */ -int global_errno; - -/* - * Attempt failover from phyint 'from' to phyint 'to'. - * IP moves everything from phyint 'from' to phyint 'to'. - */ -static int -failover(struct phyint *from, struct phyint *to) -{ - struct lifreq lifr; - int ret; - - if (debug & D_FAILOVER) { - logdebug("failing over from %s to %s\n", - from->pi_name, to->pi_name); - } - - /* - * Perform the failover. Both IPv4 and IPv6 are failed over - * using a single ioctl by passing in AF_UNSPEC family. - */ - lifr.lifr_addr.ss_family = AF_UNSPEC; - (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_movetoindex = to->pi_ifindex; - - ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr); - if (ret < 0) { - global_errno = errno; - logperror("failover: ioctl (failover)"); - } - - /* - * Set full_scan_required to true. This will make us read - * the state from the kernel in initifs() and update our tables, - * to reflect the current state after the failover. If the - * failover has failed it will then reissue the failover. - */ - full_scan_required = _B_TRUE; - return (ret); -} - -/* - * phyint 'pi' has recovered. Attempt failback from every phyint in the same - * group as phyint 'pi' that is a potential failback source, to phyint 'pi'. - * Return values: - * IPMP_SUCCESS: Failback successful from each of the other - * phyints in the group. - * IPMP_EFBPARTIAL: Failback successful from some of the other - * phyints in the group. - * IPMP_FAILURE: Failback syscall failed with some error. - * - * Note that failback is attempted regardless of the setting of the - * failback_enabled flag. - */ -int -do_failback(struct phyint *pi) -{ - struct phyint *from; - boolean_t done; - boolean_t partial; - boolean_t attempted_failback = _B_FALSE; - - if (debug & D_FAILOVER) - logdebug("do_failback(%s)\n", pi->pi_name); - - /* If this phyint is not part of a named group, return. */ - if (pi->pi_group == phyint_anongroup) { - pi->pi_full = 1; - return (IPMP_SUCCESS); - } - - /* - * Attempt failback from every phyint in the group to 'pi'. - * The reason for doing this, instead of only from the - * phyint to which we did the failover is given below. - * - * After 'pi' failed, if any app. tries to join on a multicast - * address (IPv6), on the failed phyint, IP picks any arbitrary - * non-failed phyint in the group, instead of the failed phyint, - * in.mpathd is not aware of this. Thus failing back only from the - * interface to which 'pi' failed over, will failback the ipif's - * but not the ilm's. So we need to failback from all members of - * the phyint group - */ - done = _B_TRUE; - partial = _B_FALSE; - for (from = pi->pi_group->pg_phyint; from != NULL; - from = from->pi_pgnext) { - /* Exclude ourself as a failback src */ - if (from == pi) - continue; - - /* - * If the 'from' phyint has IPv4 plumbed, the 'to' - * phyint must also have IPv4 plumbed. Similar check - * for IPv6. IP makes the same check. Otherwise the - * failback will fail. - */ - if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) || - (from->pi_v6 != NULL && pi->pi_v6 == NULL)) { - partial = _B_TRUE; - continue; - } - - pi->pi_empty = 0; /* Per state diagram */ - attempted_failback = _B_TRUE; - if (failback(from, pi) != 0) { - done = _B_FALSE; - break; - } - } - - /* - * We are done. No more phyint from which we can src the failback - */ - if (done) { - if (!partial) - pi->pi_full = 1; /* Per state diagram */ - /* - * Don't print out a message unless there is a - * transition from FAILED to RUNNING. For eg. - * we don't want to print out this message if a - * phyint is leaving the group, or at startup - */ - if (attempted_failback && (pi->pi_flags & - (IFF_FAILED | IFF_OFFLINE))) { - logerr("Successfully failed back to NIC %s\n", - pi->pi_name); - } - return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); - } - - return (IPMP_FAILURE); -} - -/* - * This function is similar to do_failback() above, but respects the - * failback_enabled flag for phyints in named groups. - */ -int -try_failback(struct phyint *pi) -{ - if (debug & D_FAILOVER) - logdebug("try_failback(%s)\n", pi->pi_name); - - if (pi->pi_group != phyint_anongroup && !failback_enabled) - return (IPMP_EFBDISABLED); - - return (do_failback(pi)); -} - -/* - * Failback everything from phyint 'from' that has the same ifindex - * as phyint to's ifindex. - */ -static int -failback(struct phyint *from, struct phyint *to) -{ - struct lifreq lifr; - int ret; - - if (debug & D_FAILOVER) - logdebug("failback(%s %s)\n", from->pi_name, to->pi_name); - - lifr.lifr_addr.ss_family = AF_UNSPEC; - (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_movetoindex = to->pi_ifindex; - - ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr); - if (ret < 0) { - global_errno = errno; - logperror("failback: ioctl (failback)"); - } - - /* - * Set full_scan_required to true. This will make us read - * the state from the kernel in initifs() and update our tables, - * to reflect the current state after the failback. If the - * failback has failed it will then reissue the failback. - */ - full_scan_required = _B_TRUE; - - return (ret); -} - -/* - * Select a target phyint for failing over from 'pi'. - * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred - * target phyint is chosen as follows, - * 1. Pick any inactive standby interface. - * 2. If no inactive standby is available, select any phyint in the - * same group that has the least number of logints, (excluding - * IFF_NOFAILOVER and !IFF_UP logints) - * If we are failing over from a standby, failover_type is - * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination. - * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY, - * and we won't return NULL, as long as there is at least 1 other phyint - * in the group. - */ -static struct phyint * -get_failover_dst(struct phyint *pi, int failover_type) -{ - struct phyint *maybe = NULL; - struct phyint *pi2; - struct phyint *last_choice = NULL; - - if (pi->pi_group == phyint_anongroup) - return (NULL); - - /* - * Loop thru the phyints in the group, and pick the preferred - * phyint for the target. - */ - for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { - /* Exclude ourself and offlined interfaces */ - if (pi2 == pi || pi2->pi_state == PI_OFFLINE) - continue; - - /* - * The chosen target phyint must have IPv4 instance - * plumbed, if the src phyint has IPv4 plumbed. Similarly - * for IPv6. - */ - if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) || - (pi2->pi_v6 == NULL && pi->pi_v6 != NULL)) - continue; - - /* The chosen target must be PI_RUNNING. */ - if (pi2->pi_state != PI_RUNNING) { - last_choice = pi2; - continue; - } - - if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) && - (failover_type != FAILOVER_TO_NONSTANDBY)) { - return (pi2); - } else { - if (maybe == NULL) - maybe = pi2; - else if (logint_upcount(pi2) < logint_upcount(maybe)) - maybe = pi2; - } - } - if (maybe == NULL && failover_type == FAILOVER_TO_ANY) - return (last_choice); - else - return (maybe); -} - -/* * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. */ boolean_t -change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) +change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear) { int ifsock; struct lifreq lifr; uint64_t old_flags; - if (debug & D_FAILOVER) { - logdebug("change_lif_flags(%s): flags %llx setfl %d\n", - pi->pi_name, flags, (int)setfl); + if (debug & D_FAILREP) { + logdebug("change_pif_flags(%s): set %llx clear %llx\n", + pi->pi_name, set, clear); } - if (pi->pi_v4 != NULL) { + if (pi->pi_v4 != NULL) ifsock = ifsock_v4; - } else { + else ifsock = ifsock_v6; - } /* * Get the current flags from the kernel, and set/clear the * desired phyint flags. Since we set only phyint flags, we can * do it on either IPv4 or IPv6 instance. */ - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); + if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) - logperror("change_lif_flags: ioctl (get flags)"); + logperror("change_pif_flags: ioctl (get flags)"); return (_B_FALSE); } old_flags = lifr.lifr_flags; - if (setfl) - lifr.lifr_flags |= flags; - else - lifr.lifr_flags &= ~flags; + lifr.lifr_flags |= set; + lifr.lifr_flags &= ~clear; if (old_flags == lifr.lifr_flags) { /* No change in the flags. No need to send ioctl */ @@ -2846,7 +2577,7 @@ change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) - logperror("change_lif_flags: ioctl (set flags)"); + logperror("change_pif_flags: ioctl (set flags)"); return (_B_FALSE); } @@ -2854,15 +2585,13 @@ change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) * Keep pi_flags in synch. with actual flags. Assumes flags are * phyint flags. */ - if (setfl) - pi->pi_flags |= flags; - else - pi->pi_flags &= ~flags; + pi->pi_flags |= set; + pi->pi_flags &= ~clear; - if (pi->pi_v4) + if (pi->pi_v4 != NULL) pi->pi_v4->pii_flags = pi->pi_flags; - if (pi->pi_v6) + if (pi->pi_v6 != NULL) pi->pi_v6->pii_flags = pi->pi_flags; return (_B_TRUE); @@ -2928,18 +2657,31 @@ reset_snxt_basetimes(void) * and it is up, it is not possible to detect the interface failure. * SIOCTMYADDR also doesn't consider local zone address as own address. * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they - * are stored in laddr_list. + * are stored in `localaddrs' */ - boolean_t own_address(struct in6_addr addr) { - struct local_addr *taddr = laddr_list; + addrlist_t *addrp; + struct sockaddr_storage ss; + int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6; - for (; taddr != NULL; taddr = taddr->next) { - if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) { + addr2storage(af, &addr, &ss); + for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) { + if (sockaddrcmp(&ss, &addrp->al_addr)) return (_B_TRUE); - } } return (_B_FALSE); } + +static int +ns2ms(int64_t ns) +{ + return (ns / (NANOSEC / MILLISEC)); +} + +static int64_t +tv2ns(struct timeval *tvp) +{ + return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000); +} diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c index b56648cf12..def08d39ce 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mpd_defs.h" #include "mpd_tables.h" @@ -47,11 +45,7 @@ static void phyint_inst_print(struct phyint_instance *pii); static void phyint_insert(struct phyint *pi, struct phyint_group *pg); static void phyint_delete(struct phyint *pi); - -static void phyint_group_insert(struct phyint_group *pg); -static void phyint_group_delete(struct phyint_group *pg); -static struct phyint_group *phyint_group_lookup(const char *pg_name); -static struct phyint_group *phyint_group_create(const char *pg_name); +static boolean_t phyint_is_usable(struct phyint *pi); static void logint_print(struct logint *li); static void logint_insert(struct phyint_instance *pii, struct logint *li); @@ -68,16 +62,13 @@ static void reset_pii_probes(struct phyint_instance *pii, struct target *tg); static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii); static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii); -static void ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask); -static boolean_t prefix_equal(struct in6_addr p1, struct in6_addr p2, - int prefix_len); - static int phyint_state_event(struct phyint_group *pg, struct phyint *pi); static int phyint_group_state_event(struct phyint_group *pg); static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t); static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi, ipmp_if_op_t op); +static int logint_upcount(struct phyint *pi); static uint64_t gensig(void); /* Initialize any per-file global state. Returns 0 on success, -1 on failure */ @@ -110,6 +101,183 @@ phyint_lookup(const char *name) return (pi); } +/* + * Lookup a phyint in the group that has the same hardware address as `pi', or + * NULL if there's none. If `online_only' is set, then only online phyints + * are considered when matching. Otherwise, phyints that had been offlined + * due to a duplicate hardware address will also be considered. + */ +static struct phyint * +phyint_lookup_hwaddr(struct phyint *pi, boolean_t online_only) +{ + struct phyint *pi2; + + if (pi->pi_group == phyint_anongroup) + return (NULL); + + for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (pi2 == pi) + continue; + + /* + * NOTE: even when online_only is B_FALSE, we ignore phyints + * that are administratively offline (rather than offline + * because they're dups); when they're brought back online, + * they'll be flagged as dups if need be. + */ + if (pi2->pi_state == PI_OFFLINE && + (online_only || !pi2->pi_hwaddrdup)) + continue; + + if (pi2->pi_hwaddrlen == pi->pi_hwaddrlen && + bcmp(pi2->pi_hwaddr, pi->pi_hwaddr, pi->pi_hwaddrlen) == 0) + return (pi2); + } + return (NULL); +} + +/* + * Respond to DLPI notifications. Currently, this only processes physical + * address changes for the phyint passed via `arg' by onlining or offlining + * phyints in the group. + */ +/* ARGSUSED */ +static void +phyint_link_notify(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg) +{ + struct phyint *pi = arg; + struct phyint *oduppi = NULL, *duppi = NULL; + + assert((dnip->dni_note & pi->pi_notes) != 0); + + if (dnip->dni_note != DL_NOTE_PHYS_ADDR) + return; + + assert(dnip->dni_physaddrlen <= DLPI_PHYSADDR_MAX); + + /* + * If our hardware address hasn't changed, there's nothing to do. + */ + if (pi->pi_hwaddrlen == dnip->dni_physaddrlen && + bcmp(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen) == 0) + return; + + oduppi = phyint_lookup_hwaddr(pi, _B_FALSE); + pi->pi_hwaddrlen = dnip->dni_physaddrlen; + (void) memcpy(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen); + duppi = phyint_lookup_hwaddr(pi, _B_FALSE); + + if (oduppi != NULL || pi->pi_hwaddrdup) { + /* + * Our old hardware address was a duplicate. If we'd been + * offlined because of it, and our new hardware address is not + * a duplicate, then bring us online. Otherwise, `oduppi' + * must've been the one brought offline; bring it online. + */ + if (pi->pi_hwaddrdup) { + if (duppi == NULL) + (void) phyint_undo_offline(pi); + } else { + assert(oduppi->pi_hwaddrdup); + (void) phyint_undo_offline(oduppi); + } + } + + if (duppi != NULL && !pi->pi_hwaddrdup) { + /* + * Our new hardware address was a duplicate and we're not + * yet flagged as a duplicate; bring us offline. + */ + pi->pi_hwaddrdup = _B_TRUE; + (void) phyint_offline(pi, 0); + } +} + +/* + * Initialize information about the underlying link for `pi', and set us + * up to be notified about future changes. Returns _B_TRUE on success. + */ +boolean_t +phyint_link_init(struct phyint *pi) +{ + int retval; + uint_t notes; + const char *errmsg; + dlpi_notifyid_t id; + + pi->pi_notes = 0; + retval = dlpi_open(pi->pi_name, &pi->pi_dh, 0); + if (retval != DLPI_SUCCESS) { + pi->pi_dh = NULL; + errmsg = "cannot open"; + goto failed; + } + + pi->pi_hwaddrlen = DLPI_PHYSADDR_MAX; + retval = dlpi_get_physaddr(pi->pi_dh, DL_CURR_PHYS_ADDR, pi->pi_hwaddr, + &pi->pi_hwaddrlen); + if (retval != DLPI_SUCCESS) { + errmsg = "cannot get hardware address"; + goto failed; + } + + retval = dlpi_bind(pi->pi_dh, DLPI_ANY_SAP, NULL); + if (retval != DLPI_SUCCESS) { + errmsg = "cannot bind to DLPI_ANY_SAP"; + goto failed; + } + + /* + * Check if the link supports DLPI link state notifications. For + * historical reasons, the actual changes are tracked through routing + * sockets, so we immediately disable the notification upon success. + */ + notes = DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN; + retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id); + if (retval == DLPI_SUCCESS) { + (void) dlpi_disabnotify(pi->pi_dh, id, NULL); + pi->pi_notes |= notes; + } + + /* + * Enable notification of hardware address changes to keep pi_hwaddr + * up-to-date and track if we need to offline/undo-offline phyints. + */ + notes = DL_NOTE_PHYS_ADDR; + retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id); + if (retval == DLPI_SUCCESS && poll_add(dlpi_fd(pi->pi_dh)) == 0) + pi->pi_notes |= notes; + + return (_B_TRUE); +failed: + logerr("%s: %s: %s\n", pi->pi_name, errmsg, dlpi_strerror(retval)); + if (pi->pi_dh != NULL) { + dlpi_close(pi->pi_dh); + pi->pi_dh = NULL; + } + return (_B_FALSE); +} + +/* + * Close use of link on `pi'. + */ +void +phyint_link_close(struct phyint *pi) +{ + if (pi->pi_notes & DL_NOTE_PHYS_ADDR) { + (void) poll_remove(dlpi_fd(pi->pi_dh)); + pi->pi_notes &= ~DL_NOTE_PHYS_ADDR; + } + + /* + * NOTE: we don't clear pi_notes here so that iflinkstate() can still + * properly report the link state even when offline (which is possible + * since we use IFF_RUNNING to track link state). + */ + dlpi_close(pi->pi_dh); + pi->pi_dh = NULL; +} + /* Return the phyint instance with the given name and the given family */ struct phyint_instance * phyint_inst_lookup(int af, char *name) @@ -128,7 +296,7 @@ phyint_inst_lookup(int af, char *name) return (PHYINT_INSTANCE(pi, af)); } -static struct phyint_group * +struct phyint_group * phyint_group_lookup(const char *pg_name) { struct phyint_group *pg; @@ -173,6 +341,9 @@ phyint_insert(struct phyint *pi, struct phyint_group *pg) pi->pi_pgnext->pi_pgprev = pi; pg->pg_phyint = pi; + /* Refresh the group state now that this phyint has been added */ + phyint_group_refresh_state(pg); + pg->pg_sig++; (void) phyint_group_member_event(pg, pi, IPMP_IF_ADD); } @@ -214,24 +385,24 @@ phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex, } /* - * Record the phyint values. Also insert the phyint into the - * phyint group by calling phyint_insert(). + * Record the phyint values. */ (void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name)); pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME; pi->pi_ifindex = ifindex; - pi->pi_icmpid = - htons(((getpid() & 0xFF) << 8) | (pi->pi_ifindex & 0xFF)); + pi->pi_icmpid = htons(((getpid() & 0xFF) << 8) | (ifindex & 0xFF)); + /* - * We optimistically start in the PI_RUNNING state. Later (in - * process_link_state_changes()), we will readjust this to match the + * If the interface is offline, we set the state to PI_OFFLINE. + * Otherwise, we optimistically start in the PI_RUNNING state. Later + * (in process_link_state_changes()), we will adjust this to match the * current state of the link. Further, if test addresses are * subsequently assigned, we will transition to PI_NOTARGETS and then - * either PI_RUNNING or PI_FAILED, depending on the result of the test - * probes. + * to either PI_RUNNING or PI_FAILED depending on the probe results. */ - pi->pi_state = PI_RUNNING; + pi->pi_state = (flags & IFF_OFFLINE) ? PI_OFFLINE : PI_RUNNING; pi->pi_flags = PHYINT_FLAGS(flags); + /* * Initialise the link state. The link state is initialised to * up, so that if the link is down when IPMP starts monitoring @@ -241,19 +412,17 @@ phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex, */ INIT_LINK_STATE(pi); + if (!phyint_link_init(pi)) { + free(pi); + return (NULL); + } + /* * Insert the phyint in the list of all phyints, and the * list of phyint group members */ phyint_insert(pi, pg); - /* - * If we are joining a failed group, mark the interface as - * failed. - */ - if (GROUP_FAILED(pg)) - (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); - return (pi); } @@ -313,15 +482,14 @@ phyint_chstate(struct phyint *pi, enum pi_state state) return; pi->pi_state = state; - pi->pi_group->pg_sig++; - (void) phyint_state_event(pi->pi_group, pi); + phyint_changed(pi); } /* - * Note that the type of phyint `pi' has changed. + * Note that `pi' has changed state. */ void -phyint_newtype(struct phyint *pi) +phyint_changed(struct phyint *pi) { pi->pi_group->pg_sig++; (void) phyint_state_event(pi->pi_group, pi); @@ -331,7 +499,7 @@ phyint_newtype(struct phyint *pi) * Insert the phyint group in the linked list of all phyint groups * at the head of the list */ -static void +void phyint_group_insert(struct phyint_group *pg) { pg->pg_next = phyint_groups; @@ -347,7 +515,7 @@ phyint_group_insert(struct phyint_group *pg) /* * Create a new phyint group called 'name'. */ -static struct phyint_group * +struct phyint_group * phyint_group_create(const char *name) { struct phyint_group *pg; @@ -363,9 +531,16 @@ phyint_group_create(const char *name) (void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name)); pg->pg_sig = gensig(); - pg->pg_fdt = user_failure_detection_time; pg->pg_probeint = user_probe_interval; + pg->pg_in_use = _B_TRUE; + + /* + * Normal groups always start in the PG_FAILED state since they + * have no active interfaces. In contrast, anonymous groups are + * heterogeneous and thus always PG_OK. + */ + pg->pg_state = (name[0] == '\0' ? PG_OK : PG_FAILED); return (pg); } @@ -378,10 +553,20 @@ phyint_group_chstate(struct phyint_group *pg, enum pg_state state) { assert(pg != phyint_anongroup); + /* + * To simplify things, some callers always set a given state + * regardless of the previous state of the group (e.g., setting + * PG_DEGRADED when it's already set). We shouldn't bother + * generating an event or consuming a signature for these, since + * the actual state of the group is unchanged. + */ + if (pg->pg_state == state) + return; + + pg->pg_state = state; + switch (state) { case PG_FAILED: - pg->pg_groupfailed = 1; - /* * We can never know with certainty that a group has * failed. It is possible that all known targets have @@ -392,16 +577,15 @@ phyint_group_chstate(struct phyint_group *pg, enum pg_state state) * hosts, we have to discover it by multicast. So flush * all the host targets. The next probe will send out a * multicast echo request. If this is a group failure, we - * will still not see any response, otherwise we will - * clear the pg_groupfailed flag after we get - * NUM_PROBE_REPAIRS consecutive unicast replies on any - * phyint. + * will still not see any response, otherwise the group + * will be repaired after we get NUM_PROBE_REPAIRS + * consecutive unicast replies on any phyint. */ target_flush_hosts(pg); break; - case PG_RUNNING: - pg->pg_groupfailed = 0; + case PG_OK: + case PG_DEGRADED: break; default: @@ -432,7 +616,6 @@ phyint_inst_init_from_k(int af, char *pi_name) struct lifreq lifr; struct phyint *pi; struct phyint_instance *pii; - boolean_t pg_created; boolean_t pi_created; struct phyint_group *pg; @@ -441,7 +624,6 @@ retry: pi = NULL; pg = NULL; pi_created = _B_FALSE; - pg_created = _B_FALSE; if (debug & D_PHYINT) { logdebug("phyint_inst_init_from_k(%s %s)\n", @@ -454,11 +636,11 @@ retry: ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6; /* - * Get the interface flags. Ignore loopback and multipoint - * interfaces. + * Get the interface flags. Ignore virtual interfaces, IPMP + * meta-interfaces, point-to-point interfaces, and interfaces + * that can't support multicast. */ - (void) strncpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; + (void) strlcpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name)); if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) { logperror("phyint_inst_init_from_k:" @@ -467,7 +649,8 @@ retry: return (NULL); } flags = lifr.lifr_flags; - if (!(flags & IFF_MULTICAST) || (flags & IFF_LOOPBACK)) + if (!(flags & IFF_MULTICAST) || + (flags & (IFF_VIRTUAL|IFF_IPMP|IFF_POINTOPOINT))) return (NULL); /* @@ -493,8 +676,7 @@ retry: } return (NULL); } - (void) strncpy(pg_name, lifr.lifr_groupname, sizeof (pg_name)); - pg_name[sizeof (pg_name) - 1] = '\0'; + (void) strlcpy(pg_name, lifr.lifr_groupname, sizeof (pg_name)); /* * If the phyint is not part of any group, pg_name is the @@ -503,12 +685,13 @@ retry: */ if (pg_name[0] == '\0' && !track_all_phyints) { /* - * If the IFF_FAILED or IFF_OFFLINE flags are set, reset - * them. These flags shouldn't be set if IPMP isn't - * tracking the interface. + * If the IFF_FAILED, IFF_INACTIVE, or IFF_OFFLINE flags are + * set, reset them. These flags shouldn't be set if in.mpathd + * isn't tracking the interface. */ - if ((flags & (IFF_FAILED | IFF_OFFLINE)) != 0) { - lifr.lifr_flags = flags & ~(IFF_FAILED | IFF_OFFLINE); + if ((flags & (IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE))) { + lifr.lifr_flags = flags & + ~(IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE); if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) { logperror("phyint_inst_init_from_k:" @@ -520,21 +703,20 @@ retry: } /* - * We need to create a new phyint instance. A phyint instance - * belongs to a phyint, and the phyint belongs to a phyint group. - * So we first lookup the 'parents' and if they don't exist then - * we create them. + * We need to create a new phyint instance. We may also need to + * create the group if e.g. the SIOCGLIFCONF loop in initifs() found + * an underlying interface before it found its IPMP meta-interface. + * Note that we keep any created groups even if phyint_inst_from_k() + * fails since a group's existence is not dependent on the ability of + * in.mpathd to the track the group's interfaces. */ - pg = phyint_group_lookup(pg_name); - if (pg == NULL) { - pg = phyint_group_create(pg_name); - if (pg == NULL) { - logerr("phyint_inst_init_from_k:" - " unable to create group %s\n", pg_name); + if ((pg = phyint_group_lookup(pg_name)) == NULL) { + if ((pg = phyint_group_create(pg_name)) == NULL) { + logerr("phyint_inst_init_from_k: cannot create group " + "%s\n", pg_name); return (NULL); } phyint_group_insert(pg); - pg_created = _B_TRUE; } /* @@ -546,8 +728,6 @@ retry: if (pi == NULL) { logerr("phyint_inst_init_from_k:" " unable to create phyint %s\n", pi_name); - if (pg_created) - phyint_group_delete(pg); return (NULL); } pi_created = _B_TRUE; @@ -564,8 +744,6 @@ retry: * while we are yet to update our tables. Do it now. */ if (pi->pi_ifindex != ifindex) { - if (pg_created) - phyint_group_delete(pg); phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af))); goto retry; } @@ -577,9 +755,6 @@ retry: * changed, while we are yet to update our tables. Do it now. */ if (strcmp(pi->pi_group->pg_name, pg_name) != 0) { - if (pg_created) - phyint_group_delete(pg); - restore_phyint(pi); phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af))); goto retry; @@ -594,16 +769,25 @@ retry: if (pii == NULL) { logerr("phyint_inst_init_from_k: unable to create" "phyint inst %s\n", pi->pi_name); - if (pi_created) { - /* - * Deleting the phyint will delete the phyint group - * if this is the last phyint in the group. - */ + if (pi_created) phyint_delete(pi); - } + return (NULL); } + if (pi_created) { + /* + * If this phyint does not have a unique hardware address in its + * group, offline it. (The change_pif_flags() implementation + * requires that we defer this until after the phyint_instance + * is created.) + */ + if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) { + pi->pi_hwaddrdup = _B_TRUE; + (void) phyint_offline(pi, 0); + } + } + return (pii); } @@ -677,16 +861,16 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) { icmp6_filter_t filter; int hopcount = 1; - int int_op; + int off = 0; + int on = 1; struct sockaddr_in6 testaddr; /* * Open a raw socket with ICMPv6 protocol. * - * Use IPV6_DONTFAILOVER_IF to make sure that probes go out - * on the specified phyint only, and are not subject to load - * balancing. Bind to the src address chosen will ensure that - * the responses are received only on the specified phyint. + * Use IPV6_BOUND_IF to make sure that probes are sent and received on + * the specified phyint only. Bind to the test address to ensure that + * the responses are sent to the specified phyint. * * Set the hopcount to 1 so that probe packets are not routed. * Disable multicast loopback. Set the receive filter to @@ -696,7 +880,7 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) if (pii->pii_probe_sock < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: socket"); return (_B_FALSE); -} + } bzero(&testaddr, sizeof (testaddr)); testaddr.sin6_family = AF_INET6; @@ -709,14 +893,17 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - /* - * IPV6_DONTFAILOVER_IF option takes precedence over setting - * IP_MULTICAST_IF. So we don't set IPV6_MULTICAST_IF again. - */ - if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_DONTFAILOVER_IF, + if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_IF, (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" - " IPV6_DONTFAILOVER_IF"); + " IPV6_MULTICAST_IF"); + return (_B_FALSE); + } + + if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_BOUND_IF, + &pii->pii_ifindex, sizeof (uint_t)) < 0) { + logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" + " IPV6_BOUND_IF"); return (_B_FALSE); } @@ -734,9 +921,8 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - int_op = 0; /* used to turn off option */ if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, - (char *)&int_op, sizeof (int_op)) < 0) { + (char *)&off, sizeof (off)) < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" " IPV6_MULTICAST_LOOP"); return (_B_FALSE); @@ -755,15 +941,22 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - /* Enable receipt of ancillary data */ - int_op = 1; + /* Enable receipt of hoplimit */ if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT, - (char *)&int_op, sizeof (int_op)) < 0) { + &on, sizeof (on)) < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" " IPV6_RECVHOPLIMIT"); return (_B_FALSE); } + /* Enable receipt of timestamp */ + if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, + &on, sizeof (on)) < 0) { + logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" + " SO_TIMESTAMP"); + return (_B_FALSE); + } + return (_B_TRUE); } @@ -775,20 +968,20 @@ static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii) { struct sockaddr_in testaddr; - char char_op; + char char_off = 0; int ttl = 1; char char_ttl = 1; + int on = 1; /* * Open a raw socket with ICMPv4 protocol. * - * Use IP_DONTFAILOVER_IF to make sure that probes go out - * on the specified phyint only, and are not subject to load - * balancing. Bind to the src address chosen will ensure that - * the responses are received only on the specified phyint. + * Use IP_BOUND_IF to make sure that probes are sent and received on + * the specified phyint only. Bind to the test address to ensure that + * the responses are sent to the specified phyint. * * Set the ttl to 1 so that probe packets are not routed. - * Disable multicast loopback. + * Disable multicast loopback. Enable receipt of timestamp. */ pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP); if (pii->pii_probe_sock < 0) { @@ -808,14 +1001,17 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - /* - * IP_DONTFAILOVER_IF option takes precedence over setting - * IP_MULTICAST_IF. So we don't set IP_MULTICAST_IF again. - */ - if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_DONTFAILOVER_IF, + if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_BOUND_IF, + &pii->pii_ifindex, sizeof (uint_t)) < 0) { + logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" + " IP_BOUND_IF"); + return (_B_FALSE); + } + + if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_IF, (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) { logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" - " IP_DONTFAILOVER"); + " IP_MULTICAST_IF"); return (_B_FALSE); } @@ -826,9 +1022,8 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - char_op = 0; /* used to turn off option */ if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP, - (char *)&char_op, sizeof (char_op)) == -1) { + (char *)&char_off, sizeof (char_off)) == -1) { logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" " IP_MULTICAST_LOOP"); return (_B_FALSE); @@ -841,6 +1036,13 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) return (_B_FALSE); } + if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, &on, + sizeof (on)) < 0) { + logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" + " SO_TIMESTAMP"); + return (_B_FALSE); + } + return (_B_TRUE); } @@ -848,7 +1050,7 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) * Remove the phyint group from the list of 'all phyint groups' * and free it. */ -static void +void phyint_group_delete(struct phyint_group *pg) { /* @@ -881,10 +1083,69 @@ phyint_group_delete(struct phyint_group *pg) phyint_grouplistsig++; (void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE); + addrlist_free(&pg->pg_addrs); free(pg); } /* + * Refresh the state of `pg' based on its current members. + */ +void +phyint_group_refresh_state(struct phyint_group *pg) +{ + enum pg_state state; + enum pg_state origstate = pg->pg_state; + struct phyint *pi, *usablepi; + uint_t nif = 0, nusable = 0; + + /* + * Anonymous groups never change state. + */ + if (pg == phyint_anongroup) + return; + + for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { + nif++; + if (phyint_is_usable(pi)) { + nusable++; + usablepi = pi; + } + } + + if (nusable == 0) + state = PG_FAILED; + else if (nif == nusable) + state = PG_OK; + else + state = PG_DEGRADED; + + phyint_group_chstate(pg, state); + + /* + * If we're shutting down, skip logging messages since otherwise our + * shutdown housecleaning will make us report that groups are unusable. + */ + if (cleanup_started) + return; + + /* + * NOTE: We use pg_failmsg_printed rather than origstate since + * otherwise at startup we'll log a "now usable" message when the + * first usable phyint is added to an empty group. + */ + if (state != PG_FAILED && pg->pg_failmsg_printed) { + assert(origstate == PG_FAILED); + logerr("At least 1 IP interface (%s) in group %s is now " + "usable\n", usablepi->pi_name, pg->pg_name); + pg->pg_failmsg_printed = _B_FALSE; + } else if (origstate != PG_FAILED && state == PG_FAILED) { + logerr("All IP interfaces in group %s are now unusable\n", + pg->pg_name); + pg->pg_failmsg_printed = _B_TRUE; + } +} + +/* * Extract information from the kernel about the desired phyint. * Look only for properties of the phyint and not properties of logints. * Take appropriate action on the changes. @@ -998,28 +1259,16 @@ phyint_inst_update_from_k(struct phyint_instance *pii) if (pi->pi_v6 != NULL) pi->pi_v6->pii_flags = pi->pi_flags; + /* + * Make sure the IFF_FAILED flag is set if and only if we think + * the interface should be failed. + */ if (pi->pi_flags & IFF_FAILED) { - /* - * If we are in the running and full state, we have - * completed failbacks successfully and we would have - * expected IFF_FAILED to have been clear. That it is - * set means there was a race condition. Some other - * process turned on the IFF_FAILED flag. Since the - * flag setting is not atomic, i.e. a get ioctl followed - * by a set ioctl, and since there is no way to set an - * individual flag bit, this could have occurred. - */ - if (pi->pi_state == PI_RUNNING && pi->pi_full) - (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); + if (pi->pi_state == PI_RUNNING) + (void) change_pif_flags(pi, 0, IFF_FAILED); } else { - /* - * If we are in the failed state, there was a race. - * we have completed failover successfully because our - * state is failed and empty. Some other process turned - * off the IFF_FAILED flag. Same comment as above - */ - if (pi->pi_state == PI_FAILED && pi->pi_empty) - (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); + if (pi->pi_state == PI_FAILED) + (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); } /* No change in phyint status */ @@ -1028,12 +1277,12 @@ phyint_inst_update_from_k(struct phyint_instance *pii) /* * Delete the phyint. Remove it from the list of all phyints, and the - * list of phyint group members. If the group becomes empty, delete the - * group also. + * list of phyint group members. */ static void phyint_delete(struct phyint *pi) { + struct phyint *pi2; struct phyint_group *pg = pi->pi_group; if (debug & D_PHYINT) @@ -1065,6 +1314,9 @@ phyint_delete(struct phyint *pi) pi->pi_pgnext = NULL; pi->pi_pgprev = NULL; + /* Refresh the group state now that this phyint has been removed */ + phyint_group_refresh_state(pg); + /* Remove the phyint from the global list of phyints */ if (pi->pi_prev == NULL) { /* Phyint is the 1st in the list */ @@ -1077,11 +1329,153 @@ phyint_delete(struct phyint *pi) pi->pi_next = NULL; pi->pi_prev = NULL; + /* + * See if another phyint in the group had been offlined because + * it was a dup of `pi' -- and if so, online it. + */ + if (!pi->pi_hwaddrdup && + (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) { + assert(pi2->pi_hwaddrdup); + (void) phyint_undo_offline(pi2); + } + phyint_link_close(pi); free(pi); +} + +/* + * Offline phyint `pi' if at least `minred' usable interfaces remain in the + * group. Returns an IPMP error code. + */ +int +phyint_offline(struct phyint *pi, uint_t minred) +{ + unsigned int nusable = 0; + struct phyint *pi2; + struct phyint_group *pg = pi->pi_group; + + /* + * Verify that enough usable interfaces in the group would remain. + * As a special case, if the group has failed, allow any non-offline + * phyints to be offlined. + */ + if (pg != phyint_anongroup) { + for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (pi2 == pi) + continue; + if (phyint_is_usable(pi2) || + (GROUP_FAILED(pg) && pi2->pi_state != PI_OFFLINE)) + nusable++; + } + } + if (nusable < minred) + return (IPMP_EMINRED); + + if (!change_pif_flags(pi, IFF_OFFLINE, 0)) + return (IPMP_FAILURE); + + /* + * The interface is now offline, so stop probing it. Note that + * if_mpadm(1M) will down the test addresses, after receiving a + * success reply from us. The routing socket message will then make us + * close the socket used for sending probes. But it is more logical + * that an offlined interface must not be probed, even if it has test + * addresses. + * + * NOTE: stop_probing() also sets PI_OFFLINE. + */ + stop_probing(pi); + + /* + * If we're offlining the phyint because it has a duplicate hardware + * address, print a warning -- and leave the link open so that we can + * be notified of hardware address changes that make it usable again. + * Otherwise, close the link so that we won't prevent a detach. + */ + if (pi->pi_hwaddrdup) { + logerr("IP interface %s has a hardware address which is not " + "unique in group %s; offlining\n", pi->pi_name, + pg->pg_name); + } else { + phyint_link_close(pi); + } + + /* + * If this phyint was preventing another phyint with a duplicate + * hardware address from being online, bring that one online now. + */ + if (!pi->pi_hwaddrdup && + (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) { + assert(pi2->pi_hwaddrdup); + (void) phyint_undo_offline(pi2); + } - /* Delete the phyint_group if the last phyint has been deleted */ - if (pg->pg_phyint == NULL) - phyint_group_delete(pg); + /* + * If this interface was active, try to activate another INACTIVE + * interface in the group. + */ + if (!(pi->pi_flags & IFF_INACTIVE)) + phyint_activate_another(pi); + + return (IPMP_SUCCESS); +} + +/* + * Undo a previous offline of `pi'. Returns an IPMP error code. + */ +int +phyint_undo_offline(struct phyint *pi) +{ + if (pi->pi_state != PI_OFFLINE) { + errno = EINVAL; + return (IPMP_FAILURE); + } + + /* + * If necessary, reinitialize our link information and verify that its + * hardware address is still unique across the group. + */ + if (pi->pi_dh == NULL && !phyint_link_init(pi)) { + errno = EIO; + return (IPMP_FAILURE); + } + + if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) { + pi->pi_hwaddrdup = _B_TRUE; + return (IPMP_EHWADDRDUP); + } + + if (pi->pi_hwaddrdup) { + logerr("IP interface %s now has a unique hardware address in " + "group %s; onlining\n", pi->pi_name, pi->pi_group->pg_name); + pi->pi_hwaddrdup = _B_FALSE; + } + + if (!change_pif_flags(pi, 0, IFF_OFFLINE)) + return (IPMP_FAILURE); + + /* + * While the interface was offline, it may have failed (e.g. the link + * may have gone down). phyint_inst_check_for_failure() will have + * already set pi_flags with IFF_FAILED, so we can use that to decide + * whether the phyint should transition to running. Note that after + * we transition to running, we will start sending probes again (if + * test addresses are configured), which may also reveal that the + * interface is in fact failed. + */ + if (pi->pi_flags & IFF_FAILED) { + phyint_chstate(pi, PI_FAILED); + } else { + /* calls phyint_chstate() */ + phyint_transition_to_running(pi); + } + + /* + * Give the requestor time to configure test addresses before + * complaining that they're missing. + */ + pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME; + + return (IPMP_SUCCESS); } /* @@ -1166,11 +1560,10 @@ phyint_inst_print(struct phyint_instance *pii) } logdebug("\nPhyint instance: %s %s index %u state %x flags %llx " - "sock %x in_use %d empty %x full %x\n", + "sock %x in_use %d\n", AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex, pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock, - pii->pii_in_use, pii->pii_phyint->pi_empty, - pii->pii_phyint->pi_full); + pii->pii_in_use); for (li = pii->pii_logint; li != NULL; li = li->li_next) logint_print(li); @@ -1211,9 +1604,11 @@ phyint_inst_print(struct phyint_instance *pii) } else { logdebug("#%d target NULL ", i); } - logdebug("time_sent %u status %d time_ack/lost %u\n", - pii->pii_probes[i].pr_time_sent, + logdebug("time_start %lld status %d " + "time_ackproc %lld time_lost %u", + pii->pii_probes[i].pr_hrtime_start, pii->pii_probes[i].pr_status, + pii->pii_probes[i].pr_hrtime_ackproc, pii->pii_probes[i].pr_time_lost); i = PROBE_INDEX_PREV(i); } while (i != most_recent); @@ -1293,7 +1688,6 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name) struct logint *li; struct lifreq lifr; struct in6_addr test_subnet; - struct in6_addr test_subnet_mask; struct in6_addr testaddr; int test_subnet_len; struct sockaddr_in6 *sin6; @@ -1373,55 +1767,21 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name) testaddr = sin6->sin6_addr; } - if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { - ptp = _B_TRUE; - if (ioctl(ifsock, SIOCGLIFDSTADDR, (char *)&lifr) < 0) { - if (errno != ENXIO) { - logperror_li(li, "logint_init_from_k:" - " (get dstaddr)"); - } - goto error; - } - if (pii->pii_af == AF_INET) { - sin = (struct sockaddr_in *)&lifr.lifr_addr; - IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &tgaddr); - } else { - sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; - tgaddr = sin6->sin6_addr; - } - } else { - if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) { - /* Interface may have vanished */ - if (errno != ENXIO) { - logperror_li(li, "logint_init_from_k:" - " (get subnet)"); - } - goto error; - } - if (lifr.lifr_subnet.ss_family == AF_INET6) { - sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet; - test_subnet = sin6->sin6_addr; - test_subnet_len = lifr.lifr_addrlen; - } else { - sin = (struct sockaddr_in *)&lifr.lifr_subnet; - IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet); - test_subnet_len = lifr.lifr_addrlen + - (IPV6_ABITS - IP_ABITS); - } - (void) ip_index_to_mask_v6(test_subnet_len, &test_subnet_mask); - } - - /* - * Also record the OINDEX for completeness. This information is - * not used. - */ - if (ioctl(ifsock, SIOCGLIFOINDEX, (char *)&lifr) < 0) { - if (errno != ENXIO) { - logperror_li(li, "logint_init_from_k:" - " (get lifoindex)"); - } + if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) { + /* Interface may have vanished */ + if (errno != ENXIO) + logperror_li(li, "logint_init_from_k: (get subnet)"); goto error; } + if (lifr.lifr_subnet.ss_family == AF_INET6) { + sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet; + test_subnet = sin6->sin6_addr; + test_subnet_len = lifr.lifr_addrlen; + } else { + sin = (struct sockaddr_in *)&lifr.lifr_subnet; + IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet); + test_subnet_len = lifr.lifr_addrlen + (IPV6_ABITS - IP_ABITS); + } /* * If this is the logint corresponding to the test address used for @@ -1454,7 +1814,6 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name) /* Update the logint with the values obtained from the kernel. */ li->li_addr = testaddr; li->li_in_use = 1; - li->li_oifindex = lifr.lifr_index; if (ptp) { li->li_dstaddr = tgaddr; li->li_subnet_len = (pii->pii_af == AF_INET) ? @@ -1530,15 +1889,12 @@ static void logint_print(struct logint *li) { char abuf[INET6_ADDRSTRLEN]; - int af; - - af = li->li_phyint_inst->pii_af; + int af = li->li_phyint_inst->pii_af; logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name, pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len); - logdebug("\tFlags: %llx in_use %d oifindex %d\n", - li->li_flags, li->li_in_use, li->li_oifindex); + logdebug("\tFlags: %llx in_use %d\n", li->li_flags, li->li_in_use); } char * @@ -1555,6 +1911,33 @@ pr_addr(int af, struct in6_addr addr, char *abuf, int len) return (abuf); } +/* + * Fill in the sockaddr_storage pointed to by `ssp' with the IP address + * represented by the [`af',`addr'] pair. Needed because in.mpathd internally + * stores all addresses as in6_addrs, but we don't want to expose that. + */ +void +addr2storage(int af, const struct in6_addr *addr, struct sockaddr_storage *ssp) +{ + struct sockaddr_in *sinp = (struct sockaddr_in *)ssp; + struct sockaddr_in6 *sin6p = (struct sockaddr_in6 *)ssp; + + assert(af == AF_INET || af == AF_INET6); + + switch (af) { + case AF_INET: + (void) memset(sinp, 0, sizeof (*sinp)); + sinp->sin_family = AF_INET; + IN6_V4MAPPED_TO_INADDR(addr, &sinp->sin_addr); + break; + case AF_INET6: + (void) memset(sin6p, 0, sizeof (*sin6p)); + sin6p->sin6_family = AF_INET6; + sin6p->sin6_addr = *addr; + break; + } +} + /* Lookup target on its address */ struct target * target_lookup(struct phyint_instance *pii, struct in6_addr addr) @@ -1686,7 +2069,7 @@ target_select_best(struct phyint_instance *pii) if (tg->tg_latime + MIN_RECOVERY_TIME < now) { slow_recovered = tg; /* - * Promote the slow_recoverd to unused + * Promote the slow_recovered to unused */ tg->tg_status = TG_UNUSED; } else { @@ -1698,7 +2081,7 @@ target_select_best(struct phyint_instance *pii) if (tg->tg_latime + MIN_RECOVERY_TIME < now) { dead_recovered = tg; /* - * Promote the dead_recoverd to slow + * Promote the dead_recovered to slow */ tg->tg_status = TG_SLOW; tg->tg_latime = now; @@ -1798,11 +2181,9 @@ target_create(struct phyint_instance *pii, struct in6_addr addr, /* * If there are multiple subnets associated with an interface, then - * add the target to this phyint instance, only if it belongs to the - * same subnet as the test address. The reason is that interface - * routes derived from non-test-addresses i.e. non-IFF_NOFAILOVER - * addresses, will disappear after failover, and the targets will not - * be reachable from this interface. + * add the target to this phyint instance only if it belongs to the + * same subnet as the test address. This assures us that we will + * be able to reach this target through our routing table. */ if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len)) return; @@ -1906,11 +2287,12 @@ target_add(struct phyint_instance *pii, struct in6_addr addr, /* * If the target does not exist, create it; target_create() will set - * tg_in_use to true. If it exists already, and it is a router - * target, set tg_in_use to to true, so that init_router_targets() - * won't delete it + * tg_in_use to true. Even if it exists already, if it's a router + * target and we'd previously learned of it through multicast, then we + * need to recreate it as a router target. Otherwise, just set + * tg_in_use to to true so that init_router_targets() won't delete it. */ - if (tg == NULL) + if (tg == NULL || (is_router && !pii->pii_targets_are_routers)) target_create(pii, addr, is_router); else if (is_router) tg->tg_in_use = 1; @@ -2034,16 +2416,17 @@ target_delete(struct target *tg) * relevant any longer. */ assert(pii->pii_targets == NULL); + pii->pii_targets_are_routers = _B_FALSE; clear_pii_probe_stats(pii); pii_other = phyint_inst_other(pii); /* - * If there are no targets on both instances and the interface is - * online, go back to PI_NOTARGETS state, since we cannot probe this - * phyint any more. For more details, please see phyint state - * diagram in mpd_probe.c. + * If there are no targets on both instances and the interface would + * otherwise be considered PI_RUNNING, go back to PI_NOTARGETS state, + * since we cannot probe this phyint any more. For more details, + * please see phyint state diagram in mpd_probe.c. */ - if (!PROBE_CAPABLE(pii_other) && + if (!PROBE_CAPABLE(pii_other) && LINK_UP(pii->pii_phyint) && pii->pii_phyint->pi_state != PI_OFFLINE) phyint_chstate(pii->pii_phyint, PI_NOTARGETS); } @@ -2101,9 +2484,11 @@ reset_pii_probes(struct phyint_instance *pii, struct target *tg) for (i = 0; i < PROBE_STATS_COUNT; i++) { if (pii->pii_probes[i].pr_target == tg) { + if (pii->pii_probes[i].pr_status == PR_UNACKED) { + probe_chstate(&pii->pii_probes[i], pii, + PR_LOST); + } pii->pii_probes[i].pr_target = NULL; - if (pii->pii_probes[i].pr_status == PR_UNACKED) - pii->pii_probes[i].pr_status = PR_LOST; } } @@ -2132,7 +2517,7 @@ target_print(struct target *tg) af = tg->tg_phyint_inst->pii_af; logdebug("Target on %s %s addr %s\n" - "status %d rtt_sa %d rtt_sd %d crtt %d tg_in_use %d\n", + "status %d rtt_sa %lld rtt_sd %lld crtt %d tg_in_use %d\n", AF_STR(af), tg->tg_phyint_inst->pii_name, pr_addr(af, tg->tg_address, abuf, sizeof (abuf)), tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd, @@ -2158,35 +2543,16 @@ phyint_inst_print_all(void) } /* - * Convert length for a mask to the mask. - */ -static void -ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask) -{ - int j; - - assert(masklen <= IPV6_ABITS); - bzero((char *)bitmask, sizeof (*bitmask)); - - /* Make the 'masklen' leftmost bits one */ - for (j = 0; masklen > 8; masklen -= 8, j++) - bitmask->s6_addr[j] = 0xff; - - bitmask->s6_addr[j] = 0xff << (8 - masklen); - -} - -/* * Compare two prefixes that have the same prefix length. * Fails if the prefix length is unreasonable. */ -static boolean_t -prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len) +boolean_t +prefix_equal(struct in6_addr p1, struct in6_addr p2, uint_t prefix_len) { uchar_t mask; int j; - if (prefix_len < 0 || prefix_len > IPV6_ABITS) + if (prefix_len > IPV6_ABITS) return (_B_FALSE); for (j = 0; prefix_len > 8; prefix_len -= 8, j++) @@ -2202,35 +2568,25 @@ prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len) } /* - * Get the number of UP logints (excluding IFF_NOFAILOVERs), on both - * IPv4 and IPv6 put together. The phyint with the least such number - * will be used as the failover destination, if no standby interface is - * available + * Get the number of UP logints on phyint `pi'. */ -int +static int logint_upcount(struct phyint *pi) { struct logint *li; - struct phyint_instance *pii; int count = 0; - pii = pi->pi_v4; - if (pii != NULL) { - for (li = pii->pii_logint; li != NULL; li = li->li_next) { - if ((li->li_flags & - (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) { + if (pi->pi_v4 != NULL) { + for (li = pi->pi_v4->pii_logint; li != NULL; li = li->li_next) { + if (li->li_flags & IFF_UP) count++; - } } } - pii = pi->pi_v6; - if (pii != NULL) { - for (li = pii->pii_logint; li != NULL; li = li->li_next) { - if ((li->li_flags & - (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) { + if (pi->pi_v6 != NULL) { + for (li = pi->pi_v6->pii_logint; li != NULL; li = li->li_next) { + if (li->li_flags & IFF_UP) count++; - } } } @@ -2250,6 +2606,28 @@ phyint_inst_other(struct phyint_instance *pii) } /* + * Check whether a phyint is functioning. + */ +static boolean_t +phyint_is_functioning(struct phyint *pi) +{ + if (pi->pi_state == PI_RUNNING) + return (_B_TRUE); + return (pi->pi_state == PI_NOTARGETS && !(pi->pi_flags & IFF_FAILED)); +} + +/* + * Check whether a phyint is usable. + */ +static boolean_t +phyint_is_usable(struct phyint *pi) +{ + if (logint_upcount(pi) == 0) + return (_B_FALSE); + return (phyint_is_functioning(pi)); +} + +/* * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'. * Before sending the event, it prepends the current version of the IPMP * sysevent API. Returns 0 on success, -1 on failure (in either case, @@ -2258,16 +2636,18 @@ phyint_inst_other(struct phyint_instance *pii) static int post_event(const char *subclass, nvlist_t *nvl) { - sysevent_id_t eid; + static evchan_t *evchp = NULL; /* - * Since sysevents don't work yet in non-global zones, there cannot - * possibly be any consumers yet, so don't bother trying to generate - * them. (Otherwise, we'll spew warnings.) + * Initialize the event channel if we haven't already done so. */ - if (getzoneid() != GLOBAL_ZONEID) { - nvlist_free(nvl); - return (0); + if (evchp == NULL) { + errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evchp, EVCH_CREAT); + if (errno != 0) { + logerr("cannot create event channel `%s': %s\n", + IPMP_EVENT_CHAN, strerror(errno)); + goto failed; + } } errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION, @@ -2278,8 +2658,9 @@ post_event(const char *subclass, nvlist_t *nvl) goto failed; } - if (sysevent_post_event(EC_IPMP, (char *)subclass, SUNW_VENDOR, - "in.mpathd", nvl, &eid) == -1) { + errno = sysevent_evc_publish(evchp, EC_IPMP, subclass, "com.sun", + "in.mpathd", nvl, EVCH_NOSLEEP); + if (errno != 0) { logerr("cannot send `%s' event: %s\n", subclass, strerror(errno)); goto failed; @@ -2300,6 +2681,8 @@ ifstate(struct phyint *pi) { switch (pi->pi_state) { case PI_NOTARGETS: + if (pi->pi_flags & IFF_FAILED) + return (IPMP_IF_FAILED); return (IPMP_IF_UNKNOWN); case PI_OFFLINE: @@ -2330,12 +2713,203 @@ iftype(struct phyint *pi) } /* + * Return the external IPMP link state associated with phyint `pi'. + */ +static ipmp_if_linkstate_t +iflinkstate(struct phyint *pi) +{ + if (!(pi->pi_notes & (DL_NOTE_LINK_UP|DL_NOTE_LINK_DOWN))) + return (IPMP_LINK_UNKNOWN); + + return (LINK_DOWN(pi) ? IPMP_LINK_DOWN : IPMP_LINK_UP); +} + +/* + * Return the external IPMP probe state associated with phyint `pi'. + */ +static ipmp_if_probestate_t +ifprobestate(struct phyint *pi) +{ + if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) + return (IPMP_PROBE_DISABLED); + + if (pi->pi_state == PI_FAILED) + return (IPMP_PROBE_FAILED); + + if (!PROBE_CAPABLE(pi->pi_v4) && !PROBE_CAPABLE(pi->pi_v6)) + return (IPMP_PROBE_UNKNOWN); + + return (IPMP_PROBE_OK); +} + +/* + * Return the external IPMP target mode associated with phyint instance `pii'. + */ +static ipmp_if_targmode_t +iftargmode(struct phyint_instance *pii) +{ + if (!PROBE_ENABLED(pii)) + return (IPMP_TARG_DISABLED); + else if (pii->pii_targets_are_routers) + return (IPMP_TARG_ROUTES); + else + return (IPMP_TARG_MULTICAST); +} + +/* + * Return the external IPMP flags associated with phyint `pi'. + */ +static ipmp_if_flags_t +ifflags(struct phyint *pi) +{ + ipmp_if_flags_t flags = 0; + + if (logint_upcount(pi) == 0) + flags |= IPMP_IFFLAG_DOWN; + if (pi->pi_flags & IFF_INACTIVE) + flags |= IPMP_IFFLAG_INACTIVE; + if (pi->pi_hwaddrdup) + flags |= IPMP_IFFLAG_HWADDRDUP; + if (phyint_is_functioning(pi) && flags == 0) + flags |= IPMP_IFFLAG_ACTIVE; + + return (flags); +} + +/* + * Store the test address used on phyint instance `pii' in `ssp'. If there's + * no test address, 0.0.0.0 is stored. + */ +static struct sockaddr_storage * +iftestaddr(struct phyint_instance *pii, struct sockaddr_storage *ssp) +{ + if (PROBE_ENABLED(pii)) + addr2storage(pii->pii_af, &pii->pii_probe_logint->li_addr, ssp); + else + addr2storage(AF_INET6, &in6addr_any, ssp); + + return (ssp); +} + +/* * Return the external IPMP group state associated with phyint group `pg'. */ static ipmp_group_state_t groupstate(struct phyint_group *pg) { - return (GROUP_FAILED(pg) ? IPMP_GROUP_FAILED : IPMP_GROUP_OK); + switch (pg->pg_state) { + case PG_FAILED: + return (IPMP_GROUP_FAILED); + case PG_DEGRADED: + return (IPMP_GROUP_DEGRADED); + case PG_OK: + return (IPMP_GROUP_OK); + } + + logerr("groupstate: unknown state %d; aborting\n", pg->pg_state); + abort(); + /* NOTREACHED */ +} + +/* + * Return the external IPMP probe state associated with probe `ps'. + */ +static ipmp_probe_state_t +probestate(struct probe_stats *ps) +{ + switch (ps->pr_status) { + case PR_UNUSED: + case PR_LOST: + return (IPMP_PROBE_LOST); + case PR_UNACKED: + return (IPMP_PROBE_SENT); + case PR_ACKED: + return (IPMP_PROBE_ACKED); + } + + logerr("probestate: unknown state %d; aborting\n", ps->pr_status); + abort(); + /* NOTREACHED */ +} + +/* + * Generate an ESC_IPMP_PROBE_STATE sysevent for the probe described by `pr' + * on phyint instance `pii'. Returns 0 on success, -1 on failure. + */ +int +probe_state_event(struct probe_stats *pr, struct phyint_instance *pii) +{ + nvlist_t *nvl; + hrtime_t proc_time = 0, recv_time = 0; + struct sockaddr_storage ss; + struct target *tg = pr->pr_target; + + errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); + if (errno != 0) { + logperror("cannot create `interface change' event"); + return (-1); + } + + errno = nvlist_add_uint32(nvl, IPMP_PROBE_ID, pr->pr_id); + if (errno != 0) + goto failed; + + errno = nvlist_add_string(nvl, IPMP_IF_NAME, pii->pii_phyint->pi_name); + if (errno != 0) + goto failed; + + errno = nvlist_add_uint32(nvl, IPMP_PROBE_STATE, probestate(pr)); + if (errno != 0) + goto failed; + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_START_TIME, + pr->pr_hrtime_start); + if (errno != 0) + goto failed; + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_SENT_TIME, + pr->pr_hrtime_sent); + if (errno != 0) + goto failed; + + if (pr->pr_status == PR_ACKED) { + recv_time = pr->pr_hrtime_ackrecv; + proc_time = pr->pr_hrtime_ackproc; + } + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, recv_time); + if (errno != 0) + goto failed; + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, proc_time); + if (errno != 0) + goto failed; + + if (tg != NULL) + addr2storage(pii->pii_af, &tg->tg_address, &ss); + else + addr2storage(pii->pii_af, &in6addr_any, &ss); + + errno = nvlist_add_byte_array(nvl, IPMP_PROBE_TARGET, (uchar_t *)&ss, + sizeof (ss)); + if (errno != 0) + goto failed; + + errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, + tg->tg_rtt_sa / 8); + if (errno != 0) + goto failed; + + errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, + tg->tg_rtt_sd / 4); + if (errno != 0) + goto failed; + + return (post_event(ESC_IPMP_PROBE_STATE, nvl)); +failed: + logperror("cannot create `probe state' event"); + nvlist_free(nvl); + return (-1); } /* @@ -2529,10 +3103,15 @@ gensig(void) unsigned int getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp) { - struct phyint_group *pg; struct phyint *pi; + struct phyint_group *pg; char (*ifs)[LIFNAMSIZ]; - unsigned int nif, i; + unsigned int i, j; + unsigned int nif = 0, naddr = 0; + lifgroupinfo_t lifgr; + addrlist_t *addrp; + struct sockaddr_storage *addrs; + int fdt = 0; pg = phyint_group_lookup(grname); if (pg == NULL) @@ -2540,39 +3119,143 @@ getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp) /* * Tally up the number of interfaces, allocate an array to hold them, - * and insert their names into the array. + * and insert their names into the array. While we're at it, if any + * interface is actually enabled to send probes, save the group fdt. */ - for (nif = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) + for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) nif++; ifs = alloca(nif * sizeof (*ifs)); for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) { assert(i < nif); (void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ); + if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) + fdt = pg->pg_fdt; } assert(i == nif); - *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, - groupstate(pg), nif, ifs); + /* + * If this is the anonymous group, there's no other information to + * collect (since there's no IPMP interface). + */ + if (pg == phyint_anongroup) { + *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt, + groupstate(pg), nif, ifs, "", "", "", "", 0, NULL); + return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); + } + + /* + * Grab some additional information about the group from the kernel. + * (NOTE: since SIOCGLIFGROUPINFO does not look up by interface name, + * we can use ifsock_v4 even for a V6-only group.) + */ + (void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ); + if (ioctl(ifsock_v4, SIOCGLIFGROUPINFO, &lifgr) == -1) { + if (errno == ENOENT) + return (IPMP_EUNKGROUP); + + logperror("getgroupinfo: SIOCGLIFGROUPINFO"); + return (IPMP_FAILURE); + } + + /* + * Tally up the number of data addresses, allocate an array to hold + * them, and insert their values into the array. + */ + for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) + naddr++; + + addrs = alloca(naddr * sizeof (*addrs)); + i = 0; + for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) { + /* + * It's possible to have duplicate addresses (if some are + * down). Weed the dups out to avoid confusing consumers. + * (If groups start having tons of addresses, we'll need a + * better algorithm here.) + */ + for (j = 0; j < i; j++) { + if (sockaddrcmp(&addrs[j], &addrp->al_addr)) + break; + } + if (j == i) { + assert(i < naddr); + addrs[i++] = addrp->al_addr; + } + } + naddr = i; + + *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt, + groupstate(pg), nif, ifs, lifgr.gi_grifname, lifgr.gi_m4ifname, + lifgr.gi_m6ifname, lifgr.gi_bcifname, naddr, addrs); return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); } /* + * Store the target information associated with phyint instance `pii' into a + * dynamically allocated structure pointed to by `*targinfopp'. Returns an + * IPMP error code. + */ +unsigned int +gettarginfo(struct phyint_instance *pii, const char *name, + ipmp_targinfo_t **targinfopp) +{ + uint_t ntarg = 0; + struct target *tg; + struct sockaddr_storage ss; + struct sockaddr_storage *targs = NULL; + + if (PROBE_CAPABLE(pii)) { + targs = alloca(pii->pii_ntargets * sizeof (*targs)); + tg = pii->pii_target_next; + do { + if (tg->tg_status == TG_ACTIVE) { + assert(ntarg < pii->pii_ntargets); + addr2storage(pii->pii_af, &tg->tg_address, + &targs[ntarg++]); + } + if ((tg = tg->tg_next) == NULL) + tg = pii->pii_targets; + } while (tg != pii->pii_target_next); + + assert(ntarg == pii->pii_ntargets); + } + + *targinfopp = ipmp_targinfo_create(name, iftestaddr(pii, &ss), + iftargmode(pii), ntarg, targs); + return (*targinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); +} + +/* * Store the information associated with interface `ifname' into a dynamically * allocated structure pointed to by `*ifinfopp'. Returns an IPMP error code. */ unsigned int getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp) { + int retval; struct phyint *pi; + ipmp_targinfo_t *targinfo4; + ipmp_targinfo_t *targinfo6; pi = phyint_lookup(ifname); if (pi == NULL) return (IPMP_EUNKIF); + if ((retval = gettarginfo(pi->pi_v4, pi->pi_name, &targinfo4)) != 0 || + (retval = gettarginfo(pi->pi_v6, pi->pi_name, &targinfo6)) != 0) + goto out; + *ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name, - ifstate(pi), iftype(pi)); - return (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); + ifstate(pi), iftype(pi), iflinkstate(pi), ifprobestate(pi), + ifflags(pi), targinfo4, targinfo6); + retval = (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); +out: + if (targinfo4 != NULL) + ipmp_freetarginfo(targinfo4); + if (targinfo6 != NULL) + ipmp_freetarginfo(targinfo6); + return (retval); } /* @@ -2605,6 +3288,54 @@ getgrouplist(ipmp_grouplist_t **grlistpp) } /* + * Store the address information for `ssp' (in group `grname') into a + * dynamically allocated structure pointed to by `*adinfopp'. Returns an IPMP + * error code. (We'd call this function getaddrinfo(), but it would conflict + * with getaddrinfo(3SOCKET)). + */ +unsigned int +getgraddrinfo(const char *grname, struct sockaddr_storage *ssp, + ipmp_addrinfo_t **adinfopp) +{ + int ifsock; + addrlist_t *addrp, *addrmatchp = NULL; + ipmp_addr_state_t state; + const char *binding = ""; + struct lifreq lifr; + struct phyint_group *pg; + + if ((pg = phyint_group_lookup(grname)) == NULL) + return (IPMP_EUNKADDR); + + /* + * Walk through the data addresses, and find a match. Note that since + * some of the addresses may be down, more than one may match. We + * prefer an up address (if one exists). + */ + for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) { + if (sockaddrcmp(ssp, &addrp->al_addr)) { + addrmatchp = addrp; + if (addrmatchp->al_flags & IFF_UP) + break; + } + } + + if (addrmatchp == NULL) + return (IPMP_EUNKADDR); + + state = (addrmatchp->al_flags & IFF_UP) ? IPMP_ADDR_UP : IPMP_ADDR_DOWN; + if (state == IPMP_ADDR_UP) { + ifsock = (ssp->ss_family == AF_INET) ? ifsock_v4 : ifsock_v6; + (void) strlcpy(lifr.lifr_name, addrmatchp->al_name, LIFNAMSIZ); + if (ioctl(ifsock, SIOCGLIFBINDING, &lifr) >= 0) + binding = lifr.lifr_binding; + } + + *adinfopp = ipmp_addrinfo_create(ssp, state, pg->pg_name, binding); + return (*adinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); +} + +/* * Store a snapshot of the IPMP subsystem into a dynamically allocated * structure pointed to by `*snapp'. Returns an IPMP error code. */ @@ -2613,10 +3344,12 @@ getsnap(ipmp_snap_t **snapp) { ipmp_grouplist_t *grlistp; ipmp_groupinfo_t *grinfop; + ipmp_addrinfo_t *adinfop; + ipmp_addrlist_t *adlistp; ipmp_ifinfo_t *ifinfop; ipmp_snap_t *snap; struct phyint *pi; - unsigned int i; + unsigned int i, j; int retval; snap = ipmp_snap_create(); @@ -2627,26 +3360,37 @@ getsnap(ipmp_snap_t **snapp) * Add group list. */ retval = getgrouplist(&snap->sn_grlistp); - if (retval != IPMP_SUCCESS) { - ipmp_snap_free(snap); - return (retval); - } + if (retval != IPMP_SUCCESS) + goto failed; /* - * Add information for each group in the list. + * Add information for each group in the list, along with all of its + * data addresses. */ grlistp = snap->sn_grlistp; for (i = 0; i < grlistp->gl_ngroup; i++) { retval = getgroupinfo(grlistp->gl_groups[i], &grinfop); - if (retval != IPMP_SUCCESS) { - ipmp_snap_free(snap); - return (retval); - } + if (retval != IPMP_SUCCESS) + goto failed; + retval = ipmp_snap_addgroupinfo(snap, grinfop); if (retval != IPMP_SUCCESS) { ipmp_freegroupinfo(grinfop); - ipmp_snap_free(snap); - return (retval); + goto failed; + } + + adlistp = grinfop->gr_adlistp; + for (j = 0; j < adlistp->al_naddr; j++) { + retval = getgraddrinfo(grinfop->gr_name, + &adlistp->al_addrs[j], &adinfop); + if (retval != IPMP_SUCCESS) + goto failed; + + retval = ipmp_snap_addaddrinfo(snap, adinfop); + if (retval != IPMP_SUCCESS) { + ipmp_freeaddrinfo(adinfop); + goto failed; + } } } @@ -2655,18 +3399,19 @@ getsnap(ipmp_snap_t **snapp) */ for (pi = phyints; pi != NULL; pi = pi->pi_next) { retval = getifinfo(pi->pi_name, &ifinfop); - if (retval != IPMP_SUCCESS) { - ipmp_snap_free(snap); - return (retval); - } + if (retval != IPMP_SUCCESS) + goto failed; + retval = ipmp_snap_addifinfo(snap, ifinfop); if (retval != IPMP_SUCCESS) { ipmp_freeifinfo(ifinfop); - ipmp_snap_free(snap); - return (retval); + goto failed; } } *snapp = snap; return (IPMP_SUCCESS); +failed: + ipmp_snap_free(snap); + return (retval); } diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h index e4be3ccb30..39da2c3f1b 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _MPD_TABLES_H #define _MPD_TABLES_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -47,20 +45,11 @@ extern "C" { * switch AND * (ii) share the same phyint group name. * Load spreading and failover occur across members of the same phyint group. - * phyint group members must be homogenous. i.e. if a phyint belonging to a + * phyint group members must be homogeneous. i.e. if a phyint belonging to a * phyint group has a IPv6 protocol instance, then all members of the phyint * group, must have IPv6 protocol instances. (struct phyint_group) */ -/* - * Parameter passed to try_failover(), indicating the type of failover - * that is requested. - */ -#define FAILOVER_NORMAL 1 /* Failover to another phyint */ - /* that is preferably a standby */ -#define FAILOVER_TO_NONSTANDBY 2 /* Failover to non-standby phyint */ -#define FAILOVER_TO_ANY 3 /* Failover to any available phyint */ - #define MAXDEFERREDRTT 1 /* Maximum number of deferred rtts */ /* @@ -79,15 +68,9 @@ extern "C" { #define PI_IOCTL_ERROR 4 /* Some ioctl error */ #define PI_GROUP_CHANGED 5 /* The phyint has changed group. */ -/* - * Though IFF_POINTOPOINT is a logint property, for the purpose of - * failover, we treat it as a phyint property. Note that we cannot failover - * individual logints. - */ #define PHYINT_FLAGS(flags) \ - (((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \ - IFF_POINTOPOINT | IFF_RUNNING)) | (handle_link_notifications ? \ - 0 : IFF_RUNNING)) + (((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \ + IFF_RUNNING)) | (handle_link_notifications ? 0 : IFF_RUNNING)) /* A Phyint can have up to 2 instances, the IPv4 and the IPv6 instance */ #define PHYINT_INSTANCE(pi, af) \ @@ -152,29 +135,32 @@ extern "C" { * Phyint group states; see below for the phyint group definition. */ enum pg_state { - PG_RUNNING = 1, /* at least one interface in group is working */ - PG_FAILED = 2 /* group has failed completely */ + PG_OK = 1, /* all interfaces in the group are working */ + PG_DEGRADED, /* some interfaces in the group are unusable */ + PG_FAILED /* all interfaces in the group are unusable */ }; /* * Convenience macro to check if the whole group has failed. */ -#define GROUP_FAILED(pg) ((pg)->pg_groupfailed) +#define GROUP_FAILED(pg) ((pg)->pg_state == PG_FAILED) /* * A doubly linked list of all phyint groups in the system. * A phyint group is identified by its group name. */ struct phyint_group { - char pg_name[LIFNAMSIZ + 1]; /* Phyint group name */ + char pg_name[LIFGRNAMSIZ]; /* Phyint group name */ struct phyint *pg_phyint; /* List of phyints in this group */ struct phyint_group *pg_next; /* Next phyint group */ struct phyint_group *pg_prev; /* Prev phyint group */ - uint64_t pg_sig; /* Current signature of this group */ - int pg_probeint; /* Interval between probes */ - int pg_fdt; /* Time needed to detect failure */ - uint_t - pg_groupfailed : 1; /* The whole group has failed */ + uint64_t pg_sig; /* Current signature of this group */ + int pg_probeint; /* Interval between probes */ + int pg_fdt; /* Time needed to detect failure */ + enum pg_state pg_state; /* Current group state */ + boolean_t pg_in_use; /* To detect removed groups */ + struct addrlist *pg_addrs; /* Data addresses in this group */ + boolean_t pg_failmsg_printed; /* Group failure msg printed */ }; /* @@ -207,6 +193,11 @@ struct phyint { uint16_t pi_icmpid; /* icmp id in icmp echo request */ uint64_t pi_taddrthresh; /* time (in secs) to delay logging */ /* about missing test addresses */ + dlpi_handle_t pi_dh; /* DLPI handle to underlying link */ + uint_t pi_notes; /* enabled DLPI notifications */ + uchar_t pi_hwaddr[DLPI_PHYSADDR_MAX]; /* phyint's hw address */ + size_t pi_hwaddrlen; /* phyint's hw address length */ + /* * The pi_whenup array is a circular buffer of the most recent * times (in milliseconds since some arbitrary point of time in @@ -217,14 +208,12 @@ struct phyint { unsigned int pi_whendx; uint_t - pi_empty : 1, /* failover done, empty */ - pi_full : 1, /* failback done, full */ - /* More details in probe.c */ pi_taddrmsg_printed : 1, /* testaddr msg printed */ pi_duptaddrmsg_printed : 1, /* dup testaddr msg printed */ pi_cfgmsg_printed : 1, /* bad config msg printed */ pi_lfmsg_printed : 1, /* link-flapping msg printed */ - pi_link_state : 1; /* interface link state */ + pi_link_state : 1, /* interface link state */ + pi_hwaddrdup : 1; /* disabled due to dup hw address */ }; /* @@ -260,19 +249,19 @@ struct phyint_instance { uint64_t pii_flags; /* Phyint flags from kernel */ struct probe_stats { - struct target *pr_target; /* Probe Target */ - uint_t pr_time_sent; /* Time probe was sent */ + uint_t pr_id; /* Full ID of probe */ + struct target *pr_target; /* Probe Target */ + uint_t pr_time_lost; /* Time probe declared lost */ + struct timeval pr_tv_sent; /* Wall time probe was sent */ + hrtime_t pr_hrtime_start; /* hrtime probe op started */ + hrtime_t pr_hrtime_sent; /* hrtime probe was sent */ + hrtime_t pr_hrtime_ackrecv; /* hrtime probe ack received */ + hrtime_t pr_hrtime_ackproc; /* hrtime probe ack processed */ uint_t pr_status; /* probe status as below */ #define PR_UNUSED 0 /* Probe slot unused */ #define PR_UNACKED 1 /* Probe is unacknowledged */ #define PR_ACKED 2 /* Probe has been acknowledged */ #define PR_LOST 3 /* Probe is declared lost */ - union { - uint_t tl; /* time probe is declared lost */ - uint_t ta; /* time probe is acked */ - } prt; -#define pr_time_lost prt.tl -#define pr_time_acked prt.ta } pii_probes[PROBE_STATS_COUNT]; uint_t @@ -319,7 +308,6 @@ struct logint { struct in6_addr li_subnet; /* prefix / subnet */ uint_t li_subnet_len; /* prefix / subnet length */ uint64_t li_flags; /* IFF_* flags */ - uint_t li_oifindex; /* original ifindex (SIOCGLIFOINDEX) */ uint_t li_in_use : 1, /* flag to detect deleted logints */ li_dupaddr : 1; /* test address is not unique */ @@ -345,12 +333,12 @@ struct target { #define TG_DEAD 4 /* Target is not responding */ hrtime_t tg_latime; /* Target's last active time */ - int tg_rtt_sa; /* Scaled round trip time(RTT) avg. */ - int tg_rtt_sd; /* Scaled RTT deviation */ - int tg_crtt; /* Conservative RTT = A + 4D */ + int64_t tg_rtt_sa; /* Scaled RTT average (in ns) */ + int64_t tg_rtt_sd; /* Scaled RTT deviation (in ns) */ + int tg_crtt; /* Conservative RTT = A + 4D (in ms) */ uint32_t tg_in_use : 1; /* In use flag */ - int tg_deferred[MAXDEFERREDRTT + 1]; + int64_t tg_deferred[MAXDEFERREDRTT + 1]; /* Deferred rtt data points */ int tg_num_deferred; /* Number of deferred rtt data points */ @@ -393,19 +381,20 @@ struct probe_success_count struct probes_missed { uint_t pm_nprobes; /* Cumulative number of missed probes */ - uint_t pm_ntimes; /* Total number of occassions */ + uint_t pm_ntimes; /* Total number of occasions */ }; -struct local_addr -{ - struct in6_addr addr; - struct local_addr *next; -}; +typedef struct addrlist { + struct addrlist *al_next; /* next address */ + char al_name[LIFNAMSIZ]; /* address lif name */ + uint64_t al_flags; /* address flags */ + struct sockaddr_storage al_addr; /* address */ +} addrlist_t; /* * Globals */ -extern struct local_addr *laddr_list; +extern addrlist_t *localaddrs; /* List of all local addresses, including local zones */ extern struct phyint *phyints; /* List of all phyints */ extern struct phyint_group *phyint_groups; /* List of all phyint groups */ @@ -428,10 +417,19 @@ extern void phyint_inst_delete(struct phyint_instance *pii); extern uint_t phyint_inst_timer(struct phyint_instance *pii); extern boolean_t phyint_inst_sockinit(struct phyint_instance *pii); -extern void phyint_newtype(struct phyint *pi); +extern void phyint_changed(struct phyint *pi); extern void phyint_chstate(struct phyint *pi, enum pi_state state); extern void phyint_group_chstate(struct phyint_group *pg, enum pg_state state); +extern struct phyint_group *phyint_group_create(const char *pg_name); +extern struct phyint_group *phyint_group_lookup(const char *pg_name); +extern void phyint_group_insert(struct phyint_group *pg); +extern void phyint_group_delete(struct phyint_group *pg); +extern void phyint_group_refresh_state(struct phyint_group *pg); extern void phyint_check_for_repair(struct phyint *pi); +extern void phyint_transition_to_running(struct phyint *pi); +extern void phyint_activate_another(struct phyint *pi); +extern int phyint_offline(struct phyint *pi, unsigned int); +extern int phyint_undo_offline(struct phyint *pi); extern void logint_init_from_k(struct phyint_instance *pii, char *li_name); extern void logint_delete(struct logint *li); @@ -448,34 +446,40 @@ extern void target_add(struct phyint_instance *pii, struct in6_addr addr, extern void in_data(struct phyint_instance *pii); extern void in6_data(struct phyint_instance *pii); -extern int try_failover(struct phyint *pi, int failover_type); -extern int try_failback(struct phyint *pi); -extern int do_failback(struct phyint *pi); -extern boolean_t change_lif_flags(struct phyint *pi, uint64_t flags, - boolean_t setfl); - extern void logperror_pii(struct phyint_instance *pii, const char *str); extern void logperror_li(struct logint *li, const char *str); extern char *pr_addr(int af, struct in6_addr addr, char *abuf, int len); +extern void addr2storage(int af, const struct in6_addr *addr, + struct sockaddr_storage *ssp); extern void phyint_inst_print_all(void); +extern boolean_t prefix_equal(struct in6_addr, struct in6_addr, uint_t); -extern int logint_upcount(struct phyint *pi); -extern void restore_phyint(struct phyint *pi); extern void reset_crtt_all(struct phyint *pi); extern int failure_state(struct phyint_instance *pii); extern void process_link_state_changes(void); extern void clear_pii_probe_stats(struct phyint_instance *pii); extern void start_timer(struct phyint_instance *pii); +extern void stop_probing(struct phyint *pi); extern boolean_t own_address(struct in6_addr addr); +extern boolean_t change_pif_flags(struct phyint *pi, uint64_t set, + uint64_t clear); extern void close_probe_socket(struct phyint_instance *pii, boolean_t flag); +extern int probe_state_event(struct probe_stats *, struct phyint_instance *); +extern void probe_chstate(struct probe_stats *, struct phyint_instance *, int); +extern unsigned int getgraddrinfo(const char *, struct sockaddr_storage *, + ipmp_addrinfo_t **); extern unsigned int getifinfo(const char *, ipmp_ifinfo_t **); extern unsigned int getgroupinfo(const char *, ipmp_groupinfo_t **); extern unsigned int getgrouplist(ipmp_grouplist_t **); extern unsigned int getsnap(ipmp_snap_t **); +extern boolean_t addrlist_add(addrlist_t **, const char *, uint64_t, + struct sockaddr_storage *); +extern void addrlist_free(addrlist_t **); + #ifdef __cplusplus } #endif diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c index 27716cabce..703ddcfaad 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c @@ -17,14 +17,11 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "tables.h" #include <fcntl.h> @@ -122,7 +119,7 @@ sendpacket(struct sockaddr_in6 *sin6, int sock, int size, int flags) char abuf[INET6_ADDRSTRLEN]; cc = sendto(sock, (char *)packet, size, flags, - (struct sockaddr *)sin6, sizeof (*sin6)); + (struct sockaddr *)sin6, sizeof (*sin6)); if (cc < 0 || cc != size) { if (cc < 0) { logperror("sendpacket: sendto"); @@ -135,6 +132,32 @@ sendpacket(struct sockaddr_in6 *sin6, int sock, int size, int flags) } } +/* + * If possible, place an ND_OPT_SOURCE_LINKADDR option at `optp'. + * Return the number of bytes placed in the option. + */ +static uint_t +add_opt_lla(struct phyint *pi, struct nd_opt_lla *optp) +{ + uint_t optlen; + uint_t hwaddrlen; + struct lifreq lifr; + + /* If this phyint doesn't have a link-layer address, bail */ + if (phyint_get_lla(pi, &lifr) == -1) + return (0); + + hwaddrlen = lifr.lifr_nd.lnr_hdw_len; + /* roundup to multiple of 8 and make padding zero */ + optlen = ((sizeof (struct nd_opt_hdr) + hwaddrlen + 7) / 8) * 8; + bzero(optp, optlen); + optp->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR; + optp->nd_opt_lla_len = optlen / 8; + bcopy(lifr.lifr_nd.lnr_hdw_addr, optp->nd_opt_lla_hdw_addr, hwaddrlen); + + return (optlen); +} + /* Send a Router Solicitation */ static void solicit(struct sockaddr_in6 *sin6, struct phyint *pi) @@ -151,24 +174,8 @@ solicit(struct sockaddr_in6 *sin6, struct phyint *pi) packetlen += sizeof (*rs); pptr += sizeof (*rs); - /* Attach any options */ - if (pi->pi_hdw_addr_len != 0) { - struct nd_opt_lla *lo = (struct nd_opt_lla *)pptr; - int optlen; - - /* roundup to multiple of 8 and make padding zero */ - optlen = ((sizeof (struct nd_opt_hdr) + - pi->pi_hdw_addr_len + 7) / 8) * 8; - bzero(pptr, optlen); - - lo->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR; - lo->nd_opt_lla_len = optlen / 8; - bcopy((char *)pi->pi_hdw_addr, - (char *)lo->nd_opt_lla_hdw_addr, - pi->pi_hdw_addr_len); - packetlen += optlen; - pptr += optlen; - } + /* add options */ + packetlen += add_opt_lla(pi, (struct nd_opt_lla *)pptr); if (debug & D_PKTOUT) { print_route_sol("Sending solicitation to ", pi, rs, packetlen, @@ -224,24 +231,9 @@ advertise(struct sockaddr_in6 *sin6, struct phyint *pi, boolean_t no_prefixes) return; } - /* Attach any options */ - if (pi->pi_hdw_addr_len != 0) { - struct nd_opt_lla *lo = (struct nd_opt_lla *)pptr; - int optlen; - - /* roundup to multiple of 8 and make padding zero */ - optlen = ((sizeof (struct nd_opt_hdr) + - pi->pi_hdw_addr_len + 7) / 8) * 8; - bzero(pptr, optlen); - - lo->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR; - lo->nd_opt_lla_len = optlen / 8; - bcopy((char *)pi->pi_hdw_addr, - (char *)lo->nd_opt_lla_hdw_addr, - pi->pi_hdw_addr_len); - packetlen += optlen; - pptr += optlen; - } + /* add options */ + packetlen += add_opt_lla(pi, (struct nd_opt_lla *)pptr); + pptr = (char *)packet + packetlen; if (pi->pi_AdvLinkMTU != 0) { struct nd_opt_mtu *mo = (struct nd_opt_mtu *)pptr; @@ -1671,10 +1663,10 @@ process_rtsock(int rtsock) return; } - if (ifm->ifm_flags != pi->pi_flags) { + if (ifm->ifm_flags != (uint_t)pi->pi_flags) { if (debug & D_IFSCAN) { logmsg(LOG_DEBUG, "process_rtsock: clr for " - "%s old flags 0x%x new flags 0x%x\n", + "%s old flags 0x%llx new flags 0x%x\n", pi->pi_name, pi->pi_flags, ifm->ifm_flags); } } @@ -1825,141 +1817,67 @@ process_mibsock(int mibsock) } /* - * Check whether the address formed by pr->pr_prefix and pi_token - * exists in the kernel. Cannot call SIOCTMYADDR/ONLINK as it - * does not check for down addresses. This function should not - * be called for onlink prefixes. - */ -static boolean_t -is_address_present(struct phyint *pi, struct prefix *pr, uint64_t flags) -{ - int s; - in6_addr_t addr, *token; - int i; - int ret; - struct sockaddr_in6 sin6; - - s = socket(AF_INET6, SOCK_DGRAM, 0); - if (s < 0) { - logperror("is_address_present: socket"); - /* - * By returning B_TRUE, we make the caller delete - * the prefix from the internal table. In the worst - * case the next RA will create the prefix. - */ - return (_B_TRUE); - } - if (flags & IFF_TEMPORARY) - token = &pi->pi_tmp_token; - else - token = &pi->pi_token; - for (i = 0; i < 16; i++) { - /* - * prefix_create ensures that pr_prefix has all-zero - * bits after prefixlen. - */ - addr.s6_addr[i] = pr->pr_prefix.s6_addr[i] | token->s6_addr[i]; - } - (void) memset(&sin6, 0, sizeof (struct sockaddr_in6)); - sin6.sin6_family = AF_INET6; - sin6.sin6_addr = addr; - ret = bind(s, (struct sockaddr *)&sin6, sizeof (struct sockaddr_in6)); - (void) close(s); - if (ret < 0 && errno == EADDRNOTAVAIL) - return (_B_FALSE); - else - return (_B_TRUE); -} - -/* * Look if the phyint or one of its prefixes have been removed from * the kernel and take appropriate action. - * Uses {pi,pr}_in_use. + * Uses pr_in_use and pi{,_kernel}_state. */ static void check_if_removed(struct phyint *pi) { - struct prefix *pr; - struct prefix *next_pr; + struct prefix *pr, *next_pr; /* - * Detect phyints that have been removed from the kernel. - * Since we can't recreate it here (would require ifconfig plumb - * logic) we just terminate use of that phyint. - */ - if (!(pi->pi_kernel_state & PI_PRESENT) && - (pi->pi_state & PI_PRESENT)) { - logmsg(LOG_ERR, "Interface %s has been removed from kernel. " - "in.ndpd will no longer use it\n", pi->pi_name); - /* - * Clear state so that should the phyint reappear - * we will start with initial advertisements or - * solicitations. - */ - phyint_cleanup(pi); - } - /* * Detect prefixes which are removed. - * - * We remove the prefix in all of the following cases : - * - * 1) Static prefixes are not the ones we create. So, - * just remove it from our tables. - * - * 2) On-link prefixes potentially move to a different - * phyint during failover. As it does not have - * an address, we can't use the logic in is_address_present - * to detect whether it is present in the kernel or not. - * Thus when it is manually removed we don't recreate it. - * - * 3) If there is a token mis-match and this prefix is not - * in the kernel, it means we don't need this prefix on - * this interface anymore. It must have been moved to a - * different interface by in.mpathd. This normally - * happens after a failover followed by a failback (or - * another failover) and we re-read the network - * configuration. For the failover from A to B, we would - * have created state on B about A's address, which will - * not be in use after the subsequent failback. So, we - * remove that prefix here. - * - * 4) If the physical interface is not present, then remove - * the prefix. In the cases where we are advertising - * prefixes, the state is kept in advertisement prefix and - * hence we can delete the prefix. - * - * 5) Similar to case (3), when we failover from A to B, the - * prefix in A will not be in use as it has been moved to B. - * We will delete it from our tables and recreate it when - * it fails back. is_address_present makes sure that the - * address is still valid in kernel. - * - * If none of the above is true, we recreate the prefix as it - * has been manually removed. We do it only when the interface - * is not FAILED or INACTIVE or OFFLINE. + * Static prefixes are just removed from our tables. + * Non-static prefixes are recreated i.e. in.ndpd takes precedence + * over manually removing prefixes via ifconfig. */ for (pr = pi->pi_prefix_list; pr != NULL; pr = next_pr) { next_pr = pr->pr_next; if (!pr->pr_in_use) { - /* Clear PR_AUTO and PR_ONLINK */ + /* Clear everything except PR_STATIC */ pr->pr_kernel_state &= PR_STATIC; - if ((pr->pr_state & PR_STATIC) || - !(pr->pr_state & PR_AUTO) || - !(prefix_token_match(pi, pr, pr->pr_flags)) || - (!(pi->pi_kernel_state & PI_PRESENT)) || - (is_address_present(pi, pr, pr->pr_flags))) { + pr->pr_name[0] = '\0'; + if (pr->pr_state & PR_STATIC) { prefix_delete(pr); - } else if (!(pi->pi_flags & - (IFF_FAILED|IFF_INACTIVE|IFF_OFFLINE)) && - pr->pr_state != pr->pr_kernel_state) { - pr->pr_name[0] = '\0'; + } else if (!(pi->pi_kernel_state & PI_PRESENT)) { + /* + * Ensure that there are no future attempts to + * run prefix_update_k since the phyint is gone. + */ + pr->pr_state = pr->pr_kernel_state; + } else if (pr->pr_state != pr->pr_kernel_state) { logmsg(LOG_INFO, "Prefix manually removed " - "on %s - recreating it!\n", - pi->pi_name); + "on %s; recreating\n", pi->pi_name); prefix_update_k(pr); } } } + + /* + * Detect phyints that have been removed from the kernel, and tear + * down any prefixes we created that are associated with that phyint. + * (NOTE: IPMP depends on in.ndpd tearing down these prefixes so an + * administrator can easily place an IP interface with ADDRCONF'd + * addresses into an IPMP group.) + */ + if (!(pi->pi_kernel_state & PI_PRESENT) && + (pi->pi_state & PI_PRESENT)) { + logmsg(LOG_ERR, "Interface %s has been removed from kernel. " + "in.ndpd will no longer use it\n", pi->pi_name); + + for (pr = pi->pi_prefix_list; pr != NULL; pr = next_pr) { + next_pr = pr->pr_next; + if (pr->pr_state & PR_AUTO) + prefix_delete(pr); + } + + /* + * Clear state so that should the phyint reappear we will + * start with initial advertisements or solicitations. + */ + phyint_cleanup(pi); + } } diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c index 5d64a9303d..0a9e1e6a13 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -383,29 +383,12 @@ incoming_ra(struct phyint *pi, struct nd_router_advert *ra, int len, if (no_loopback && loopback) return; - /* - * If the interface is FAILED or INACTIVE or OFFLINE, don't - * create any addresses on them. in.mpathd assumes that no new - * addresses will appear on these. This implies that we - * won't create any new prefixes advertised by the router - * on FAILED/INACTIVE/OFFLINE interfaces. When the state changes, - * the next RA will create the prefix on this interface. - */ - if (pi->pi_flags & (IFF_FAILED|IFF_INACTIVE|IFF_OFFLINE)) - return; + bzero(&lifr, sizeof (lifr)); + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; - if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) { - if (errno == ENXIO) - return; - logperror_pi(pi, "incoming_ra: SIOCGLIFLNKINFO"); - return; - } if (ra->nd_ra_curhoplimit != CURHOP_UNSPECIFIED && ra->nd_ra_curhoplimit != pi->pi_CurHopLimit) { pi->pi_CurHopLimit = ra->nd_ra_curhoplimit; - lifr.lifr_ifinfo.lir_maxhops = pi->pi_CurHopLimit; set_needed = _B_TRUE; } @@ -460,7 +443,7 @@ incoming_ra(struct phyint *pi, struct nd_router_advert *ra, int len, logmsg(LOG_DEBUG, "incoming_ra: trigger dhcp %s on %s\n", (ra->nd_ra_flags_reserved & ~pi->pi_ra_flags & - ND_RA_FLAG_MANAGED) ? "MANAGED" : "OTHER", + ND_RA_FLAG_MANAGED) ? "MANAGED" : "OTHER", pi->pi_name); } pi->pi_ra_flags |= ra->nd_ra_flags_reserved; @@ -999,11 +982,9 @@ incoming_prefix_addrconf_process(struct phyint *pi, struct prefix *pr, * Delete this prefix structure as kernel * does not allow duplicated addresses */ - logmsg(LOG_ERR, "incoming_prefix_addrconf_process: " - "Duplicate prefix %s received on interface %s\n", - inet_ntop(AF_INET6, - (void *)&po->nd_opt_pi_prefix, abuf, + "Duplicate prefix %s received on interface %s\n", + inet_ntop(AF_INET6, &po->nd_opt_pi_prefix, abuf, sizeof (abuf)), pi->pi_name); logmsg(LOG_ERR, "incoming_prefix_addrconf_process: " "Prefix already exists in interface %s\n", @@ -1129,12 +1110,8 @@ incoming_mtu_opt(struct phyint *pi, uchar_t *opt, } pi->pi_LinkMTU = mtu; - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; - if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) { - logperror_pi(pi, "incoming_mtu_opt: SIOCGLIFLNKINFO"); - return; - } + bzero(&lifr, sizeof (lifr)); + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); lifr.lifr_ifinfo.lir_maxmtu = pi->pi_LinkMTU; if (ioctl(pi->pi_sock, SIOCSLIFLNKINFO, (char *)&lifr) < 0) { logperror_pi(pi, "incoming_mtu_opt: SIOCSLIFLNKINFO"); @@ -1155,33 +1132,33 @@ incoming_lla_opt(struct phyint *pi, uchar_t *opt, struct sockaddr_in6 *sin6; int max_content_len; - if (pi->pi_hdw_addr_len == 0) + /* + * Get our link-layer address length. We may not have one, in which + * case we can just bail. + */ + if (phyint_get_lla(pi, &lifr) != 0) return; /* * Can't remove padding since it is link type specific. - * However, we check against the length of our link-layer - * address. - * Note: assumes that all links have a fixed lengh address. + * However, we check against the length of our link-layer address. + * Note: assumes that all links have a fixed length address. */ max_content_len = lo->nd_opt_lla_len * 8 - sizeof (struct nd_opt_hdr); - if (max_content_len < pi->pi_hdw_addr_len || + if (max_content_len < lifr.lifr_nd.lnr_hdw_len || (max_content_len >= 8 && - max_content_len - 7 > pi->pi_hdw_addr_len)) { + max_content_len - 7 > lifr.lifr_nd.lnr_hdw_len)) { char abuf[INET6_ADDRSTRLEN]; (void) inet_ntop(AF_INET6, (void *)&from->sin6_addr, abuf, sizeof (abuf)); logmsg(LOG_INFO, "lla option from %s on %s too long with bad " - "physaddr length (%d vs. %d bytes)\n", - abuf, pi->pi_name, - max_content_len, pi->pi_hdw_addr_len); + "physaddr length (%d vs. %d bytes)\n", abuf, pi->pi_name, + max_content_len, lifr.lifr_nd.lnr_hdw_len); return; } - lifr.lifr_nd.lnr_hdw_len = pi->pi_hdw_addr_len; - bcopy((char *)lo->nd_opt_lla_hdw_addr, - (char *)lifr.lifr_nd.lnr_hdw_addr, + bcopy(lo->nd_opt_lla_hdw_addr, lifr.lifr_nd.lnr_hdw_addr, lifr.lifr_nd.lnr_hdw_len); sin6 = (struct sockaddr_in6 *)&lifr.lifr_nd.lnr_addr; @@ -1196,8 +1173,7 @@ incoming_lla_opt(struct phyint *pi, uchar_t *opt, lifr.lifr_nd.lnr_state_same_lla = ND_UNCHANGED; lifr.lifr_nd.lnr_state_diff_lla = ND_STALE; lifr.lifr_nd.lnr_flags = isrouter; - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); if (ioctl(pi->pi_sock, SIOCLIFSETND, (char *)&lifr) < 0) { logperror_pi(pi, "incoming_lla_opt: SIOCLIFSETND"); return; diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c index c8fc6381b7..09e6137965 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "tables.h" @@ -171,6 +169,7 @@ phyint_init_from_k(struct phyint *pi) struct ipv6_mreq v6mcastr; struct lifreq lifr; int fd; + int save_errno; boolean_t newsock; uint_t ttl; struct sockaddr_in6 *sin6; @@ -297,30 +296,6 @@ start_over: pi->pi_dst_token = in6addr_any; } - /* Get link-layer address */ - if (!(pi->pi_flags & IFF_MULTICAST) || - (pi->pi_flags & IFF_POINTOPOINT)) { - pi->pi_hdw_addr_len = 0; - } else { - sin6 = (struct sockaddr_in6 *)&lifr.lifr_nd.lnr_addr; - bzero(sin6, sizeof (struct sockaddr_in6)); - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = pi->pi_ifaddr; - - if (ioctl(fd, SIOCLIFGETND, (char *)&lifr) < 0) { - logperror_pi(pi, "phyint_init_from_k: SIOCLIFGETND"); - goto error; - } - - pi->pi_hdw_addr_len = lifr.lifr_nd.lnr_hdw_len; - - if (lifr.lifr_nd.lnr_hdw_len != 0) { - bcopy((char *)lifr.lifr_nd.lnr_hdw_addr, - (char *)pi->pi_hdw_addr, - lifr.lifr_nd.lnr_hdw_len); - } - } - if (newsock) { icmp6_filter_t filter; int on = 1; @@ -360,8 +335,21 @@ start_over: v6mcastr.ipv6mr_interface = pi->pi_index; if (setsockopt(fd, IPPROTO_IPV6, IPV6_JOIN_GROUP, (char *)&v6mcastr, sizeof (v6mcastr)) < 0) { - logperror_pi(pi, "phyint_init_from_k: " - "setsockopt IPV6_JOIN_GROUP"); + /* + * One benign reason IPV6_JOIN_GROUP could fail is + * when `pi' has been placed into an IPMP group and we + * haven't yet processed the routing socket message + * informing us of its disappearance. As such, if + * it's now in a group, don't print an error. + */ + save_errno = errno; + (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1 || + lifr.lifr_groupname[0] == '\0') { + errno = save_errno; + logperror_pi(pi, "phyint_init_from_k: " + "setsockopt IPV6_JOIN_GROUP"); + } goto error; } pi->pi_state |= PI_JOINED_ALLNODES; @@ -403,8 +391,17 @@ start_over: v6mcastr.ipv6mr_interface = pi->pi_index; if (setsockopt(fd, IPPROTO_IPV6, IPV6_JOIN_GROUP, (char *)&v6mcastr, sizeof (v6mcastr)) < 0) { - logperror_pi(pi, "phyint_init_from_k: setsockopt " - "IPV6_JOIN_GROUP"); + /* + * See IPV6_JOIN_GROUP comment above. + */ + save_errno = errno; + (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1 || + lifr.lifr_groupname[0] == '\0') { + errno = save_errno; + logperror_pi(pi, "phyint_init_from_k: " + "setsockopt IPV6_JOIN_GROUP"); + } goto error; } pi->pi_state |= PI_JOINED_ALLROUTERS; @@ -569,22 +566,16 @@ phyint_print(struct phyint *pi) struct adv_prefix *adv_pr; struct router *dr; char abuf[INET6_ADDRSTRLEN]; - char llabuf[BUFSIZ]; logmsg(LOG_DEBUG, "Phyint %s index %d state %x, kernel %x, " "num routers %d\n", pi->pi_name, pi->pi_index, pi->pi_state, pi->pi_kernel_state, pi->pi_num_k_routers); - logmsg(LOG_DEBUG, "\taddress: %s flags %x\n", + logmsg(LOG_DEBUG, "\taddress: %s flags %llx\n", inet_ntop(AF_INET6, (void *)&pi->pi_ifaddr, abuf, sizeof (abuf)), pi->pi_flags); - logmsg(LOG_DEBUG, "\tsock %d mtu %d hdw_addr len %d <%s>\n", - pi->pi_sock, pi->pi_mtu, pi->pi_hdw_addr_len, - ((pi->pi_hdw_addr_len != 0) ? - fmt_lla(llabuf, sizeof (llabuf), pi->pi_hdw_addr, - pi->pi_hdw_addr_len) : "none")); - logmsg(LOG_DEBUG, "\ttoken: len %d %s\n", - pi->pi_token_length, + logmsg(LOG_DEBUG, "\tsock %d mtu %d\n", pi->pi_sock, pi->pi_mtu); + logmsg(LOG_DEBUG, "\ttoken: len %d %s\n", pi->pi_token_length, inet_ntop(AF_INET6, (void *)&pi->pi_token, abuf, sizeof (abuf))); if (pi->pi_TmpAddrsEnabled) { @@ -632,6 +623,43 @@ phyint_print(struct phyint *pi) logmsg(LOG_DEBUG, "\n"); } + +/* + * Store the LLA for the phyint `pi' `lifrp'. Returns 0 on success, or + * -1 on failure. + * + * Note that we do not cache the hardware address since there's no reliable + * mechanism to determine when it's become stale. + */ +int +phyint_get_lla(struct phyint *pi, struct lifreq *lifrp) +{ + struct sockaddr_in6 *sin6; + + /* If this phyint doesn't have a link-layer address, bail */ + if (!(pi->pi_flags & IFF_MULTICAST) || + (pi->pi_flags & IFF_POINTOPOINT)) { + return (-1); + } + + (void) strlcpy(lifrp->lifr_name, pi->pi_name, LIFNAMSIZ); + sin6 = (struct sockaddr_in6 *)&(lifrp->lifr_nd.lnr_addr); + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = pi->pi_ifaddr; + if (ioctl(pi->pi_sock, SIOCLIFGETND, lifrp) < 0) { + /* + * For IPMP interfaces, don't report ESRCH errors since that + * merely indicates that there are no active interfaces in the + * IPMP group (and thus there's no working hardware address), + * and the packet will thus never make it out anyway. + */ + if (!(pi->pi_flags & IFF_IPMP) || errno != ESRCH) + logperror_pi(pi, "phyint_get_lla: SIOCLIFGETND"); + return (-1); + } + return (0); +} + /* * Randomize pi->pi_ReachableTime. * Done periodically when there are no RAs and at a maximum frequency when @@ -642,20 +670,14 @@ phyint_print(struct phyint *pi) void phyint_reach_random(struct phyint *pi, boolean_t set_needed) { + struct lifreq lifr; + pi->pi_ReachableTime = GET_RANDOM( (int)(ND_MIN_RANDOM_FACTOR * pi->pi_BaseReachableTime), (int)(ND_MAX_RANDOM_FACTOR * pi->pi_BaseReachableTime)); if (set_needed) { - struct lifreq lifr; - - (void) strncpy(lifr.lifr_name, pi->pi_name, - sizeof (lifr.lifr_name)); - pi->pi_name[sizeof (pi->pi_name) - 1] = '\0'; - if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) { - logperror_pi(pi, - "phyint_reach_random: SIOCGLIFLNKINFO"); - return; - } + bzero(&lifr, sizeof (lifr)); + (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ); lifr.lifr_ifinfo.lir_reachtime = pi->pi_ReachableTime; if (ioctl(pi->pi_sock, SIOCSLIFLNKINFO, (char *)&lifr) < 0) { logperror_pi(pi, @@ -1386,12 +1408,12 @@ prefix_modify_flags(struct prefix *pr, uint64_t onflags, uint64_t offflags) (void) strncpy(lifr.lifr_name, pr->pr_name, sizeof (lifr.lifr_name)); lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; if (ioctl(pi->pi_sock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { - logperror_pr(pr, "prefix_modify_flags: SIOCGLIFFLAGS"); - logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx " - "on 0x%llx off 0x%llx\n", - pr->pr_physical->pi_name, - pr->pr_name, - pr->pr_flags, onflags, offflags); + if (errno != ENXIO) { + logperror_pr(pr, "prefix_modify_flags: SIOCGLIFFLAGS"); + logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx" + " on 0x%llx off 0x%llx\n", pr->pr_physical->pi_name, + pr->pr_name, pr->pr_flags, onflags, offflags); + } return (-1); } old_flags = lifr.lifr_flags; @@ -1399,12 +1421,13 @@ prefix_modify_flags(struct prefix *pr, uint64_t onflags, uint64_t offflags) lifr.lifr_flags &= ~offflags; pr->pr_flags = lifr.lifr_flags; if (ioctl(pi->pi_sock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { - logperror_pr(pr, "prefix_modify_flags: SIOCSLIFFLAGS"); - logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx " - "new 0x%llx on 0x%llx off 0x%llx\n", - pr->pr_physical->pi_name, - pr->pr_name, - old_flags, lifr.lifr_flags, onflags, offflags); + if (errno != ENXIO) { + logperror_pr(pr, "prefix_modify_flags: SIOCSLIFFLAGS"); + logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx" + " new 0x%llx on 0x%llx off 0x%llx\n", + pr->pr_physical->pi_name, pr->pr_name, + old_flags, lifr.lifr_flags, onflags, offflags); + } return (-1); } return (0); @@ -1540,7 +1563,8 @@ prefix_update_k(struct prefix *pr) /* Remove logical interface based on pr_name */ lifr.lifr_addr.ss_family = AF_UNSPEC; - if (ioctl(pi->pi_sock, SIOCLIFREMOVEIF, (char *)&lifr) < 0) { + if (ioctl(pi->pi_sock, SIOCLIFREMOVEIF, (char *)&lifr) < 0 && + errno != ENXIO) { logperror_pr(pr, "prefix_update_k: SIOCLIFREMOVEIF"); } pr->pr_kernel_state = 0; @@ -1865,36 +1889,6 @@ prefix_print(struct prefix *pr) } /* - * Does the address formed by pr->pr_prefix and pi->pi_token match - * pr->pr_address. It does not match if a failover has happened - * earlier (done by in.mpathd) from a different pi. Should not - * be called for onlink prefixes. - */ -boolean_t -prefix_token_match(struct phyint *pi, struct prefix *pr, uint64_t flags) -{ - int i; - in6_addr_t addr, *token; - - if (flags & IFF_TEMPORARY) - token = &pi->pi_tmp_token; - else - token = &pi->pi_token; - for (i = 0; i < 16; i++) { - /* - * prefix_create ensures that pr_prefix has all-zero - * bits after prefixlen. - */ - addr.s6_addr[i] = pr->pr_prefix.s6_addr[i] | token->s6_addr[i]; - } - if (IN6_ARE_ADDR_EQUAL(&pr->pr_address, &addr)) { - return (_B_TRUE); - } else { - return (_B_FALSE); - } -} - -/* * Lookup advertisement prefix structure that matches the prefix and * prefix length. * Assumes that the bits after prefixlen might not be zero. @@ -2305,8 +2299,7 @@ phyint_print_all(void) } void -phyint_cleanup(pi) - struct phyint *pi; +phyint_cleanup(struct phyint *pi) { pi->pi_state = 0; pi->pi_kernel_state = 0; diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h index 409600a402..dfc5414d5d 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _NDPD_TABLES_H #define _NDPD_TABLES_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -58,9 +56,7 @@ struct phyint { char pi_name[LIFNAMSIZ]; /* Used to identify it */ int pi_sock; /* For sending and receiving */ struct in6_addr pi_ifaddr; /* Local address */ - uint_t pi_flags; /* IFF_* flags */ - uint_t pi_hdw_addr_len; - uchar_t pi_hdw_addr[ND_MAX_HDW_LEN]; + uint64_t pi_flags; /* IFF_* flags */ uint_t pi_mtu; /* From SIOCGLIFMTU */ struct in6_addr pi_token; uint_t pi_token_length; @@ -256,6 +252,7 @@ extern int phyint_init_from_k(struct phyint *pi); extern void phyint_delete(struct phyint *pi); extern uint_t phyint_timer(struct phyint *pi, uint_t elapsed); extern void phyint_print_all(void); +extern int phyint_get_lla(struct phyint *pi, struct lifreq *lifrp); extern void phyint_reach_random(struct phyint *pi, boolean_t set_needed); extern void phyint_cleanup(struct phyint *pi); @@ -280,8 +277,6 @@ extern void prefix_update_k(struct prefix *pr); extern uint_t prefix_timer(struct prefix *pr, uint_t elapsed); extern uint_t adv_prefix_timer(struct adv_prefix *adv_pr, uint_t elapsed); -extern boolean_t prefix_token_match(struct phyint *pi, - struct prefix *pr, uint64_t flags); extern struct prefix *prefix_lookup_addr(struct phyint *pi, struct in6_addr prefix); diff --git a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c index 15db1b7539..b76341e303 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c +++ b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c @@ -1,3 +1,7 @@ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ /* -*- Mode: C; tab-width: 4 -*- * * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. @@ -130,8 +134,6 @@ First checkin */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mDNSUNP.h" #include "mDNSDebug.h" @@ -398,13 +400,11 @@ select_src_ifi_info_solaris(int sockfd, int numifs, continue; /* * Avoid address if any of the following flags are set: - * IFF_NOFAILOVER: IPMP test address for use only by in.mpathd * IFF_NOXMIT: no packets transmitted over interface * IFF_NOLOCAL: no address * IFF_PRIVATE: is not advertised */ - if (ifflags & (IFF_NOFAILOVER | IFF_NOXMIT - | IFF_NOLOCAL | IFF_PRIVATE)) + if (ifflags & (IFF_NOXMIT | IFF_NOLOCAL | IFF_PRIVATE)) continue; if (*best_lifr != NULL) { diff --git a/usr/src/cmd/cmd-inet/usr.sbin/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/Makefile index d91d113347..e29c1765ec 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/Makefile +++ b/usr/src/cmd/cmd-inet/usr.sbin/Makefile @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -65,12 +65,13 @@ K5TELNETOBJS= in.telnetd.o SRCS= $(PROGSRCS) $(OTHERSRC) SUBDIRS= bootconfchk htable ifconfig in.ftpd in.rdisc in.routed \ - in.talkd inetadm inetconv ipqosconf kssl/kssladm kssl/ksslcfg \ - ping routeadm snoop sppptun traceroute wificonfig ipsecutils + in.talkd inetadm inetconv ipmpstat ipqosconf ipsecutils \ + kssl/kssladm kssl/ksslcfg ping routeadm snoop sppptun \ + traceroute wificonfig MSGSUBDIRS= bootconfchk htable ifconfig in.ftpd in.routed in.talkd inetadm \ - inetconv ipqosconf kssl/ksslcfg routeadm sppptun snoop \ - wificonfig ipsecutils + inetconv ipmpstat ipqosconf ipsecutils kssl/ksslcfg routeadm \ + sppptun snoop wificonfig # As programs get lint-clean, add them here and to the 'lint' target. # Eventually this hack should go away, and all in PROG should be @@ -83,7 +84,8 @@ LINTCLEAN= 6to4relay arp in.rlogind in.rshd in.telnetd in.tftpd \ # with SUBDIRS. Also (sigh) deal with the commented-out build lines # for the lint rule. LINTSUBDIRS= bootconfchk in.rdisc in.routed in.talkd inetadm inetconv \ - ipqosconf ping routeadm sppptun traceroute wificonfig ipsecutils + ipmpstat ipqosconf ipsecutils ping routeadm sppptun traceroute \ + wificonfig # And as programs are verified not to attempt to write into constants, # -xstrconst should be used to ensure they stay that way. CONSTCLEAN= @@ -144,6 +146,8 @@ LDLIBS += $(K5LIBS) $(TSNETPROG) := LDLIBS += -ltsnet in.rarpd := LDLIBS += -linetutil -ldlpi +if_mpadm := LDLIBS += -linetutil -lipmp +if_mpadm.po := XGETFLAGS += -a route := CPPFLAGS += -DNDEBUG ndd := LDLIBS += -ldladm gettable in.comsat := LDFLAGS += $(MAPFILE.NGB:%=-M%) @@ -245,7 +249,7 @@ lint: $(LINTSUBDIRS) -I$(SRC)/lib/gss_mechs/mech_krb5/include \ -I$(SRC)/lib/pam_modules/krb5 \ in.telnetd.c $(LDLIBS) -lbsm -lpam -lsocket -lnsl - $(LINT.c) if_mpadm.c $(LDLIBS) -lsocket -lnsl -lipmp + $(LINT.c) if_mpadm.c $(LDLIBS) -lsocket -lnsl -lipmp -linetutil $(LINT.c) ipaddrsel.c $(LDLIBS) -lsocket -lnsl $(LINT.c) route.c $(LDLIBS) -lsocket -lnsl -ltsnet $(LINT.c) syncinit.c $(LDLIBS) -ldlpi diff --git a/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c b/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c index d4874135fd..7c5d73c796 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,660 +19,250 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <unistd.h> -#include <stdlib.h> +#include <errno.h> +#include <ipmp_admin.h> +#include <libinetutil.h> +#include <locale.h> +#include <net/if.h> +#include <stdarg.h> #include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> #include <sys/socket.h> -#include <netinet/in.h> -#include <netinet/tcp.h> #include <sys/sockio.h> -#include <net/if.h> -#include <errno.h> -#include <strings.h> -#include <ipmp_mpathd.h> -#include <libintl.h> +#include <sys/types.h> -static int if_down(int ifsock, struct lifreq *lifr); -static int if_up(int ifsock, struct lifreq *lifr); -static void send_cmd(int cmd, char *ifname); -static int connect_to_mpathd(sa_family_t family); -static void do_offline(char *ifname); -static void undo_offline(char *ifname); -static boolean_t offline_set(char *ifname); +typedef void offline_func_t(const char *, ipmp_handle_t); -#define IF_SEPARATOR ':' -#define MAX_RETRIES 3 +static const char *progname; +static int sioc4fd, sioc6fd; +static offline_func_t do_offline, undo_offline; +static boolean_t set_lifflags(const char *, uint64_t); +static boolean_t is_offline(const char *); +static void warn(const char *, ...); +static void die(const char *, ...); static void usage() { - (void) fprintf(stderr, "Usage : if_mpadm [-d | -r] <interface_name>\n"); + (void) fprintf(stderr, "Usage: %s [-d | -r] <interface>\n", progname); + exit(1); } -static void -print_mpathd_error_msg(uint32_t error) +static const char * +mpadm_errmsg(uint32_t error) { switch (error) { - case MPATHD_MIN_RED_ERROR: - (void) fprintf(stderr, gettext( - "Offline failed as there is no other functional " - "interface available in the multipathing group " - "for failing over the network access.\n")); - break; - - case MPATHD_FAILBACK_PARTIAL: - (void) fprintf(stderr, gettext( - "Offline cannot be undone because multipathing " - "configuration is not consistent across all the " - "interfaces in the group.\n")); - break; - + case IPMP_EUNKIF: + return ("not a physical interface or not in an IPMP group"); + case IPMP_EMINRED: + return ("no other functioning interfaces are in its IPMP " + "group"); default: - /* - * We shouldn't get here. All errors should have a - * meaningful error message, as shown in the above - * cases. If we get here, someone has made a mistake. - */ - (void) fprintf(stderr, gettext( - "Operation returned an unrecognized error: %u\n"), - error); - break; + return (ipmp_errmsg(error)); } } int main(int argc, char **argv) { - char *ifname; - int cmd = 0; + int retval; + ipmp_handle_t handle; + offline_func_t *ofuncp = NULL; + const char *ifname; int c; -#if !defined(TEXT_DOMAIN) -#define TEXT_DOMAIN "SYS_TEST" -#endif + if ((progname = strrchr(argv[0], '/')) != NULL) + progname++; + else + progname = argv[0]; + + (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); while ((c = getopt(argc, argv, "d:r:")) != EOF) { switch (c) { case 'd': ifname = optarg; - cmd = MI_OFFLINE; - if (offline_set(ifname)) { - (void) fprintf(stderr, gettext("Interface " - "already offlined\n")); - exit(1); - } + ofuncp = do_offline; break; case 'r': ifname = optarg; - cmd = MI_UNDO_OFFLINE; - if (!offline_set(ifname)) { - (void) fprintf(stderr, gettext("Interface not " - "offlined\n")); - exit(1); - } + ofuncp = undo_offline; break; default : usage(); - exit(1); } } - if (cmd == 0) { + if (ofuncp == NULL) usage(); - exit(1); - } /* - * Send the command to in.mpathd which is generic to - * both the commands. send_cmd returns only if there - * is no error. + * Create the global V4 and V6 socket ioctl descriptors. */ - send_cmd(cmd, ifname); - if (cmd == MI_OFFLINE) { - do_offline(ifname); - } else { - undo_offline(ifname); - } + sioc4fd = socket(AF_INET, SOCK_DGRAM, 0); + sioc6fd = socket(AF_INET6, SOCK_DGRAM, 0); + if (sioc4fd == -1 || sioc6fd == -1) + die("cannot create sockets"); - return (0); -} + if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS) + die("cannot create ipmp handle: %s\n", ipmp_errmsg(retval)); -/* - * Is IFF_OFFLINE set ? - * Returns B_FALSE on failure and B_TRUE on success. - */ -boolean_t -offline_set(char *ifname) -{ - struct lifreq lifr; - int s4; - int s6; - int ret; - - s4 = socket(AF_INET, SOCK_DGRAM, 0); - if (s4 < 0) { - perror("socket"); - exit(1); - } - s6 = socket(AF_INET6, SOCK_DGRAM, 0); - if (s6 < 0) { - perror("socket"); - exit(1); - } - (void) strncpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name)); - ret = ioctl(s4, SIOCGLIFFLAGS, (caddr_t)&lifr); - if (ret < 0) { - if (errno != ENXIO) { - perror("ioctl: SIOCGLIFFLAGS"); - exit(1); - } - ret = ioctl(s6, SIOCGLIFFLAGS, (caddr_t)&lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFFLAGS"); - exit(1); - } - } - (void) close(s4); - (void) close(s6); - if (lifr.lifr_flags & IFF_OFFLINE) - return (B_TRUE); - else - return (B_FALSE); + (*ofuncp)(ifname, handle); + + ipmp_close(handle); + (void) close(sioc4fd); + (void) close(sioc6fd); + + return (EXIT_SUCCESS); } /* - * Sends the command to in.mpathd. If not successful, prints - * an error message and exits. + * Checks whether IFF_OFFLINE is set on `ifname'. */ -void -send_cmd(int cmd, char *ifname) +boolean_t +is_offline(const char *ifname) { - struct mi_offline mio; - struct mi_undo_offline miu; - struct mi_result me; - int ret; - int cmd_len; - int i; - int s; - - for (i = 0; i < MAX_RETRIES; i++) { - s = connect_to_mpathd(AF_INET); - if (s == -1) { - s = connect_to_mpathd(AF_INET6); - if (s == -1) { - (void) fprintf(stderr, gettext("Cannot " - "establish communication with " - "in.mpathd.\n")); - exit(1); - } - } - switch (cmd) { - case MI_OFFLINE : - cmd_len = sizeof (struct mi_offline); - bzero(&mio, cmd_len); - mio.mio_command = cmd; - (void) strncpy(mio.mio_ifname, ifname, LIFNAMSIZ); - mio.mio_min_redundancy = 1; - ret = write(s, &mio, cmd_len); - if (ret != cmd_len) { - /* errno is set only when ret is -1 */ - if (ret == -1) - perror("write"); - (void) fprintf(stderr, gettext("Failed to " - "successfully send command to " - "in.mpathd.\n")); - exit(1); - } - break; - case MI_UNDO_OFFLINE: - cmd_len = sizeof (struct mi_undo_offline); - bzero(&miu, cmd_len); - miu.miu_command = cmd; - (void) strncpy(miu.miu_ifname, ifname, LIFNAMSIZ); - ret = write(s, &miu, cmd_len); - if (ret != cmd_len) { - /* errno is set only when ret is -1 */ - if (ret == -1) - perror("write"); - (void) fprintf(stderr, gettext("Failed to " - "successfully send command to " - "in.mpathd.\n")); - exit(1); - } - break; - default : - (void) fprintf(stderr, "Unknown command \n"); - exit(1); - } + struct lifreq lifr = { 0 }; - /* Read the result from mpathd */ - ret = read(s, &me, sizeof (me)); - if (ret != sizeof (me)) { - /* errno is set only when ret is -1 */ - if (ret == -1) - perror("read"); - (void) fprintf(stderr, gettext("Failed to successfully " - "read result from in.mpathd.\n")); - exit(1); + (void) strlcpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name)); + if (ioctl(sioc4fd, SIOCGLIFFLAGS, &lifr) == -1) { + if (errno != ENXIO || + ioctl(sioc6fd, SIOCGLIFFLAGS, &lifr) == -1) { + die("cannot get interface flags on %s", ifname); } - if (me.me_mpathd_error == 0) { - if (i != 0) { - /* - * We retried at least once. Tell the user - * that things succeeded now. - */ - (void) fprintf(stderr, - gettext("Retry Successful.\n")); - } - return; /* Successful */ - } - - if (me.me_mpathd_error == MPATHD_SYS_ERROR) { - if (me.me_sys_error == EAGAIN) { - (void) close(s); - (void) sleep(1); - (void) fprintf(stderr, - gettext("Retrying ...\n")); - continue; /* Retry */ - } - errno = me.me_sys_error; - perror("if_mpadm"); - } else { - print_mpathd_error_msg(me.me_mpathd_error); - } - exit(1); } - /* - * We come here only if we retry the operation multiple - * times and did not succeed. Let the user try it again - * later. - */ - (void) fprintf(stderr, - gettext("Device busy. Retry the operation later.\n")); - exit(1); + + return ((lifr.lifr_flags & IFF_OFFLINE) != 0); } static void -do_offline(char *ifname) +do_offline(const char *ifname, ipmp_handle_t handle) { - struct lifreq lifr; - struct lifreq *lifcr; - struct lifnum lifn; - struct lifconf lifc; - char *buf; - int numifs; - int n; - char pi_name[LIFNAMSIZ + 1]; - char *cp; - int ifsock_v4; - int ifsock_v6; - int af; - int ret; + ifaddrlistx_t *ifaddrp, *ifaddrs; + int retval; + + if (is_offline(ifname)) + die("interface %s is already offline\n", ifname); + + if ((retval = ipmp_offline(handle, ifname, 1)) != IPMP_SUCCESS) + die("cannot offline %s: %s\n", ifname, mpadm_errmsg(retval)); /* - * Verify whether IFF_OFFLINE is not set as a sanity check. - */ - if (!offline_set(ifname)) { - (void) fprintf(stderr, gettext("Operation failed : in.mpathd " - "has not set IFF_OFFLINE on %s\n"), ifname); - exit(1); - } - /* - * Get both the sockets as we may need to bring both - * IPv4 and IPv6 interfaces down. - */ - ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); - if (ifsock_v4 < 0) { - perror("socket"); - exit(1); - } - ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); - if (ifsock_v6 < 0) { - perror("socket"); - exit(1); - } - /* - * Get all the logicals for "ifname" and mark them down. - * There is no easy way of doing this. We get all the - * interfaces in the system using SICGLIFCONF and mark the - * ones matching the name down. + * Get all the up addresses for `ifname' and bring them down. */ - lifn.lifn_family = AF_UNSPEC; - lifn.lifn_flags = 0; - if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { - perror("ioctl : SIOCGLIFNUM"); - exit(1); - } - numifs = lifn.lifn_count; - - buf = calloc(numifs, sizeof (struct lifreq)); - if (buf == NULL) { - perror("calloc"); - exit(1); - } + if (ifaddrlistx(ifname, IFF_UP, 0, &ifaddrs) == -1) + die("cannot get addresses on %s", ifname); - lifc.lifc_family = AF_UNSPEC; - lifc.lifc_flags = 0; - lifc.lifc_len = numifs * sizeof (struct lifreq); - lifc.lifc_buf = buf; + for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!(ifaddrp->ia_flags & IFF_OFFLINE)) + warn("IFF_OFFLINE vanished on %s\n", ifaddrp->ia_name); - if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { - perror("ioctl : SIOCGLIFCONF"); - exit(1); + if (!set_lifflags(ifaddrp->ia_name, + ifaddrp->ia_flags & ~IFF_UP)) + warn("cannot bring down address on %s\n", + ifaddrp->ia_name); } - lifcr = (struct lifreq *)lifc.lifc_req; - for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifcr++) { - af = lifcr->lifr_addr.ss_family; - (void) strncpy(pi_name, lifcr->lifr_name, - sizeof (pi_name)); - pi_name[sizeof (pi_name) - 1] = '\0'; - if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) - *cp = '\0'; - if (strcmp(pi_name, ifname) == 0) { - /* It matches the interface name that was offlined */ - (void) strncpy(lifr.lifr_name, lifcr->lifr_name, - sizeof (lifr.lifr_name)); - if (af == AF_INET) - ret = if_down(ifsock_v4, &lifr); - else - ret = if_down(ifsock_v6, &lifr); - if (ret != 0) { - (void) fprintf(stderr, gettext("Bringing down " - "the interfaces failed.\n")); - exit(1); - } - } - } + ifaddrlistx_free(ifaddrs); } static void -undo_offline(char *ifname) +undo_offline(const char *ifname, ipmp_handle_t handle) { - struct lifreq lifr; - struct lifreq *lifcr; - struct lifnum lifn; - struct lifconf lifc; - char *buf; - int numifs; - int n; - char pi_name[LIFNAMSIZ + 1]; - char *cp; - int ifsock_v4; - int ifsock_v6; - int af; - int ret; + ifaddrlistx_t *ifaddrp, *ifaddrs; + int retval; + + if (!is_offline(ifname)) + die("interface %s is not offline\n", ifname); /* - * Verify whether IFF_OFFLINE is set as a sanity check. - */ - if (offline_set(ifname)) { - (void) fprintf(stderr, gettext("Operation failed : in.mpathd " - "has not cleared IFF_OFFLINE on %s\n"), ifname); - exit(1); - } - /* - * Get both the sockets as we may need to bring both - * IPv4 and IPv6 interfaces UP. - */ - ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); - if (ifsock_v4 < 0) { - perror("socket"); - exit(1); - } - ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); - if (ifsock_v6 < 0) { - perror("socket"); - exit(1); - } - /* - * Get all the logicals for "ifname" and mark them up. - * There is no easy way of doing this. We get all the - * interfaces in the system using SICGLIFCONF and mark the - * ones matching the name up. + * Get all the down addresses for `ifname' and bring them up. */ - lifn.lifn_family = AF_UNSPEC; - lifn.lifn_flags = 0; - if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { - perror("ioctl : SIOCGLIFNUM"); - exit(1); - } - numifs = lifn.lifn_count; - - buf = calloc(numifs, sizeof (struct lifreq)); - if (buf == NULL) { - perror("calloc"); - exit(1); - } + if (ifaddrlistx(ifname, 0, IFF_UP, &ifaddrs) == -1) + die("cannot get addresses for %s", ifname); - lifc.lifc_family = AF_UNSPEC; - lifc.lifc_flags = 0; - lifc.lifc_len = numifs * sizeof (struct lifreq); - lifc.lifc_buf = buf; + for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!(ifaddrp->ia_flags & IFF_OFFLINE)) + warn("IFF_OFFLINE vanished on %s\n", ifaddrp->ia_name); - if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { - perror("ioctl : SIOCGLIFCONF"); - exit(1); + if (!set_lifflags(ifaddrp->ia_name, ifaddrp->ia_flags | IFF_UP)) + warn("cannot bring up address on %s\n", + ifaddrp->ia_name); } - lifcr = (struct lifreq *)lifc.lifc_req; - for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifcr++) { - af = lifcr->lifr_addr.ss_family; - (void) strncpy(pi_name, lifcr->lifr_name, - sizeof (pi_name)); - pi_name[sizeof (pi_name) - 1] = '\0'; - if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) - *cp = '\0'; - - if (strcmp(pi_name, ifname) == 0) { - /* It matches the interface name that was offlined */ - (void) strncpy(lifr.lifr_name, lifcr->lifr_name, - sizeof (lifr.lifr_name)); - if (af == AF_INET) - ret = if_up(ifsock_v4, &lifr); - else - ret = if_up(ifsock_v6, &lifr); - if (ret != 0) { - (void) fprintf(stderr, gettext("Bringing up " - "the interfaces failed.\n")); - exit(1); - } - } - } -} + ifaddrlistx_free(ifaddrs); -/* - * Returns -1 on failure. Returns the socket file descriptor on - * success. - */ -static int -connect_to_mpathd(sa_family_t family) -{ - int s; - struct sockaddr_storage ss; - struct sockaddr_in *sin = (struct sockaddr_in *)&ss; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; - int addrlen; - int ret; - int on; - - s = socket(family, SOCK_STREAM, 0); - if (s < 0) { - perror("socket"); - return (-1); - } - bzero((char *)&ss, sizeof (ss)); - ss.ss_family = family; /* - * Need to bind to a privileged port. For non-root, this - * will fail. in.mpathd verifies that only commands coming - * from privileged ports succeed so that the ordinary user - * can't issue offline commands. + * Undo the offline. */ - on = 1; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - perror("setsockopt : TCP_ANONPRIVBIND"); - exit(1); - } - switch (family) { - case AF_INET: - sin->sin_port = 0; - sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - addrlen = sizeof (struct sockaddr_in); - break; - case AF_INET6: - sin6->sin6_port = 0; - sin6->sin6_addr = loopback_addr; - addrlen = sizeof (struct sockaddr_in6); - break; - } - ret = bind(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - perror("bind"); - return (-1); - } - switch (family) { - case AF_INET: - sin->sin_port = htons(MPATHD_PORT); - break; - case AF_INET6: - sin6->sin6_port = htons(MPATHD_PORT); - break; + if ((retval = ipmp_undo_offline(handle, ifname)) != IPMP_SUCCESS) { + die("cannot undo-offline %s: %s\n", ifname, + mpadm_errmsg(retval)); } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - perror("connect"); - return (-1); - } - on = 0; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - perror("setsockopt : TCP_ANONPRIVBIND"); - return (-1); - } - return (s); + + /* + * Verify whether IFF_OFFLINE is set as a sanity check. + */ + if (is_offline(ifname)) + warn("in.mpathd has not cleared IFF_OFFLINE on %s\n", ifname); } /* - * Bring down the interface specified by the name lifr->lifr_name. - * - * Returns -1 on failure. Returns 0 on success. + * Change `lifname' to have `flags' set. Returns B_TRUE on success. */ -static int -if_down(int ifsock, struct lifreq *lifr) +static boolean_t +set_lifflags(const char *lifname, uint64_t flags) { - int ret; + struct lifreq lifr = { 0 }; + int fd = (flags & IFF_IPV4) ? sioc4fd : sioc6fd; - ret = ioctl(ifsock, SIOCGLIFFLAGS, (caddr_t)lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFFLAGS"); - return (-1); - } + (void) strlcpy(lifr.lifr_name, lifname, LIFNAMSIZ); + lifr.lifr_flags = flags; - /* IFF_OFFLINE was set to start with. Is it still there ? */ - if (!(lifr->lifr_flags & (IFF_OFFLINE))) { - (void) fprintf(stderr, gettext("IFF_OFFLINE disappeared on " - "%s\n"), lifr->lifr_name); - return (-1); - } - lifr->lifr_flags &= ~IFF_UP; - ret = ioctl(ifsock, SIOCSLIFFLAGS, (caddr_t)lifr); - if (ret < 0) { - perror("ioctl: SIOCSLIFFLAGS"); - return (-1); - } - return (0); + return (ioctl(fd, SIOCSLIFFLAGS, &lifr) >= 0); } -/* - * Bring up the interface specified by the name lifr->lifr_name. - * - * Returns -1 on failure. Returns 0 on success. - */ -static int -if_up(int ifsock, struct lifreq *lifr) +/* PRINTFLIKE1 */ +static void +die(const char *format, ...) { - int ret; - boolean_t zeroaddr = B_FALSE; - struct sockaddr_in *addr; - - ret = ioctl(ifsock, SIOCGLIFADDR, lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFADDR"); - return (-1); - } + va_list alist; + char *errstr = strerror(errno); - addr = (struct sockaddr_in *)&lifr->lifr_addr; - switch (addr->sin_family) { - case AF_INET: - zeroaddr = (addr->sin_addr.s_addr == INADDR_ANY); - break; + format = gettext(format); + (void) fprintf(stderr, gettext("%s: fatal: "), progname); - case AF_INET6: - zeroaddr = IN6_IS_ADDR_UNSPECIFIED( - &((struct sockaddr_in6 *)addr)->sin6_addr); - break; + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); - default: - break; - } + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", errstr); - ret = ioctl(ifsock, SIOCGLIFFLAGS, lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFFLAGS"); - return (-1); - } - /* - * Don't affect the state of addresses that failed back. - * - * XXX Link local addresses that are not marked IFF_NOFAILOVER - * will not be brought up. Link local addresses never failover. - * When the interface was offlined, we brought the link local - * address down. We will not bring it up now if IFF_NOFAILOVER - * is not marked. We check for IFF_NOFAILOVER below so that - * we want to maintain the state of all other addresses as it - * was before offline. Normally link local addresses are marked - * IFF_NOFAILOVER and hence this is not an issue. These can - * be fixed in future with RCM and it is beyond the scope - * of if_mpadm to maintain state and do this correctly. - */ - if (!(lifr->lifr_flags & IFF_NOFAILOVER)) - return (0); + exit(EXIT_FAILURE); +} - /* - * When a data address associated with the physical interface itself - * is failed over (e.g., qfe0, rather than qfe0:1), the kernel must - * fill the ipif data structure for qfe0 with a placeholder entry (the - * "replacement ipif"). Replacement ipif's cannot be brought IFF_UP - * (nor would it make any sense to do so), so we must be careful to - * skip them; thankfully they can be easily identified since they - * all have a zeroed address. - */ - if (zeroaddr) - return (0); - - /* IFF_OFFLINE was not set to start with. Is it there ? */ - if (lifr->lifr_flags & IFF_OFFLINE) { - (void) fprintf(stderr, - gettext("IFF_OFFLINE set wrongly on %s\n"), - lifr->lifr_name); - return (-1); - } - lifr->lifr_flags |= IFF_UP; - ret = ioctl(ifsock, SIOCSLIFFLAGS, lifr); - if (ret < 0) { - perror("ioctl: SIOCSLIFFLAGS"); - return (-1); - } - return (0); +/* PRINTFLIKE1 */ +static void +warn(const char *format, ...) +{ + va_list alist; + char *errstr = strerror(errno); + + format = gettext(format); + (void) fprintf(stderr, gettext("%s: warning: "), progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", errstr); } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile index 69e91758ea..e99f2945a7 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile @@ -19,10 +19,9 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# PROG = ifconfig ROOTFS_PROG = $(PROG) @@ -38,7 +37,7 @@ COMMONSRCS= $(CMDINETCOMMONDIR)/$(COMMONOBJS:%.o=%.c) SRCS= $(LOCALSRCS) $(COMMONSRCS) CPPFLAGS += -I$(CMDINETCOMMONDIR) -I$(SRC)/common/net/dhcp -LDLIBS += -ldhcpagent -linetcfg -ldlpi -ldladm +LDLIBS += -ldhcpagent -ldlpi -linetutil -linetcfg -lipmp -ldladm LINTFLAGS += -m ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%) diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h index c993baeb02..4aa1aa0ed7 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -11,13 +11,12 @@ #ifndef _DEFS_H #define _DEFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif #include <errno.h> +#include <limits.h> #include <unistd.h> #include <stdlib.h> #include <stdio.h> @@ -54,7 +53,10 @@ extern "C" { #include <assert.h> #include <ipmp_mpathd.h> +#include <ipmp_admin.h> #include <inetcfg.h> +#include <libinetutil.h> +#include <alloca.h> #ifdef __cplusplus } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c index f49fca249c..d5517a4700 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -23,6 +23,7 @@ #define TUN_NAME "tun" #define ATUN_NAME "atun" #define TUN6TO4_NAME "6to4tun" +#define IPMPSTUB (void *)-1 typedef struct if_flags { uint64_t iff_value; @@ -67,7 +68,20 @@ static if_flags_t if_flags_tbl[] = { { IFF_TEMPORARY, "TEMPORARY" }, { IFF_FIXEDMTU, "FIXEDMTU" }, { IFF_VIRTUAL, "VIRTUAL" }, - { IFF_DUPLICATE, "DUPLICATE" } + { IFF_DUPLICATE, "DUPLICATE" }, + { IFF_IPMP, "IPMP"} +}; + +typedef struct { + const char *ia_app; + uint64_t ia_flag; + uint_t ia_tries; +} if_appflags_t; + +static const if_appflags_t if_appflags_tbl[] = { + { "dhcpagent(1M)", IFF_DHCPRUNNING, 1 }, + { "in.ndpd(1M)", IFF_ADDRCONF, 3 }, + { NULL, 0, 0 } }; static struct lifreq lifr; @@ -75,7 +89,6 @@ static struct lifreq lifr; static char name[LIFNAMSIZ]; /* foreach interface saved name */ static char origname[LIFNAMSIZ]; -static char savedname[LIFNAMSIZ]; /* For addif */ static int setaddr; /* @@ -89,20 +102,7 @@ static int setaddr; #define NO_ESP_AALG 256 #define NO_ESP_EALG 256 -/* - * iface_t - * used by setifether to create a list of interfaces to mark - * down-up when changing the ethernet address of an interface - */ -typedef struct iface { - struct lifreq lifr; - struct iface *next; /* pointer to the next list element */ -} iface_t; - -static iface_t *logifs = NULL; /* list of logical interfaces */ -static iface_t *phyif = NULL; /* physical interface */ - -int s; +int s, s4, s6; int af = AF_INET; /* default address family */ int debug = 0; int all = 0; /* setifdhcp() needs to know this */ @@ -113,6 +113,7 @@ int v4compat = 0; /* Compatible printing format */ * Function prototypes for command functions. */ static int addif(char *arg, int64_t param); +static int inetipmp(char *arg, int64_t param); static int inetplumb(char *arg, int64_t param); static int inetunplumb(char *arg, int64_t param); static int removeif(char *arg, int64_t param); @@ -141,7 +142,7 @@ static int modinsert(char *arg, int64_t param); static int modremove(char *arg, int64_t param); static int setifgroupname(char *arg, int64_t param); static int configinfo(char *arg, int64_t param); -static void print_config_flags(uint64_t flags); +static void print_config_flags(int af, uint64_t flags); static void print_flags(uint64_t flags); static void print_ifether(char *ifname); static int set_tun_encap_limit(char *arg, int64_t param); @@ -150,6 +151,7 @@ static int set_tun_hop_limit(char *arg, int64_t param); static int setzone(char *arg, int64_t param); static int setallzones(char *arg, int64_t param); static int setifsrc(char *arg, int64_t param); +static int lifnum(const char *ifname); /* * Address family specific function prototypes. @@ -179,19 +181,22 @@ static int settaddr(char *, int (*)(icfg_handle_t, static void status(void); static void ifstatus(const char *); static void usage(void); -static int strioctl(int s, int cmd, char *buf, int buflen); +static int strioctl(int s, int cmd, void *buf, int buflen); static int setifdhcp(const char *caller, const char *ifname, int argc, char *argv[]); static int ip_domux2fd(int *, int *, int *, int *, int *); static int ip_plink(int, int, int, int, int); static int modop(char *arg, char op); -static void selectifs(int argc, char *argv[], int af, - struct lifreq *lifrp); -static int updownifs(iface_t *ifs, int up); static int find_all_global_interfaces(struct lifconf *lifcp, char **buf, int64_t lifc_flags); static int find_all_zone_interfaces(struct lifconf *lifcp, char **buf, int64_t lifc_flags); +static int create_ipmp(const char *grname, int af, const char *ifname, + boolean_t implicit); +static int create_ipmp_peer(int af, const char *ifname); +static void start_ipmp_daemon(void); +static boolean_t ifaddr_up(ifaddrlistx_t *ifaddrp); +static boolean_t ifaddr_down(ifaddrlistx_t *ifaddrp); #define max(a, b) ((a) < (b) ? (b) : (a)) @@ -251,6 +256,7 @@ struct cmd { { "index", NEXTARG, setifindex, 0, AF_ANY }, { "broadcast", NEXTARG, setifbroadaddr, 0, AF_INET }, { "auto-revarp", 0, setifrevarp, 1, AF_INET }, + { "ipmp", 0, inetipmp, 1, AF_ANY }, { "plumb", 0, inetplumb, 1, AF_ANY }, { "unplumb", 0, inetunplumb, 0, AF_ANY }, { "subnet", NEXTARG, setifsubnet, 0, AF_ANY }, @@ -297,22 +303,30 @@ struct cmd { typedef struct if_config_cmd { uint64_t iff_flag; + int iff_af; char *iff_name; } if_config_cmd_t; +/* + * NOTE: print_config_flags() processes this table in order, so we put "up" + * last so that we can be sure "-failover" will take effect first. Otherwise, + * IPMP test addresses will erroneously migrate to the IPMP interface. + */ static if_config_cmd_t if_config_cmd_tbl[] = { - { IFF_UP, "up" }, - { IFF_NOTRAILERS, "-trailers" }, - { IFF_PRIVATE, "private" }, - { IFF_NOXMIT, "-xmit" }, - { IFF_ANYCAST, "anycast" }, - { IFF_NOLOCAL, "-local" }, - { IFF_DEPRECATED, "deprecated" }, - { IFF_NOFAILOVER, "-failover" }, - { IFF_STANDBY, "standby" }, - { IFF_FAILED, "failed" }, - { IFF_PREFERRED, "preferred" }, - { 0, 0 }, + { IFF_NOTRAILERS, AF_UNSPEC, "-trailers" }, + { IFF_PRIVATE, AF_UNSPEC, "private" }, + { IFF_NOXMIT, AF_UNSPEC, "-xmit" }, + { IFF_ANYCAST, AF_INET6, "anycast" }, + { IFF_NOLOCAL, AF_UNSPEC, "-local" }, + { IFF_DEPRECATED, AF_UNSPEC, "deprecated" }, + { IFF_NOFAILOVER, AF_UNSPEC, "-failover" }, + { IFF_STANDBY, AF_UNSPEC, "standby" }, + { IFF_FAILED, AF_UNSPEC, "failed" }, + { IFF_PREFERRED, AF_UNSPEC, "preferred" }, + { IFF_NONUD, AF_INET6, "-nud" }, + { IFF_NOARP, AF_INET, "-arp" }, + { IFF_UP, AF_UNSPEC, "up" }, + { 0, 0, NULL }, }; typedef struct ni { @@ -345,10 +359,11 @@ struct afswtch *afp; /* the address family being set or asked about */ int main(int argc, char *argv[]) { - /* Include IFF_NOXMIT, IFF_TEMPORARY and all zone interfaces */ - int64_t lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; + int64_t lifc_flags; char *default_ip_str; + lifc_flags = LIFC_NOXMIT|LIFC_TEMPORARY|LIFC_ALLZONES|LIFC_UNDER_IPMP; + if (argc < 2) { usage(); exit(1); @@ -388,9 +403,10 @@ main(int argc, char *argv[]) } s = socket(SOCKET_AF(af), SOCK_DGRAM, 0); - if (s < 0) { + s4 = socket(AF_INET, SOCK_DGRAM, 0); + s6 = socket(AF_INET6, SOCK_DGRAM, 0); + if (s == -1 || s4 == -1 || s6 == -1) Perror0_exit("socket"); - } /* * Special interface names is any combination of these flags. @@ -1441,39 +1457,38 @@ setifdstaddr(char *addr, int64_t param) static int setifflags(char *val, int64_t value) { - int phyintlen, origphyintlen; + struct lifreq lifrl; /* local lifreq struct */ + boolean_t bringup = _B_FALSE; (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) Perror0_exit("setifflags: SIOCGLIFFLAGS"); - if (value == IFF_NOFAILOVER) { - /* - * Fail if '-failover' is set after a prior addif created the - * alias on a different interface. This can happen when the - * interface is part of an IPMP group. - */ - phyintlen = strcspn(name, ":"); - origphyintlen = strcspn(origname, ":"); - if (phyintlen != origphyintlen || - strncmp(name, origname, phyintlen) != 0) { - (void) fprintf(stderr, "ifconfig: can't set -failover " - "on failed/standby/offlined interface %s\n", - origname); - exit(1); - } - } - if (value < 0) { value = -value; + + if ((value & IFF_NOFAILOVER) && (lifr.lifr_flags & IFF_UP)) { + /* + * The kernel does not allow administratively up test + * addresses to be converted to data addresses. Bring + * the address down first, then bring it up after it's + * been converted to a data address. + */ + lifr.lifr_flags &= ~IFF_UP; + (void) ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr); + bringup = _B_TRUE; + } + lifr.lifr_flags &= ~value; - if ((value & IFF_UP) && (lifr.lifr_flags & IFF_DUPLICATE)) { + if ((value & (IFF_UP | IFF_NOFAILOVER)) && + (lifr.lifr_flags & IFF_DUPLICATE)) { /* * If the user is trying to mark an interface with a - * duplicate address as "down," then fetch the address - * and set it. This will cause IP to clear the - * IFF_DUPLICATE flag and stop the automatic recovery - * timer. + * duplicate address as "down," or convert a duplicate + * test address to a data address, then fetch the + * address and set it. This will cause IP to clear + * the IFF_DUPLICATE flag and stop the automatic + * recovery timer. */ value = lifr.lifr_flags; if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) >= 0) @@ -1483,10 +1498,48 @@ setifflags(char *val, int64_t value) } else { lifr.lifr_flags |= value; } + + /* + * If we're about to bring up an underlying physical IPv6 interface in + * an IPMP group, ensure the IPv6 IPMP interface is also up. This is + * for backward compatibility with legacy configurations in which + * there are no explicit hostname files for IPMP interfaces. (For + * IPv4, this is automatically handled by the kernel when migrating + * the underlying interface's data address to the IPMP interface.) + */ + (void) strlcpy(lifrl.lifr_name, name, LIFNAMSIZ); + + if (lifnum(lifr.lifr_name) == 0 && + (lifr.lifr_flags & (IFF_UP|IFF_IPV6)) == (IFF_UP|IFF_IPV6) && + ioctl(s, SIOCGLIFGROUPNAME, &lifrl) == 0 && + lifrl.lifr_groupname[0] != '\0') { + lifgroupinfo_t lifgr; + + (void) strlcpy(lifgr.gi_grname, lifrl.lifr_groupname, + LIFGRNAMSIZ); + if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1) + Perror0_exit("setifflags: SIOCGLIFGROUPINFO"); + + (void) strlcpy(lifrl.lifr_name, lifgr.gi_grifname, LIFNAMSIZ); + if (ioctl(s, SIOCGLIFFLAGS, &lifrl) == -1) + Perror0_exit("setifflags: SIOCGLIFFLAGS"); + if (!(lifrl.lifr_flags & IFF_UP)) { + lifrl.lifr_flags |= IFF_UP; + if (ioctl(s, SIOCSLIFFLAGS, &lifrl) == -1) + Perror0_exit("setifflags: SIOCSLIFFLAGS"); + } + } + (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); - if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) { + if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) Perror0_exit("setifflags: SIOCSLIFFLAGS"); + + if (bringup) { + lifr.lifr_flags |= IFF_UP; + if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) + Perror0_exit("setifflags: SIOCSLIFFLAGS IFF_UP"); } + return (0); } @@ -1524,12 +1577,21 @@ setifindex(char *val, int64_t param) } /* ARGSUSED */ +static void +notifycb(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg) +{ +} + +/* ARGSUSED */ static int setifether(char *addr, int64_t param) { - uchar_t *ea; - iface_t *current; - int maclen; + uchar_t *hwaddr; + int hwaddrlen; + int retval; + ifaddrlistx_t *ifaddrp, *ifaddrs = NULL; + dlpi_handle_t dh; + dlpi_notifyid_t id; if (addr == NULL) { ifstatus(name); @@ -1537,9 +1599,6 @@ setifether(char *addr, int64_t param) return (0); } - phyif = NULL; - logifs = NULL; - /* * if the IP interface in the arguments is a logical * interface, exit with an error now. @@ -1550,79 +1609,68 @@ setifether(char *addr, int64_t param) exit(1); } - ea = _link_aton(addr, &maclen); - if (ea == NULL) { - if (maclen == -1) + if ((hwaddr = _link_aton(addr, &hwaddrlen)) == NULL) { + if (hwaddrlen == -1) (void) fprintf(stderr, - "ifconfig: %s: bad address\n", addr); + "ifconfig: %s: bad address\n", hwaddr); else (void) fprintf(stderr, "ifconfig: malloc() failed\n"); exit(1); } - (void) strncpy(savedname, name, sizeof (savedname)); + if ((retval = dlpi_open(name, &dh, 0)) != DLPI_SUCCESS) + Perrdlpi_exit("cannot dlpi_open() link", name, retval); - /* - * Call selectifs only for the IP interfaces that are ipv4. - * offflags == IFF_IPV6 because you should not change the - * Ethernet address of an ipv6 interface - */ - foreachinterface(selectifs, 0, (char **)NULL, 0, 0, IFF_IPV6, 0); + if ((retval = dlpi_bind(dh, DLPI_ANY_SAP, NULL)) != DLPI_SUCCESS) + Perrdlpi_exit("cannot dlpi_bind() link", name, retval); - /* If physical interface not found, exit now */ - if (phyif == NULL) { - (void) fprintf(stderr, - "ifconfig: interface %s not found\n", savedname); - exit(1); - } - - /* Restore */ - (void) strncpy(name, savedname, sizeof (name)); - (void) strncpy(origname, savedname, sizeof (origname)); - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); - - /* - * close and reopen the socket - * we don't know which type of socket we have now - */ - (void) close(s); - s = socket(SOCKET_AF(AF_UNSPEC), SOCK_DGRAM, 0); - if (s < 0) { - Perror0_exit("socket"); - } - - /* - * mark down the logical interfaces first, - * and then the physical interface - */ - if (updownifs(logifs, 0) < 0 || updownifs(phyif, 0) < 0) { - Perror0_exit("mark down interface failed"); + retval = dlpi_enabnotify(dh, DL_NOTE_PHYS_ADDR, notifycb, NULL, &id); + if (retval == DLPI_SUCCESS) { + (void) dlpi_disabnotify(dh, id, NULL); + } else { + /* + * This link does not support DL_NOTE_PHYS_ADDR: bring down + * all of the addresses to flush the old hardware address + * information out of IP. + * + * NOTE: Skipping this when DL_NOTE_PHYS_ADDR is supported is + * more than an optimization: in.mpathd will set IFF_OFFLINE + * if it's notified and the new address is a duplicate of + * another in the group -- but the flags manipulation in + * ifaddr_{down,up}() cannot be atomic and thus might clobber + * IFF_OFFLINE, confusing in.mpathd. + */ + if (ifaddrlistx(name, IFF_UP, 0, &ifaddrs) == -1) + Perror2_exit(name, "cannot get address list"); + + ifaddrp = ifaddrs; + for (; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_down(ifaddrp)) { + Perror2_exit(ifaddrp->ia_name, + "cannot bring down"); + } + } } /* - * Change the physical address + * Change the hardware address. */ - if (dlpi_set_address(savedname, ea, maclen) == -1) { + retval = dlpi_set_physaddr(dh, DL_CURR_PHYS_ADDR, hwaddr, hwaddrlen); + if (retval != DLPI_SUCCESS) { (void) fprintf(stderr, - "ifconfig: failed setting mac address on %s\n", - savedname); + "ifconfig: failed setting mac address on %s\n", name); } + dlpi_close(dh); /* - * if any interfaces were marked down before changing the - * ethernet address, put them up again. - * First the physical interface, then the logical ones. + * If any addresses were brought down before changing the hardware + * address, bring them up again. */ - if (updownifs(phyif, 1) < 0 || updownifs(logifs, 1) < 0) { - Perror0_exit("mark down interface failed"); - } - - /* Free the memory allocated by selectifs */ - free(phyif); - for (current = logifs; current != NULL; current = logifs) { - logifs = logifs->next; - free(current); + for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_up(ifaddrp)) + Perror2_exit(ifaddrp->ia_name, "cannot bring up"); } + ifaddrlistx_free(ifaddrs); return (0); } @@ -1655,8 +1703,8 @@ print_ifether(char *ifname) } (void) close(fd); - /* Virtual interfaces don't have MAC addresses */ - if (lifr.lifr_flags & IFF_VIRTUAL) + /* VNI and IPMP interfaces don't have MAC addresses */ + if (lifr.lifr_flags & (IFF_VIRTUAL|IFF_IPMP)) return; /* @@ -1685,104 +1733,6 @@ print_ifether(char *ifname) } /* - * static void selectifs(int argc, char *argv[], int af, struct lifreq *rp) - * - * Called inside setifether() to create a list of interfaces to - * mark down/up when changing the Ethernet address. - * If the current interface is the physical interface passed - * as an argument to ifconfig, update phyif. - * If the current interface is a logical interface associated - * to the physical interface, add it to the logifs list. - */ -/* ARGSUSED */ -static void -selectifs(int argc, char *argv[], int af, struct lifreq *rp) -{ - char *colonp; - int length; - iface_t *current; - - /* - * savedname= name of the IP interface to which you want to - * change ethernet address - * name= name of the current IP interface - */ - colonp = strchr(name, ':'); - if (colonp == NULL) - length = max(strlen(savedname), strlen(name)); - else - length = max(strlen(savedname), colonp - name); - if (strncmp(savedname, name, length) == 0) { - (void) strcpy(lifr.lifr_name, name); - if (ioctl(s, SIOCGLIFFLAGS, &lifr) < 0) { - Perror0("selectifs: SIOCGLIFFLAGS"); - return; - } - - if ((current = malloc(sizeof (iface_t))) == NULL) { - Perror0_exit("selectifs: malloc failed\n"); - } - - if (colonp == NULL) { - /* this is the physical interface */ - phyif = current; - bcopy(&lifr, &phyif->lifr, sizeof (struct lifreq)); - phyif->next = NULL; - } else { - /* this is a logical interface */ - bcopy(&lifr, ¤t->lifr, sizeof (struct lifreq)); - current->next = logifs; - logifs = current; - } - } -} - -/* - * static int updownifs(iface_t *ifs, int up) - * - * It takes in input a list of IP interfaces (ifs) - * and a flag (up). - * It marks each interface in the list down (up = 0) - * or up (up > 0). This is done ONLY if the IP - * interface was originally up. - * - * Return values: - * 0 = everything OK - * -1 = problem - */ -static int -updownifs(iface_t *ifs, int up) -{ - iface_t *current; - int ret = 0; - int save_errno; - char savename[LIFNAMSIZ]; - uint64_t orig_flags; - - for (current = ifs; current != NULL; current = current->next) { - if (current->lifr.lifr_flags & IFF_UP) { - orig_flags = current->lifr.lifr_flags; - if (!up) - current->lifr.lifr_flags &= ~IFF_UP; - if (ioctl(s, SIOCSLIFFLAGS, ¤t->lifr) < 0) { - save_errno = errno; - (void) strcpy(savename, - current->lifr.lifr_name); - ret = -1; - } - if (!up) /* restore the original flags */ - current->lifr.lifr_flags = orig_flags; - } - } - - if (ret == -1) { - (void) strcpy(lifr.lifr_name, savename); - errno = save_errno; - } - return (ret); -} - -/* * static int find_all_global_interfaces(struct lifconf *lifcp, char **buf, * int64_t lifc_flags) * @@ -2109,130 +2059,217 @@ setiftoken(char *addr, int64_t param) return (0); } -/* - * Return value: 0 on success, -1 on failure. - */ -static int -connect_to_mpathd(int family) -{ - int s; - struct sockaddr_storage ss; - struct sockaddr_in *sin = (struct sockaddr_in *)&ss; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; - int addrlen; - int ret; - int on; - - s = socket(family, SOCK_STREAM, 0); - if (s < 0) { - Perror0_exit("connect_to_mpathd: socket"); - } - (void) bzero((char *)&ss, sizeof (ss)); - ss.ss_family = family; - /* - * Need to bind to a privileged port. For non-root, this - * will fail. in.mpathd verifies that only commands coming - * from privileged ports succeed so that ordinary users - * can't connect and start talking to in.mpathd - */ - on = 1; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - Perror0_exit("connect_to_mpathd: setsockopt"); - } - switch (family) { - case AF_INET: - sin->sin_port = 0; - sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - addrlen = sizeof (struct sockaddr_in); - break; - case AF_INET6: - sin6->sin6_port = 0; - sin6->sin6_addr = loopback_addr; - addrlen = sizeof (struct sockaddr_in6); - break; - } - ret = bind(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - (void) close(s); - return (-1); - } - - switch (family) { - case AF_INET: - sin->sin_port = htons(MPATHD_PORT); - break; - case AF_INET6: - sin6->sin6_port = htons(MPATHD_PORT); - break; - } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - (void) close(s); - return (ret); -} - /* ARGSUSED */ static int -setifgroupname(char *grpname, int64_t param) +setifgroupname(char *grname, int64_t param) { + lifgroupinfo_t lifgr; + struct lifreq lifrl; + ifaddrlistx_t *ifaddrp, *nextifaddrp; + ifaddrlistx_t *ifaddrs = NULL, *downaddrs = NULL; + int af; + if (debug) { (void) printf("Setting groupname %s on interface %s\n", - grpname, name); - } - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); - (void) strncpy(lifr.lifr_groupname, grpname, - sizeof (lifr.lifr_groupname)); - if (ioctl(s, SIOCSLIFGROUPNAME, (caddr_t)&lifr) < 0) { - Perror0_exit("setifgroupname: SIOCSLIFGROUPNAME"); + grname, name); } - /* - * If the SUNW_NO_MPATHD environment variable is set then don't - * bother starting up in.mpathd. See PSARC/2002/249 for the - * depressing details on this bit of stupidity. - */ - if (getenv("SUNW_NO_MPATHD") != NULL) { - return (0); + (void) strlcpy(lifrl.lifr_name, name, LIFNAMSIZ); + (void) strlcpy(lifrl.lifr_groupname, grname, LIFGRNAMSIZ); + + while (ioctl(s, SIOCSLIFGROUPNAME, &lifrl) == -1) { + switch (errno) { + case ENOENT: + /* + * The group doesn't yet exist; create it and repeat. + */ + af = afp->af_af; + if (create_ipmp(grname, af, NULL, _B_TRUE) == -1) { + if (errno == EEXIST) + continue; + + Perror2(grname, "cannot create IPMP group"); + goto fail; + } + continue; + + case EALREADY: + /* + * The interface is already in another group; must + * remove existing membership first. + */ + lifrl.lifr_groupname[0] = '\0'; + if (ioctl(s, SIOCSLIFGROUPNAME, &lifrl) == -1) { + Perror2(name, "cannot remove existing " + "IPMP group membership"); + goto fail; + } + (void) strlcpy(lifrl.lifr_groupname, grname, + LIFGRNAMSIZ); + continue; + + case EAFNOSUPPORT: + /* + * The group exists, but it's not configured with the + * address families the interface needs. Since only + * two address families are currently supported, just + * configure the "other" address family. Note that we + * may race with group deletion or creation by another + * process (ENOENT or EEXIST); in such cases we repeat + * our original SIOCSLIFGROUPNAME. + */ + (void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ); + if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1) { + if (errno == ENOENT) + continue; + + Perror2(grname, "SIOCGLIFGROUPINFO"); + goto fail; + } + + af = lifgr.gi_v4 ? AF_INET6 : AF_INET; + if (create_ipmp(grname, af, lifgr.gi_grifname, + _B_TRUE) == -1) { + if (errno == EEXIST) + continue; + + Perror2(grname, "cannot configure IPMP group"); + goto fail; + } + continue; + + case EADDRINUSE: + /* + * Some addresses are in-use (or under control of DAD). + * Bring them down and retry the group join operation. + * We will bring them back up after the interface has + * been placed in the group. + */ + if (ifaddrlistx(lifrl.lifr_name, IFF_UP|IFF_DUPLICATE, + 0, &ifaddrs) == -1) { + Perror2(grname, "cannot get address list"); + goto fail; + } + + ifaddrp = ifaddrs; + for (; ifaddrp != NULL; ifaddrp = nextifaddrp) { + if (!ifaddr_down(ifaddrp)) { + ifaddrs = ifaddrp; + goto fail; + } + nextifaddrp = ifaddrp->ia_next; + ifaddrp->ia_next = downaddrs; + downaddrs = ifaddrp; + } + ifaddrs = NULL; + continue; + + case EADDRNOTAVAIL: { + /* + * Some data addresses are under application control. + * For some of these (e.g., ADDRCONF), the application + * should remove the address, in which case we retry a + * few times (since the application's action is not + * atomic with respect to us) before bailing out and + * informing the user. + */ + int ntries, nappaddr = 0; + const if_appflags_t *iap = if_appflags_tbl; + + for (; iap->ia_app != NULL; iap++) { + ntries = 0; +again: + if (ifaddrlistx(lifrl.lifr_name, iap->ia_flag, + IFF_NOFAILOVER, &ifaddrs) == -1) { + (void) fprintf(stderr, "ifconfig: %s: " + "cannot get data addresses managed " + "by %s\n", lifrl.lifr_name, + iap->ia_app); + goto fail; + } + + if (ifaddrs == NULL) + continue; + + ifaddrlistx_free(ifaddrs); + ifaddrs = NULL; + + if (++ntries < iap->ia_tries) { + (void) poll(NULL, 0, 100); + goto again; + } + + (void) fprintf(stderr, "ifconfig: cannot join " + "IPMP group: %s has data addresses managed " + "by %s\n", lifrl.lifr_name, iap->ia_app); + nappaddr++; + } + if (nappaddr > 0) + goto fail; + continue; + } + default: + Perror2(name, "SIOCSLIFGROUPNAME"); + goto fail; + } } /* - * Try to connect to in.mpathd using IPv4. If we succeed, - * we conclude that in.mpathd is running, and quit. + * If there were addresses that we had to bring down, it's time to + * bring them up again. As part of bringing them up, the kernel will + * automatically move them to the new IPMP interface. */ - if (connect_to_mpathd(AF_INET) == 0) { - /* connect succeeded, mpathd is already running */ - return (0); + for (ifaddrp = downaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_up(ifaddrp) && errno != ENXIO) { + (void) fprintf(stderr, "ifconfig: cannot bring back up " + "%s: %s\n", ifaddrp->ia_name, strerror(errno)); + } } + ifaddrlistx_free(downaddrs); + return (0); +fail: /* - * Try to connect to in.mpathd using IPv6. If we succeed, - * we conclude that in.mpathd is running, and quit. + * Attempt to bring back up any interfaces that we downed. */ - if (connect_to_mpathd(AF_INET6) == 0) { - /* connect succeeded, mpathd is already running */ - return (0); + for (ifaddrp = downaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_up(ifaddrp) && errno != ENXIO) { + (void) fprintf(stderr, "ifconfig: cannot bring back up " + "%s: %s\n", ifaddrp->ia_name, strerror(errno)); + } } + ifaddrlistx_free(downaddrs); + ifaddrlistx_free(ifaddrs); /* - * in.mpathd may not be running. Start it now. If it is already - * running, in.mpathd will take care of handling multiple incarnations - * of itself. ifconfig only tries to optimize performance by not - * starting another incarnation of in.mpathd. + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). */ - switch (fork()) { + exit(1); + /* NOTREACHED */ +} - case -1: - Perror0_exit("setifgroupname: fork"); - /* NOTREACHED */ - case 0: - (void) execl(MPATHD_PATH, MPATHD_PATH, NULL); - _exit(1); - /* NOTREACHED */ - default: - return (0); +static boolean_t +modcheck(const char *ifname) +{ + (void) strlcpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name)); + + if (ioctl(s, SIOCGLIFFLAGS, &lifr) < 0) { + Perror0("SIOCGLIFFLAGS"); + return (_B_FALSE); } -} + if (lifr.lifr_flags & IFF_IPMP) { + (void) fprintf(stderr, "ifconfig: %s: module operations not" + " supported on IPMP interfaces\n", ifname); + return (_B_FALSE); + } + if (lifr.lifr_flags & IFF_VIRTUAL) { + (void) fprintf(stderr, "ifconfig: %s: module operations not" + " supported on virtual IP interfaces\n", ifname); + return (_B_FALSE); + } + return (_B_TRUE); +} /* * To list all the modules above a given network interface. @@ -2250,7 +2287,13 @@ modlist(char *null, int64_t param) struct str_list strlist; int orig_arpid; - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + /* + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). + */ + if (!modcheck(name)) + exit(1); + if (ip_domux2fd(&muxfd, &muxid_fd, &ipfd_lowstr, &arpfd_lowstr, &orig_arpid) < 0) { return (-1); @@ -2354,8 +2397,8 @@ open_arp_on_udp(char *udp_dev_name) * Return: * -1 if operation fails, 0 otherwise. * - * Please see the big block comment above plumb_one_device() - * for the logic of the PLINK/PUNLINK + * Please see the big block comment above ifplumb() for the logic of the + * PLINK/PUNLINK */ static int ip_domux2fd(int *muxfd, int *muxid_fd, int *ipfd_lowstr, int *arpfd_lowstr, @@ -2467,8 +2510,8 @@ ip_domux2fd(int *muxfd, int *muxid_fd, int *ipfd_lowstr, int *arpfd_lowstr, * Return: * -1 if operation fails, 0 otherwise. * - * Please see the big block comment above plumb_one_device() - * for the logic of the PLINK/PUNLINK + * Please see the big block comment above ifplumb() for the logic of the + * PLINK/PUNLINK */ static int ip_plink(int muxfd, int muxid_fd, int ipfd_lowstr, int arpfd_lowstr, @@ -2530,7 +2573,12 @@ modop(char *arg, char op) char *arg_str; int orig_arpid; - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + /* + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). + */ + if (!modcheck(name)) + exit(1); /* Need to save the original string for -a option. */ if ((arg_str = malloc(strlen(arg) + 1)) == NULL) { @@ -3067,13 +3115,14 @@ status(void) static int configinfo(char *null, int64_t param) { + char *cp; struct afswtch *p = afp; uint64_t flags; - char phydevname[LIFNAMSIZ]; + char lifname[LIFNAMSIZ]; char if_usesrc_name[LIFNAMSIZ]; - char *cp; (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) { Perror0_exit("status: SIOCGLIFFLAGS"); } @@ -3084,13 +3133,13 @@ configinfo(char *null, int64_t param) name, flags, p != NULL ? p->af_af : -1); } - /* remove LIF component */ - (void) strncpy(phydevname, name, sizeof (phydevname)); - cp = strchr(phydevname, ':'); - if (cp) { - *cp = 0; - } - phydevname[sizeof (phydevname) - 1] = '\0'; + /* + * Build the interface name to print (we can't directly use `name' + * because one cannot "plumb" ":0" interfaces). + */ + (void) strlcpy(lifname, name, LIFNAMSIZ); + if ((cp = strchr(lifname, ':')) != NULL && atoi(cp + 1) == 0) + *cp = '\0'; /* * if the interface is IPv4 @@ -3105,7 +3154,7 @@ configinfo(char *null, int64_t param) if (v4compat) flags &= ~IFF_IPV4; - (void) printf("%s inet plumb", phydevname); + (void) printf("%s inet plumb", lifname); } else if (flags & IFF_IPV6) { /* * else if the interface is IPv6 @@ -3117,7 +3166,7 @@ configinfo(char *null, int64_t param) if (v4compat) return (-1); - (void) printf("%s inet6 plumb", phydevname); + (void) printf("%s inet6 plumb", lifname); } (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); @@ -3131,8 +3180,8 @@ configinfo(char *null, int64_t param) ioctl(s, SIOCGLIFMTU, (caddr_t)&lifr) >= 0) (void) printf(" mtu %d", lifr.lifr_metric); - /* don't print index when in compatibility mode */ - if (!v4compat) { + /* Index only applies to the zeroth interface */ + if (lifnum(name) == 0) { if (ioctl(s, SIOCGLIFINDEX, (caddr_t)&lifr) >= 0) (void) printf(" index %d", lifr.lifr_index); } @@ -3162,7 +3211,6 @@ configinfo(char *null, int64_t param) } (void) printf("\n"); - return (0); } @@ -3398,15 +3446,11 @@ in_status(int force, uint64_t flags) inet_ntoa(sin->sin_addr)); } } - /* If there is a groupname, print it for lun 0 alone */ + /* If there is a groupname, print it for only the physical interface */ if (strchr(name, ':') == NULL) { - (void) memset(lifr.lifr_groupname, 0, - sizeof (lifr.lifr_groupname)); - if (ioctl(s, SIOCGLIFGROUPNAME, (caddr_t)&lifr) >= 0) { - if (strlen(lifr.lifr_groupname) > 0) { - (void) printf("\n\tgroupname %s", - lifr.lifr_groupname); - } + if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 && + lifr.lifr_groupname[0] != '\0') { + (void) printf("\n\tgroupname %s", lifr.lifr_groupname); } } (void) putchar('\n'); @@ -3550,11 +3594,7 @@ in_configinfo(int force, uint64_t flags) Perror0_exit("in_configinfo: SIOCGLIFADDR"); } sin = (struct sockaddr_in *)&lifr.lifr_addr; - if (strchr(name, ':') != NULL) { - (void) printf(" addif %s ", inet_ntoa(sin->sin_addr)); - } else { - (void) printf(" set %s ", inet_ntoa(sin->sin_addr)); - } + (void) printf(" set %s ", inet_ntoa(sin->sin_addr)); laddr = sin; } @@ -3614,8 +3654,8 @@ in_configinfo(int force, uint64_t flags) } } - /* If there is a groupname, print it for only the physical interface */ - if (strchr(name, ':') == NULL) { + /* If there is a groupname, print it for only the zeroth interface */ + if (lifnum(name) == 0) { if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 && lifr.lifr_groupname[0] != '\0') { (void) printf(" group %s ", lifr.lifr_groupname); @@ -3623,12 +3663,7 @@ in_configinfo(int force, uint64_t flags) } /* Print flags to configure */ - print_config_flags(flags); - - /* IFF_NOARP applies to AF_INET only */ - if (flags & IFF_NOARP) { - (void) printf("-arp "); - } + print_config_flags(AF_INET, flags); } static void @@ -3657,17 +3692,9 @@ in6_configinfo(int force, uint64_t flags) Perror0_exit("in6_configinfo: SIOCGLIFADDR"); } sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; - if (strchr(name, ':') != NULL) { - (void) printf(" addif %s/%d ", - inet_ntop(AF_INET6, (void *)&sin6->sin6_addr, - abuf, sizeof (abuf)), - lifr.lifr_addrlen); - } else { - (void) printf(" set %s/%d ", - inet_ntop(AF_INET6, (void *)&sin6->sin6_addr, - abuf, sizeof (abuf)), - lifr.lifr_addrlen); - } + (void) printf(" set %s/%d ", + inet_ntop(AF_INET6, &sin6->sin6_addr, abuf, sizeof (abuf)), + lifr.lifr_addrlen); laddr6 = sin6; } (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); @@ -3720,8 +3747,8 @@ in6_configinfo(int force, uint64_t flags) lifr.lifr_addrlen); } - /* If there is a groupname, print it for only the physical interface */ - if (strchr(name, ':') == NULL) { + /* If there is a groupname, print it for only the zeroth interface */ + if (lifnum(name) == 0) { if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 && lifr.lifr_groupname[0] != '\0') { (void) printf(" group %s ", lifr.lifr_groupname); @@ -3729,12 +3756,7 @@ in6_configinfo(int force, uint64_t flags) } /* Print flags to configure */ - print_config_flags(flags); - - /* IFF_NONUD applies to AF_INET6 only */ - if (flags & IFF_NONUD) { - (void) printf("-nud "); - } + print_config_flags(AF_INET6, flags); } /* @@ -3768,31 +3790,41 @@ in6_configinfo(int force, uint64_t flags) * compatibility for other utilities like atmifconfig etc. In this case * the utility must use SIOCSLIFMUXID. */ -static void -plumb_one_device(int af) +static int +ifplumb(const char *linkname, const char *ifname, boolean_t genppa, int af) { int arp_muxid = -1, ip_muxid; int mux_fd, ip_fd, arp_fd; int retval; - uint_t ppa; char *udp_dev_name; - char provider[DLPI_LINKNAME_MAX]; + uint64_t flags; + uint_t dlpi_flags; dlpi_handle_t dh_arp, dh_ip; /* - * We use DLPI_NOATTACH because the ip module will do the attach - * itself for DLPI style-2 devices. + * Always dlpi_open() with DLPI_NOATTACH because the IP and ARP module + * will do the attach themselves for DLPI style-2 links. */ - retval = dlpi_open(name, &dh_ip, DLPI_NOATTACH); - if (retval != DLPI_SUCCESS) - Perrdlpi_exit("cannot open link", name, retval); + dlpi_flags = DLPI_NOATTACH; - if ((retval = dlpi_parselink(name, provider, &ppa)) != DLPI_SUCCESS) - Perrdlpi_exit("dlpi_parselink", name, retval); + /* + * If `linkname' is the special token IPMPSTUB, then this is a request + * to create an IPMP interface atop /dev/ipmpstub0. (We can't simply + * pass "ipmpstub0" as `linkname' since an admin *could* have a normal + * vanity-named link named "ipmpstub0" that they'd like to plumb.) + */ + if (linkname == IPMPSTUB) { + linkname = "ipmpstub0"; + dlpi_flags |= DLPI_DEVONLY; + } + + retval = dlpi_open(linkname, &dh_ip, dlpi_flags); + if (retval != DLPI_SUCCESS) + Perrdlpi_exit("cannot open link", linkname, retval); if (debug) { - (void) printf("ifconfig: plumb_one_device: provider %s," - " ppa %u\n", provider, ppa); + (void) printf("ifconfig: ifplumb: link %s, ifname %s, " + "genppa %u\n", linkname, ifname, genppa); } ip_fd = dlpi_fd(dh_ip); @@ -3812,29 +3844,106 @@ plumb_one_device(int af) Perror2_exit("I_PUSH", ARP_MOD_NAME); /* - * Set IFF_IPV4/IFF_IPV6 flags. - * At this point in time the kernel also allows an - * override of the CANTCHANGE flags. + * Prepare to set IFF_IPV4/IFF_IPV6 flags as part of SIOCSLIFNAME. + * (At this point in time the kernel also allows an override of the + * IFF_CANTCHANGE flags.) */ lifr.lifr_name[0] = '\0'; if (ioctl(ip_fd, SIOCGLIFFLAGS, (char *)&lifr) == -1) - Perror0_exit("plumb_one_device: SIOCGLIFFLAGS"); + Perror0_exit("ifplumb: SIOCGLIFFLAGS"); - /* Set the name string and the IFF_IPV* flag */ if (af == AF_INET6) { - lifr.lifr_flags |= IFF_IPV6; - lifr.lifr_flags &= ~(IFF_BROADCAST | IFF_IPV4); + flags = lifr.lifr_flags | IFF_IPV6; + flags &= ~(IFF_BROADCAST | IFF_IPV4); } else { - lifr.lifr_flags |= IFF_IPV4; - lifr.lifr_flags &= ~IFF_IPV6; + flags = lifr.lifr_flags | IFF_IPV4; + flags &= ~IFF_IPV6; } - /* record the device and module names as interface name */ - lifr.lifr_ppa = ppa; - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + /* + * Set the interface name. If we've been asked to generate the PPA, + * then find the lowest available PPA (only currently used for IPMP + * interfaces). Otherwise, use the interface name as-is. + */ + if (genppa) { + int ppa; + + /* + * We'd like to just set lifr_ppa to UINT_MAX and have the + * kernel pick a PPA. Unfortunately, that would mishandle + * two cases: + * + * 1. If the PPA is available but the groupname is taken + * (e.g., the "ipmp2" IP interface name is available + * but the "ipmp2" groupname is taken) then the + * auto-assignment by the kernel will fail. + * + * 2. If we're creating (e.g.) an IPv6-only IPMP + * interface, and there's already an IPv4-only IPMP + * interface, the kernel will allow us to accidentally + * reuse the IPv6 IPMP interface name (since + * SIOCSLIFNAME uniqueness is per-interface-type). + * This will cause administrative confusion. + * + * Thus, we instead take a brute-force approach of checking + * whether the IPv4 or IPv6 name is already in-use before + * attempting the SIOCSLIFNAME. As per (1) above, the + * SIOCSLIFNAME may still fail, in which case we just proceed + * to the next one. If this approach becomes too slow, we + * can add a new SIOC* to handle this case in the kernel. + */ + for (ppa = 0; ppa < UINT_MAX; ppa++) { + (void) snprintf(lifr.lifr_name, LIFNAMSIZ, "%s%d", + ifname, ppa); + + if (ioctl(s4, SIOCGLIFFLAGS, &lifr) != -1 || + errno != ENXIO) + continue; + + if (ioctl(s6, SIOCGLIFFLAGS, &lifr) != -1 || + errno != ENXIO) + continue; + + lifr.lifr_ppa = ppa; + lifr.lifr_flags = flags; + retval = ioctl(ip_fd, SIOCSLIFNAME, &lifr); + if (retval != -1 || errno != EEXIST) + break; + } + } else { + ifspec_t ifsp; + + /* + * The interface name could have come from the command-line; + * check it. + */ + if (!ifparse_ifspec(ifname, &ifsp) || ifsp.ifsp_lunvalid) + Perror2_exit("invalid IP interface name", ifname); + + /* + * Before we call SIOCSLIFNAME, ensure that the IPMP group + * interface for this address family exists. Otherwise, the + * kernel will kick the interface out of the group when we do + * the SIOCSLIFNAME. + * + * Example: suppose bge0 is plumbed for IPv4 and in group "a". + * If we're now plumbing bge0 for IPv6, but the IPMP group + * interface for "a" is not plumbed for IPv6, the SIOCSLIFNAME + * will kick bge0 out of group "a", which is undesired. + */ + if (create_ipmp_peer(af, ifname) == -1) { + (void) fprintf(stderr, "ifconfig: warning: cannot " + "create %s IPMP group; %s will be removed from " + "group\n", af == AF_INET ? "IPv4" : "IPv6", ifname); + } - /* set the interface name */ - if (ioctl(ip_fd, SIOCSLIFNAME, (char *)&lifr) == -1) { + lifr.lifr_ppa = ifsp.ifsp_ppa; + lifr.lifr_flags = flags; + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); + retval = ioctl(ip_fd, SIOCSLIFNAME, &lifr); + } + + if (retval == -1) { if (errno != EEXIST) Perror0_exit("SIOCSLIFNAME for ip"); /* @@ -3847,15 +3956,15 @@ plumb_one_device(int af) * called for EEXIST. */ Perror0("SIOCSLIFNAME for ip"); - return; + return (-1); } /* Get the full set of existing flags for this stream */ if (ioctl(ip_fd, SIOCGLIFFLAGS, (char *)&lifr) == -1) - Perror0_exit("plumb_one_device: SIOCFLIFFLAGS"); + Perror0_exit("ifplumb: SIOCGLIFFLAGS"); if (debug) { - (void) printf("ifconfig: plumb_one_device: %s got flags:\n", + (void) printf("ifconfig: ifplumb: %s got flags:\n", lifr.lifr_name); print_flags(lifr.lifr_flags); (void) putchar('\n'); @@ -3890,7 +3999,7 @@ plumb_one_device(int af) if ((ip_muxid = ioctl(mux_fd, I_PLINK, ip_fd)) == -1) Perror0_exit("I_PLINK for ip"); (void) close(mux_fd); - return; + return (lifr.lifr_ppa); } /* @@ -3901,15 +4010,11 @@ plumb_one_device(int af) * only on the interface stream, not on the ARP stream. */ if (debug) - (void) printf("ifconfig: plumb_one_device: ifname: %s\n", name); + (void) printf("ifconfig: ifplumb: interface %s", ifname); - /* - * We use DLPI_NOATTACH because the arp module will do the attach - * itself for DLPI style-2 devices. - */ - retval = dlpi_open(name, &dh_arp, DLPI_NOATTACH); + retval = dlpi_open(linkname, &dh_arp, dlpi_flags); if (retval != DLPI_SUCCESS) - Perrdlpi_exit("cannot open link", name, retval); + Perrdlpi_exit("cannot open link", linkname, retval); arp_fd = dlpi_fd(dh_arp); if (ioctl(arp_fd, I_PUSH, ARP_MOD_NAME) == -1) @@ -3919,16 +4024,13 @@ plumb_one_device(int af) * Tell ARP the name and unit number for this interface. * Note that arp has no support for transparent ioctls. */ - if (strioctl(arp_fd, SIOCSLIFNAME, (char *)&lifr, - sizeof (lifr)) == -1) { + if (strioctl(arp_fd, SIOCSLIFNAME, &lifr, sizeof (lifr)) == -1) { if (errno != EEXIST) Perror0_exit("SIOCSLIFNAME for arp"); Perror0("SIOCSLIFNAME for arp"); - dlpi_close(dh_arp); - dlpi_close(dh_ip); - (void) close(mux_fd); - return; + goto out; } + /* * PLINK the IP and ARP streams so that ifconfig can exit * without tearing down the stream. @@ -3942,12 +4044,13 @@ plumb_one_device(int af) if (debug) (void) printf("arp muxid = %d\n", arp_muxid); +out: dlpi_close(dh_ip); dlpi_close(dh_arp); (void) close(mux_fd); + return (lifr.lifr_ppa); } - /* * If this is a physical interface then remove it. * If it is a logical interface name use SIOCLIFREMOVEIF to @@ -3965,6 +4068,7 @@ inetunplumb(char *arg, int64_t param) uint64_t flags; boolean_t changed_arp_muxid = _B_FALSE; int save_errno; + boolean_t v6 = (afp->af_af == AF_INET6); strptr = strchr(name, ':'); if (strptr != NULL || strcmp(name, LOOPBACK_IF) == 0) { @@ -3986,7 +4090,7 @@ inetunplumb(char *arg, int64_t param) * We used /dev/udp or udp6 to set up the mux. So we have to use * the same now for PUNLINK also. */ - if (afp->af_af == AF_INET6) + if (v6) udp_dev_name = UDP6_DEV_NAME; else udp_dev_name = UDP_DEV_NAME; @@ -4002,6 +4106,50 @@ inetunplumb(char *arg, int64_t param) Perror0_exit("unplumb: SIOCGLIFFLAGS"); } flags = lifr.lifr_flags; + + if (flags & IFF_IPMP) { + lifgroupinfo_t lifgr; + ifaddrlistx_t *ifaddrs, *ifaddrp; + + /* + * The kernel will fail the I_PUNLINK if the group still has + * members, but check now to provide a better error message. + */ + if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) == -1) + Perror0_exit("unplumb: SIOCGLIFGROUPNAME"); + + (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname, + LIFGRNAMSIZ); + if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1) + Perror0_exit("unplumb: SIOCGLIFGROUPINFO"); + + if ((v6 && lifgr.gi_nv6 != 0) || (!v6 && lifgr.gi_nv4 != 0)) { + (void) fprintf(stderr, "ifconfig: %s: cannot unplumb:" + " IPMP group is not empty\n", name); + exit(1); + } + + /* + * The kernel will fail the I_PUNLINK if the IPMP interface + * has administratively up addresses; bring 'em down. + */ + if (ifaddrlistx(name, IFF_UP|IFF_DUPLICATE, 0, &ifaddrs) == -1) + Perror2_exit(name, "cannot get address list"); + + ifaddrp = ifaddrs; + for (; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (((ifaddrp->ia_flags & IFF_IPV6) && !v6) || + (!(ifaddrp->ia_flags & IFF_IPV6) && v6)) + continue; + + if (!ifaddr_down(ifaddrp)) { + Perror2_exit(ifaddrp->ia_name, + "cannot bring down"); + } + } + ifaddrlistx_free(ifaddrs); + } + if (ioctl(muxid_fd, SIOCGLIFMUXID, (caddr_t)&lifr) < 0) { Perror0_exit("unplumb: SIOCGLIFMUXID"); } @@ -4098,12 +4246,6 @@ inetplumb(char *arg, int64_t param) Perror2_exit("plumb: SIOCLIFADDIF", name); } } - /* - * IP can create the new logical interface on a different - * physical interface in the same IPMP group. Take the new - * interface into account for further operations. - */ - (void) strncpy(name, lifr.lifr_name, sizeof (name)); return (0); } @@ -4131,10 +4273,229 @@ inetplumb(char *arg, int64_t param) if (debug) (void) printf("inetplumb: %s af %d\n", name, afp->af_af); - plumb_one_device(afp->af_af); + (void) ifplumb(name, name, _B_FALSE, afp->af_af); + return (0); +} + +/* ARGSUSED */ +static int +inetipmp(char *arg, int64_t param) +{ + int retval; + + /* + * Treat e.g. "ifconfig ipmp0:2 ipmp" as "ifconfig ipmp0:2 plumb". + * Otherwise, try to create the requested IPMP interface. + */ + if (strchr(name, ':') != NULL) + retval = inetplumb(arg, param); + else + retval = create_ipmp(name, afp->af_af, name, _B_FALSE); + + /* + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). + */ + if (retval == -1) + exit(1); return (0); } +/* + * Create an IPMP group `grname' with address family `af'. If `ifname' is + * non-NULL, it specifies the interface name to use. Otherwise, use the name + * ipmpN, where N corresponds to the lowest available integer. If `implicit' + * is set, then the group is being created as a side-effect of placing an + * underlying interface in a group. Also start in.mpathd if necessary. + */ +static int +create_ipmp(const char *grname, int af, const char *ifname, boolean_t implicit) +{ + int ppa; + static int ipmp_daemon_started; + + if (debug) { + (void) printf("create_ipmp: ifname %s grname %s af %d\n", + ifname != NULL ? ifname : "NULL", grname, af); + } + + if (ifname != NULL) + ppa = ifplumb(IPMPSTUB, ifname, _B_FALSE, af); + else + ppa = ifplumb(IPMPSTUB, "ipmp", _B_TRUE, af); + + if (ppa == -1) { + Perror2(grname, "cannot create IPMP interface"); + return (-1); + } + + if (ifname != NULL) + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); + else + (void) snprintf(lifr.lifr_name, LIFNAMSIZ, "ipmp%d", ppa); + + /* + * To preserve backward-compatibility, always bring up the link-local + * address for implicitly-created IPv6 IPMP interfaces. + */ + if (implicit && af == AF_INET6) { + if (ioctl(s6, SIOCGLIFFLAGS, &lifr) == 0) { + lifr.lifr_flags |= IFF_UP; + (void) ioctl(s6, SIOCSLIFFLAGS, &lifr); + } + } + + /* + * If the caller requested a different group name, issue a + * SIOCSLIFGROUPNAME on the new IPMP interface. + */ + if (strcmp(lifr.lifr_name, grname) != 0) { + (void) strlcpy(lifr.lifr_groupname, grname, LIFGRNAMSIZ); + if (ioctl(s, SIOCSLIFGROUPNAME, &lifr) == -1) { + Perror0("SIOCSLIFGROUPNAME"); + return (-1); + } + } + + /* + * If we haven't done so yet, ensure in.mpathd is started. + */ + if (ipmp_daemon_started++ == 0) + start_ipmp_daemon(); + + return (0); +} + +/* + * Check if `ifname' is plumbed and in an IPMP group on its "other" address + * family. If so, create a matching IPMP group for address family `af'. + */ +static int +create_ipmp_peer(int af, const char *ifname) +{ + int fd; + lifgroupinfo_t lifgr; + + assert(af == AF_INET || af == AF_INET6); + + /* + * Get the socket for the "other" address family. + */ + fd = (af == AF_INET) ? s6 : s4; + + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) != 0) + return (0); + + (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname, LIFGRNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPINFO, &lifgr) != 0) + return (0); + + /* + * If `ifname' *is* the IPMP group interface, or if the relevant + * address family is already configured, then there's nothing to do. + */ + if (strcmp(lifgr.gi_grifname, ifname) == 0 || + (af == AF_INET && lifgr.gi_v4) || (af == AF_INET6 && lifgr.gi_v6)) + return (0); + + return (create_ipmp(lifgr.gi_grname, af, lifgr.gi_grifname, _B_TRUE)); +} + +/* + * Start in.mpathd if it's not already running. + */ +static void +start_ipmp_daemon(void) +{ + int retval; + ipmp_handle_t ipmp_handle; + + /* + * Ping in.mpathd to see if it's running already. + */ + if ((retval = ipmp_open(&ipmp_handle)) != IPMP_SUCCESS) { + (void) fprintf(stderr, "ifconfig: cannot create IPMP handle: " + "%s\n", ipmp_errmsg(retval)); + return; + } + + retval = ipmp_ping_daemon(ipmp_handle); + ipmp_close(ipmp_handle); + + switch (retval) { + case IPMP_ENOMPATHD: + break; + case IPMP_SUCCESS: + return; + default: + (void) fprintf(stderr, "ifconfig: cannot ping in.mpathd: %s\n", + ipmp_errmsg(retval)); + break; + } + + /* + * Start in.mpathd. Note that in.mpathd will handle multiple + * incarnations (ipmp_ping_daemon() is just an optimization) so we + * don't need to worry about racing with another ifconfig process. + */ + switch (fork()) { + case -1: + Perror0_exit("start_ipmp_daemon: fork"); + /* NOTREACHED */ + case 0: + (void) execl(MPATHD_PATH, MPATHD_PATH, NULL); + _exit(1); + /* NOTREACHED */ + default: + break; + } +} + +/* + * Bring the address named by `ifaddrp' up or down. Doesn't trust any mutable + * values in ia_flags since they may be stale. + */ +static boolean_t +ifaddr_op(ifaddrlistx_t *ifaddrp, boolean_t up) +{ + struct lifreq lifrl; /* Local lifreq struct */ + int fd = (ifaddrp->ia_flags & IFF_IPV4) ? s4 : s6; + + (void) memset(&lifrl, 0, sizeof (lifrl)); + (void) strlcpy(lifrl.lifr_name, ifaddrp->ia_name, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFFLAGS, &lifrl) == -1) + return (_B_FALSE); + + if (up) { + lifrl.lifr_flags |= IFF_UP; + } else { + /* + * If we've been asked to bring down an IFF_DUPLICATE address, + * then get the address and set it. This will cause IP to + * clear IFF_DUPLICATE and stop the automatic recovery timer. + */ + if (lifrl.lifr_flags & IFF_DUPLICATE) { + return (ioctl(fd, SIOCGLIFADDR, &lifrl) != -1 && + ioctl(fd, SIOCSLIFADDR, &lifrl) != -1); + } + lifrl.lifr_flags &= ~IFF_UP; + } + return (ioctl(fd, SIOCSLIFFLAGS, &lifrl) == 0); +} + +static boolean_t +ifaddr_up(ifaddrlistx_t *ifaddrp) +{ + return (ifaddr_op(ifaddrp, _B_TRUE)); +} + +static boolean_t +ifaddr_down(ifaddrlistx_t *ifaddrp) +{ + return (ifaddr_op(ifaddrp, _B_FALSE)); +} + void Perror0(const char *cmd) { @@ -4404,14 +4765,14 @@ print_flags(uint64_t flags) } static void -print_config_flags(uint64_t flags) +print_config_flags(int af, uint64_t flags) { - int cnt, i; + if_config_cmd_t *cmdp; - cnt = sizeof (if_config_cmd_tbl) / sizeof (if_config_cmd_t); - for (i = 0; i < cnt; i++) { - if (flags & if_config_cmd_tbl[i].iff_flag) { - (void) printf("%s ", if_config_cmd_tbl[i].iff_name); + for (cmdp = if_config_cmd_tbl; cmdp->iff_flag != 0; cmdp++) { + if ((flags & cmdp->iff_flag) && + (cmdp->iff_af == AF_UNSPEC || cmdp->iff_af == af)) { + (void) printf("%s ", cmdp->iff_name); } } } @@ -4454,7 +4815,18 @@ in_getmask(struct sockaddr_in *saddr, boolean_t addr_set) } static int -strioctl(int s, int cmd, char *buf, int buflen) +lifnum(const char *ifname) +{ + const char *cp; + + if ((cp = strchr(ifname, ':')) == NULL) + return (0); + else + return (atoi(cp + 1)); +} + +static int +strioctl(int s, int cmd, void *buf, int buflen) { struct strioctl ioc; @@ -4681,6 +5053,7 @@ usage(void) "\t[ modlist ]\n" "\t[ modinsert <module_name@position> ]\n" "\t[ modremove <module_name@position> ]\n" + "\t[ ipmp ]\n" "\t[ group <groupname>] | [ group \"\"]\n" "\t[ deprecated | -deprecated ]\n" "\t[ standby | -standby ]\n" diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h index 0ac600001f..f11f4d0a94 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -11,8 +11,6 @@ #ifndef _IFCONFIG_H #define _IFCONFIG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -39,7 +37,6 @@ extern void Perrdlpi_exit(const char *, const char *, int); extern int doifrevarp(const char *, struct sockaddr_in *); -extern int dlpi_set_address(const char *, uchar_t *, uint_t); extern void dlpi_print_address(const char *); #ifdef __cplusplus diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c index 725c8b24c3..aba4794942 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "ifconfig.h" #include <sys/types.h> @@ -88,6 +86,7 @@ doifrevarp(const char *linkname, struct sockaddr_in *laddr) /* don't try to revarp if we know it won't work */ if ((lifr.lifr_flags & IFF_LOOPBACK) || (lifr.lifr_flags & IFF_NOARP) || + (lifr.lifr_flags & IFF_IPMP) || (lifr.lifr_flags & IFF_POINTOPOINT)) { (void) close(s); return (0); @@ -326,28 +325,6 @@ rarp_recv(dlpi_handle_t dh, struct arphdr *ans, size_t msglen, return (DLPI_ETIMEDOUT); } -int -dlpi_set_address(const char *linkname, uchar_t *physaddr, uint_t physaddrlen) -{ - int retval; - dlpi_handle_t dh; - - if ((retval = dlpi_open(linkname, &dh, 0)) != DLPI_SUCCESS) { - Perrdlpi("dlpi_open failed", linkname, retval); - return (-1); - } - - if ((retval = dlpi_set_physaddr(dh, DL_CURR_PHYS_ADDR, physaddr, - physaddrlen)) != DLPI_SUCCESS) { - Perrdlpi("dlpi_set_physaddr failed", linkname, retval); - dlpi_close(dh); - return (-1); - } - - dlpi_close(dh); - return (0); -} - void dlpi_print_address(const char *linkname) { diff --git a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h index 900b5841ed..5cca3ecb2e 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h +++ b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Copyright (c) 1983, 1988, 1993 @@ -414,16 +414,9 @@ struct interface { (IS_REMOTE|IS_PASSIVE)) /* - * Is an IP interface up? Because of the way IPMP uses deprecated - * interfaces, we need to check more than the IFF_UP and IFF_RUNNING - * interface flags here. Basically, we do not want to use IFF_DEPRECATED - * interfaces unless they are also IFF_STANDBY and not IFF_INACTIVE. + * Is an IP interface up? */ -#define IFF_GOOD (IFF_UP|IFF_RUNNING) -#define IS_IFF_UP(f) \ - ((((f) & (IFF_GOOD|IFF_DEPRECATED)) == IFF_GOOD) || \ - (((f) & (IFF_GOOD|IFF_INACTIVE|IFF_STANDBY)) == \ - (IFF_GOOD|IFF_STANDBY))) +#define IS_IFF_UP(f) (((f) & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING)) /* * This defines interfaces that we should not use for advertising or diff --git a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c index 79ae02e703..a3a26ac2cb 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Copyright (c) 1983, 1988, 1993 @@ -36,8 +36,6 @@ * $FreeBSD: src/sbin/routed/trace.c,v 1.6 2000/08/11 08:24:38 sheldonh Exp $ */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "pathnames.h" #include <signal.h> @@ -566,6 +564,7 @@ static struct bits if_bits[] = { { IFF_TEMPORARY, 0, "TEMPORARY" }, { IFF_FIXEDMTU, 0, "FIXEDMTU" }, { IFF_VIRTUAL, 0, "VIRTUAL"}, + { IFF_IPMP, 0, "IPMP"}, { 0, 0, NULL} }; @@ -898,8 +897,8 @@ trace_upslot(struct rt_entry *rt, print_rts(rts, 0, 0, rts->rts_gate != new->rts_gate, rts->rts_tag != new->rts_tag, - rts != rt->rt_spares || AGE_RT(rt->rt_state, - rts->rts_origin, rt->rt_ifp)); + rts != rt->rt_spares || + AGE_RT(rt->rt_state, rts->rts_origin, rt->rt_ifp)); (void) fprintf(ftrace, "\n %19s%-16s ", "", (new->rts_gate != rts->rts_gate ? @@ -1173,10 +1172,9 @@ trace_rip(const char *dir1, const char *dir2, if (NA->a_type == RIP_AUTH_PW && n == msg->rip_nets) { (void) fprintf(ftrace, "\tPassword" - " Authentication:" - " \"%s\"\n", + " Authentication: \"%s\"\n", qstring(NA->au.au_pw, - RIP_AUTH_PW_LEN)); + RIP_AUTH_PW_LEN)); continue; } @@ -1186,13 +1184,12 @@ trace_rip(const char *dir1, const char *dir2, "\tMD5 Auth" " pkt_len=%d KeyID=%u" " auth_len=%d" - " seqno=%#lx" - " rsvd=%#x,%#x\n", + " seqno=%#x" + " rsvd=%#hx,%#hx\n", ntohs(NA->au.a_md5.md5_pkt_len), NA->au.a_md5.md5_keyid, NA->au.a_md5.md5_auth_len, - (unsigned long)ntohl(NA->au.a_md5. - md5_seqno), + ntohl(NA->au.a_md5.md5_seqno), ntohs(NA->au.a_md5.rsvd[0]), ntohs(NA->au.a_md5.rsvd[1])); continue; @@ -1217,14 +1214,12 @@ trace_rip(const char *dir1, const char *dir2, inet_ntoa(tmp_mask)); } else if (msg->rip_vers == RIPv1) { (void) fprintf(ftrace, "\t%-18s ", - addrname(n->n_dst, - ntohl(n->n_mask), - n->n_mask == 0 ? 2 : 1)); + addrname(n->n_dst, ntohl(n->n_mask), + n->n_mask == 0 ? 2 : 1)); } else { (void) fprintf(ftrace, "\t%-18s ", - addrname(n->n_dst, - ntohl(n->n_mask), - n->n_mask == 0 ? 2 : 0)); + addrname(n->n_dst, ntohl(n->n_mask), + n->n_mask == 0 ? 2 : 0)); } (void) fprintf(ftrace, "metric=%-2lu ", (unsigned long)ntohl(n->n_metric)); @@ -1242,8 +1237,8 @@ trace_rip(const char *dir1, const char *dir2, break; case RIPCMD_TRACEON: - (void) fprintf(ftrace, "\tfile=\"%.*s\"\n", size-4, - msg->rip_tracefile); + (void) fprintf(ftrace, "\tfile=\"%.*s\"\n", size - 4, + msg->rip_tracefile); break; case RIPCMD_TRACEOFF: diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile new file mode 100644 index 0000000000..a256cf5f49 --- /dev/null +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile @@ -0,0 +1,48 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +PROG = ipmpstat +ROOTFS_PROG = $(PROG) +ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%) + +include $(SRC)/cmd/Makefile.cmd + +C99MODE = $(C99_ENABLE) +LDLIBS += -lipmp -lsocket -lsysevent -lnvpair +XGETFLAGS += -a -x $(PROG).xcl + +.KEEP_STATE: + +all: $(PROG) + +install: all $(ROOTSBINPROG) $(ROOTUSRSBINLINKS) + +clean: + +lint: lint_PROG + +$(ROOTUSRSBINLINKS): + -$(RM) $@; $(SYMLINK) ../../sbin/$(@F) $@ + +include $(SRC)/cmd/Makefile.targ diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c new file mode 100644 index 0000000000..4620c34a24 --- /dev/null +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c @@ -0,0 +1,1498 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <alloca.h> +#include <arpa/inet.h> +#include <assert.h> +#include <errno.h> +#include <ipmp_admin.h> +#include <ipmp_query.h> +#include <libintl.h> +#include <libnvpair.h> +#include <libsysevent.h> +#include <locale.h> +#include <netdb.h> +#include <signal.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/ipmp.h> +#include <sys/sysmacros.h> +#include <sys/termios.h> +#include <sys/types.h> + +/* + * ipmpstat -- display IPMP subsystem status. + * + * This utility makes extensive use of libipmp and IPMP sysevents to gather + * and pretty-print the status of the IPMP subsystem. All output formats + * except for -p (probe) use libipmp to create a point-in-time snapshot of the + * IPMP subsystem (unless the test-special -L flag is used), and then output + * the contents of that snapshot in a user-specified manner. Because the + * output format and requested fields aren't known until run-time, three sets + * of function pointers and two core data structures are used. Specifically: + * + * * The ipmpstat_walker_t function pointers (walk_*) iterate through + * all instances of a given IPMP object (group, interface, or address). + * At most one ipmpstat_walker_t is used per ipmpstat invocation. + * Since target information is included with the interface information, + * both -i and -t use the interface walker (walk_if()). + * + * * The ipmpstat_sfunc_t function pointers (sfunc_*) obtain a given + * value for a given IPMP object. Each ipmpstat_sunc_t is passed a + * buffer to write its result into, the buffer's size, and an + * ipmpstat_sfunc_arg_t state structure. The state structure consists + * of a pointer to the IPMP object to obtain information from + * (sa_data), and an open libipmp handle (sa_ih) which can be used to + * do additional libipmp queries, if necessary (e.g., because the + * object does not have all of the needed information). + * + * * The ipmpstat_field_t structure provides the list of supported fields + * for a given output format, along with output formatting information + * (e.g., field width), and a pointer to an ipmpstat_sfunc_t function + * that can obtain the value for a IPMP given object. For a given + * ipmpstat output format, there's a corresponding array of + * ipmpstat_field_t structures. Thus, one ipmpstat_field_t array is + * used per ipmpstat invocation. + * + * * The ipmpstat_ofmt_t provides an ordered list of the requested + * ipmpstat_field_t's (e.g., via -o) for a given ipmpstat invocation. + * It is built at runtime from the command-line arguments. This + * structure (and a given IPMP object) is used by ofmt_output() to + * output a single line of information about that IPMP object. + * + * * The ipmpstat_cbfunc_t function pointers (*_cbfunc) are called back + * by the walkers. They are used both internally to implement nested + * walks, and by the ipmpstat output logic to provide the glue between + * the IPMP object walkers and the ofmt_output() logic. Usually, a + * single line is output for each IPMP object, and thus ofmt_output() + * can be directly invoked (see info_output_cbfunc()). However, if + * multiple lines need to be output, then a more complex cbfunc is + * needed (see targinfo_output_cbfunc()). At most one cbfunc is used + * per ipmpstat invocation. + */ + +/* + * Data type used by the sfunc callbacks to obtain the requested information + * from the agreed-upon object. + */ +typedef struct ipmpstat_sfunc_arg { + ipmp_handle_t sa_ih; + void *sa_data; +} ipmpstat_sfunc_arg_t; + +typedef void ipmpstat_sfunc_t(ipmpstat_sfunc_arg_t *, char *, uint_t); + +/* + * Data type that describes how to output a field; used by ofmt_output*(). + */ +typedef struct ipmpstat_field { + const char *f_name; /* field name */ + uint_t f_width; /* output width */ + ipmpstat_sfunc_t *f_sfunc; /* value->string function */ +} ipmpstat_field_t; + +/* + * Data type that specifies the output field order; used by ofmt_output*() + */ +typedef struct ipmpstat_ofmt { + const ipmpstat_field_t *o_field; /* current field info */ + struct ipmpstat_ofmt *o_next; /* next field */ +} ipmpstat_ofmt_t; + +/* + * Function pointers used to iterate through IPMP objects. + */ +typedef void ipmpstat_cbfunc_t(ipmp_handle_t, void *, void *); +typedef void ipmpstat_walker_t(ipmp_handle_t, ipmpstat_cbfunc_t *, void *); + +/* + * Data type used to implement nested walks. + */ +typedef struct ipmpstat_walkdata { + ipmpstat_cbfunc_t *iw_func; /* caller-specified callback */ + void *iw_funcarg; /* caller-specified arg */ +} ipmpstat_walkdata_t; + +/* + * Data type used by enum2str() to map an enumerated value to a string. + */ +typedef struct ipmpstat_enum { + const char *e_name; /* string */ + int e_val; /* value */ +} ipmpstat_enum_t; + +/* + * Data type used to pass state between probe_output() and probe_event(). + */ +typedef struct ipmpstat_probe_state { + ipmp_handle_t ps_ih; /* open IPMP handle */ + ipmpstat_ofmt_t *ps_ofmt; /* requested ofmt string */ +} ipmpstat_probe_state_t; + +/* + * Options that modify the output mode; more than one may be lit. + */ +typedef enum { + IPMPSTAT_OPT_NUMERIC = 0x1, + IPMPSTAT_OPT_PARSABLE = 0x2 +} ipmpstat_opt_t; + +/* + * Indices for the FLAGS field of the `-i' output format. + */ +enum { + IPMPSTAT_IFLAG_INDEX, IPMPSTAT_SFLAG_INDEX, IPMPSTAT_M4FLAG_INDEX, + IPMPSTAT_BFLAG_INDEX, IPMPSTAT_M6FLAG_INDEX, IPMPSTAT_DFLAG_INDEX, + IPMPSTAT_HFLAG_INDEX, IPMPSTAT_NUM_FLAGS +}; + +#define IPMPSTAT_NCOL 80 +#define NS2FLOATMS(ns) ((float)(ns) / (NANOSEC / MILLISEC)) +#define MS2FLOATSEC(ms) ((float)(ms) / 1000) + +static const char *progname; +static hrtime_t probe_output_start; +static struct winsize winsize; +static ipmpstat_opt_t opt; +static ipmpstat_enum_t addr_state[], group_state[], if_state[], if_link[]; +static ipmpstat_enum_t if_probe[], targ_mode[]; +static ipmpstat_field_t addr_fields[], group_fields[], if_fields[]; +static ipmpstat_field_t probe_fields[], targ_fields[]; +static ipmpstat_cbfunc_t walk_addr_cbfunc, walk_if_cbfunc; +static ipmpstat_cbfunc_t info_output_cbfunc, targinfo_output_cbfunc; +static ipmpstat_walker_t walk_addr, walk_if, walk_group; + +static int probe_event(sysevent_t *, void *); +static void probe_output(ipmp_handle_t, ipmpstat_ofmt_t *); +static ipmpstat_field_t *field_find(ipmpstat_field_t *, const char *); +static ipmpstat_ofmt_t *ofmt_create(const char *, ipmpstat_field_t []); +static void ofmt_output(const ipmpstat_ofmt_t *, ipmp_handle_t, void *); +static void ofmt_destroy(ipmpstat_ofmt_t *); +static void enum2str(const ipmpstat_enum_t *, int, char *, uint_t); +static void sockaddr2str(const struct sockaddr_storage *, char *, uint_t); +static void sighandler(int); +static void usage(void); +static void die(const char *, ...); +static void die_ipmperr(int, const char *, ...); +static void warn(const char *, ...); +static void warn_ipmperr(int, const char *, ...); + +int +main(int argc, char **argv) +{ + int c; + int err; + const char *ofields = NULL; + ipmp_handle_t ih; + ipmp_qcontext_t qcontext = IPMP_QCONTEXT_SNAP; + ipmpstat_ofmt_t *ofmt; + ipmpstat_field_t *fields = NULL; + ipmpstat_cbfunc_t *cbfunc; + ipmpstat_walker_t *walker; + + if ((progname = strrchr(argv[0], '/')) == NULL) + progname = argv[0]; + else + progname++; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + + while ((c = getopt(argc, argv, "nLPo:agipt")) != EOF) { + if (fields != NULL && strchr("agipt", c) != NULL) + die("only one output format may be specified\n"); + + switch (c) { + case 'n': + opt |= IPMPSTAT_OPT_NUMERIC; + break; + case 'L': + /* Undocumented option: for testing use ONLY */ + qcontext = IPMP_QCONTEXT_LIVE; + break; + case 'P': + opt |= IPMPSTAT_OPT_PARSABLE; + break; + case 'o': + ofields = optarg; + break; + case 'a': + walker = walk_addr; + cbfunc = info_output_cbfunc; + fields = addr_fields; + break; + case 'g': + walker = walk_group; + cbfunc = info_output_cbfunc; + fields = group_fields; + break; + case 'i': + walker = walk_if; + cbfunc = info_output_cbfunc; + fields = if_fields; + break; + case 'p': + fields = probe_fields; + break; + case 't': + walker = walk_if; + cbfunc = targinfo_output_cbfunc; + fields = targ_fields; + break; + default: + usage(); + break; + } + } + + if (argc > optind || fields == NULL) + usage(); + + if (opt & IPMPSTAT_OPT_PARSABLE) { + if (ofields == NULL) { + die("output field list (-o) required in parsable " + "output mode\n"); + } else if (strcasecmp(ofields, "all") == 0) { + die("\"all\" not allowed in parsable output mode\n"); + } + } + + /* + * Obtain the window size and monitor changes to the size. This data + * is used to redisplay the output headers when necessary. + */ + (void) sigset(SIGWINCH, sighandler); + sighandler(SIGWINCH); + + if ((err = ipmp_open(&ih)) != IPMP_SUCCESS) + die_ipmperr(err, "cannot create IPMP handle"); + + if (ipmp_ping_daemon(ih) != IPMP_SUCCESS) + die("cannot contact in.mpathd(1M) -- is IPMP in use?\n"); + + /* + * Create the ofmt linked list that will eventually be passed to + * to ofmt_output() to output the fields. + */ + ofmt = ofmt_create(ofields, fields); + + /* + * If we've been asked to display probes, then call the probe output + * function. Otherwise, snapshot IPMP state (or use live state) and + * invoke the specified walker with the specified callback function. + */ + if (fields == probe_fields) { + probe_output(ih, ofmt); + } else { + if ((err = ipmp_setqcontext(ih, qcontext)) != IPMP_SUCCESS) { + if (qcontext == IPMP_QCONTEXT_SNAP) + die_ipmperr(err, "cannot snapshot IPMP state"); + else + die_ipmperr(err, "cannot use live IPMP state"); + } + (*walker)(ih, cbfunc, ofmt); + } + + ofmt_destroy(ofmt); + ipmp_close(ih); + + return (EXIT_SUCCESS); +} + +/* + * Walks all IPMP groups on the system and invokes `cbfunc' on each, passing + * it `ih', the ipmp_groupinfo_t pointer, and `arg'. + */ +static void +walk_group(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg) +{ + int err; + uint_t i; + ipmp_groupinfo_t *grinfop; + ipmp_grouplist_t *grlistp; + + if ((err = ipmp_getgrouplist(ih, &grlistp)) != IPMP_SUCCESS) + die_ipmperr(err, "cannot get IPMP group list"); + + for (i = 0; i < grlistp->gl_ngroup; i++) { + err = ipmp_getgroupinfo(ih, grlistp->gl_groups[i], &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + grlistp->gl_groups[i]); + continue; + } + (*cbfunc)(ih, grinfop, arg); + ipmp_freegroupinfo(grinfop); + } + + ipmp_freegrouplist(grlistp); +} + +/* + * Walks all IPMP interfaces on the system and invokes `cbfunc' on each, + * passing it `ih', the ipmp_ifinfo_t pointer, and `arg'. + */ +static void +walk_if(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg) +{ + ipmpstat_walkdata_t iw = { cbfunc, arg }; + + walk_group(ih, walk_if_cbfunc, &iw); +} + +/* + * Walks all IPMP data addresses on the system and invokes `cbfunc' on each. + * passing it `ih', the ipmp_addrinfo_t pointer, and `arg'. + */ +static void +walk_addr(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg) +{ + ipmpstat_walkdata_t iw = { cbfunc, arg }; + + walk_group(ih, walk_addr_cbfunc, &iw); +} + +/* + * Nested walker callback function for walk_if(). + */ +static void +walk_if_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + int err; + uint_t i; + ipmp_groupinfo_t *grinfop = infop; + ipmp_ifinfo_t *ifinfop; + ipmp_iflist_t *iflistp = grinfop->gr_iflistp; + ipmpstat_walkdata_t *iwp = arg; + + for (i = 0; i < iflistp->il_nif; i++) { + err = ipmp_getifinfo(ih, iflistp->il_ifs[i], &ifinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for interface `%s'", + iflistp->il_ifs[i]); + continue; + } + (*iwp->iw_func)(ih, ifinfop, iwp->iw_funcarg); + ipmp_freeifinfo(ifinfop); + } +} + +/* + * Nested walker callback function for walk_addr(). + */ +static void +walk_addr_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + int err; + uint_t i; + ipmp_groupinfo_t *grinfop = infop; + ipmp_addrinfo_t *adinfop; + ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; + ipmpstat_walkdata_t *iwp = arg; + char addr[INET6_ADDRSTRLEN]; + struct sockaddr_storage *addrp; + + for (i = 0; i < adlistp->al_naddr; i++) { + addrp = &adlistp->al_addrs[i]; + err = ipmp_getaddrinfo(ih, grinfop->gr_name, addrp, &adinfop); + if (err != IPMP_SUCCESS) { + sockaddr2str(addrp, addr, sizeof (addr)); + warn_ipmperr(err, "cannot get info for `%s'", addr); + continue; + } + (*iwp->iw_func)(ih, adinfop, iwp->iw_funcarg); + ipmp_freeaddrinfo(adinfop); + } +} + +static void +sfunc_nvwarn(const char *nvname, char *buf, uint_t bufsize) +{ + warn("cannot retrieve %s\n", nvname); + (void) strlcpy(buf, "?", bufsize); +} + +static void +sfunc_addr_address(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_addrinfo_t *adinfop = arg->sa_data; + + sockaddr2str(&adinfop->ad_addr, buf, bufsize); +} + +static void +sfunc_addr_group(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + ipmp_addrinfo_t *adinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + err = ipmp_getgroupinfo(arg->sa_ih, adinfop->ad_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + adinfop->ad_group); + (void) strlcpy(buf, "?", bufsize); + return; + } + (void) strlcpy(buf, grinfop->gr_ifname, bufsize); + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_addr_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_addrinfo_t *adinfop = arg->sa_data; + + enum2str(addr_state, adinfop->ad_state, buf, bufsize); +} + +static void +sfunc_addr_inbound(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_addrinfo_t *adinfop = arg->sa_data; + + (void) strlcpy(buf, adinfop->ad_binding, bufsize); +} + +static void +sfunc_addr_outbound(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + uint_t i, nactive = 0; + ipmp_ifinfo_t *ifinfop; + ipmp_iflist_t *iflistp; + ipmp_addrinfo_t *adinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + if (adinfop->ad_state == IPMP_ADDR_DOWN) + return; + + /* + * If there's no inbound interface for this address, there can't + * be any outbound traffic. + */ + if (adinfop->ad_binding[0] == '\0') + return; + + /* + * The address can use any active interface in the group, so + * obtain all of those. + */ + err = ipmp_getgroupinfo(arg->sa_ih, adinfop->ad_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + adinfop->ad_group); + (void) strlcpy(buf, "?", bufsize); + return; + } + + iflistp = grinfop->gr_iflistp; + for (i = 0; i < iflistp->il_nif; i++) { + err = ipmp_getifinfo(arg->sa_ih, iflistp->il_ifs[i], &ifinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for interface `%s'", + iflistp->il_ifs[i]); + continue; + } + + if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE) { + if (nactive++ != 0) + (void) strlcat(buf, " ", bufsize); + (void) strlcat(buf, ifinfop->if_name, bufsize); + } + ipmp_freeifinfo(ifinfop); + } + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_group_name(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + (void) strlcpy(buf, grinfop->gr_name, bufsize); +} + +static void +sfunc_group_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + (void) strlcpy(buf, grinfop->gr_ifname, bufsize); +} + +static void +sfunc_group_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + enum2str(group_state, grinfop->gr_state, buf, bufsize); +} + +static void +sfunc_group_fdt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + if (grinfop->gr_fdt == 0) + return; + + (void) snprintf(buf, bufsize, "%.2fs", MS2FLOATSEC(grinfop->gr_fdt)); +} + +static void +sfunc_group_interfaces(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + uint_t i; + char *active, *inactive, *unusable; + uint_t nactive = 0, ninactive = 0, nunusable = 0; + ipmp_groupinfo_t *grinfop = arg->sa_data; + ipmp_iflist_t *iflistp = grinfop->gr_iflistp; + ipmp_ifinfo_t *ifinfop; + + active = alloca(bufsize); + active[0] = '\0'; + inactive = alloca(bufsize); + inactive[0] = '\0'; + unusable = alloca(bufsize); + unusable[0] = '\0'; + + for (i = 0; i < iflistp->il_nif; i++) { + err = ipmp_getifinfo(arg->sa_ih, iflistp->il_ifs[i], &ifinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for interface `%s'", + iflistp->il_ifs[i]); + continue; + } + + if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE) { + if (nactive++ != 0) + (void) strlcat(active, " ", bufsize); + (void) strlcat(active, ifinfop->if_name, bufsize); + } else if (ifinfop->if_flags & IPMP_IFFLAG_INACTIVE) { + if (ninactive++ != 0) + (void) strlcat(inactive, " ", bufsize); + (void) strlcat(inactive, ifinfop->if_name, bufsize); + } else { + if (nunusable++ != 0) + (void) strlcat(unusable, " ", bufsize); + (void) strlcat(unusable, ifinfop->if_name, bufsize); + } + + ipmp_freeifinfo(ifinfop); + } + + (void) strlcpy(buf, active, bufsize); + + if (ninactive > 0) { + if (nactive != 0) + (void) strlcat(buf, " ", bufsize); + + (void) strlcat(buf, "(", bufsize); + (void) strlcat(buf, inactive, bufsize); + (void) strlcat(buf, ")", bufsize); + } + + if (nunusable > 0) { + if (nactive + ninactive != 0) + (void) strlcat(buf, " ", bufsize); + + (void) strlcat(buf, "[", bufsize); + (void) strlcat(buf, unusable, bufsize); + (void) strlcat(buf, "]", bufsize); + } +} + +static void +sfunc_if_name(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + (void) strlcpy(buf, ifinfop->if_name, bufsize); +} + +static void +sfunc_if_active(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE) + (void) strlcpy(buf, "yes", bufsize); + else + (void) strlcpy(buf, "no", bufsize); +} + +static void +sfunc_if_group(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + ipmp_ifinfo_t *ifinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + err = ipmp_getgroupinfo(arg->sa_ih, ifinfop->if_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + ifinfop->if_group); + (void) strlcpy(buf, "?", bufsize); + return; + } + + (void) strlcpy(buf, grinfop->gr_ifname, bufsize); + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_if_flags(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + ipmp_ifinfo_t *ifinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + assert(bufsize > IPMPSTAT_NUM_FLAGS); + + (void) memset(buf, '-', IPMPSTAT_NUM_FLAGS); + buf[IPMPSTAT_NUM_FLAGS] = '\0'; + + if (ifinfop->if_type == IPMP_IF_STANDBY) + buf[IPMPSTAT_SFLAG_INDEX] = 's'; + + if (ifinfop->if_flags & IPMP_IFFLAG_INACTIVE) + buf[IPMPSTAT_IFLAG_INDEX] = 'i'; + + if (ifinfop->if_flags & IPMP_IFFLAG_DOWN) + buf[IPMPSTAT_DFLAG_INDEX] = 'd'; + + if (ifinfop->if_flags & IPMP_IFFLAG_HWADDRDUP) + buf[IPMPSTAT_HFLAG_INDEX] = 'h'; + + err = ipmp_getgroupinfo(arg->sa_ih, ifinfop->if_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get broadcast/multicast info for " + "group `%s'", ifinfop->if_group); + return; + } + + if (strcmp(grinfop->gr_m4ifname, ifinfop->if_name) == 0) + buf[IPMPSTAT_M4FLAG_INDEX] = 'm'; + + if (strcmp(grinfop->gr_m6ifname, ifinfop->if_name) == 0) + buf[IPMPSTAT_M6FLAG_INDEX] = 'M'; + + if (strcmp(grinfop->gr_bcifname, ifinfop->if_name) == 0) + buf[IPMPSTAT_BFLAG_INDEX] = 'b'; + + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_if_link(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + enum2str(if_link, ifinfop->if_linkstate, buf, bufsize); +} + +static void +sfunc_if_probe(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + enum2str(if_probe, ifinfop->if_probestate, buf, bufsize); +} + +static void +sfunc_if_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + enum2str(if_state, ifinfop->if_state, buf, bufsize); +} + +static void +sfunc_probe_id(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + uint32_t probe_id; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_ID, &probe_id) != 0) { + sfunc_nvwarn("IPMP_PROBE_ID", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%u", probe_id); +} + +static void +sfunc_probe_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + char *ifname; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_string(nvl, IPMP_IF_NAME, &ifname) != 0) { + sfunc_nvwarn("IPMP_IF_NAME", buf, bufsize); + return; + } + + (void) strlcpy(buf, ifname, bufsize); +} + +static void +sfunc_probe_time(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + hrtime_t start; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_START_TIME, &start) != 0) { + sfunc_nvwarn("IPMP_PROBE_START_TIME", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%.2fs", + (float)(start - probe_output_start) / NANOSEC); +} + +static void +sfunc_probe_target(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + uint_t nelem; + struct sockaddr_storage *target; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_byte_array(nvl, IPMP_PROBE_TARGET, + (uchar_t **)&target, &nelem) != 0) { + sfunc_nvwarn("IPMP_PROBE_TARGET", buf, bufsize); + return; + } + + sockaddr2str(target, buf, bufsize); +} + +static void +sfunc_probe_rtt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + hrtime_t start, ackproc; + nvlist_t *nvl = arg->sa_data; + uint32_t state; + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) { + sfunc_nvwarn("IPMP_PROBE_STATE", buf, bufsize); + return; + } + + if (state != IPMP_PROBE_ACKED) + return; + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_START_TIME, &start) != 0) { + sfunc_nvwarn("IPMP_PROBE_START_TIME", buf, bufsize); + return; + } + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, &ackproc) != 0) { + sfunc_nvwarn("IPMP_PROBE_ACKPROC_TIME", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(ackproc - start)); +} + +static void +sfunc_probe_netrtt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + hrtime_t sent, ackrecv; + nvlist_t *nvl = arg->sa_data; + uint32_t state; + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) { + sfunc_nvwarn("IPMP_PROBE_STATE", buf, bufsize); + return; + } + + if (state != IPMP_PROBE_ACKED) + return; + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_SENT_TIME, &sent) != 0) { + sfunc_nvwarn("IPMP_PROBE_SENT_TIME", buf, bufsize); + return; + } + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, &ackrecv) != 0) { + sfunc_nvwarn("IPMP_PROBE_ACKRECV_TIME", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(ackrecv - sent)); +} + +static void +sfunc_probe_rttavg(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int64_t rttavg; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, &rttavg) != 0) { + sfunc_nvwarn("IPMP_PROBE_TARGET_RTTAVG", buf, bufsize); + return; + } + + if (rttavg != 0) + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(rttavg)); +} + +static void +sfunc_probe_rttdev(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int64_t rttdev; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, &rttdev) != 0) { + sfunc_nvwarn("IPMP_PROBE_TARGET_RTTDEV", buf, bufsize); + return; + } + + if (rttdev != 0) + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(rttdev)); +} + +/* ARGSUSED */ +static void +probe_enabled_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + uint_t *nenabledp = arg; + ipmp_ifinfo_t *ifinfop = infop; + + if (ifinfop->if_probestate != IPMP_PROBE_DISABLED) + (*nenabledp)++; +} + +static void +probe_output(ipmp_handle_t ih, ipmpstat_ofmt_t *ofmt) +{ + char sub[MAX_SUBID_LEN]; + evchan_t *evch; + ipmpstat_probe_state_t ps = { ih, ofmt }; + uint_t nenabled = 0; + + /* + * Check if any interfaces are enabled for probe-based failure + * detection. If not, immediately fail. + */ + walk_if(ih, probe_enabled_cbfunc, &nenabled); + if (nenabled == 0) + die("probe-based failure detection is disabled\n"); + + probe_output_start = gethrtime(); + + /* + * Unfortunately, until 4791900 is fixed, only privileged processes + * can bind and thus receive sysevents. + */ + errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evch, EVCH_CREAT); + if (errno != 0) { + if (errno == EPERM) + die("insufficient privileges for -p\n"); + die("sysevent_evc_bind to channel %s failed", IPMP_EVENT_CHAN); + } + + /* + * The subscriber must be unique in order for sysevent_evc_subscribe() + * to succeed, so combine our name and pid. + */ + (void) snprintf(sub, sizeof (sub), "%d-%s", getpid(), progname); + + errno = sysevent_evc_subscribe(evch, sub, EC_IPMP, probe_event, &ps, 0); + if (errno != 0) + die("sysevent_evc_subscribe for class %s failed", EC_IPMP); + + for (;;) + (void) pause(); +} + +static int +probe_event(sysevent_t *ev, void *arg) +{ + nvlist_t *nvl; + uint32_t state; + uint32_t version; + ipmpstat_probe_state_t *psp = arg; + + if (strcmp(sysevent_get_subclass_name(ev), ESC_IPMP_PROBE_STATE) != 0) + return (0); + + if (sysevent_get_attr_list(ev, &nvl) != 0) { + warn("sysevent_get_attr_list failed; dropping event"); + return (0); + } + + if (nvlist_lookup_uint32(nvl, IPMP_EVENT_VERSION, &version) != 0) { + warn("dropped event with no IPMP_EVENT_VERSION\n"); + goto out; + } + + if (version != IPMP_EVENT_CUR_VERSION) { + warn("dropped event with unsupported IPMP_EVENT_VERSION %d\n", + version); + goto out; + } + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) { + warn("dropped event with no IPMP_PROBE_STATE\n"); + goto out; + } + + if (state == IPMP_PROBE_ACKED || state == IPMP_PROBE_LOST) + ofmt_output(psp->ps_ofmt, psp->ps_ih, nvl); +out: + nvlist_free(nvl); + return (0); +} + +static void +sfunc_targ_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_targinfo_t *targinfop = arg->sa_data; + + (void) strlcpy(buf, targinfop->it_name, bufsize); +} + +static void +sfunc_targ_mode(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_targinfo_t *targinfop = arg->sa_data; + + enum2str(targ_mode, targinfop->it_targmode, buf, bufsize); +} + +static void +sfunc_targ_testaddr(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_targinfo_t *targinfop = arg->sa_data; + + if (targinfop->it_targmode != IPMP_TARG_DISABLED) + sockaddr2str(&targinfop->it_testaddr, buf, bufsize); +} + +static void +sfunc_targ_targets(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + uint_t i; + char *targname = alloca(bufsize); + ipmp_targinfo_t *targinfop = arg->sa_data; + ipmp_addrlist_t *targlistp = targinfop->it_targlistp; + + for (i = 0; i < targlistp->al_naddr; i++) { + sockaddr2str(&targlistp->al_addrs[i], targname, bufsize); + (void) strlcat(buf, targname, bufsize); + if ((i + 1) < targlistp->al_naddr) + (void) strlcat(buf, " ", bufsize); + } +} + +static void +info_output_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + ofmt_output(arg, ih, infop); +} + +static void +targinfo_output_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + ipmp_ifinfo_t *ifinfop = infop; + ipmp_if_targmode_t targmode4 = ifinfop->if_targinfo4.it_targmode; + ipmp_if_targmode_t targmode6 = ifinfop->if_targinfo6.it_targmode; + + /* + * Usually, either IPv4 or IPv6 probing will be enabled, but the admin + * may enable both. If only one is enabled, omit the other one so as + * to not encourage the admin to enable both. If neither is enabled, + * we still print one just so the admin can see a MODE of "disabled". + */ + if (targmode4 != IPMP_TARG_DISABLED || targmode6 == IPMP_TARG_DISABLED) + ofmt_output(arg, ih, &ifinfop->if_targinfo4); + if (targmode6 != IPMP_TARG_DISABLED) + ofmt_output(arg, ih, &ifinfop->if_targinfo6); +} + +/* + * Creates an ipmpstat_ofmt_t field list from the comma-separated list of + * user-specified fields passed via `ofields'. The table of known fields + * (and their attributes) is passed via `fields'. + */ +static ipmpstat_ofmt_t * +ofmt_create(const char *ofields, ipmpstat_field_t fields[]) +{ + char *token, *lasts, *ofields_dup; + const char *fieldname; + ipmpstat_ofmt_t *ofmt, *ofmt_head = NULL, *ofmt_tail; + ipmpstat_field_t *fieldp; + uint_t cols = 0; + + /* + * If "-o" was omitted or "-o all" was specified, build a list of + * field names. If "-o" was omitted, stop building the list when + * we run out of columns. + */ + if (ofields == NULL || strcasecmp(ofields, "all") == 0) { + for (fieldp = fields; fieldp->f_name != NULL; fieldp++) { + cols += fieldp->f_width; + if (ofields == NULL && cols > IPMPSTAT_NCOL) + break; + + if ((ofmt = calloc(sizeof (*ofmt), 1)) == NULL) + die("cannot allocate output format list"); + + ofmt->o_field = fieldp; + if (ofmt_head == NULL) { + ofmt_head = ofmt; + ofmt_tail = ofmt; + } else { + ofmt_tail->o_next = ofmt; + ofmt_tail = ofmt; + } + } + return (ofmt_head); + } + + if ((ofields_dup = strdup(ofields)) == NULL) + die("cannot allocate output format list"); + + token = ofields_dup; + while ((fieldname = strtok_r(token, ",", &lasts)) != NULL) { + token = NULL; + + if ((fieldp = field_find(fields, fieldname)) == NULL) { + /* + * Since machine parsers are unlikely to be able to + * gracefully handle missing fields, die if we're in + * parsable mode. Otherwise, just print a warning. + */ + if (opt & IPMPSTAT_OPT_PARSABLE) + die("unknown output field `%s'\n", fieldname); + + warn("ignoring unknown output field `%s'\n", fieldname); + continue; + } + + if ((ofmt = calloc(sizeof (*ofmt), 1)) == NULL) + die("cannot allocate output format list"); + + ofmt->o_field = fieldp; + if (ofmt_head == NULL) { + ofmt_head = ofmt; + ofmt_tail = ofmt; + } else { + ofmt_tail->o_next = ofmt; + ofmt_tail = ofmt; + } + } + + free(ofields_dup); + if (ofmt_head == NULL) + die("no valid output fields specified\n"); + + return (ofmt_head); +} + +/* + * Destroys the provided `ofmt' field list. + */ +static void +ofmt_destroy(ipmpstat_ofmt_t *ofmt) +{ + ipmpstat_ofmt_t *ofmt_next; + + for (; ofmt != NULL; ofmt = ofmt_next) { + ofmt_next = ofmt->o_next; + free(ofmt); + } +} + +/* + * Outputs a header for the fields named by `ofmt'. + */ +static void +ofmt_output_header(const ipmpstat_ofmt_t *ofmt) +{ + const ipmpstat_field_t *fieldp; + + for (; ofmt != NULL; ofmt = ofmt->o_next) { + fieldp = ofmt->o_field; + + if (ofmt->o_next == NULL) + (void) printf("%s", fieldp->f_name); + else + (void) printf("%-*s", fieldp->f_width, fieldp->f_name); + } + (void) printf("\n"); +} + +/* + * Outputs one row of values for the fields named by `ofmt'. The values to + * output are obtained through the `ofmt' function pointers, which are + * indirectly passed the `ih' and `arg' structures for state; see the block + * comment at the start of this file for details. + */ +static void +ofmt_output(const ipmpstat_ofmt_t *ofmt, ipmp_handle_t ih, void *arg) +{ + int i; + char buf[1024]; + boolean_t escsep; + static int nrow; + const char *value; + uint_t width, valwidth; + uint_t compress, overflow = 0; + const ipmpstat_field_t *fieldp; + ipmpstat_sfunc_arg_t sfunc_arg; + + /* + * For each screenful of data, display the header. + */ + if ((nrow++ % winsize.ws_row) == 0 && !(opt & IPMPSTAT_OPT_PARSABLE)) { + ofmt_output_header(ofmt); + nrow++; + } + + /* + * Check if we'll be displaying multiple fields per line, and thus + * need to escape the field separator. + */ + escsep = (ofmt != NULL && ofmt->o_next != NULL); + + for (; ofmt != NULL; ofmt = ofmt->o_next) { + fieldp = ofmt->o_field; + + sfunc_arg.sa_ih = ih; + sfunc_arg.sa_data = arg; + + buf[0] = '\0'; + (*fieldp->f_sfunc)(&sfunc_arg, buf, sizeof (buf)); + + if (opt & IPMPSTAT_OPT_PARSABLE) { + for (i = 0; buf[i] != '\0'; i++) { + if (escsep && (buf[i] == ':' || buf[i] == '\\')) + (void) putchar('\\'); + (void) putchar(buf[i]); + } + if (ofmt->o_next != NULL) + (void) putchar(':'); + } else { + value = (buf[0] == '\0') ? "--" : buf; + + /* + * To avoid needless line-wraps, for the last field, + * don't include any trailing whitespace. + */ + if (ofmt->o_next == NULL) { + (void) printf("%s", value); + continue; + } + + /* + * For other fields, grow the width as necessary to + * ensure the value completely fits. However, if + * there's unused whitespace in subsequent fields, + * then "compress" that whitespace to attempt to get + * the columns to line up again. + */ + width = fieldp->f_width; + valwidth = strlen(value); + + if (valwidth + overflow >= width) { + overflow += valwidth - width + 1; + (void) printf("%s ", value); + continue; + } + + if (overflow > 0) { + compress = MIN(overflow, width - valwidth); + overflow -= compress; + width -= compress; + } + (void) printf("%-*s", width, value); + } + } + (void) printf("\n"); + + /* + * In case stdout has been redirected to e.g. a pipe, flush stdout so + * that commands can act on our output immediately. + */ + (void) fflush(stdout); +} + +/* + * Searches the `fields' array for a field matching `fieldname'. Returns + * a pointer to that field on success, or NULL on failure. + */ +static ipmpstat_field_t * +field_find(ipmpstat_field_t *fields, const char *fieldname) +{ + ipmpstat_field_t *fieldp; + + for (fieldp = fields; fieldp->f_name != NULL; fieldp++) { + if (strcasecmp(fieldp->f_name, fieldname) == 0) + return (fieldp); + } + return (NULL); +} + +/* + * Uses `enums' to map `enumval' to a string, and stores at most `bufsize' + * bytes of that string into `buf'. + */ +static void +enum2str(const ipmpstat_enum_t *enums, int enumval, char *buf, uint_t bufsize) +{ + const ipmpstat_enum_t *enump; + + for (enump = enums; enump->e_name != NULL; enump++) { + if (enump->e_val == enumval) { + (void) strlcpy(buf, enump->e_name, bufsize); + return; + } + } + (void) snprintf(buf, bufsize, "<%d>", enumval); +} + +/* + * Stores the stringified value of the sockaddr_storage pointed to by `ssp' + * into at most `bufsize' bytes of `buf'. + */ +static void +sockaddr2str(const struct sockaddr_storage *ssp, char *buf, uint_t bufsize) +{ + int flags = NI_NOFQDN; + socklen_t socklen; + struct sockaddr *sp = (struct sockaddr *)ssp; + + /* + * Sadly, getnameinfo() does not allow the socklen to be oversized for + * a given family -- so we must determine the exact size to pass to it. + */ + switch (ssp->ss_family) { + case AF_INET: + socklen = sizeof (struct sockaddr_in); + break; + case AF_INET6: + socklen = sizeof (struct sockaddr_in6); + break; + default: + (void) strlcpy(buf, "?", bufsize); + return; + } + + if (opt & IPMPSTAT_OPT_NUMERIC) + flags |= NI_NUMERICHOST; + + (void) getnameinfo(sp, socklen, buf, bufsize, NULL, 0, flags); +} + +static void +sighandler(int sig) +{ + assert(sig == SIGWINCH); + + if (ioctl(1, TIOCGWINSZ, &winsize) == -1 || + winsize.ws_col == 0 || winsize.ws_row == 0) { + winsize.ws_col = 80; + winsize.ws_row = 24; + } +} + +static void +usage(void) +{ + const char *argstr = gettext("[-n] [-o <field> [-P]] -a|-g|-i|-p|-t"); + + (void) fprintf(stderr, gettext("usage: %s %s\n"), progname, argstr); + exit(EXIT_FAILURE); +} + +/* PRINTFLIKE1 */ +static void +warn(const char *format, ...) +{ + va_list alist; + int error = errno; + + format = gettext(format); + (void) fprintf(stderr, gettext("%s: warning: "), progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", strerror(error)); +} + +/* PRINTFLIKE2 */ +static void +warn_ipmperr(int ipmperr, const char *format, ...) +{ + va_list alist; + + format = gettext(format); + (void) fprintf(stderr, gettext("%s: warning: "), progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + (void) fprintf(stderr, ": %s\n", ipmp_errmsg(ipmperr)); +} + +/* PRINTFLIKE1 */ +static void +die(const char *format, ...) +{ + va_list alist; + int error = errno; + + format = gettext(format); + (void) fprintf(stderr, "%s: ", progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", strerror(error)); + + exit(EXIT_FAILURE); +} + +/* PRINTFLIKE2 */ +static void +die_ipmperr(int ipmperr, const char *format, ...) +{ + va_list alist; + + format = gettext(format); + (void) fprintf(stderr, "%s: ", progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + (void) fprintf(stderr, ": %s\n", ipmp_errmsg(ipmperr)); + + exit(EXIT_FAILURE); +} + +static ipmpstat_field_t addr_fields[] = { + { "ADDRESS", 26, sfunc_addr_address }, + { "STATE", 7, sfunc_addr_state }, + { "GROUP", 12, sfunc_addr_group }, + { "INBOUND", 12, sfunc_addr_inbound }, + { "OUTBOUND", 23, sfunc_addr_outbound }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t group_fields[] = { + { "GROUP", 12, sfunc_group_ifname }, + { "GROUPNAME", 12, sfunc_group_name }, + { "STATE", 10, sfunc_group_state }, + { "FDT", 10, sfunc_group_fdt }, + { "INTERFACES", 30, sfunc_group_interfaces }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t if_fields[] = { + { "INTERFACE", 12, sfunc_if_name }, + { "ACTIVE", 8, sfunc_if_active }, + { "GROUP", 12, sfunc_if_group }, + { "FLAGS", 10, sfunc_if_flags }, + { "LINK", 10, sfunc_if_link }, + { "PROBE", 10, sfunc_if_probe }, + { "STATE", 10, sfunc_if_state }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t probe_fields[] = { + { "TIME", 10, sfunc_probe_time }, + { "INTERFACE", 12, sfunc_probe_ifname }, + { "PROBE", 7, sfunc_probe_id }, + { "NETRTT", 10, sfunc_probe_netrtt }, + { "RTT", 10, sfunc_probe_rtt }, + { "RTTAVG", 10, sfunc_probe_rttavg }, + { "TARGET", 20, sfunc_probe_target }, + { "RTTDEV", 10, sfunc_probe_rttdev }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t targ_fields[] = { + { "INTERFACE", 12, sfunc_targ_ifname }, + { "MODE", 10, sfunc_targ_mode }, + { "TESTADDR", 20, sfunc_targ_testaddr }, + { "TARGETS", 38, sfunc_targ_targets }, + { NULL, 0, NULL } +}; + +static ipmpstat_enum_t addr_state[] = { + { "up", IPMP_ADDR_UP }, + { "down", IPMP_ADDR_DOWN }, + { NULL, 0 } +}; + +static ipmpstat_enum_t group_state[] = { + { "ok", IPMP_GROUP_OK }, + { "failed", IPMP_GROUP_FAILED }, + { "degraded", IPMP_GROUP_DEGRADED }, + { NULL, 0 } +}; + +static ipmpstat_enum_t if_link[] = { + { "up", IPMP_LINK_UP }, + { "down", IPMP_LINK_DOWN }, + { "unknown", IPMP_LINK_UNKNOWN }, + { NULL, 0 } +}; + +static ipmpstat_enum_t if_probe[] = { + { "ok", IPMP_PROBE_OK }, + { "failed", IPMP_PROBE_FAILED }, + { "unknown", IPMP_PROBE_UNKNOWN }, + { "disabled", IPMP_PROBE_DISABLED }, + { NULL, 0 } +}; + +static ipmpstat_enum_t if_state[] = { + { "ok", IPMP_IF_OK }, + { "failed", IPMP_IF_FAILED }, + { "unknown", IPMP_IF_UNKNOWN }, + { "offline", IPMP_IF_OFFLINE }, + { NULL, 0 } +}; + +static ipmpstat_enum_t targ_mode[] = { + { "disabled", IPMP_TARG_DISABLED }, + { "routes", IPMP_TARG_ROUTES }, + { "multicast", IPMP_TARG_MULTICAST }, + { NULL, 0 } +}; diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl new file mode 100644 index 0000000000..e2398aaf64 --- /dev/null +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl @@ -0,0 +1,106 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +msgid " " +msgid "%-*s" +msgid "%.2fms" +msgid "%.2fs" +msgid "%d-%s" +msgid "%s" +msgid "%s " +msgid "%s: " +msgid "%u" +msgid "(" +msgid ")" +msgid "," +msgid "--" +msgid ": %s\n" +msgid "?" +msgid "[" +msgid "]" +msgid "<%d>" +msgid "\n" +msgid "ACTIVE" +msgid "ADDRESS" +msgid "EC_ipmp" +msgid "ESC_ipmp_probe_state" +msgid "FDT" +msgid "FLAGS" +msgid "GROUP" +msgid "GROUPNAME" +msgid "INBOUND" +msgid "INTERFACE" +msgid "INTERFACES" +msgid "IPMP_IF_NAME" +msgid "IPMP_PROBE_ACKPROC_TIME" +msgid "IPMP_PROBE_ACKRECV_TIME" +msgid "IPMP_PROBE_ID" +msgid "IPMP_PROBE_SENT_TIME" +msgid "IPMP_PROBE_START_TIME" +msgid "IPMP_PROBE_STATE" +msgid "IPMP_PROBE_TARGET" +msgid "IPMP_PROBE_TARGET_RTTAVG" +msgid "IPMP_PROBE_TARGET_RTTDEV" +msgid "LINK" +msgid "MODE" +msgid "NETRTT" +msgid "OUTBOUND" +msgid "PROBE" +msgid "RTT" +msgid "RTTAVG" +msgid "RTTDEV" +msgid "STATE" +msgid "TARGET" +msgid "TARGETS" +msgid "TESTADDR" +msgid "TIME" +msgid "agipt" +msgid "all" +msgid "bufsize > IPMPSTAT_NUM_FLAGS" +msgid "com.sun:ipmp:events" +msgid "degraded" +msgid "disabled" +msgid "down" +msgid "failed" +msgid "ipmp_event_version" +msgid "ipmp_if_name" +msgid "ipmp_probe_ackproc_time" +msgid "ipmp_probe_ackrecv_time" +msgid "ipmp_probe_id" +msgid "ipmp_probe_sent_time" +msgid "ipmp_probe_start_time" +msgid "ipmp_probe_state" +msgid "ipmp_probe_target" +msgid "ipmp_probe_target_rttavg" +msgid "ipmp_probe_target_rttdev" +msgid "ipmpstat.c" +msgid "multicast" +msgid "nLPo:agipt" +msgid "no" +msgid "offline" +msgid "ok" +msgid "routes" +msgid "sig == SIGWINCH" +msgid "unknown" +msgid "up" +msgid "yes" diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types b/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types index bb15199492..e42bc626d8 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types @@ -1,13 +1,12 @@ # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -23,15 +22,12 @@ # CDDL HEADER END # -#pragma ident "%Z%%M% %I% %E% SMI" - fmt_version 1.0 mod_version 1.0 #PERM_CLASS default filter name string -filter if_groupname string filter user user filter projid int32 filter if_name ifname diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c index 17891ffc78..2a4ff60d57 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c @@ -18,7 +18,7 @@ * * CDDL HEADER END * - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,8 +37,6 @@ * contributors. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdio.h> #include <strings.h> #include <errno.h> @@ -243,7 +241,7 @@ main(int argc, char *argv[]) ushort_t udp_src_port6; /* used to identify replies */ uint_t flowinfo = 0; uint_t class = 0; - char tmp_buf[INET6_ADDRSTRLEN]; + char abuf[INET6_ADDRSTRLEN]; int c; int i; boolean_t has_sys_ip_config; @@ -671,24 +669,18 @@ main(int argc, char *argv[]) Printf("PING %s: %d data bytes\n", targethost, datalen); } else { if (ai_dst->ai_family == AF_INET) { - Printf("PING %s (%s): %d data bytes\n", - targethost, - inet_ntop(AF_INET, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in *) - ai_dst->ai_addr)->sin_addr, - tmp_buf, sizeof (tmp_buf)), - datalen); + (void) inet_ntop(AF_INET, + &((struct sockaddr_in *)(void *) + ai_dst->ai_addr)->sin_addr, + abuf, sizeof (abuf)); } else { - Printf("PING %s (%s): %d data bytes\n", - targethost, - inet_ntop(AF_INET6, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in6 *) - ai_dst->ai_addr)->sin6_addr, - tmp_buf, sizeof (tmp_buf)), - datalen); + (void) inet_ntop(AF_INET6, + &((struct sockaddr_in6 *)(void *) + ai_dst->ai_addr)->sin6_addr, + abuf, sizeof (abuf)); } + Printf("PING %s (%s): %d data bytes\n", + targethost, abuf, datalen); } } @@ -1074,12 +1066,12 @@ select_all_src_addrs(union any_in_addr **src_addr_list, struct addrinfo *ai, int num_dst = 1; int i; - if (probe_all) - for (aip = ai; aip->ai_next != NULL; - aip = aip->ai_next, num_dst++); + if (probe_all) { + for (aip = ai; aip->ai_next != NULL; aip = aip->ai_next) + num_dst++; + } - list = (union any_in_addr *) - calloc((size_t)num_dst, sizeof (union any_in_addr)); + list = calloc((size_t)num_dst, sizeof (union any_in_addr)); if (list == NULL) { Fprintf(stderr, "%s: calloc: %s\n", progname, strerror(errno)); exit(EXIT_FAILURE); @@ -1472,7 +1464,7 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index, int i; /* pull out the interface list */ - num_ifs = ifaddrlist(&al, family, errbuf); + num_ifs = ifaddrlist(&al, family, LIFC_UNDER_IPMP, errbuf); if (num_ifs == -1) { Fprintf(stderr, "%s: %s\n", progname, errbuf); exit(EXIT_FAILURE); @@ -1699,8 +1691,8 @@ send_scheduled_probe() } else { Printf("no answer from %s(%s)\n", targethost, inet_ntop(current_targetaddr->family, - ¤t_targetaddr->dst_addr, - tmp_buf, sizeof (tmp_buf))); + ¤t_targetaddr->dst_addr, + tmp_buf, sizeof (tmp_buf))); } } /* @@ -1736,9 +1728,8 @@ send_scheduled_probe() * Each time we move to a new targetaddr, which has * a different target IP address, we update this field. */ - current_targetaddr->starting_seq_num = - use_udp ? dest_port : - (ntransmitted % (MAX_ICMP_SEQ + 1)); + current_targetaddr->starting_seq_num = use_udp ? + dest_port : (ntransmitted % (MAX_ICMP_SEQ + 1)); } } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c index f062247997..e5b23fa126 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -139,7 +139,7 @@ check_device(dlpi_handle_t *dhp, char **devicep) if (ioctl(s, SIOCGIFFLAGS, (char *)ifr) < 0) pr_err("ioctl SIOCGIFFLAGS"); if ((ifr->ifr_flags & - (IFF_VIRTUAL|IFF_LOOPBACK|IFF_UP| + (IFF_VIRTUAL|IFF_IPMP|IFF_UP| IFF_RUNNING)) == (IFF_UP|IFF_RUNNING)) break; } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c index adc6a932b0..cae75df60d 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,8 +27,6 @@ * @(#)$Header: traceroute.c,v 1.49 97/06/13 02:30:23 leres Exp $ (LBL) */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/param.h> #include <sys/file.h> #include <sys/ioctl.h> @@ -707,7 +705,7 @@ get_hostinfo(char *host, int family, struct addrinfo **aipp) struct addrinfo hints, *ai; struct in6_addr addr6; struct in_addr addr; - char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ + char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ int rc; /* @@ -720,11 +718,10 @@ get_hostinfo(char *host, int family, struct addrinfo **aipp) IN6_V4MAPPED_TO_INADDR(&addr6, &addr); /* convert it back to a string */ - (void) inet_ntop(AF_INET, (void *)&addr, temp_buf, - sizeof (temp_buf)); + (void) inet_ntop(AF_INET, &addr, abuf, sizeof (abuf)); /* now the host is an IPv4 address */ - (void) strcpy(host, temp_buf); + (void) strcpy(host, abuf); /* * If it's a mapped address, we convert it into IPv4 @@ -826,15 +823,19 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp) struct sockaddr_in6 *sin6_from = (struct sockaddr_in6 *)pr->from; struct addrinfo *aip; char errbuf[ERRBUFSIZE]; - char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ + char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ int num_ifs; /* all the interfaces */ int num_src_ifs; /* exclude loopback and down */ int i; + uint_t ifaddrflags = 0; source = source_input; + if (device != NULL) + ifaddrflags |= LIFC_UNDER_IPMP; + /* get the interface address list */ - num_ifs = ifaddrlist(&al, pr->family, errbuf); + num_ifs = ifaddrlist(&al, pr->family, ifaddrflags, errbuf); if (num_ifs < 0) { Fprintf(stderr, "%s: ifaddrlist: %s\n", prog, errbuf); exit(EXIT_FAILURE); @@ -881,26 +882,20 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp) if (pr->family == AF_INET) ap = (union any_in_addr *) /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in *) - aip->ai_addr)->sin_addr; + &((struct sockaddr_in *)aip->ai_addr)->sin_addr; else ap = (union any_in_addr *) /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in6 *) - aip->ai_addr)->sin6_addr; + &((struct sockaddr_in6 *)aip->ai_addr)->sin6_addr; /* * LBNL bug fixed: used to accept any src address */ tmp2_al = find_ifaddr(al, num_ifs, ap, pr->family); - if (tmp2_al == NULL) { - Fprintf(stderr, - "%s: %s is not a local %s address\n", - prog, inet_ntop(pr->family, ap, - temp_buf, sizeof (temp_buf)), - pr->name); - + (void) inet_ntop(pr->family, ap, abuf, sizeof (abuf)); + Fprintf(stderr, "%s: %s is not a local %s address\n", + prog, abuf, pr->name); free(al); freeaddrinfo(aip); return (0); @@ -928,13 +923,11 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp) set_sin(pr->from, ap, pr->family); if (aip->ai_next != NULL) { - Fprintf(stderr, - "%s: Warning: %s has multiple " - "addresses; using %s\n", - prog, source, - inet_ntop(pr->family, - (const void *)pr->from_sin_addr, - temp_buf, sizeof (temp_buf))); + (void) inet_ntop(pr->family, pr->from_sin_addr, + abuf, sizeof (abuf)); + Fprintf(stderr, "%s: Warning: %s has multiple " + "addresses; using %s\n", prog, source, + abuf); } } else { /* -i and -s used */ /* @@ -1484,7 +1477,7 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, uchar_t code; /* icmp code */ int reply; int seq = 0; - char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ + char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ int longjmp_return; /* return value from longjump */ struct ip *ip = (struct ip *)packet; boolean_t got_there = _B_FALSE; /* we hit the destination */ @@ -1535,13 +1528,11 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, if (dev_name == NULL) dev_name = "?"; + (void) inet_ntop(pr->family, pr->from_sin_addr, abuf, + sizeof (abuf)); Fprintf(stderr, "%s: Warning: Multiple interfaces found;" - " using %s @ %s\n", - prog, inet_ntop(pr->family, - (const void *)pr->from_sin_addr, - temp_buf, sizeof (temp_buf)), - dev_name); + " using %s @ %s\n", prog, abuf, dev_name); } } @@ -1558,8 +1549,7 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, Fprintf(stderr, "%s to %s", prog, hostname); } else { Fprintf(stderr, "%s to %s (%s)", prog, hostname, - inet_ntop(pr->family, (const void *)ip_addr, temp_buf, - sizeof (temp_buf))); + inet_ntop(pr->family, ip_addr, abuf, sizeof (abuf))); } if (source) @@ -1700,9 +1690,8 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, } if (pr->family == AF_INET6) { - intp = - (int *)find_ancillary_data(&in_msg, - IPPROTO_IPV6, IPV6_HOPLIMIT); + intp = find_ancillary_data(&in_msg, + IPPROTO_IPV6, IPV6_HOPLIMIT); if (intp == NULL) { Fprintf(stderr, "%s: can't find " @@ -2188,10 +2177,11 @@ static void usage(void) { Fprintf(stderr, "Usage: %s [-adFIlnSvx] [-A address_family] " -"[-c traffic_class] \n" -"\t[-f first_hop] [-g gateway [-g gateway ...]| -r] [-i iface]\n" -"\t[-L flow_label] [-m max_hop] [-P pause_sec] [-p port] [-Q max_timeout]\n" -"\t[-q nqueries] [-s src_addr] [-t tos] [-w wait_time] host [packetlen]\n", - prog); + "[-c traffic_class]\n" + "\t[-f first_hop] [-g gateway [-g gateway ...]| -r] [-i iface]\n" + "\t[-L flow_label] [-m max_hop] [-P pause_sec] [-p port] " + "[-Q max_timeout]\n" + "\t[-q nqueries] [-s src_addr] [-t tos] [-w wait_time] host " + "[packetlen]\n", prog); exit(EXIT_FAILURE); } diff --git a/usr/src/cmd/devfsadm/misc_link.c b/usr/src/cmd/devfsadm/misc_link.c index c72be6be37..44756c3e98 100644 --- a/usr/src/cmd/devfsadm/misc_link.c +++ b/usr/src/cmd/devfsadm/misc_link.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -104,7 +104,7 @@ static devfsadm_create_t misc_cbt[] = { "(^ip$)|(^tcp$)|(^udp$)|(^icmp$)|(^sctp$)|" "(^ip6$)|(^tcp6$)|(^udp6$)|(^icmp6$)|(^sctp6$)|" "(^rts$)|(^arp$)|(^ipsecah$)|(^ipsecesp$)|(^keysock$)|(^spdsock$)|" - "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)", + "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)|(^dlpistub$)", TYPE_EXACT | DRV_RE, ILEVEL_1, minor_name }, { "pseudo", "ddi_pseudo", diff --git a/usr/src/cmd/mdb/common/modules/ip/ip.c b/usr/src/cmd/mdb/common/modules/ip/ip.c index f2dadd5261..f064b58d83 100644 --- a/usr/src/cmd/mdb/common/modules/ip/ip.c +++ b/usr/src/cmd/mdb/common/modules/ip/ip.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stropts.h> #include <sys/stream.h> @@ -524,8 +522,7 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg) static const mdb_bitmask_t mmasks[] = { { "CONDEMNED", IRE_MARK_CONDEMNED, IRE_MARK_CONDEMNED }, - { "NORECV", IRE_MARK_NORECV, IRE_MARK_NORECV }, - { "HIDDEN", IRE_MARK_HIDDEN, IRE_MARK_HIDDEN }, + { "TESTHIDDEN", IRE_MARK_TESTHIDDEN, IRE_MARK_TESTHIDDEN }, { "NOADD", IRE_MARK_NOADD, IRE_MARK_NOADD }, { "TEMPORARY", IRE_MARK_TEMPORARY, IRE_MARK_TEMPORARY }, { "USESRC", IRE_MARK_USESRC_CHECK, IRE_MARK_USESRC_CHECK }, diff --git a/usr/src/cmd/rcm_daemon/Makefile.com b/usr/src/cmd/rcm_daemon/Makefile.com index 365371c45c..dbe3c1f1d1 100644 --- a/usr/src/cmd/rcm_daemon/Makefile.com +++ b/usr/src/cmd/rcm_daemon/Makefile.com @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -124,7 +124,7 @@ SUNW_network_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_vlan_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_vnic_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_aggr_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm -SUNW_ip_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil -ldladm +SUNW_ip_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil -ldladm -lipmp SUNW_ip_anon_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil LDLIBS += -lgen -lelf -lrcm -lnvpair -ldevinfo -lnsl -lsocket diff --git a/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c b/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c index be9a31f952..6e1fe1bf39 100644 --- a/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c +++ b/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * RCM module to prevent plumbed IP addresses from being removed. */ @@ -177,7 +175,7 @@ ip_anon_register(rcm_handle_t *hdl) if (_cladm(CL_INITIALIZE, CL_GET_BOOTFLAG, &bootflags) != 0) { rcm_log_message(RCM_ERROR, - gettext("unable to check cluster status\n")); + gettext("unable to check cluster status\n")); (void) mutex_unlock(&ip_list_lock); return (RCM_FAILURE); } @@ -199,7 +197,7 @@ ip_anon_register(rcm_handle_t *hdl) else { if ((exclude_addrs.cladm_netaddrs_array = malloc(sizeof (cladm_netaddr_entry_t) * - (num_exclude_addrs))) == NULL) { + (num_exclude_addrs))) == NULL) { rcm_log_message(RCM_ERROR, gettext("out of memory\n")); (void) mutex_unlock(&ip_list_lock); @@ -274,7 +272,7 @@ ip_anon_register(rcm_handle_t *hdl) rcm_log_message(RCM_DEBUG, "ip_anon: obtaining list of IPv4 addresses.\n"); - num_ifs = ifaddrlist(&al, AF_INET, errbuf); + num_ifs = ifaddrlist(&al, AF_INET, LIFC_UNDER_IPMP, errbuf); if (num_ifs == -1) { rcm_log_message(RCM_ERROR, gettext("cannot get IPv4 address list errno=%d (%s)\n"), @@ -286,7 +284,7 @@ ip_anon_register(rcm_handle_t *hdl) rcm_log_message(RCM_DEBUG, "ip_anon: obtaining list of IPv6 addresses.\n"); - num_ifs6 = ifaddrlist(&al6, AF_INET6, errbuf); + num_ifs6 = ifaddrlist(&al6, AF_INET6, LIFC_UNDER_IPMP, errbuf); if (num_ifs6 == -1) { rcm_log_message(RCM_ERROR, gettext("cannot get IPv6 address list errno=%d (%s)\n"), @@ -392,7 +390,7 @@ ip_anon_register(rcm_handle_t *hdl) * currently know about it. */ if (!(tentry->flags & IP_FLAG_CL) && - !(tentry->flags & IP_FLAG_REG)) { + !(tentry->flags & IP_FLAG_REG)) { tentry->flags |= IP_FLAG_REG; rcm_log_message(RCM_DEBUG, "ip_anon: registering interest in %s\n", diff --git a/usr/src/cmd/rcm_daemon/common/ip_rcm.c b/usr/src/cmd/rcm_daemon/common/ip_rcm.c index f62b3dfc19..24be0cafeb 100644 --- a/usr/src/cmd/rcm_daemon/common/ip_rcm.c +++ b/usr/src/cmd/rcm_daemon/common/ip_rcm.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,23 +38,22 @@ #include <errno.h> #include <fcntl.h> #include <sys/types.h> +#include <sys/wait.h> #include <sys/stat.h> #include <sys/socket.h> #include <sys/sockio.h> #include <net/if.h> #include <netinet/in.h> -#include <netinet/tcp.h> #include <arpa/inet.h> #include <stropts.h> #include <strings.h> -#include <libdevinfo.h> -#include <sys/systeminfo.h> -#include <netdb.h> +#include <sys/sysmacros.h> #include <inet/ip.h> #include <libinetutil.h> #include <libdllink.h> +#include <libgen.h> +#include <ipmp_admin.h> -#include <ipmp_mpathd.h> #include "rcm_module.h" /* @@ -75,42 +74,19 @@ #define RCM_LINK_RESOURCE_MAX (13 + LINKID_STR_WIDTH) #define RCM_STR_SUNW_IP "SUNW_ip/" /* IP address export prefix */ -#define RCM_SIZE_SUNW_IP 9 /* strlen("SUNW_ip/") + 1 */ -/* ifconfig(1M) */ -#define USR_SBIN_IFCONFIG "/usr/sbin/ifconfig" /* ifconfig command */ -#define CFGFILE_FMT_IPV4 "/etc/hostname." /* IPV4 config file */ -#define CFGFILE_FMT_IPV6 "/etc/hostname6." /* IPV6 config file */ +#define SBIN_IFCONFIG "/sbin/ifconfig" /* ifconfig command */ +#define SBIN_IFPARSE "/sbin/ifparse" /* ifparse command */ +#define DHCPFILE_FMT "/etc/dhcp.%s" /* DHCP config file */ +#define CFGFILE_FMT_IPV4 "/etc/hostname.%s" /* IPV4 config file */ +#define CFGFILE_FMT_IPV6 "/etc/hostname6.%s" /* IPV6 config file */ #define CFG_CMDS_STD " netmask + broadcast + up" /* Normal config string */ -#define CONFIG_AF_INET 0x1 /* Post-configure IPv4 */ -#define CONFIG_AF_INET6 0x2 /* Post-configure IPv6 */ -#define MAXLINE 1024 /* Max. line length */ -#define MAXARGS 512 /* Max. args in ifconfig cmd */ - -/* Physical interface flags mask */ -#define RCM_PIF_FLAGS (IFF_OFFLINE | IFF_INACTIVE | IFF_FAILED | \ - IFF_STANDBY) +#define CFG_DHCP_CMD "dhcp wait 0" /* command to start DHCP */ /* Some useful macros */ -#ifndef MAX -#define MAX(a, b) (((a) > (b))?(a):(b)) -#endif /* MAX */ - -#ifndef ISSPACE #define ISSPACE(c) ((c) == ' ' || (c) == '\t') -#endif - -#ifndef ISEOL #define ISEOL(c) ((c) == '\n' || (c) == '\r' || (c) == '\0') -#endif - -#ifndef STREQ #define STREQ(a, b) (*(a) == *(b) && strcmp((a), (b)) == 0) -#endif - -#ifndef ADDSPACE -#define ADDSPACE(a) ((void) strcat((a), " ")) -#endif /* Interface Cache state flags */ #define CACHE_IF_STALE 0x1 /* stale cached data */ @@ -125,48 +101,20 @@ /* RCM IPMP Module specific property definitions */ #define RCM_IPMP_MIN_REDUNDANCY 1 /* default min. redundancy */ -/* in.mpathd(1M) specifics */ -#define MPATHD_MAX_RETRIES 5 /* Max. offline retries */ - /* Stream module operations */ #define MOD_INSERT 0 /* Insert a mid-stream module */ #define MOD_REMOVE 1 /* Remove a mid-stream module */ #define MOD_CHECK 2 /* Check mid-stream module safety */ /* - * in.mpathd(1M) message passing formats - */ -typedef struct mpathd_cmd { - uint32_t cmd_command; /* message command */ - char cmd_ifname[LIFNAMSIZ]; /* this interface name */ - char cmd_movetoif[LIFNAMSIZ]; /* move to interface */ - uint32_t cmd_min_red; /* min. redundancy */ -/* Message passing values for MI_SETOINDEX */ -#define from_lifname cmd_ifname /* current logical interface */ -#define to_pifname cmd_movetoif /* new physical interface */ -#define addr_family cmd_min_red /* address family */ -} mpathd_cmd_t; - -/* This is needed since mpathd checks message size for offline */ -typedef struct mpathd_unoffline { - uint32_t cmd_command; /* offline / undo offline */ - char cmd_ifname[LIFNAMSIZ]; /* this interface name */ -} mpathd_unoffline_t; - -typedef struct mpathd_response { - uint32_t resp_sys_errno; /* system errno */ - uint32_t resp_mpathd_err; /* mpathd error information */ -} mpathd_response_t; - -/* * IP module data types */ /* Physical interface representation */ typedef struct ip_pif { - char pi_ifname[LIFNAMSIZ+1]; /* interface name */ - char pi_grpname[LIFNAMSIZ+1]; /* IPMP group name */ - struct ip_lif *pi_lifs; /* ptr to logical interfaces */ + char pi_ifname[LIFNAMSIZ]; /* interface name */ + char pi_grname[LIFGRNAMSIZ]; /* IPMP group name */ + struct ip_lif *pi_lifs; /* ptr to logical interfaces */ } ip_pif_t; /* Logical interface representation */ @@ -239,7 +187,7 @@ static void free_node(ip_cache_t *); static void cache_insert(ip_cache_t *); static char *ip_usage(ip_cache_t *); static int update_pif(rcm_handle_t *, int, int, struct lifreq *); -static int ip_ipmp_offline(ip_cache_t *, ip_cache_t *); +static int ip_ipmp_offline(ip_cache_t *); static int ip_ipmp_undo_offline(ip_cache_t *); static int if_cfginfo(ip_cache_t *, uint_t); static int if_unplumb(ip_cache_t *); @@ -247,9 +195,6 @@ static int if_replumb(ip_cache_t *); static void ip_log_err(ip_cache_t *, char **, char *); static char *get_link_resource(const char *); static void clr_cfg_state(ip_pif_t *); -static uint64_t if_get_flags(ip_pif_t *); -static int mpathd_send_cmd(mpathd_cmd_t *); -static int connect_to_mpathd(int); static int modop(char *, char *, int, char); static int get_modlist(char *, ip_lif_t *); static int ip_domux2fd(int *, int *, int *, struct lifreq *); @@ -262,15 +207,13 @@ static char **ip_get_addrlist(ip_cache_t *); static void ip_free_addrlist(char **); static void ip_consumer_notify(rcm_handle_t *, datalink_id_t, char **, uint_t, rcm_info_t **); +static boolean_t ip_addrstr(ip_lif_t *, char *, size_t); static int if_configure(datalink_id_t); -static int isgrouped(char *); -static int if_ipmp_config(char *, int, int); -static int if_mpathd_configure(char *, char *, int, int); -static char *get_mpathd_dest(char *, int); -static int if_getcount(int); -static void tokenize(char *, char **, char *, int *); - +static boolean_t isgrouped(const char *); +static int if_config_inst(const char *, FILE *, int, boolean_t); +static uint_t ntok(const char *cp); +static boolean_t ifconfig(const char *, const char *, const char *, boolean_t); /* Module-Private data */ static struct rcm_mod_ops ip_ops = @@ -429,9 +372,9 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, { ip_cache_t *node; ip_pif_t *pif; - int detachable = 0; - int nofailover = 0; - int ipmp = 0; + boolean_t detachable = B_FALSE; + boolean_t ipmp; + int retval; rcm_log_message(RCM_TRACE1, "IP: offline(%s)\n", rsrc); @@ -455,25 +398,17 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, pif = node->ip_pif; /* Establish default detachability criteria */ - if (flags & RCM_FORCE) { - detachable++; - } + if (flags & RCM_FORCE) + detachable = B_TRUE; - /* Check if the interface is an IPMP grouped interface */ - if (strcmp(pif->pi_grpname, "")) { - ipmp++; - } - - if (if_get_flags(pif) & IFF_NOFAILOVER) { - nofailover++; - } + /* Check if the interface is under IPMP */ + ipmp = (pif->pi_grname[0] != '\0'); /* - * Even if the interface is not in an IPMP group, it's possible that - * it's still okay to offline it as long as there are higher-level - * failover mechanisms for the addresses it owns (e.g., clustering). - * In this case, ip_offlinelist() will return RCM_SUCCESS, and we - * charge on. + * Even if the interface is not under IPMP, it's possible that it's + * still okay to offline it as long as there are higher-level failover + * mechanisms for the addresses it owns (e.g., clustering). In this + * case, ip_offlinelist() will return RCM_SUCCESS, and we charge on. */ if (!ipmp && !detachable) { /* Inform consumers of IP addresses being offlined */ @@ -489,17 +424,6 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, } } - /* - * Cannot remove an IPMP interface if IFF_NOFAILOVER is set. - */ - if (ipmp && nofailover) { - /* Interface is part of an IPMP group, and cannot failover */ - ip_log_err(node, errorp, "Failover disabled"); - errno = EBUSY; - (void) mutex_unlock(&cache_lock); - return (RCM_FAILURE); - } - /* Check if it's a query */ if (flags & RCM_QUERY) { rcm_log_message(RCM_TRACE1, "IP: offline query success(%s)\n", @@ -534,38 +458,32 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, } /* - * This an IPMP interface that can be failed over. - * Request in.mpathd(1M) to failover the physical interface. + * This is an IPMP interface that can be offlined. + * Request in.mpathd(1M) to offline the physical interface. */ + if ((retval = ip_ipmp_offline(node)) != IPMP_SUCCESS) + ip_log_err(node, errorp, "in.mpathd offline failed"); - /* Failover to "any", let mpathd determine best failover candidate */ - if (ip_ipmp_offline(node, NULL) < 0) { - ip_log_err(node, errorp, "in.mpathd failover failed"); + if (retval == IPMP_EMINRED && !detachable) { /* - * Odds are that in.mpathd(1M) could not offline the device - * because it was the last interface in the group. However, - * it's possible that it's still okay to offline it as long as - * there are higher-level failover mechanisms for the - * addresses it owns (e.g., clustering). In this case, - * ip_offlinelist() will return RCM_SUCCESS, and we charge on. - * - * TODO: change ip_ipmp_offline() to return the actual failure - * from in.mpathd so that we can verify that it did indeed - * fail with IPMP_EMINRED. + * in.mpathd(1M) could not offline the device because it was + * the last interface in the group. However, it's possible + * that it's still okay to offline it as long as there are + * higher-level failover mechanisms for the addresses it owns + * (e.g., clustering). In this case, ip_offlinelist() will + * return RCM_SUCCESS, and we charge on. */ - if (!detachable) { - /* Inform consumers of IP addresses being offlined */ - if (ip_offlinelist(hd, node, errorp, flags, - depend_info) == RCM_SUCCESS) { - rcm_log_message(RCM_DEBUG, - "IP: consumers agree on detach"); - } else { - ip_log_err(node, errorp, - "Device consumers prohibit offline"); - (void) mutex_unlock(&cache_lock); - errno = EBUSY; - return (RCM_FAILURE); - } + /* Inform consumers of IP addresses being offlined */ + if (ip_offlinelist(hd, node, errorp, flags, + depend_info) == RCM_SUCCESS) { + rcm_log_message(RCM_DEBUG, + "IP: consumers agree on detach"); + } else { + ip_log_err(node, errorp, + "Device consumers prohibit offline"); + (void) mutex_unlock(&cache_lock); + errno = EBUSY; + return (RCM_FAILURE); } } @@ -574,8 +492,8 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, _("IP: Unplumb failed (%s)\n"), pif->pi_ifname); - /* Request mpathd to undo the offline */ - if (ip_ipmp_undo_offline(node) < 0) { + /* Request in.mpathd to undo the offline */ + if (ip_ipmp_undo_offline(node) != IPMP_SUCCESS) { ip_log_err(node, errorp, "Undo offline failed"); (void) mutex_unlock(&cache_lock); return (RCM_FAILURE); @@ -862,18 +780,16 @@ static char * ip_usage(ip_cache_t *node) { ip_lif_t *lif; - int numifs; - char *buf; - char *linkidstr; + uint_t numup; + char *sep, *buf, *linkidstr; datalink_id_t linkid; - const char *fmt; - char *sep; + const char *msg; char link[MAXLINKNAMELEN]; char addrstr[INET6_ADDRSTRLEN]; char errmsg[DLADM_STRSIZE]; dladm_status_t status; - int offline = 0; - size_t bufsz; + boolean_t offline, ipmp; + size_t bufsz = 0; rcm_log_message(RCM_TRACE2, "IP: usage(%s)\n", node->ip_resource); @@ -904,76 +820,53 @@ ip_usage(ip_cache_t *node) /* TRANSLATION_NOTE: separator used between IP addresses */ sep = _(", "); - numifs = 0; - for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next) { - if (lif->li_ifflags & IFF_UP) { - numifs++; - } - } + numup = 0; + for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next) + if (lif->li_ifflags & IFF_UP) + numup++; - if (node->ip_cachestate & CACHE_IF_OFFLINED) { - offline++; - } + ipmp = (node->ip_pif->pi_grname[0] != '\0'); + offline = ((node->ip_cachestate & CACHE_IF_OFFLINED) != 0); - if (!offline && numifs) { - fmt = _("%1$s hosts IP addresses: "); - } else if (offline) { - fmt = _("%1$s offlined"); + if (offline) { + msg = _("offlined"); + } else if (numup == 0) { + msg = _("plumbed but down"); } else { - fmt = _("%1$s plumbed but down"); + if (ipmp) { + msg = _("providing connectivity for IPMP group "); + bufsz += LIFGRNAMSIZ; + } else { + msg = _("hosts IP addresses: "); + bufsz += (numup * (INET6_ADDRSTRLEN + strlen(sep))); + } } - /* space for addresses and separators, plus message */ - bufsz = ((numifs * (INET6_ADDRSTRLEN + strlen(sep))) + - strlen(fmt) + strlen(link) + 1); + bufsz += strlen(link) + strlen(msg) + 1; if ((buf = malloc(bufsz)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: usage(%s) malloc failure(%s)\n"), node->ip_resource, strerror(errno)); return (NULL); } - bzero(buf, bufsz); - (void) sprintf(buf, fmt, link); - - if (offline || (numifs == 0)) { /* Nothing else to do */ - rcm_log_message(RCM_TRACE2, "IP: usage (%s) info = %s\n", - node->ip_resource, buf); - - return (buf); - } - - for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next) { + (void) snprintf(buf, bufsz, "%s: %s", link, msg); - void *addr; - int af; - - if (!(lif->li_ifflags & IFF_UP)) { - /* ignore interfaces not up */ - continue; - } - af = lif->li_addr.family; - if (af == AF_INET6) { - addr = &lif->li_addr.ip6.sin6_addr; - } else if (af == AF_INET) { - addr = &lif->li_addr.ip4.sin_addr; + if (!offline && numup > 0) { + if (ipmp) { + (void) strlcat(buf, node->ip_pif->pi_grname, bufsz); } else { - rcm_log_message(RCM_DEBUG, - "IP: unknown addr family %d, assuming AF_INET\n", - af); - af = AF_INET; - addr = &lif->li_addr.ip4.sin_addr; - } - if (inet_ntop(af, addr, addrstr, INET6_ADDRSTRLEN) == NULL) { - rcm_log_message(RCM_ERROR, - _("IP: inet_ntop: %s\n"), strerror(errno)); - continue; - } - rcm_log_message(RCM_DEBUG, "IP addr := %s\n", addrstr); + lif = node->ip_pif->pi_lifs; + for (; lif != NULL; lif = lif->li_next) { + if (!(lif->li_ifflags & IFF_UP)) + continue; + + if (!ip_addrstr(lif, addrstr, sizeof (addrstr))) + continue; - (void) strcat(buf, addrstr); - numifs--; - if (numifs > 0) { - (void) strcat(buf, ", "); + (void) strlcat(buf, addrstr, bufsz); + if (--numup > 0) + (void) strlcat(buf, sep, bufsz); + } } } @@ -983,6 +876,32 @@ ip_usage(ip_cache_t *node) return (buf); } +static boolean_t +ip_addrstr(ip_lif_t *lif, char *addrstr, size_t addrsize) +{ + int af = lif->li_addr.family; + void *addr; + + if (af == AF_INET6) { + addr = &lif->li_addr.ip6.sin6_addr; + } else if (af == AF_INET) { + addr = &lif->li_addr.ip4.sin_addr; + } else { + rcm_log_message(RCM_DEBUG, + "IP: unknown addr family %d, assuming AF_INET\n", af); + af = AF_INET; + addr = &lif->li_addr.ip4.sin_addr; + } + if (inet_ntop(af, addr, addrstr, addrsize) == NULL) { + rcm_log_message(RCM_ERROR, + _("IP: inet_ntop: %s\n"), strerror(errno)); + return (B_FALSE); + } + + rcm_log_message(RCM_DEBUG, "IP addr := %s\n", addrstr); + return (B_TRUE); +} + /* * Cache management routines, all cache management functions should be * be called with cache_lock held. @@ -1121,11 +1040,13 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) ifnumber = ifspec.ifsp_lun; /* Get the interface flags */ - (void) strcpy(lifreq.lifr_name, lifr->lifr_name); + (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ); if (ioctl(sock, SIOCGLIFFLAGS, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFFLAGS(%s): %s\n"), - pif.pi_ifname, strerror(errno)); + if (errno != ENXIO) { + rcm_log_message(RCM_ERROR, + _("IP: SIOCGLIFFLAGS(%s): %s\n"), + lifreq.lifr_name, strerror(errno)); + } return (-1); } (void) memcpy(&ifflags, &lifreq.lifr_flags, sizeof (ifflags)); @@ -1135,12 +1056,13 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) * - IFF_VIRTUAL: e.g., loopback and vni * - IFF_POINTOPOINT: e.g., sppp and ip.tun * - !IFF_MULTICAST: e.g., ip.6to4tun + * - IFF_IPMP: IPMP meta-interfaces * * Note: The !IFF_MULTICAST check can be removed once iptun is * implemented as a datalink. */ if (!(ifflags & IFF_MULTICAST) || - (ifflags & (IFF_POINTOPOINT | IFF_VIRTUAL))) { + (ifflags & (IFF_POINTOPOINT | IFF_VIRTUAL | IFF_IPMP))) { rcm_log_message(RCM_TRACE3, "IP: if ignored (%s)\n", pif.pi_ifname); return (0); @@ -1148,23 +1070,26 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) /* Get the interface group name for this interface */ if (ioctl(sock, SIOCGLIFGROUPNAME, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFGROUPNAME(%s): %s\n"), - lifreq.lifr_name, strerror(errno)); + if (errno != ENXIO) { + rcm_log_message(RCM_ERROR, + _("IP: SIOCGLIFGROUPNAME(%s): %s\n"), + lifreq.lifr_name, strerror(errno)); + } return (-1); } /* copy the group name */ - (void) memcpy(&pif.pi_grpname, &lifreq.lifr_groupname, - sizeof (pif.pi_grpname)); - pif.pi_grpname[sizeof (pif.pi_grpname) - 1] = '\0'; + (void) strlcpy(pif.pi_grname, lifreq.lifr_groupname, + sizeof (pif.pi_grname)); /* Get the interface address for this interface */ if (ioctl(sock, SIOCGLIFADDR, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFADDR(%s): %s\n"), - lifreq.lifr_name, strerror(errno)); - return (-1); + if (errno != ENXIO) { + rcm_log_message(RCM_ERROR, + _("IP: SIOCGLIFADDR(%s): %s\n"), + lifreq.lifr_name, strerror(errno)); + return (-1); + } } (void) memcpy(&ifaddr, &lifreq.lifr_addr, sizeof (ifaddr)); @@ -1241,9 +1166,9 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) sizeof (pif.pi_ifname)); } - /* save pif properties */ - (void) memcpy(&probepif->pi_grpname, &pif.pi_grpname, - sizeof (pif.pi_grpname)); + /* save the group name */ + (void) strlcpy(probepif->pi_grname, pif.pi_grname, + sizeof (pif.pi_grname)); /* add lif, if this is a lif and it is not in cache */ if (!lif_listed) { @@ -1304,7 +1229,7 @@ update_ipifs(rcm_handle_t *hd, int af) } lifn.lifn_family = af; - lifn.lifn_flags = 0; + lifn.lifn_flags = LIFC_UNDER_IPMP; if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) { rcm_log_message(RCM_ERROR, _("IP: SIOCLGIFNUM failed: %s\n"), @@ -1321,7 +1246,7 @@ update_ipifs(rcm_handle_t *hd, int af) } lifc.lifc_family = af; - lifc.lifc_flags = 0; + lifc.lifc_flags = LIFC_UNDER_IPMP; lifc.lifc_len = sizeof (struct lifreq) * lifn.lifn_count; lifc.lifc_buf = buf; @@ -1480,39 +1405,33 @@ static void ip_log_err(ip_cache_t *node, char **errorp, char *errmsg) { char *ifname = NULL; - int len; + int size; const char *errfmt; - char *error; + char *error = NULL; if ((node != NULL) && (node->ip_pif != NULL) && (node->ip_pif->pi_ifname != NULL)) { ifname = node->ip_pif->pi_ifname; } - if (errorp != NULL) - *errorp = NULL; - if (ifname == NULL) { rcm_log_message(RCM_ERROR, _("IP: %s\n"), errmsg); errfmt = _("IP: %s"); - len = strlen(errfmt) + strlen(errmsg) + 1; - if (error = (char *)calloc(1, len)) { - (void) sprintf(error, errfmt, errmsg); - } + size = strlen(errfmt) + strlen(errmsg) + 1; + if (errorp != NULL && (error = malloc(size)) != NULL) + (void) snprintf(error, size, errfmt, errmsg); } else { rcm_log_message(RCM_ERROR, _("IP: %s(%s)\n"), errmsg, ifname); errfmt = _("IP: %s(%s)"); - len = strlen(errfmt) + strlen(errmsg) + strlen(ifname) + 1; - if (error = (char *)calloc(1, len)) { - (void) sprintf(error, errfmt, errmsg, ifname); - } + size = strlen(errfmt) + strlen(errmsg) + strlen(ifname) + 1; + if (errorp != NULL && (error = malloc(size)) != NULL) + (void) snprintf(error, size, errfmt, errmsg, ifname); } if (errorp != NULL) *errorp = error; } - /* * if_cfginfo() - Save off the config info for all interfaces */ @@ -1538,7 +1457,7 @@ if_cfginfo(ip_cache_t *node, uint_t force) rcm_log_message(RCM_ERROR, _("IP: get modlist error (%s) %s\n"), pif->pi_ifname, strerror(errno)); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } @@ -1551,7 +1470,7 @@ if_cfginfo(ip_cache_t *node, uint_t force) rcm_log_message(RCM_ERROR, _("IP: module %s@%d\n"), lif->li_modules[i], i); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } } @@ -1595,11 +1514,11 @@ if_cfginfo(ip_cache_t *node, uint_t force) /* Save reconfiguration information */ if (lif->li_ifflags & IFF_IPV4) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s:%d configinfo\n", USR_SBIN_IFCONFIG, + "%s %s:%d configinfo\n", SBIN_IFCONFIG, pif->pi_ifname, lif->li_ifnum); } else if (lif->li_ifflags & IFF_IPV6) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s:%d inet6 configinfo\n", USR_SBIN_IFCONFIG, + "%s %s:%d inet6 configinfo\n", SBIN_IFCONFIG, pif->pi_ifname, lif->li_ifnum); } rcm_log_message(RCM_TRACE2, "IP: %s\n", syscmd); @@ -1609,7 +1528,7 @@ if_cfginfo(ip_cache_t *node, uint_t force) rcm_log_message(RCM_ERROR, _("IP: ifconfig configinfo error (%s:%d) %s\n"), pif->pi_ifname, lif->li_ifnum, strerror(errno)); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } bzero(buf, MAX_RECONFIG_SIZE); @@ -1619,20 +1538,18 @@ if_cfginfo(ip_cache_t *node, uint_t force) _("IP: ifconfig configinfo error (%s:%d) %s\n"), pif->pi_ifname, lif->li_ifnum, strerror(errno)); (void) pclose(fp); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } (void) pclose(fp); - lif->li_reconfig = malloc(strlen(buf)+1); - if (lif->li_reconfig == NULL) { + if ((lif->li_reconfig = strdup(buf)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: malloc error (%s) %s\n"), pif->pi_ifname, strerror(errno)); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } - (void) strcpy(lif->li_reconfig, buf); rcm_log_message(RCM_DEBUG, "IP: if_cfginfo: reconfig string(%s:%d) = %s\n", pif->pi_ifname, lif->li_ifnum, lif->li_reconfig); @@ -1654,57 +1571,37 @@ static int if_unplumb(ip_cache_t *node) { ip_lif_t *lif; - ip_pif_t *pif; - int ipv4 = 0, ipv6 = 0; - char syscmd[MAX_RECONFIG_SIZE + LIFNAMSIZ]; + ip_pif_t *pif = node->ip_pif; + boolean_t ipv4 = B_FALSE; + boolean_t ipv6 = B_FALSE; rcm_log_message(RCM_TRACE2, "IP: if_unplumb(%s)\n", node->ip_resource); - pif = node->ip_pif; - lif = pif->pi_lifs; - - while (lif != NULL) { + for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { if (lif->li_ifflags & IFF_IPV4) { - ipv4++; + ipv4 = B_TRUE; } else if (lif->li_ifflags & IFF_IPV6) { - ipv6++; + ipv6 = B_TRUE; } else { /* Unlikely case */ rcm_log_message(RCM_DEBUG, "IP: Unplumb ignored (%s:%d)\n", pif->pi_ifname, lif->li_ifnum); - lif = lif->li_next; - continue; } - lif = lif->li_next; } - /* Unplumb the physical interface */ - if (ipv4) { - rcm_log_message(RCM_TRACE2, - "IP: if_unplumb: ifconfig %s unplumb\n", pif->pi_ifname); - (void) snprintf(syscmd, sizeof (syscmd), "%s %s unplumb\n", - USR_SBIN_IFCONFIG, pif->pi_ifname); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot unplumb (%s) %s\n"), - pif->pi_ifname, strerror(errno)); - return (-1); - } + if (ipv4 && !ifconfig(pif->pi_ifname, "inet", "unplumb", B_FALSE)) { + rcm_log_message(RCM_ERROR, _("IP: Cannot unplumb (%s) %s\n"), + pif->pi_ifname, strerror(errno)); + return (-1); } - if (ipv6) { - rcm_log_message(RCM_TRACE2, - "IP: if_unplumb: ifconfig %s inet6 unplumb\n", - pif->pi_ifname); - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s inet6 unplumb\n", USR_SBIN_IFCONFIG, pif->pi_ifname); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot unplumb (%s) %s\n"), - pif->pi_ifname, strerror(errno)); - return (-1); - } + + if (ipv6 && !ifconfig(pif->pi_ifname, "inet6", "unplumb", B_FALSE)) { + rcm_log_message(RCM_ERROR, _("IP: Cannot unplumb (%s) %s\n"), + pif->pi_ifname, strerror(errno)); + return (-1); } + rcm_log_message(RCM_TRACE2, "IP: if_unplumb(%s) success\n", node->ip_resource); @@ -1723,8 +1620,11 @@ if_replumb(ip_cache_t *node) ip_lif_t *lif; ip_pif_t *pif; int i; - char syscmd[LIFNAMSIZ+MAXPATHLEN]; /* must be big enough */ - int max_ipv4 = 0, max_ipv6 = 0; + boolean_t success, ipmp; + const char *fstr; + char lifname[LIFNAMSIZ]; + char buf[MAX_RECONFIG_SIZE]; + int max_lifnum = 0; rcm_log_message(RCM_TRACE2, "IP: if_replumb(%s)\n", node->ip_resource); @@ -1738,100 +1638,103 @@ if_replumb(ip_cache_t *node) */ pif = node->ip_pif; - lif = pif->pi_lifs; + ipmp = (node->ip_pif->pi_grname[0] != '\0'); /* * Make a first pass to plumb in physical interfaces and get a count * of the max logical interfaces */ - while (lif != NULL) { + for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { + max_lifnum = MAX(lif->li_ifnum, max_lifnum); if (lif->li_ifflags & IFF_IPV4) { - if (lif->li_ifnum > max_ipv4) { - max_ipv4 = lif->li_ifnum; - } + fstr = "inet"; } else if (lif->li_ifflags & IFF_IPV6) { - if (lif->li_ifnum > max_ipv6) { - max_ipv6 = lif->li_ifnum; - } + fstr = "inet6"; } else { /* Unlikely case */ rcm_log_message(RCM_DEBUG, "IP: Re-plumb ignored (%s:%d)\n", pif->pi_ifname, lif->li_ifnum); - lif = lif->li_next; continue; } - if (lif->li_ifnum == 0) { /* physical interface instance */ - if ((lif->li_ifflags & IFF_NOFAILOVER) || - (strcmp(pif->pi_grpname, "") == 0)) { - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s\n", USR_SBIN_IFCONFIG, - lif->li_reconfig); - } else if (lif->li_ifflags & IFF_IPV4) { - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s inet plumb group %s\n", - USR_SBIN_IFCONFIG, - pif->pi_ifname, pif->pi_grpname); - } else if (lif->li_ifflags & IFF_IPV6) { - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s inet6 plumb group %s\n", - USR_SBIN_IFCONFIG, - pif->pi_ifname, pif->pi_grpname); - } + /* ignore logical interface instances */ + if (lif->li_ifnum != 0) + continue; + + if ((lif->li_ifflags & IFF_NOFAILOVER) || !ipmp) { + success = ifconfig("", "", lif->li_reconfig, B_FALSE); + } else { + (void) snprintf(buf, sizeof (buf), "plumb group %s", + pif->pi_grname); + success = ifconfig(pif->pi_ifname, fstr, buf, B_FALSE); + } + + if (!success) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot plumb (%s) %s\n"), pif->pi_ifname, + strerror(errno)); + return (-1); + } + + /* + * Restart DHCP if necessary. + */ + if ((lif->li_ifflags & IFF_DHCPRUNNING) && + !ifconfig(pif->pi_ifname, fstr, CFG_DHCP_CMD, B_FALSE)) { + rcm_log_message(RCM_ERROR, _("IP: Cannot start DHCP " + "(%s) %s\n"), pif->pi_ifname, strerror(errno)); + return (-1); + } + rcm_log_message(RCM_TRACE2, + "IP: if_replumb: Modcnt = %d\n", lif->li_modcnt); + /* modinsert modules in order, ignore driver(last) */ + for (i = 0; i < (lif->li_modcnt - 1); i++) { rcm_log_message(RCM_TRACE2, - "IP: if_replumb: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { + "IP: modinsert: Pos = %d Mod = %s\n", + i, lif->li_modules[i]); + if (modop(pif->pi_ifname, lif->li_modules[i], i, + MOD_INSERT) == -1) { rcm_log_message(RCM_ERROR, - _("IP: Cannot plumb (%s) %s\n"), - pif->pi_ifname, strerror(errno)); + _("IP: modinsert error(%s)\n"), + pif->pi_ifname); return (-1); } - - rcm_log_message(RCM_TRACE2, - "IP: if_replumb: Modcnt = %d\n", lif->li_modcnt); - /* modinsert modules in order, ignore driver(last) */ - for (i = 0; i < (lif->li_modcnt - 1); i++) { - rcm_log_message(RCM_TRACE2, - "IP: modinsert: Pos = %d Mod = %s\n", - i, lif->li_modules[i]); - if (modop(pif->pi_ifname, lif->li_modules[i], i, - MOD_INSERT) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: modinsert error(%s)\n"), - pif->pi_ifname); - return (-1); - } - } } - - lif = lif->li_next; } /* Now, add all the logical interfaces in the correct order */ - for (i = 1; i <= MAX(max_ipv6, max_ipv4); i++) { + for (i = 1; i <= max_lifnum; i++) { + (void) snprintf(lifname, LIFNAMSIZ, "%s:%d", pif->pi_ifname, i); + /* reset lif through every iteration */ - lif = pif->pi_lifs; - while (lif != NULL) { - if (((lif->li_ifflags & IFF_NOFAILOVER) || - (strcmp(pif->pi_grpname, "") == 0)) && - (lif->li_ifnum == i)) { - /* Plumb in the logical interface */ - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s\n", USR_SBIN_IFCONFIG, - lif->li_reconfig); - rcm_log_message(RCM_TRACE2, - "IP: if_replumb: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot addif (%s:%d) " - "%s\n"), - pif->pi_ifname, i, strerror(errno)); - return (-1); - } + for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { + /* + * Process entries in order. If the interface is + * using IPMP, only process test addresses. + */ + if (lif->li_ifnum != i || + (ipmp && !(lif->li_ifflags & IFF_NOFAILOVER))) + continue; + + if (!ifconfig("", "", lif->li_reconfig, B_FALSE)) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot addif (%s) %s\n"), lifname, + strerror(errno)); + return (-1); + } + + /* + * Restart DHCP if necessary. + */ + if ((lif->li_ifflags & IFF_DHCPRUNNING) && + !ifconfig(lifname, fstr, CFG_DHCP_CMD, B_FALSE)) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot start DHCP (%s) %s\n"), + lifname, strerror(errno)); + return (-1); } - lif = lif->li_next; } } @@ -1865,71 +1768,64 @@ clr_cfg_state(ip_pif_t *pif) } /* - * ip_ipmp_offline() - Failover from if_from to if_to using a - * minimum redudancy of min_red. This uses IPMPs - * "offline" mechanism to achieve the failover. + * Attempt to offline ip_cache_t `node'; returns an IPMP error code. */ static int -ip_ipmp_offline(ip_cache_t *if_from, ip_cache_t *if_to) +ip_ipmp_offline(ip_cache_t *node) { - mpathd_cmd_t mpdcmd; - - if ((if_from == NULL) || (if_from->ip_pif == NULL) || - (if_from->ip_pif->pi_ifname == NULL)) { - return (-1); - } + int retval; + ipmp_handle_t handle; rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline\n"); - mpdcmd.cmd_command = MI_OFFLINE; - (void) strcpy(mpdcmd.cmd_ifname, if_from->ip_pif->pi_ifname); - - if ((if_to != NULL) && (if_to->ip_pif != NULL) && - (if_to->ip_pif->pi_ifname != NULL)) { - rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline (%s)->(%s)\n", - if_from->ip_pif->pi_ifname, if_to->ip_pif->pi_ifname); - (void) strncpy(mpdcmd.cmd_movetoif, if_to->ip_pif->pi_ifname, - sizeof (mpdcmd.cmd_movetoif)); - mpdcmd.cmd_movetoif[sizeof (mpdcmd.cmd_movetoif) - 1] = '\0'; - } else { - rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline (%s)->(any)\n", - if_from->ip_pif->pi_ifname); - (void) strcpy(mpdcmd.cmd_movetoif, ""); /* signifies any */ + if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS) { + rcm_log_message(RCM_ERROR, + _("IP: cannot create ipmp handle: %s\n"), + ipmp_errmsg(retval)); + return (retval); } - mpdcmd.cmd_min_red = if_from->ip_ifred; - if (mpathd_send_cmd(&mpdcmd) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd offline error: %s\n"), - strerror(errno)); - return (-1); + retval = ipmp_offline(handle, node->ip_pif->pi_ifname, node->ip_ifred); + if (retval != IPMP_SUCCESS) { + rcm_log_message(RCM_ERROR, _("IP: ipmp_offline error: %s\n"), + ipmp_errmsg(retval)); + } else { + rcm_log_message(RCM_TRACE1, "IP: ipmp_offline success\n"); } - rcm_log_message(RCM_TRACE1, "IP: ipmp offline success\n"); - return (0); + ipmp_close(handle); + return (retval); } /* - * ip_ipmp_undo_offline() - Undo prior offline of the interface. - * This uses IPMPs "undo offline" feature. + * Attempt to undo the offline ip_cache_t `node'; returns an IPMP error code. */ static int ip_ipmp_undo_offline(ip_cache_t *node) { - mpathd_cmd_t mpdcmd; + int retval; + ipmp_handle_t handle; - mpdcmd.cmd_command = MI_UNDO_OFFLINE; - (void) strcpy(mpdcmd.cmd_ifname, node->ip_pif->pi_ifname); + rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_undo_offline\n"); - if (mpathd_send_cmd(&mpdcmd) < 0) { + if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS) { rcm_log_message(RCM_ERROR, - _("IP: mpathd error: %s\n"), - strerror(errno)); - return (-1); + _("IP: cannot create ipmp handle: %s\n"), + ipmp_errmsg(retval)); + return (retval); } - rcm_log_message(RCM_TRACE1, "IP: ipmp undo offline success\n"); - return (0); + retval = ipmp_undo_offline(handle, node->ip_pif->pi_ifname); + if (retval != IPMP_SUCCESS) { + rcm_log_message(RCM_ERROR, + _("IP: ipmp_undo_offline error: %s\n"), + ipmp_errmsg(retval)); + } else { + rcm_log_message(RCM_TRACE1, "IP: ipmp_undo_offline success\n"); + } + + ipmp_close(handle); + return (retval); } /* @@ -1946,10 +1842,9 @@ get_link_resource(const char *link) char *resource; dladm_status_t status; - if ((status = dladm_name2info(dld_handle, link, &linkid, &flags, NULL, - NULL)) != DLADM_STATUS_OK) { + status = dladm_name2info(dld_handle, link, &linkid, &flags, NULL, NULL); + if (status != DLADM_STATUS_OK) goto fail; - } if (!(flags & DLADM_OPT_ACTIVE)) { status = DLADM_STATUS_FAILED; @@ -1976,243 +1871,6 @@ fail: } /* - * if_get_flags() - Return the cached physical interface flags - * Call with cache_lock held - */ -static uint64_t -if_get_flags(ip_pif_t *pif) -{ - ip_lif_t *lif; - - for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { - if (lif->li_ifnum == 0) { - return (lif->li_ifflags & RCM_PIF_FLAGS); - } - } - return (0); -} - -/* - * mpathd_send_cmd() - Sends the command to in.mpathd. - */ -static int -mpathd_send_cmd(mpathd_cmd_t *mpd) -{ - mpathd_unoffline_t mpc; - struct mpathd_response mpr; - int i; - int s; - - rcm_log_message(RCM_TRACE1, "IP: mpathd_send_cmd \n"); - - for (i = 0; i < MPATHD_MAX_RETRIES; i++) { - s = connect_to_mpathd(AF_INET); - if (s == -1) { - s = connect_to_mpathd(AF_INET6); - if (s == -1) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot talk to mpathd\n")); - return (-1); - } - } - switch (mpd->cmd_command) { - case MI_OFFLINE : - rcm_log_message(RCM_TRACE1, "IP: MI_OFFLINE: " - "(%s)->(%s) redundancy = %d\n", mpd->cmd_ifname, - mpd->cmd_movetoif, mpd->cmd_min_red); - - if (write(s, mpd, sizeof (mpathd_cmd_t)) != - sizeof (mpathd_cmd_t)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd write: %s\n"), - strerror(errno)); - (void) close(s); - return (-1); - } - break; - - case MI_SETOINDEX : - rcm_log_message(RCM_TRACE1, "IP: MI_SETOINDEX: " - "(%s)->(%s) family = %d\n", mpd->from_lifname, - mpd->to_pifname, mpd->addr_family); - - if (write(s, mpd, sizeof (mpathd_cmd_t)) != - sizeof (mpathd_cmd_t)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd write: %s\n"), - strerror(errno)); - (void) close(s); - return (-1); - } - break; - - case MI_UNDO_OFFLINE: - /* mpathd checks for exact size of the message */ - mpc.cmd_command = mpd->cmd_command; - (void) strcpy(mpc.cmd_ifname, mpd->cmd_ifname); - - rcm_log_message(RCM_TRACE1, "IP: MI_UNDO_OFFLINE: " - "(%s)\n", mpd->cmd_ifname); - - if (write(s, &mpc, sizeof (mpathd_unoffline_t)) != - sizeof (mpathd_unoffline_t)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd write: %s\n"), - strerror(errno)); - (void) close(s); - return (-1); - } - break; - default : - rcm_log_message(RCM_ERROR, - _("IP: unsupported mpathd command\n")); - (void) close(s); - return (-1); - } - - bzero(&mpr, sizeof (struct mpathd_response)); - /* Read the result from mpathd */ - if (read(s, &mpr, sizeof (struct mpathd_response)) != - sizeof (struct mpathd_response)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd read : %s\n"), strerror(errno)); - (void) close(s); - return (-1); - } - - (void) close(s); - if (mpr.resp_mpathd_err == 0) { - rcm_log_message(RCM_TRACE1, - "IP: mpathd_send_cmd success\n"); - return (0); /* Successful */ - } - - if (mpr.resp_mpathd_err == MPATHD_SYS_ERROR) { - if (mpr.resp_sys_errno == EAGAIN) { - (void) sleep(1); - rcm_log_message(RCM_DEBUG, - "IP: mpathd retrying\n"); - continue; /* Retry */ - } - errno = mpr.resp_sys_errno; - rcm_log_message(RCM_WARNING, - _("IP: mpathd_send_cmd error: %s\n"), - strerror(errno)); - } else if (mpr.resp_mpathd_err == MPATHD_MIN_RED_ERROR) { - errno = EIO; - rcm_log_message(RCM_ERROR, _("IP: in.mpathd(1M): " - "Minimum redundancy not met\n")); - } else { - rcm_log_message(RCM_ERROR, - _("IP: mpathd_send_cmd error\n")); - } - /* retry */ - } - - rcm_log_message(RCM_ERROR, - _("IP: mpathd_send_cmd failed %d retries\n"), MPATHD_MAX_RETRIES); - return (-1); -} - -/* - * Returns -1 on failure. Returns the socket file descriptor on - * success. - */ -static int -connect_to_mpathd(int family) -{ - int s; - struct sockaddr_storage ss; - struct sockaddr_in *sin = (struct sockaddr_in *)&ss; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; - int addrlen; - int ret; - int on; - - rcm_log_message(RCM_TRACE1, "IP: connect_to_mpathd\n"); - - s = socket(family, SOCK_STREAM, 0); - if (s < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd socket: %s\n"), strerror(errno)); - return (-1); - } - bzero((char *)&ss, sizeof (ss)); - ss.ss_family = family; - /* - * Need to bind to a privelged port. For non-root, this - * will fail. in.mpathd verifies that only commands coming - * from priveleged ports succeed so that the ordinary user - * can't issue offline commands. - */ - on = 1; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd setsockopt: TCP_ANONPRIVBIND: %s\n"), - strerror(errno)); - return (-1); - } - switch (family) { - case AF_INET: - sin->sin_port = 0; - sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - addrlen = sizeof (struct sockaddr_in); - break; - case AF_INET6: - sin6->sin6_port = 0; - sin6->sin6_addr = loopback_addr; - addrlen = sizeof (struct sockaddr_in6); - break; - } - ret = bind(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd bind: %s\n"), strerror(errno)); - return (-1); - } - switch (family) { - case AF_INET: - sin->sin_port = htons(MPATHD_PORT); - break; - case AF_INET6: - sin6->sin6_port = htons(MPATHD_PORT); - break; - } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - if (errno == ECONNREFUSED) { - /* in.mpathd is not running, start it */ - if (rcm_exec_cmd(MPATHD_PATH) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd exec: %s\n"), - strerror(errno)); - return (-1); - } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - } - if (ret != 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd connect: %s\n"), strerror(errno)); - return (-1); - } - } - on = 0; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd setsockopt TCP_ANONPRIVBIND: %s\n"), - strerror(errno)); - return (-1); - } - - rcm_log_message(RCM_TRACE1, "IP: connect_to_mpathd success\n"); - - return (s); -} - -/* * modop() - Remove/insert a module */ static int @@ -2239,12 +1897,10 @@ modop(char *name, char *arg, int pos, char op) if (op == MOD_REMOVE) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s modremove %s@%d\n", USR_SBIN_IFCONFIG, name, arg, - pos); + "%s %s modremove %s@%d\n", SBIN_IFCONFIG, name, arg, pos); } else if (op == MOD_INSERT) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s modinsert %s@%d\n", USR_SBIN_IFCONFIG, name, arg, - pos); + "%s %s modinsert %s@%d\n", SBIN_IFCONFIG, name, arg, pos); } else { rcm_log_message(RCM_ERROR, _("IP: modop(%s): unknown operation\n"), name); @@ -2277,11 +1933,11 @@ get_modlist(char *name, ip_lif_t *lif) int i; int num_mods; struct lifreq lifr; - struct str_list strlist; + struct str_list strlist = { 0 }; rcm_log_message(RCM_TRACE1, "IP: getmodlist(%s)\n", name); - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + (void) strlcpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); lifr.lifr_flags = lif->li_ifflags; if (ip_domux2fd(&mux_fd, &muxid_fd, &fd, &lifr) < 0) { rcm_log_message(RCM_ERROR, _("IP: ip_domux2fd(%s)\n"), name); @@ -2292,39 +1948,34 @@ get_modlist(char *name, ip_lif_t *lif) rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): I_LIST(%s) \n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + goto fail; } strlist.sl_nmods = num_mods; strlist.sl_modlist = malloc(sizeof (struct str_mlist) * num_mods); - if (strlist.sl_modlist == NULL) { rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): %s\n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + goto fail; } if (ioctl(fd, I_LIST, (caddr_t)&strlist) < 0) { rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): I_LIST error: %s\n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + goto fail; } for (i = 0; i < strlist.sl_nmods; i++) { - lif->li_modules[i] = - malloc(strlen(strlist.sl_modlist[i].l_name)+1); + lif->li_modules[i] = strdup(strlist.sl_modlist[i].l_name); if (lif->li_modules[i] == NULL) { rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): %s\n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + while (i > 0) + free(lif->li_modules[--i]); + goto fail; } - (void) strcpy(lif->li_modules[i], strlist.sl_modlist[i].l_name); } lif->li_modcnt = strlist.sl_nmods; @@ -2332,6 +1983,10 @@ get_modlist(char *name, ip_lif_t *lif) rcm_log_message(RCM_TRACE1, "IP: getmodlist(%s) success\n", name); return (ip_plink(mux_fd, muxid_fd, fd, &lifr)); +fail: + free(strlist.sl_modlist); + (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); + return (-1); } /* @@ -2436,6 +2091,7 @@ ip_plink(int mux_fd, int muxid_fd, int fd, struct lifreq *lifr) * * Notify online to IP address consumers. */ +/*ARGSUSED*/ static int ip_onlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, rcm_info_t **depend_info) @@ -2464,6 +2120,7 @@ ip_onlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, * * Offline IP address consumers. */ +/*ARGSUSED*/ static int ip_offlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, rcm_info_t **depend_info) @@ -2494,9 +2151,9 @@ ip_offlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, } /* - * ip_get_addrlist() - Compile list of IP addresses hosted on this NIC (node) - * This routine malloc() required memeory for the list - * Returns list on success, NULL if failed + * ip_get_addrlist() - Get the list of IP addresses on this interface (node); + * This routine malloc()s required memory for the list. + * Returns the list on success, NULL on failure. * Call with cache_lock held. */ static char ** @@ -2504,11 +2161,9 @@ ip_get_addrlist(ip_cache_t *node) { ip_lif_t *lif; char **addrlist = NULL; - int numifs; + int i, numifs; + size_t addrlistsize; char addrstr[INET6_ADDRSTRLEN]; - void *addr; - int af; - int i; rcm_log_message(RCM_TRACE2, "IP: ip_get_addrlist(%s)\n", node->ip_resource); @@ -2532,35 +2187,21 @@ ip_get_addrlist(ip_cache_t *node) for (lif = node->ip_pif->pi_lifs, i = 0; lif != NULL; lif = lif->li_next, i++) { - af = lif->li_addr.family; - if (af == AF_INET6) { - addr = &lif->li_addr.ip6.sin6_addr; - } else if (af == AF_INET) { - addr = &lif->li_addr.ip4.sin_addr; - } else { - rcm_log_message(RCM_DEBUG, - "IP: unknown addr family %d, assuming AF_INET\n", - af); - af = AF_INET; - addr = &lif->li_addr.ip4.sin_addr; - } - if (inet_ntop(af, addr, addrstr, INET6_ADDRSTRLEN) == NULL) { - rcm_log_message(RCM_ERROR, - _("IP: inet_ntop: %s\n"), strerror(errno)); + if (!ip_addrstr(lif, addrstr, sizeof (addrstr))) { ip_free_addrlist(addrlist); return (NULL); } - if ((addrlist[i] = malloc(strlen(addrstr) + RCM_SIZE_SUNW_IP)) - == NULL) { + addrlistsize = strlen(addrstr) + sizeof (RCM_STR_SUNW_IP); + if ((addrlist[i] = malloc(addrlistsize)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: ip_get_addrlist(%s) malloc failure(%s)\n"), node->ip_resource, strerror(errno)); ip_free_addrlist(addrlist); return (NULL); } - (void) strcpy(addrlist[i], RCM_STR_SUNW_IP); /* SUNW_ip/ */ - (void) strcat(addrlist[i], addrstr); /* SUNW_ip/<address> */ + (void) snprintf(addrlist[i], addrlistsize, "%s%s", + RCM_STR_SUNW_IP, addrstr); rcm_log_message(RCM_DEBUG, "Anon Address: %s\n", addrlist[i]); } @@ -2611,16 +2252,13 @@ ip_consumer_notify(rcm_handle_t *hd, datalink_id_t linkid, char **errorp, return; } /* - * Inform anonymous consumers about IP addresses being - * onlined + * Inform anonymous consumers about IP addresses being onlined. */ (void) ip_onlinelist(hd, node, errorp, flags, depend_info); (void) mutex_unlock(&cache_lock); rcm_log_message(RCM_TRACE2, "IP: ip_consumer_notify success\n"); - return; - } /* @@ -2632,20 +2270,18 @@ if_configure(datalink_id_t linkid) char ifinst[MAXLINKNAMELEN]; char cfgfile[MAXPATHLEN]; char cached_name[RCM_LINK_RESOURCE_MAX]; - struct stat statbuf; + FILE *hostfp, *host6fp; ip_cache_t *node; - int af = 0; - int ipmp = 0; + boolean_t ipmp = B_FALSE; assert(linkid != DATALINK_INVALID_LINKID); - rcm_log_message(RCM_TRACE1, _("IP: if_configure(%u)\n"), linkid); /* Check for the interface in the cache */ (void) snprintf(cached_name, sizeof (cached_name), "%s/%u", RCM_LINK_PREFIX, linkid); - /* Check if the interface is new or was previously offlined */ + /* Check if the interface is new or was not previously offlined */ (void) mutex_lock(&cache_lock); if (((node = cache_lookup(NULL, cached_name, CACHE_REFRESH)) != NULL) && (!(node->ip_cachestate & CACHE_IF_OFFLINED))) { @@ -2663,76 +2299,69 @@ if_configure(datalink_id_t linkid) return (-1); } - /* Scan IPv4 configuration first */ - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV4, ifinst); - cfgfile[MAXPATHLEN - 1] = '\0'; - + /* + * Scan the IPv4 and IPv6 hostname files to see if (a) they exist + * and (b) if either one places the interface into an IPMP group. + */ + (void) snprintf(cfgfile, MAXPATHLEN, CFGFILE_FMT_IPV4, ifinst); rcm_log_message(RCM_TRACE1, "IP: Scanning %s\n", cfgfile); - if (stat(cfgfile, &statbuf) == 0) { - af |= CONFIG_AF_INET; - if (isgrouped(cfgfile)) { - ipmp++; - } + if ((hostfp = fopen(cfgfile, "r")) != NULL) { + if (isgrouped(cfgfile)) + ipmp = B_TRUE; } - /* Scan IPv6 configuration details */ - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV6, ifinst); - cfgfile[MAXPATHLEN - 1] = '\0'; + (void) snprintf(cfgfile, MAXPATHLEN, CFGFILE_FMT_IPV6, ifinst); rcm_log_message(RCM_TRACE1, "IP: Scanning %s\n", cfgfile); - if (stat(cfgfile, &statbuf) == 0) { - af |= CONFIG_AF_INET6; - if ((ipmp == 0) && isgrouped(cfgfile)) { - ipmp++; - } + if ((host6fp = fopen(cfgfile, "r")) != NULL) { + if (!ipmp && isgrouped(cfgfile)) + ipmp = B_TRUE; } - if (af & CONFIG_AF_INET) { - if (if_ipmp_config(ifinst, CONFIG_AF_INET, ipmp) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: IPv4 Post-attach failed (%s)\n"), ifinst); - return (-1); - } + /* + * Configure the interface according to its hostname files. + */ + if (hostfp != NULL && + if_config_inst(ifinst, hostfp, AF_INET, ipmp) == -1) { + rcm_log_message(RCM_ERROR, + _("IP: IPv4 Post-attach failed (%s)\n"), ifinst); + goto fail; } - if (af & CONFIG_AF_INET6) { - if (if_ipmp_config(ifinst, CONFIG_AF_INET6, ipmp) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: IPv6 Post-attach failed(%s)\n"), ifinst); - return (-1); - } + if (host6fp != NULL && + if_config_inst(ifinst, host6fp, AF_INET6, ipmp) == -1) { + rcm_log_message(RCM_ERROR, + _("IP: IPv6 Post-attach failed (%s)\n"), ifinst); + goto fail; } + (void) fclose(hostfp); + (void) fclose(host6fp); rcm_log_message(RCM_TRACE1, "IP: if_configure(%s) success\n", ifinst); - return (0); - +fail: + (void) fclose(hostfp); + (void) fclose(host6fp); + return (-1); } /* - * isgrouped() - Scans the given config file to see if this is a grouped - * interface - * Returns non-zero if true; 0 if false + * isgrouped() - Scans the given config file to see if this interface is + * using IPMP. Returns B_TRUE or B_FALSE. */ -static int -isgrouped(char *cfgfile) +static boolean_t +isgrouped(const char *cfgfile) { FILE *fp; struct stat statb; - char *buf = NULL; - char *tokens[MAXARGS]; /* token pointers */ - char tspace[MAXLINE]; /* token space */ - int ntok; - int group = 0; - - if (cfgfile == NULL) - return (0); + char *nlp, *line, *token, *lasts, *buf; + boolean_t grouped = B_FALSE; rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s)\n", cfgfile); if (stat(cfgfile, &statb) != 0) { rcm_log_message(RCM_TRACE1, _("IP: No config file(%s)\n"), cfgfile); - return (0); + return (B_FALSE); } /* @@ -2744,609 +2373,284 @@ isgrouped(char *cfgfile) if (statb.st_size <= 1) { rcm_log_message(RCM_TRACE1, _("IP: Empty config file(%s)\n"), cfgfile); - return (0); + return (B_FALSE); } if ((fp = fopen(cfgfile, "r")) == NULL) { rcm_log_message(RCM_ERROR, _("IP: Cannot open configuration file(%s): %s\n"), cfgfile, strerror(errno)); - return (0); + return (B_FALSE); } - if ((buf = calloc(1, statb.st_size)) == NULL) { + if ((buf = malloc(statb.st_size)) == NULL) { rcm_log_message(RCM_ERROR, - _("IP: calloc failure(%s): %s\n"), cfgfile, + _("IP: malloc failure(%s): %s\n"), cfgfile, strerror(errno)); - (void) fclose(fp); - return (0); + goto out; } while (fgets(buf, statb.st_size, fp) != NULL) { - if (*buf == '\0') - continue; - - tokenize(buf, tokens, tspace, &ntok); - while (ntok) { - if (STREQ("group", tokens[ntok - 1])) { - if (tokens[ntok] != NULL) { - group++; - } + if ((nlp = strrchr(buf, '\n')) != NULL) + *nlp = '\0'; + + line = buf; + while ((token = strtok_r(line, " \t", &lasts)) != NULL) { + line = NULL; + if (STREQ("group", token) && + strtok_r(NULL, " \t", &lasts) != NULL) { + grouped = B_TRUE; + goto out; } - ntok--; } } - +out: free(buf); - (void) fclose(fp); - if (group <= 0) { - rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s) non-grouped\n", - cfgfile); - return (0); - } else { - rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s) grouped\n", - cfgfile); - return (1); - } -} + rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s): %d\n", cfgfile, + grouped); + return (grouped); +} /* - * if_ipmp_config() - Configure an interface instance as specified by the + * if_config_inst() - Configure an interface instance as specified by the * address family af and if it is grouped (ipmp). */ static int -if_ipmp_config(char *ifinst, int af, int ipmp) +if_config_inst(const char *ifinst, FILE *hfp, int af, boolean_t ipmp) { - char cfgfile[MAXPATHLEN]; /* configuration file */ - FILE *fp; + FILE *ifparsefp; struct stat statb; - char *buf; - char *tokens[MAXARGS]; /* list of config attributes */ - char tspace[MAXLINE]; /* token space */ - char syscmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1]; - char grpcmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1]; - char fstr[8]; /* address family string inet or inet6 */ - int nofailover = 0; - int newattach = 0; - int cmdvalid = 0; - int ntok; - int n; - int stdif = 0; - - if (ifinst == NULL) - return (0); + char *buf = NULL; + char *ifparsebuf = NULL; + uint_t ifparsebufsize; + const char *fstr; /* address family string */ + boolean_t stdif = B_FALSE; - rcm_log_message(RCM_TRACE1, "IP: if_ipmp_config(%s) ipmp = %d\n", + rcm_log_message(RCM_TRACE1, "IP: if_config_inst(%s) ipmp = %d\n", ifinst, ipmp); - if (af & CONFIG_AF_INET) { - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV4, - ifinst); - (void) strcpy(fstr, "inet"); - } else if (af & CONFIG_AF_INET6) { - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV6, - ifinst); - (void) strcpy(fstr, "inet6"); - } else { - return (0); /* nothing to do */ - } - - cfgfile[MAXPATHLEN - 1] = '\0'; - grpcmd[0] = '\0'; - - if (stat(cfgfile, &statb) != 0) { - rcm_log_message(RCM_TRACE1, - "IP: No config file(%s)\n", ifinst); - return (0); + if (fstat(fileno(hfp), &statb) != 0) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot fstat file(%s)\n"), ifinst); + goto fail; } - /* Config file exists, plumb in the physical interface */ - if (af & CONFIG_AF_INET6) { - if (if_getcount(AF_INET6) == 0) { - /* - * Configure software loopback driver if this is the - * first IPv6 interface plumbed - */ - newattach++; - (void) snprintf(syscmd, sizeof (syscmd), - "%s lo0 %s plumb ::1 up", USR_SBIN_IFCONFIG, fstr); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot plumb (%s) %s\n"), - ifinst, strerror(errno)); - return (-1); - } - } - (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s plumb up", - USR_SBIN_IFCONFIG, ifinst, fstr); - } else { - (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s plumb ", - USR_SBIN_IFCONFIG, ifinst, fstr); - if (if_getcount(AF_INET) == 0) { - newattach++; - } + switch (af) { + case AF_INET: + fstr = "inet"; + break; + case AF_INET6: + fstr = "inet6"; + break; + default: + assert(0); } - rcm_log_message(RCM_TRACE1, "IP: Exec: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot plumb (%s) %s\n"), ifinst, strerror(errno)); - return (-1); - } + /* + * The hostname file exists; plumb the physical interface. + */ + if (!ifconfig(ifinst, fstr, "plumb", B_FALSE)) + goto fail; - /* Check if config file is empty, if so, nothing else to do */ - if (statb.st_size == 0) { + /* Skip static configuration if the hostname file is empty */ + if (statb.st_size <= 1) { rcm_log_message(RCM_TRACE1, - "IP: Zero size config file(%s)\n", ifinst); - return (0); + _("IP: Zero size hostname file(%s)\n"), ifinst); + goto configured; } - if ((fp = fopen(cfgfile, "r")) == NULL) { + if (fseek(hfp, 0, SEEK_SET) == -1) { rcm_log_message(RCM_ERROR, - _("IP: Open error(%s): %s\n"), cfgfile, strerror(errno)); - return (-1); + _("IP: Cannot rewind hostname file(%s): %s\n"), ifinst, + strerror(errno)); + goto fail; } + /* + * Allocate the worst-case single-line buffer sizes. A bit skanky, + * but since hostname files are small, this should suffice. + */ if ((buf = calloc(1, statb.st_size)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: calloc(%s): %s\n"), ifinst, strerror(errno)); - (void) fclose(fp); - return (-1); + goto fail; } - /* a single line with one token implies a classical if */ - if (fgets(buf, statb.st_size, fp) != NULL) { - tokenize(buf, tokens, tspace, &ntok); - if (ntok == 1) { - rcm_log_message(RCM_TRACE1, "IP: Standard interface\n"); - stdif++; - } - } - if (fseek(fp, 0L, SEEK_SET) == -1) { - rcm_log_message(RCM_ERROR, _("IP: fseek: %s\n"), - strerror(errno)); - return (-1); + ifparsebufsize = statb.st_size + sizeof (SBIN_IFPARSE " -s inet6 "); + if ((ifparsebuf = calloc(1, ifparsebufsize)) == NULL) { + rcm_log_message(RCM_ERROR, + _("IP: calloc(%s): %s\n"), ifinst, strerror(errno)); + goto fail; } /* - * Process the config command - * This loop also handles multiple logical interfaces that may - * be configured on a single line + * For IPv4, determine whether the hostname file consists of a single + * line. We need to handle these specially since they should + * automatically be suffixed with "netmask + broadcast + up". */ - while (fgets(buf, statb.st_size, fp) != NULL) { - nofailover = 0; - cmdvalid = 0; + if (af == AF_INET && + fgets(buf, statb.st_size, hfp) != NULL && + fgets(buf, statb.st_size, hfp) == NULL) { + rcm_log_message(RCM_TRACE1, "IP: one-line hostname file\n"); + stdif = B_TRUE; + } - if (*buf == '\0') - continue; + if (fseek(hfp, 0L, SEEK_SET) == -1) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot rewind hostname file(%s): %s\n"), ifinst, + strerror(errno)); + goto fail; + } - tokenize(buf, tokens, tspace, &ntok); - if (ntok <= 0) + /* + * Loop through the file one line at a time and feed it to ifconfig. + * If the interface is using IPMP, then we use /sbin/ifparse -s to + * weed out all of the data addresses, since those are already on the + * IPMP meta-interface. + */ + while (fgets(buf, statb.st_size, hfp) != NULL) { + if (ntok(buf) == 0) continue; - /* Reset the config command */ - (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s ", - USR_SBIN_IFCONFIG, ifinst, fstr); - - /* No parsing if this is first interface of its kind */ - if (newattach) { - (void) strcat(syscmd, buf); - /* Classic if */ - if ((af & CONFIG_AF_INET) && (stdif == 1)) { - (void) strcat(syscmd, CFG_CMDS_STD); - } - rcm_log_message(RCM_TRACE1, "IP: New: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Error: %s (%s): %s\n"), - syscmd, ifinst, strerror(errno)); - } + if (!ipmp) { + (void) ifconfig(ifinst, fstr, buf, stdif); continue; } - /* Parse the tokens to determine nature of the interface */ - for (n = 0; n < ntok; n++) { - /* Handle pathological failover cases */ - if (STREQ("-failover", tokens[n])) - nofailover++; - if (STREQ("failover", tokens[n])) - nofailover--; - - /* group attribute requires special processing */ - if (STREQ("group", tokens[n])) { - if (tokens[n + 1] != NULL) { - (void) snprintf(grpcmd, sizeof (grpcmd), - "%s %s %s %s %s", USR_SBIN_IFCONFIG, - ifinst, fstr, - tokens[n], tokens[n + 1]); - n++; /* skip next token */ - continue; - } - } - - /* Execute buffered command ? */ - if (STREQ("set", tokens[n]) || - STREQ("addif", tokens[n]) || - STREQ("removeif", tokens[n]) || - (n == (ntok -1))) { - - /* config command complete ? */ - if (n == (ntok -1)) { - ADDSPACE(syscmd); - (void) strcat(syscmd, tokens[n]); - cmdvalid++; - } - - if (!cmdvalid) { - ADDSPACE(syscmd); - (void) strcat(syscmd, tokens[n]); - cmdvalid++; - continue; - } - /* Classic if ? */ - if ((af & CONFIG_AF_INET) && (stdif == 1)) { - (void) strcat(syscmd, CFG_CMDS_STD); - } - - if (nofailover > 0) { - rcm_log_message(RCM_TRACE1, - "IP: Interim exec: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: %s fail(%s): %s\n"), - syscmd, ifinst, - strerror(errno)); - } - } else { - /* Have mpathd configure the address */ - if (if_mpathd_configure(syscmd, ifinst, - af, ipmp) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: %s fail(%s): %s\n"), - syscmd, ifinst, - strerror(errno)); - } - } - - /* Reset config command */ - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s %s ", USR_SBIN_IFCONFIG, ifinst, - fstr); - nofailover = 0; - cmdvalid = 0; - } - /* - * Note: No explicit command validation is required - * since ifconfig to does it for us - */ - ADDSPACE(syscmd); - (void) strcat(syscmd, tokens[n]); - cmdvalid++; - } - } - - free(buf); - (void) fclose(fp); - - /* - * The group name needs to be set after all the test/nofailover - * addresses have been configured. Otherwise, if IPMP detects that the - * interface is failed, the addresses will be moved to a working - * interface before the '-failover' flag can be set. - */ - if (grpcmd[0] != '\0') { - rcm_log_message(RCM_TRACE1, "IP: set group name: %s\n", grpcmd); - if (rcm_exec_cmd(grpcmd) != 0) { - rcm_log_message(RCM_ERROR, _("IP: %s fail(%s): %s\n"), - grpcmd, ifinst, strerror(errno)); + (void) snprintf(ifparsebuf, ifparsebufsize, SBIN_IFPARSE + " -s %s %s", fstr, buf); + if ((ifparsefp = popen(ifparsebuf, "r")) == NULL) { + rcm_log_message(RCM_ERROR, + _("IP: cannot configure %s: popen \"%s\" " + "failed: %s\n"), ifinst, buf, strerror(errno)); + goto fail; } - } - rcm_log_message(RCM_TRACE1, "IP: if_ipmp_config(%s) success\n", ifinst); - - return (0); -} - -/* - * if_mpathd_configure() - Determine configuration disposition of the interface - */ -static int -if_mpathd_configure(char *syscmd, char *ifinst, int af, int ipmp) -{ - char *tokens[MAXARGS]; - char tspace[MAXLINE]; - int ntok; - char *addr; - char *from_lifname; - mpathd_cmd_t mpdcmd; - int n; - - rcm_log_message(RCM_TRACE1, "IP: if_mpathd_configure(%s): %s\n", - ifinst, syscmd); - - tokenize(syscmd, tokens, tspace, &ntok); - if (ntok <= 0) - return (0); - - addr = tokens[3]; /* by default, third token is valid address */ - for (n = 0; n < ntok; n++) { - if (STREQ("set", tokens[n]) || - STREQ("addif", tokens[n])) { - addr = tokens[n+1]; - if (addr == NULL) { /* invalid format */ - return (-1); - } else - break; + while (fgets(buf, statb.st_size, ifparsefp) != NULL) { + if (ntok(buf) > 0) + (void) ifconfig(ifinst, fstr, buf, stdif); } - } - /* Check std. commands or no failed over address */ - if (STREQ("removeif", addr) || STREQ("group", addr) || - ((from_lifname = get_mpathd_dest(addr, af)) == NULL)) { - rcm_log_message(RCM_TRACE1, - "IP: No failed-over host, exec %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { + if (pclose(ifparsefp) == -1) { rcm_log_message(RCM_ERROR, - _("IP: %s failed(%s): %s\n"), - syscmd, ifinst, strerror(errno)); - return (-1); + _("IP: cannot configure %s: pclose \"%s\" " + "failed: %s\n"), ifinst, buf, strerror(errno)); + goto fail; } - return (0); - } - - /* Check for non-IPMP failover scenarios */ - if ((ipmp <= 0) && (from_lifname != NULL)) { - /* Address already hosted on another NIC, return */ - rcm_log_message(RCM_TRACE1, - "IP: Non-IPMP failed-over host(%s): %s\n", - ifinst, addr); - return (0); } +configured: /* - * Valid failed-over host; have mpathd set the original index + * Bring up the interface (it may already be up) + * + * Technically, since the boot scripts only unconditionally bring up + * IPv6 interfaces, we should only unconditionally bring up IPv6 here. + * However, if we don't bring up IPv4, and a legacy IPMP configuration + * without test addresses is being used, we will never bring the + * interface up even though we would've at boot. One fix is to check + * if the IPv4 hostname file contains data addresses that we would've + * brought up, but there's no simple way to do that. Given that it's + * rare to have persistent IP configuration for an interface that + * leaves it down, we cheap out and always bring it up for IPMP. */ - mpdcmd.cmd_command = MI_SETOINDEX; - (void) strcpy(mpdcmd.from_lifname, from_lifname); - (void) strcpy(mpdcmd.to_pifname, ifinst); - if (af & CONFIG_AF_INET6) { - mpdcmd.addr_family = AF_INET6; - } else { - mpdcmd.addr_family = AF_INET; - } - - /* Send command to in.mpathd(1M) */ - rcm_log_message(RCM_TRACE1, - "IP: Attempting setoindex from (%s) to (%s) ....\n", - from_lifname, ifinst); - - if (mpathd_send_cmd(&mpdcmd) < 0) { - rcm_log_message(RCM_TRACE1, - "IP: mpathd set original index unsuccessful: %s\n", - strerror(errno)); - return (-1); - } - - rcm_log_message(RCM_TRACE1, - "IP: setoindex success (%s) to (%s)\n", - from_lifname, ifinst); - - return (0); -} - -/* - * get_mpathd_dest() - Return current destination for lif; caller is - * responsible to free memory allocated for address - */ -static char * -get_mpathd_dest(char *addr, int family) -{ - int sock; - char *buf; - struct lifnum lifn; - struct lifconf lifc; - struct lifreq *lifrp; - sa_family_t af = AF_INET; /* IPv4 by default */ - int i; - struct lifreq lifreq; - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - struct hostent *hp; - char *ifname = NULL; - char *prefix = NULL; - char addrstr[INET6_ADDRSTRLEN]; - char ifaddr[INET6_ADDRSTRLEN]; - int err; - - if (addr == NULL) { - return (NULL); - } - - rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s)\n", addr); - - if (family & CONFIG_AF_INET6) { - af = AF_INET6; - } else { - af = AF_INET; - } - - if ((sock = socket(af, SOCK_DGRAM, 0)) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: failure opening %s socket: %s\n"), - af == AF_INET6 ? "IPv6" : "IPv4", strerror(errno)); - return (NULL); - } - - lifn.lifn_family = af; - lifn.lifn_flags = 0; - if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCLGIFNUM failed: %s\n"), - strerror(errno)); - (void) close(sock); - return (NULL); - } - - if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) { - rcm_log_message(RCM_ERROR, _("IP: calloc: %s\n"), - strerror(errno)); - (void) close(sock); - return (NULL); - } - - lifc.lifc_family = af; - lifc.lifc_flags = 0; - lifc.lifc_len = sizeof (struct lifreq) * lifn.lifn_count; - lifc.lifc_buf = buf; - - if (ioctl(sock, SIOCGLIFCONF, (char *)&lifc) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFCONF failed: %s\n"), - strerror(errno)); - free(buf); - (void) close(sock); - return (NULL); - } + if ((af == AF_INET6 || ipmp) && !ifconfig(ifinst, fstr, "up", B_FALSE)) + goto fail; - /* Filter out prefix address from netmask */ - (void) strcpy(ifaddr, addr); - if ((prefix = strchr(ifaddr, '/')) != NULL) { - *prefix = '\0'; /* We care about the address part only */ - } + /* + * For IPv4, if a DHCP configuration file exists, have DHCP configure + * the interface. As with the boot scripts, this is done after the + * hostname files are processed so that configuration in those files + * (such as IPMP group names) will be applied first. + */ + if (af == AF_INET) { + char dhcpfile[MAXPATHLEN]; + char *dhcpbuf; + off_t i, dhcpsize; - /* Check for aliases */ - hp = getipnodebyname(ifaddr, af, AI_DEFAULT, &err); - if (hp) { - if (inet_ntop(af, (void *)hp->h_addr_list[0], - ifaddr, sizeof (ifaddr)) == NULL) { - /* Restore original address and use it */ - (void) strcpy(ifaddr, addr); - if ((prefix = strchr(ifaddr, '/')) != NULL) { - *prefix = '\0'; - } - } - freehostent(hp); - } - rcm_log_message(RCM_TRACE2, "IP: ifaddr(%s) = %s\n", addr, ifaddr); + (void) snprintf(dhcpfile, MAXPATHLEN, DHCPFILE_FMT, ifinst); + if (stat(dhcpfile, &statb) == -1) + goto out; - /* now search the interfaces */ - lifrp = lifc.lifc_req; - for (i = 0; i < lifn.lifn_count; i++, lifrp++) { - (void) strcpy(lifreq.lifr_name, lifrp->lifr_name); - /* Get the interface address for this interface */ - if (ioctl(sock, SIOCGLIFADDR, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFADDR: %s\n"), strerror(errno)); - free(buf); - (void) close(sock); - return (NULL); - } - - if (af == AF_INET6) { - sin6 = (struct sockaddr_in6 *)&lifreq.lifr_addr; - if (inet_ntop(AF_INET6, (void *)&sin6->sin6_addr, - addrstr, sizeof (addrstr)) == NULL) { - continue; - } - } else { - sin = (struct sockaddr_in *)&lifreq.lifr_addr; - if (inet_ntop(AF_INET, (void *)&sin->sin_addr, - addrstr, sizeof (addrstr)) == NULL) { - continue; - } + if ((dhcpbuf = copylist(dhcpfile, &dhcpsize)) == NULL) { + rcm_log_message(RCM_ERROR, _("IP: cannot read " + "(%s): %s\n"), dhcpfile, strerror(errno)); + goto fail; } - if (STREQ(addrstr, ifaddr)) { - /* Allocate memory to hold interface name */ - if ((ifname = (char *)malloc(LIFNAMSIZ)) == NULL) { - rcm_log_message(RCM_ERROR, - _("IP: malloc: %s\n"), strerror(errno)); - free(buf); - (void) close(sock); - return (NULL); - } - - /* Copy the interface name */ - /* - * (void) memcpy(ifname, lifrp->lifr_name, - * sizeof (ifname)); - * ifname[sizeof (ifname) - 1] = '\0'; - */ - (void) strcpy(ifname, lifrp->lifr_name); - break; + /* + * The copylist() API converts \n's to \0's, but we want them + * to be spaces. + */ + if (dhcpsize > 0) { + for (i = 0; i < dhcpsize; i++) + if (dhcpbuf[i] == '\0') + dhcpbuf[i] = ' '; + dhcpbuf[dhcpsize - 1] = '\0'; } + (void) ifconfig(ifinst, CFG_DHCP_CMD, dhcpbuf, B_FALSE); + free(dhcpbuf); } - - (void) close(sock); +out: + free(ifparsebuf); free(buf); - - if (ifname == NULL) - rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s): none\n", - addr); - else - rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s): %s\n", - addr, ifname); - - return (ifname); -} - -static int -if_getcount(int af) -{ - int sock; - struct lifnum lifn; - - rcm_log_message(RCM_TRACE1, "IP: if_getcount\n"); - - if ((sock = socket(af, SOCK_DGRAM, 0)) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: failure opening %s socket: %s\n"), - af == AF_INET6 ? "IPv6" : "IPv4", strerror(errno)); - return (-1); - } - - lifn.lifn_family = af; - lifn.lifn_flags = 0; - if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCLGIFNUM failed: %s\n"), - strerror(errno)); - (void) close(sock); - return (-1); - } - (void) close(sock); - - rcm_log_message(RCM_TRACE1, "IP: if_getcount success: %d\n", - lifn.lifn_count); - - return (lifn.lifn_count); + rcm_log_message(RCM_TRACE1, "IP: if_config_inst(%s) success\n", ifinst); + return (0); +fail: + free(ifparsebuf); + free(buf); + rcm_log_message(RCM_ERROR, "IP: if_config_inst(%s) failure\n", ifinst); + return (-1); } /* - * tokenize() - turn a command line into tokens; caller is responsible to - * provide enough memory to hold all tokens + * ntok() - count the number of tokens in the provided buffer. */ -static void -tokenize(char *line, char **tokens, char *tspace, int *ntok) +static uint_t +ntok(const char *cp) { - char *cp; - char *sp; + uint_t ntok = 0; - sp = tspace; - cp = line; - for (*ntok = 0; *ntok < MAXARGS; (*ntok)++) { - tokens[*ntok] = sp; + for (;;) { while (ISSPACE(*cp)) cp++; + if (ISEOL(*cp)) break; + do { - *sp++ = *cp++; + cp++; } while (!ISSPACE(*cp) && !ISEOL(*cp)); - *sp++ = '\0'; + ntok++; + } + return (ntok); +} + +static boolean_t +ifconfig(const char *ifinst, const char *fstr, const char *buf, boolean_t stdif) +{ + char syscmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1]; + int status; + + (void) snprintf(syscmd, sizeof (syscmd), SBIN_IFCONFIG " %s %s %s", + ifinst, fstr, buf); + + if (stdif) + (void) strlcat(syscmd, CFG_CMDS_STD, sizeof (syscmd)); + + rcm_log_message(RCM_TRACE1, "IP: Exec: %s\n", syscmd); + if ((status = rcm_exec_cmd(syscmd)) != 0) { + if (WIFEXITED(status)) { + rcm_log_message(RCM_ERROR, _("IP: \"%s\" failed with " + "exit status %d\n"), syscmd, WEXITSTATUS(status)); + } else { + rcm_log_message(RCM_ERROR, _("IP: Error: %s: %s\n"), + syscmd, strerror(errno)); + } + return (B_FALSE); } + return (B_TRUE); } diff --git a/usr/src/cmd/svc/milestone/net-init b/usr/src/cmd/svc/milestone/net-init index 26b295dce9..7f0804af67 100644 --- a/usr/src/cmd/svc/milestone/net-init +++ b/usr/src/cmd/svc/milestone/net-init @@ -20,11 +20,9 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This is the second phase of TCP/IP configuration. The first part is # run by the svc:/network/physical service and includes configuring the # interfaces and setting the machine's hostname. The svc:/network/initial @@ -52,10 +50,11 @@ if [ -f /etc/inet/ipaddrsel.conf ]; then fi # -# Now that /usr is mounted, see if in.mpathd needs to be started by firing it -# up in "adopt" mode; if there are no interfaces it needs to manage, it will -# automatically exit. Note that it may already be running if we're not -# executing as part of system boot. +# If explicit IPMP groups are being used, in.mpathd will already be started. +# However, if TRACK_INTERFACES_ONLY_WITH_GROUPS=no and no explicit IPMP +# groups have been configured, then it still needs to be started. So, fire +# it up in "adopt" mode; if there are no interfaces it needs to manage, it +# will automatically exit. # /usr/bin/pgrep -x -u 0 -z `smf_zonename` in.mpathd >/dev/null 2>&1 || \ /usr/lib/inet/in.mpathd -a diff --git a/usr/src/cmd/svc/milestone/net-loopback b/usr/src/cmd/svc/milestone/net-loopback index 3bd5a0f525..d07afd4ada 100644 --- a/usr/src/cmd/svc/milestone/net-loopback +++ b/usr/src/cmd/svc/milestone/net-loopback @@ -20,10 +20,9 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" . /lib/svc/share/smf_include.sh @@ -36,14 +35,6 @@ smf_configure_ip || exit $SMF_EXIT_OK # -# Cause ifconfig to not automatically start in.mpathd when IPMP groups are -# configured. This is not strictly necessary but makes it so that in.mpathd -# will always be started explicitly from /lib/svc/method/net-init (the -# svc:/network/initial service), when we're sure that /usr is mounted. -# -SUNW_NO_MPATHD=; export SUNW_NO_MPATHD - -# # Before any interfaces are configured, we need to set the system # default IP forwarding behavior. This will be the setting for # interfaces that don't modify the per-interface setting with the diff --git a/usr/src/cmd/svc/milestone/net-physical b/usr/src/cmd/svc/milestone/net-physical index 8530806768..bc74c2a206 100644 --- a/usr/src/cmd/svc/milestone/net-physical +++ b/usr/src/cmd/svc/milestone/net-physical @@ -20,7 +20,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. @@ -38,22 +38,9 @@ # smf_configure_ip || exit $SMF_EXIT_OK -# Print warnings to console -warn_failed_ifs() { - echo "Failed to $1 interface(s): $2" >/dev/msglog -} - # Make sure that the libraries essential to this stage of booting can be found. LD_LIBRARY_PATH=/lib; export LD_LIBRARY_PATH -# -# Cause ifconfig to not automatically start in.mpathd when IPMP groups are -# configured. This is not strictly necessary but makes it so that in.mpathd -# will always be started explicitly from /etc/init.d/inetinit, when we're -# sure that /usr is mounted. -# -SUNW_NO_MPATHD=; export SUNW_NO_MPATHD - smf_netstrategy if smf_is_globalzone; then @@ -127,13 +114,18 @@ if [ "$interface_names" != "/etc/hostname.*[0-9]" ]; then IFS="$ORIGIFS" while [ $# -ge 2 ]; do shift - if [ $# -gt 1 -a "$2" != "/etc/hostname" ]; then - while [ $# -gt 1 -a "$1" != "/etc/hostname" ]; do - shift - done - else - inet_list="$inet_list $1" + intf_name=$1 + while [ $# -gt 1 -a "$2" != "/etc/hostname" ]; do + intf_name="$intf_name.$2" shift + done + shift + + read one rest < /etc/hostname.$intf_name + if [ "$one" = ipmp ]; then + ipmp_list="$ipmp_list $intf_name" + else + inet_list="$inet_list $intf_name" fi done fi @@ -151,17 +143,38 @@ if [ "$interface_names" != "/etc/hostname6.*[0-9]" ]; then IFS="$ORIGIFS" while [ $# -ge 2 ]; do shift - if [ $# -gt 1 -a "$2" != "/etc/hostname6" ]; then - while [ $# -gt 1 -a "$1" != "/etc/hostname6" ]; do - shift - done - else - inet6_list="$inet6_list $1" + intf_name=$1 + while [ $# -gt 1 -a "$2" != "/etc/hostname6" ]; do + intf_name="$intf_name.$2" shift + done + shift + + read one rest < /etc/hostname6.$intf_name + if [ "$one" = ipmp ]; then + ipmp6_list="$ipmp6_list $intf_name" + else + inet6_list="$inet6_list $intf_name" fi done fi +# +# Create all of the IPv4 IPMP interfaces. +# +if [ -n "$ipmp_list" ]; then + set -- $ipmp_list + while [ $# -gt 0 ]; do + if /sbin/ifconfig $1 ipmp; then + ipmp_created="$ipmp_created $1" + else + ipmp_failed="$ipmp_failed $1" + fi + shift + done + [ -n "$ipmp_failed" ] && warn_failed_ifs "create IPv4 IPMP" \ + "$ipmp_failed" +fi # # Step through the IPv4 interface list and try to plumb every interface. @@ -178,7 +191,7 @@ if [ -n "$inet_list" ]; then fi shift done - [ -n "$inet_failed" ] && warn_failed_ifs "plumb IPv4" $inet_failed + [ -n "$inet_failed" ] && warn_failed_ifs "plumb IPv4" "$inet_failed" fi # Run autoconf to connect to a WLAN if the interface is a wireless one @@ -209,7 +222,24 @@ if [ -n "$inet6_list" ]; then fi shift done - [ -n "$inet6_failed" ] && warn_failed_ifs "plumb IPv6" $inet6_failed + [ -n "$inet6_failed" ] && warn_failed_ifs "plumb IPv6" "$inet6_failed" +fi + +# +# Create all of the IPv6 IPMP interfaces. +# +if [ -n "$ipmp6_list" ]; then + set -- $ipmp6_list + while [ $# -gt 0 ]; do + if /sbin/ifconfig $1 inet6 ipmp; then + ipmp6_created="$ipmp6_created $1" + else + ipmp6_failed="$ipmp6_failed $1" + fi + shift + done + [ -n "$ipmp6_failed" ] && warn_failed_ifs "create IPv6 IPMP" \ + "$ipmp6_failed" fi if smf_is_globalzone; then @@ -224,49 +254,24 @@ if smf_is_globalzone; then fi # -# Process the /etc/hostname.* files of plumbed IPv4 interfaces. If an -# /etc/hostname file is not present or is empty, the ifconfig auto-dhcp -# / auto-revarp command will attempt to set the address, later. +# Process the /etc/hostname[6].* files for IPMP interfaces. Processing these +# before non-IPMP interfaces avoids accidental implicit IPMP group creation. +# +[ -n "$ipmp_created" ] && if_configure inet "IPMP" $ipmp_created +[ -n "$ipmp6_created" ] && if_configure inet6 "IPMP" $ipmp6_created + # -# If /etc/hostname.lo0 exists the loop below will do additional -# configuration of lo0. +# Process the /etc/hostname[6].* files for non-IPMP interfaces. # -if [ -n "$inet_plumbed" ]; then - i4s_fail= - echo "configuring IPv4 interfaces:\c" - set -- $inet_plumbed - while [ $# -gt 0 ]; do - inet_process_hostname /sbin/ifconfig $1 inet \ - </etc/hostname.$1 >/dev/null - [ $? != 0 ] && i4s_fail="$i4s_fail $1" - echo " $1\c" - shift - done - echo "." - [ -n "$i4s_fail" ] && warn_failed_ifs "configure IPv4" $i4s_fail -fi +[ -n "$inet_plumbed" ] && if_configure inet "" $inet_plumbed +[ -n "$inet6_plumbed" ] && if_configure inet6 "" $inet6_plumbed # -# Process the /etc/hostname6.* files of plumbed IPv6 interfaces. After -# processing the hostname6 file, bring the interface up. If -# /etc/hostname6.lo0 exists the loop below will do additional -# configuration of lo0. +# For the IPv4 and IPv6 interfaces that failed to plumb, find (or create) +# IPMP meta-interfaces to host their data addresses. # -if [ -n "$inet6_plumbed" ]; then - i6_fail= - echo "configuring IPv6 interfaces:\c" - set -- $inet6_plumbed - while [ $# -gt 0 ]; do - inet6_process_hostname /sbin/ifconfig $1 inet6 \ - </etc/hostname6.$1 >/dev/null && - /sbin/ifconfig $1 inet6 up - [ $? != 0 ] && i6_fail="$i6_fail $1" - echo " $1\c" - shift - done - echo "." - [ -n "$i6_fail" ] && warn_failed_ifs "configure IPv6" $i6_fail -fi +[ -n "$inet_failed" ] && move_addresses inet +[ -n "$inet6_failed" ] && move_addresses inet6 # Run DHCP if requested. Skip boot-configured interface. interface_names="`echo /etc/dhcp.*[0-9] 2>/dev/null`" @@ -326,7 +331,7 @@ if [ "$interface_names" != '/etc/dhcp.*[0-9]' ]; then done IFS="$ORIGIFS" unset ORIGIFS - [ -n "$i4d_fail" ] && warn_failed_ifs "configure IPv4 DHCP" $i4d_fail + [ -n "$i4d_fail" ] && warn_failed_ifs "configure IPv4 DHCP" "$i4d_fail" fi # In order to avoid bringing up the interfaces that have @@ -338,14 +343,6 @@ if [ "$_INIT_NET_STRATEGY" = "rarp" -o -z "$hostname" ]; then fi # -# Process IPv4 and IPv6 interfaces that failed to plumb. Find an -# alternative interface to host the addresses. -# -[ -n "$inet_failed" ] && move_addresses inet - -[ -n "$inet6_failed" ] && move_addresses inet6 - -# # If the /etc/defaultrouter file exists, process it now so that the next # stage of booting will have access to NFS. # diff --git a/usr/src/cmd/svc/shell/net_include.sh b/usr/src/cmd/svc/shell/net_include.sh index 51c87a40a8..71dc6a8256 100644 --- a/usr/src/cmd/svc/shell/net_include.sh +++ b/usr/src/cmd/svc/shell/net_include.sh @@ -20,13 +20,18 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. # All rights reserved. # +# Print warnings to console +warn_failed_ifs() { + echo "Failed to $1 interface(s):$2" >/dev/msglog +} + # # shcat file # Simulates cat in sh so it doesn't need to be on the root filesystem. @@ -41,20 +46,28 @@ shcat() { } # -# Inet_list, list of IPv4 interfaces. -# Inet_plumbed, list of plumbed IPv4 interfaces. -# Inet_failed, list of IPv4 interfaces that failed to plumb. -# Inet6_list, list of IPv6 interfaces. -# Inet6_plumbed, list of plumbed IPv6 interfaces. -# Inet6_failed, list of IPv6 interfaces that failed to plumb. +# inet_list list of IPv4 interfaces. +# inet6_list list of IPv6 interfaces. +# ipmp_list list of IPMP IPv4 interfaces. +# ipmp6_list list of IPMP IPv6 interfaces. +# inet_plumbed list of plumbed IPv4 interfaces. +# inet6_plumbed list of plumbed IPv6 interfaces. +# ipmp_created list of created IPMP IPv4 interfaces. +# ipmp6_created list of created IPMP IPv6 interfaces. +# inet_failed list of IPv4 interfaces that failed to plumb. +# inet6_failed list of IPv6 interfaces that failed to plumb. +# ipmp_failed list of IPMP IPv4 interfaces that failed to be created. +# ipmp6_failed list of IPMP IPv6 interfaces that failed to be created. # unset inet_list inet_plumbed inet_failed \ - inet6_list inet6_plumbed inet6_failed + inet6_list inet6_plumbed inet6_failed \ + ipmp_list ipmp_created ipmp_failed \ + ipmp6_list ipmp6_created ipmp6_failed + # # get_physical interface # -# Return physical interface corresponding to the given logical -# interface. +# Return physical interface corresponding to the given interface. # get_physical() { @@ -70,7 +83,7 @@ get_physical() # get_logical interface # # Return logical interface number. Zero will be returned -# if there is no explicit logical device number. +# if there is no explicit logical number. # get_logical() { @@ -89,19 +102,18 @@ get_logical() # # if_comp if1 if2 # -# Compare Interfaces. Do the physical interface names and logical interface +# Compare interfaces. Do the physical interface names and logical interface # numbers match? # if_comp() { - [ "`get_physical $1`" = "`get_physical $2`" ] && \ - [ `get_logical $1` -eq `get_logical $2` ] + physical_comp $1 $2 && [ `get_logical $1` -eq `get_logical $2` ] } - + # # physical_comp if1 if2 # -# Do the two devices share a physical interface? +# Do the two interfaces share a physical interface? # physical_comp() { @@ -129,19 +141,110 @@ in_list() } # -# get_group_from_hostname interface type +# get_inactive_ifname groupname +# +# Return the name of an inactive interface in `groupname', if one exists. +# +get_inactive_ifname() +{ + ORIGIFS="$IFS" + /sbin/ipmpstat -gP -o groupname,interfaces | + while IFS=: read groupname ifnames; do + # + # Skip other IPMP groups. + # + [ "$groupname" != "$1" ] && continue + + # + # Standby interfaces are always enclosed in ()'s, so look + # for the first interface name starting with a "(", and + # strip those off. + # + IFS=" " + for ifname in $ifnames; do + case "$ifname" in + '('*) IFS="()" + echo $ifname + IFS="$ORIGIFS" + return + ;; + *) ;; + esac + done + done + IFS="$ORIGIFS" +} + +# +# get_groupifname groupname +# +# Return the IPMP meta-interface name for the group, if it exists. +# +get_groupifname() +{ + /sbin/ipmpstat -gP -o groupname,group | while IFS=: read name ifname; do + if [ "$name" = "$1" ]; then + echo "$ifname" + return + fi + done +} + +# +# create_ipmp ifname groupname type +# +# Helper function for create_groupifname() that returns zero if it's able +# to create an IPMP interface of the specified type and place it in the +# specified group, or non-zero otherwise. +# +create_ipmp() +{ + /sbin/ifconfig $1 >/dev/null 2>&1 && return 1 + /sbin/ifconfig $1 inet6 >/dev/null 2>&1 && return 1 + /sbin/ifconfig $1 $3 ipmp group $2 2>/dev/null +} + +# +# create_groupifname groupname type +# +# Create an IPMP meta-interface name for the group. We only use this +# function if all of the interfaces in the group failed at boot and there +# were no /etc/hostname[6].<if> files for the IPMP meta-interface. +# +create_groupifname() +{ + # + # This is a horrible way to count from 0 to 999, but in sh and + # without necessarily having /usr mounted, what else can we do? + # + for a in "" 1 2 3 4 5 6 7 8 9; do + for b in 0 1 2 3 4 5 6 7 8 9; do + for c in 0 1 2 3 4 5 6 7 8 9; do + # strip leading zeroes + [ "$a" = "" ] && [ "$b" = 0 ] && b="" + if create_ipmp ipmp$a$b$c $1 $2; then + echo ipmp$a$b$c + return + fi + done + done + done +} + +# +# get_hostname_ipmpinfo interface type # -# Return all group settings from hostname file for a given interface. +# Return all requested IPMP keywords from hostname file for a given interface. # # Example: -# get_group_from_hostname hme0 inet +# get_hostname_ipmpinfo hme0 inet keyword [ keyword ... ] # -get_group_from_hostname() +get_hostname_ipmpinfo() { case "$2" in - inet) file=/etc/hostname.$1 + inet) file=/etc/hostname.$1 ;; - inet6) file=/etc/hostname6.$1 + inet6) file=/etc/hostname6.$1 ;; *) return @@ -150,16 +253,21 @@ get_group_from_hostname() [ -r "$file" ] || return + type=$2 + shift 2 + # - # Read through the hostname file looking for group settings - # There may be several group settings in the file. It is up - # to the caller to pick the right one (i.e. the last one). + # Read through the hostname file looking for the specified + # keywords. Since there may be several keywords that cancel + # each other out, the caller must post-process as appropriate. # while read line; do [ -z "$line" ] && continue - /sbin/ifparse -s "$2" $line - done < "$file" | while read one two three; do - [ "$one" = "group" ] && echo "$two" + /sbin/ifparse -s "$type" $line + done < "$file" | while read one two; do + for keyword in "$@"; do + [ "$one" = "$keyword" ] && echo "$one $two" + done done } @@ -174,7 +282,6 @@ get_group_from_hostname() get_group_for_type() { physical=`get_physical $1` - type=$2 group="" @@ -183,184 +290,77 @@ get_group_for_type() # the reason for the second while loop. # shift 2 - while [ $# -gt 0 ]; do - if if_comp "$physical" $1; then - get_group_from_hostname $1 $type + for ifname in "$@"; do + if if_comp "$physical" $ifname; then + get_hostname_ipmpinfo $ifname $type group fi - shift done | while :; do - read next || { + read keyword grname || { echo "$group" break } - group="$next" + group="$grname" done } # -# get_group interface [ configured | failed ] -# -# If there is both an inet and inet6 version of an interface, the group -# could be set in either set of hostname files. -# -# Inet6 is configured after inet, so if the group is set in both -# sets of hostname files, the inet6 file wins. -# -# The "configured" argument should be used to get the group for -# an interface that has been plumbed into the stack and configured. Use -# the "failed" argument to get the group for an interface that failed to -# plumb. -# -get_group() -{ - group="" - - case "$2" in - configured) - group=`get_group_for_type $1 inet6 $inet6_plumbed` - ;; - failed) - group=`get_group_for_type $1 inet6 $inet6_list` - ;; - *) - return - ;; - esac - - if [ -z "$group" ]; then - if [ "$2" = configured ]; then - group=`get_group_for_type $1 inet $inet_plumbed` - else - group=`get_group_for_type $1 inet $inet_list` - fi - fi - - echo $group -} - -# -# get_standby_from_hostname interface type -# -# Return any "standby" or "-standby" flags in the hostname file. -# -# Example: -# get_standby_from_hostname hme0 inet6 -# -# -get_standby_from_hostname() -{ - case "$2" in - inet) file=/etc/hostname.$1 - ;; - inet6) file=/etc/hostname6.$1 - ;; - *) - return - ;; - esac - - [ -r "$file" ] || return - - # - # There may be several instances of the "standby" and - # "-standby" flags in the hostname file. It is up to - # the caller to pick the correct one. - # - while read line; do - [ -z "$line" ] && continue - /sbin/ifparse -s "$2" $line - done < "$file" | while read one two; do - [ "$one" = "standby" ] || [ "$one" = "-standby" ] \ - && echo "$one" - done -} - -# -# get_standby_for_type interface type plumbed_list +# get_standby_for_type interface type list # # Look through the set of hostname files associated with the same physical -# interface as "interface", and determine whether they would configure -# the interface as a standby interface. +# interface as "interface", and print the standby value ("standby", +# "-standby", or nothing). Only hostname files associated with the +# physical interface or logical interface zero can set this flag. # get_standby_for_type() { - physical=`get_physical $1` type=$2 - final="" - # - # The last "standby" or "-standby" flag is the one that counts, - # which is the reason for the second while loop. + # The last setting of "standby" or "-standby" is the one that + # counts, which is the reason for the second while loop. # shift 2 - while [ $# -gt 0 ]; do - if [ "`get_physical $1`" = "$physical" ]; then - get_standby_from_hostname $1 $type + for ifname in "$@"; do + if if_comp "$physical" $ifname; then + get_hostname_ipmpinfo $ifname $type standby -standby fi - shift done | while :; do - read next || { - echo "$final" + read keyword || { + echo "$iftype" break } - final="$next" + iftype="$keyword" done } # -# is_standby interface +# get_group interface # -# Determine whether a configured interface is a standby interface. -# -# Both the inet and inet6 hostname file sets must be checked. -# If "standby" or "-standby" is set in the inet6 hostname file set, -# don't bother looking at the inet set. +# If there is both an inet and inet6 version of an interface, the group +# could be set in either set of hostname files. Since inet6 is configured +# after inet, if there's a setting in both files, inet6 wins. # -is_standby() +get_group() { - standby=`get_standby_for_type $1 inet6 $inet6_plumbed` - - if [ -z "$standby" ]; then - standby=`get_standby_for_type $1 inet $inet_plumbed` - fi - - # The return value is the value of the following test. - [ "$standby" = "standby" ] + group=`get_group_for_type $1 inet6 $inet6_list` + [ -z "$group" ] && group=`get_group_for_type $1 inet $inet_list` + echo $group } # -# get_alternate interface plumbed_list -# -# Look for a plumbed interface in the same group as "interface". -# A standby interface is preferred over a non-standby interface. +# is_standby interface # -# Example: -# get_alternate hme0 $inet_plumbed +# If there is both an inet and inet6 version of an interface, the +# "standby" or "-standby" flag could be set in either set of hostname +# files. Since inet6 is configured after inet, if there's a setting in +# both files, inet6 wins. # -get_alternate() +is_standby() { - mygroup=`get_group $1 failed` - [ -z "$mygroup" ] && return - - maybe="" - - shift - while [ $# -gt 0 ]; do - group=`get_group $1 configured` - if [ "$group" = "$mygroup" ]; then - if is_standby $1; then - get_physical $1 - return - else - [ -z "$maybe" ] && maybe=$1 - fi - fi - shift - done - - get_physical $maybe + standby=`get_standby_for_type $1 inet6 $inet6_list` + [ -z "$standby" ] && standby=`get_standby_for_type $1 inet $inet_list` + [ "$standby" = "standby" ] } # @@ -394,7 +394,7 @@ doDHCPhostname() # # If there is only line in an hostname file we assume it contains # the old style address which results in the interface being brought up -# and the netmask and broadcast address being set. +# and the netmask and broadcast address being set ($inet_oneline_epilogue). # # If there are multiple lines we assume the file contains a list of # commands to the processor with neither the implied bringing up of the @@ -403,6 +403,8 @@ doDHCPhostname() # Return non-zero if any command fails so that the caller may alert # users to errors in the configuration. # +inet_oneline_epilogue="netmask + broadcast + up" + inet_process_hostname() { if doDHCPhostname $2; then @@ -418,7 +420,7 @@ inet_process_hostname() ifcmds="" retval=0 - while read line; do + while read one rest; do if [ -n "$ifcmds" ]; then # # This handles the first N-1 @@ -427,7 +429,14 @@ inet_process_hostname() $* $ifcmds || retval=$? multiple_lines=true fi - ifcmds="$line" + + # + # Strip out the "ipmp" keyword if it's the + # first token, since it's used to control + # interface creation, not configuration. + # + [ "$one" = ipmp ] && one= + ifcmds="$one $rest" done # @@ -437,8 +446,8 @@ inet_process_hostname() # [ -z "$ifcmds" ] && return $retval if [ $multiple_lines = false ]; then - # The traditional single-line hostname file. - ifcmds="$ifcmds netmask + broadcast + up" + # The traditional one-line hostname file. + ifcmds="$ifcmds $inet_oneline_epilogue" fi # @@ -470,7 +479,13 @@ inet_process_hostname() inet6_process_hostname() { retval=0 - while read ifcmds; do + while read one rest; do + # + # See comment in inet_process_hostname for details. + # + [ "$one" = ipmp ] && one= + ifcmds="$one $rest" + if [ -n "$ifcmds" ]; then $* $ifcmds || retval=$? fi @@ -479,10 +494,9 @@ inet6_process_hostname() } # -# Process interfaces that failed to plumb. Find an alternative -# interface to host the addresses. For IPv6, only static addresses -# defined in hostname6 files are moved, autoconfigured addresses are -# not moved. +# Process interfaces that failed to plumb. Find the IPMP meta-interface +# that should host the addresses. For IPv6, only static addresses defined +# in hostname6 files are moved, autoconfigured addresses are not moved. # # Example: # move_addresses inet6 @@ -491,35 +505,43 @@ move_addresses() { type="$1" eval "failed=\"\$${type}_failed\"" - eval "plumbed=\"\$${type}_plumbed\"" eval "list=\"\$${type}_list\"" - process_hostname="${type}_process_hostname" + process_func="${type}_process_hostname" processed="" if [ "$type" = inet ]; then - echo "moving addresses from failed IPv4 interfaces:\c" + typedesc="IPv4" zaddr="0.0.0.0" hostpfx="/etc/hostname" else - echo "moving addresses from failed IPv6 interfaces:\c" + typedesc="IPv6" zaddr="::" hostpfx="/etc/hostname6" fi - set -- $failed - while [ $# -gt 0 ]; do - in_list if_comp $1 $processed && { shift; continue; } - - alternate="`get_alternate $1 $plumbed`" - if [ -z "$alternate" ]; then - in_list physical_comp $1 $processed || { - echo " $1 (couldn't move, no" \ - "alternative interface)\c" - processed="$processed $1" + echo "Moving addresses from missing ${typedesc} interface(s):\c" \ + >/dev/msglog + + for ifname in $failed; do + in_list if_comp $ifname $processed && continue + + group=`get_group $ifname` + if [ -z "$group" ]; then + in_list physical_comp $ifname $processed || { + echo " $ifname (not moved -- not" \ + "in an IPMP group)\c" >/dev/msglog + processed="$processed $ifname" } - shift continue fi + + # + # Lookup the IPMP meta-interface name. If one doesn't exist, + # create it. + # + grifname=`get_groupifname $group` + [ -z "$grifname" ] && grifname=`create_groupifname $group $type` + # # The hostname files are processed twice. In the first # pass, we are looking for all commands that apply @@ -528,7 +550,7 @@ move_addresses() # whether the address represents a failover address # or not until we've read all the files associated with the # interface. - + # # In the first pass through the hostname files, all # additional logical interface commands are removed. # The remaining commands are concatenated together and @@ -541,19 +563,18 @@ move_addresses() # the embedded "set" command set the address later. # /sbin/ifparse -f $type ` - for item in $list; do - if_comp $1 $item && \ - $process_hostname /sbin/ifparse \ - $type < $hostpfx.$item - done | while read three four; do - [ "$three" != addif ] && \ - echo "$three $four \c" - done` | while read one two; do - [ -z "$one" ] && continue - line="addif $zaddr $one $two" - /sbin/ifconfig $alternate $type \ - -standby $line >/dev/null - done + for item in $list; do + if_comp $ifname $item && $process_func \ + /sbin/ifparse $type < $hostpfx.$item + done | while read three four; do + [ "$three" != addif ] && echo "$three $four \c" + done` | while read one two; do + [ -z "$one" ] && continue + [ "$one $two" = "$inet_oneline_epilogue" ] && \ + continue + line="addif $zaddr $one $two" + /sbin/ifconfig $grifname $type $line >/dev/null + done # # In the second pass, look for the the "addif" commands @@ -561,22 +582,75 @@ move_addresses() # commands are not valid in logical interface hostname # files. # - if [ "$1" = "`get_physical $1`" ]; then - $process_hostname /sbin/ifparse -f $type \ - <$hostpfx.$1 | while read one two; do - [ "$one" = addif ] && \ - /sbin/ifconfig $alternate $type -standby \ - addif $two >/dev/null + if [ "$ifname" = "`get_physical $ifname`" ]; then + $process_func /sbin/ifparse -f $type < $hostpfx.$ifname \ + | while read one two; do + [ "$one" = addif ] && \ + /sbin/ifconfig $grifname $type \ + addif $two >/dev/null done fi - in_list physical_comp $1 $processed || { - echo " $1 (moved to $alternate)\c" - processed="$processed $1" + # + # Check if this was an active interface in the group. If so, + # activate another IP interface (if possible) + # + is_standby $ifname || inactive=`get_inactive_ifname $group` + [ -n "$inactive" ] && /sbin/ifconfig $inactive $type -standby + + in_list physical_comp $ifname $processed || { + processed="$processed $ifname" + echo " $ifname (moved to $grifname\c" > /dev/msglog + if [ -n "$inactive" ]; then + echo " and cleared 'standby' on\c" > /dev/msglog + echo " $inactive to compensate\c" > /dev/msglog + fi + echo ")\c" > /dev/msglog } + inactive="" + done + echo "." >/dev/msglog +} + +# +# if_configure type class interface_list +# +# Configure all of the interfaces of type `type' (e.g., "inet6") in +# `interface_list' according to their /etc/hostname[6].* files. `class' +# describes the class of interface (e.g., "IPMP"), as a diagnostic aid. +# For inet6 interfaces, the interface is also brought up. +# +if_configure() +{ + fail= + type=$1 + class=$2 + process_func=${type}_process_hostname + shift 2 + + if [ "$type" = inet ]; then + desc="IPv4" + hostpfx="/etc/hostname" + else + desc="IPv6" + hostpfx="/etc/hostname6" + fi + [ -n "$class" ] && desc="$class $desc" + + echo "configuring $desc interfaces:\c" + while [ $# -gt 0 ]; do + $process_func /sbin/ifconfig $1 $type < $hostpfx.$1 >/dev/null + if [ $? != 0 ]; then + fail="$fail $1" + elif [ "$type" = inet6 ]; then + /sbin/ifconfig $1 inet6 up || fail="$fail $1" + fi + echo " $1\c" shift done echo "." + + [ -n "$fail" ] && warn_failed_ifs "configure $desc" "$fail" } # diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c index 46b2b5a958..dc90957dfa 100644 --- a/usr/src/cmd/truss/codes.c +++ b/usr/src/cmd/truss/codes.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -615,13 +615,10 @@ const struct ioc { { (uint_t)SIOCSIPSECONFIG, "SIOCSIPSECONFIG", NULL }, { (uint_t)SIOCDIPSECONFIG, "SIOCDIPSECONFIG", NULL }, { (uint_t)SIOCLIPSECONFIG, "SIOCLIPSECONFIG", NULL }, - { (uint_t)SIOCLIFFAILOVER, "SIOCLIFFAILOVER", "lifreq" }, - { (uint_t)SIOCLIFFAILBACK, "SIOCLIFFAILBACK", "lifreq" }, - { (uint_t)SIOCSIPMPFAILBACK, "SIOCSIPMPFAILBACK", NULL }, + { (uint_t)SIOCGLIFBINDING, "SIOCGLIFBINDING", "lifreq" }, { (uint_t)SIOCSLIFGROUPNAME, "SIOCSLIFGROUPNAME", "lifreq" }, { (uint_t)SIOCGLIFGROUPNAME, "SIOCGLIFGROUPNAME", "lifreq" }, - { (uint_t)SIOCGLIFOINDEX, "SIOCGLIFOINDEX", "lifreq" }, - { (uint_t)SIOCSLIFOINDEX, "SIOCSLIFOINDEX", "lifreq" }, + { (uint_t)SIOCGLIFGROUPINFO, "SIOCGLIFGROUPINFO", "lifgroupinfo" }, { (uint_t)SIOCGDSTINFO, "SIOCGDSTINFO", NULL }, { (uint_t)SIOCGIP6ADDRPOLICY, "SIOCGIP6ADDRPOLICY", NULL }, { (uint_t)SIOCSIP6ADDRPOLICY, "SIOCSIP6ADDRPOLICY", NULL }, diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c index edc610559d..8165f64f99 100644 --- a/usr/src/cmd/truss/print.c +++ b/usr/src/cmd/truss/print.c @@ -19,16 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ - -#pragma ident "%Z%%M% %I% %E% SMI" - #define _SYSCALL32 /* make 32-bit compat headers visible */ #include <stdio.h> @@ -73,6 +70,7 @@ #include <netinet/tcp.h> #include <netinet/udp.h> #include <netinet/sctp.h> +#include <net/route.h> #include <sys/utrap.h> #include <sys/lgrp_user.h> #include <sys/door.h> @@ -1749,6 +1747,8 @@ prt_sol(private_t *pri, int raw, long val) { if (val == SOL_SOCKET) { outstring(pri, "SOL_SOCKET"); + } else if (val == SOL_ROUTE) { + outstring(pri, "SOL_ROUTE"); } else { const struct protoent *p; struct protoent res; @@ -1826,6 +1826,18 @@ sol_optname(private_t *pri, long val) #undef CBSIZE } +const char * +route_optname(private_t *pri, long val) +{ + switch (val) { + case RT_AWARE: + return ("RT_AWARE"); + default: + (void) snprintf(pri->code_buf, sizeof (pri->code_buf), + "0x%lx", val); + return (pri->code_buf); + } +} const char * tcp_optname(private_t *pri, long val) @@ -1918,6 +1930,8 @@ prt_son(private_t *pri, int raw, long val) switch (pri->sys_args[1]) { case SOL_SOCKET: outstring(pri, sol_optname(pri, val)); break; + case SOL_ROUTE: outstring(pri, route_optname(pri, val)); + break; case IPPROTO_TCP: outstring(pri, tcp_optname(pri, val)); break; case IPPROTO_UDP: outstring(pri, udp_optname(pri, val)); diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c index 72b6ce5c76..fb8f540cb5 100644 --- a/usr/src/cmd/zoneadmd/vplat.c +++ b/usr/src/cmd/zoneadmd/vplat.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -2397,6 +2397,7 @@ configure_one_interface(zlog_t *zlogp, zoneid_t zone_id, */ char buffer[INET6_ADDRSTRLEN]; void *addr; + const char *nomatch = "no matching subnet found in netmasks(4)"; if (af == AF_INET) addr = &((struct sockaddr_in *) @@ -2405,14 +2406,23 @@ configure_one_interface(zlog_t *zlogp, zoneid_t zone_id, addr = &((struct sockaddr_in6 *) (&lifr.lifr_addr))->sin6_addr; - /* Find out what netmask interface is going to be using */ + /* + * Find out what netmask the interface is going to be using. + * If we just brought up an IPMP data address on an underlying + * interface above, the address will have already migrated, so + * the SIOCGLIFNETMASK won't be able to find it (but we need + * to bring the address up to get the actual netmask). Just + * omit printing the actual netmask in this corner-case. + */ if (ioctl(s, SIOCGLIFNETMASK, (caddr_t)&lifr) < 0 || - inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL) - goto bad; - zerror(zlogp, B_FALSE, - "WARNING: %s: no matching subnet found in netmasks(4) for " - "%s; using default of %s.", - lifr.lifr_name, addrstr4, buffer); + inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL) { + zerror(zlogp, B_FALSE, "WARNING: %s; using default.", + nomatch); + } else { + zerror(zlogp, B_FALSE, + "WARNING: %s: %s: %s; using default of %s.", + lifr.lifr_name, nomatch, addrstr4, buffer); + } } /* diff --git a/usr/src/lib/brand/native/zone/platform.xml b/usr/src/lib/brand/native/zone/platform.xml index f7030ba0a1..69e86cefd2 100644 --- a/usr/src/lib/brand/native/zone/platform.xml +++ b/usr/src/lib/brand/native/zone/platform.xml @@ -20,7 +20,7 @@ CDDL HEADER END - Copyright 2008 Sun Microsystems, Inc. All rights reserved. + Copyright 2009 Sun Microsystems, Inc. All rights reserved. Use is subject to license terms. DO NOT EDIT THIS FILE. @@ -97,6 +97,7 @@ <device match="ipf" ip-type="exclusive" /> <device match="ipl" ip-type="exclusive" /> <device match="iplookup" ip-type="exclusive" /> + <device match="ipmpstub" ip-type="exclusive" /> <device match="ipnat" ip-type="exclusive" /> <device match="ipscan" ip-type="exclusive" /> <device match="ipsecah" ip-type="exclusive" /> diff --git a/usr/src/lib/brand/sn1/zone/platform.xml b/usr/src/lib/brand/sn1/zone/platform.xml index 1659d8851c..b3bb0d7962 100644 --- a/usr/src/lib/brand/sn1/zone/platform.xml +++ b/usr/src/lib/brand/sn1/zone/platform.xml @@ -20,7 +20,7 @@ CDDL HEADER END - Copyright 2008 Sun Microsystems, Inc. All rights reserved. + Copyright 2009 Sun Microsystems, Inc. All rights reserved. Use is subject to license terms. DO NOT EDIT THIS FILE. @@ -101,6 +101,7 @@ <device match="ipf" ip-type="exclusive" /> <device match="ipl" ip-type="exclusive" /> <device match="iplookup" ip-type="exclusive" /> + <device match="ipmpstub" ip-type="exclusive" /> <device match="ipnat" ip-type="exclusive" /> <device match="ipscan" ip-type="exclusive" /> <device match="ipsecah" ip-type="exclusive" /> diff --git a/usr/src/lib/libbsm/common/adt.c b/usr/src/lib/libbsm/common/adt.c index 23f78b6247..d9947622d4 100644 --- a/usr/src/lib/libbsm/common/adt.c +++ b/usr/src/lib/libbsm/common/adt.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -2137,7 +2137,7 @@ adt_get_local_address(int family, struct ifaddrlist *al) int ifal_count; int i; - if ((ifal_count = ifaddrlist(&ifal, family, errbuf)) <= 0) { + if ((ifal_count = ifaddrlist(&ifal, family, 0, errbuf)) <= 0) { int serrno = errno; (void) snprintf(msg, sizeof (msg), "adt_get_local_address " diff --git a/usr/src/lib/libdlpi/common/libdlpi.c b/usr/src/lib/libdlpi/common/libdlpi.c index 14c4451081..d546807342 100644 --- a/usr/src/lib/libdlpi/common/libdlpi.c +++ b/usr/src/lib/libdlpi/common/libdlpi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1109,7 +1109,7 @@ i_dlpi_open(const char *provider, int *fd, uint_t flags, boolean_t style1) /* open libdladm handle rather than taking it as input */ if (dladm_open(&handle) != DLADM_STATUS_OK) - return (DLPI_FAILURE); + goto fallback; if (dladm_dev2linkid(handle, device, &linkid) == DLADM_STATUS_OK) { @@ -1400,7 +1400,7 @@ i_dlpi_strgetmsg(dlpi_impl_t *dip, int msec, dlpi_msg_t *dlreplyp, void *databuf, size_t *datalenp, size_t *totdatalenp) { int retval; - int flags = 0; + int flags; int fd = dip->dli_fd; struct strbuf ctl, data; struct pollfd pfd; @@ -1437,16 +1437,17 @@ i_dlpi_strgetmsg(dlpi_impl_t *dip, int msec, dlpi_msg_t *dlreplyp, start = gethrtime() / (NANOSEC / MILLISEC); switch (poll(&pfd, 1, msec)) { - default: - if (pfd.revents & POLLHUP) - return (DL_SYSERR); - break; - case 0: - return (DLPI_ETIMEDOUT); - case -1: + default: + if (pfd.revents & POLLHUP) return (DL_SYSERR); + break; + case 0: + return (DLPI_ETIMEDOUT); + case -1: + return (DL_SYSERR); } + flags = 0; if ((retval = getmsg(fd, &ctl, &data, &flags)) < 0) return (DL_SYSERR); diff --git a/usr/src/lib/libinetcfg/common/inetcfg.c b/usr/src/lib/libinetcfg/common/inetcfg.c index 38beca5574..e1f09a881a 100644 --- a/usr/src/lib/libinetcfg/common/inetcfg.c +++ b/usr/src/lib/libinetcfg/common/inetcfg.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -743,7 +741,8 @@ icfg_set_flags(icfg_handle_t handle, uint64_t flags) struct lifreq lifr; uint64_t oflags; int ret; - int rtsock; + int rtsock = -1; + int aware = RTAW_UNDER_IPMP; (void) strlcpy(lifr.lifr_name, handle->ifh_interface.if_name, sizeof (lifr.lifr_name)); @@ -757,10 +756,16 @@ icfg_set_flags(icfg_handle_t handle, uint64_t flags) /* * Any time flags are changed on an interface that has IFF_UP set, * you'll get a routing socket message. We care about the status, - * though, only when the new flags are marked "up." + * though, only when the new flags are marked "up." Since we may be + * changing an IPMP test address, we enable RTAW_UNDER_IPMP. */ - rtsock = (flags & IFF_UP) ? - socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle)) : -1; + if (flags & IFF_UP) { + rtsock = socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle)); + if (rtsock != -1) { + (void) setsockopt(rtsock, SOL_ROUTE, RT_AWARE, &aware, + sizeof (aware)); + } + } lifr.lifr_flags = flags; if (ioctl(handle->ifh_sock, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) { @@ -993,7 +998,8 @@ icfg_set_addr(icfg_handle_t handle, const struct sockaddr *addr, struct lifreq lifr; uint64_t flags; int ret; - int rtsock; + int rtsock = -1; + int aware = RTAW_UNDER_IPMP; (void) memset(&lifr.lifr_addr, 0, sizeof (lifr.lifr_addr)); if ((ret = to_sockaddr_storage(ICFG_FAMILY(handle), addr, addrlen, @@ -1002,15 +1008,19 @@ icfg_set_addr(icfg_handle_t handle, const struct sockaddr *addr, } /* - * Need to do check on duplicate address detection results if the - * interface is up. + * Need to check duplicate address detection results if the address is + * up. Since this may be an IPMP test address, enable RTAW_UNDER_IPMP. */ - if ((ret = icfg_get_flags(handle, &flags)) != ICFG_SUCCESS) { + if ((ret = icfg_get_flags(handle, &flags)) != ICFG_SUCCESS) return (ret); - } - rtsock = (flags & IFF_UP) ? - socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle)) : -1; + if (flags & IFF_UP) { + rtsock = socket(PF_ROUTE, SOCK_RAW, ICFG_FAMILY(handle)); + if (rtsock != -1) { + (void) setsockopt(rtsock, SOL_ROUTE, RT_AWARE, &aware, + sizeof (aware)); + } + } (void) strlcpy(lifr.lifr_name, handle->ifh_interface.if_name, sizeof (lifr.lifr_name)); diff --git a/usr/src/lib/libinetutil/Makefile.com b/usr/src/lib/libinetutil/Makefile.com index 810f24bd71..cd3a0d6e33 100644 --- a/usr/src/lib/libinetutil/Makefile.com +++ b/usr/src/lib/libinetutil/Makefile.com @@ -19,15 +19,13 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# -LIBRARY = libinetutil.a -VERS = .1 -OBJECTS = octet.o inetutil4.o ifspec.o ifaddrlist.o eh.o tq.o +LIBRARY = libinetutil.a +VERS = .1 +OBJECTS = octet.o inetutil.o ifspec.o ifaddrlist.o ifaddrlistx.o eh.o tq.o include ../../Makefile.lib @@ -38,9 +36,9 @@ LIBS = $(DYNLIB) $(LINTLIB) SRCDIR = ../common COMDIR = $(SRC)/common/net/dhcp -SRCS = $(COMDIR)/octet.c $(SRCDIR)/inetutil4.c \ +SRCS = $(COMDIR)/octet.c $(SRCDIR)/inetutil.c \ $(SRCDIR)/ifspec.c $(SRCDIR)/eh.c $(SRCDIR)/tq.c \ - $(SRCDIR)/ifaddrlist.c + $(SRCDIR)/ifaddrlist.c $(SRCDIR)/ifaddrlistx.c $(LINTLIB):= SRCS = $(SRCDIR)/$(LINTSRC) LDLIBS += -lsocket -lc diff --git a/usr/src/lib/libinetutil/common/ifaddrlist.c b/usr/src/lib/libinetutil/common/ifaddrlist.c index 383dc2afb0..fa67a0fc37 100644 --- a/usr/src/lib/libinetutil/common/ifaddrlist.c +++ b/usr/src/lib/libinetutil/common/ifaddrlist.c @@ -1,5 +1,5 @@ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,9 +38,6 @@ * @(#) $Header: ifaddrlist.c,v 1.2 97/04/22 13:31:05 leres Exp $ (LBL) */ -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <alloca.h> #include <errno.h> #include <libinetutil.h> #include <stdio.h> @@ -54,9 +51,9 @@ * See <libinetutil.h> for a description of the programming interface. */ int -ifaddrlist(struct ifaddrlist **ipaddrp, int family, char *errbuf) +ifaddrlist(struct ifaddrlist **ipaddrp, int family, uint_t flags, char *errbuf) { - struct ifaddrlist *ifaddrlist, *al; + struct ifaddrlist *ifaddrlist = NULL, *al = NULL; struct sockaddr_in *sin; struct sockaddr_in6 *sin6; struct lifconf lifc; @@ -64,31 +61,28 @@ ifaddrlist(struct ifaddrlist **ipaddrp, int family, char *errbuf) struct lifreq *lifrp; int i, count, nlifr; int fd; - const char *iocstr; + const char *opstr; + (void) memset(&lifc, 0, sizeof (lifc)); if (family != AF_INET && family != AF_INET6) { (void) strlcpy(errbuf, "invalid address family", ERRBUFSIZE); return (-1); } - fd = socket(family, SOCK_DGRAM, 0); - if (fd == -1) { - (void) snprintf(errbuf, ERRBUFSIZE, "socket: %s", - strerror(errno)); - return (-1); + if ((fd = socket(family, SOCK_DGRAM, 0)) == -1) { + opstr = "socket"; + goto fail; } /* * Get the number of network interfaces of type `family'. */ lifn.lifn_family = family; - lifn.lifn_flags = 0; + lifn.lifn_flags = flags; again: if (ioctl(fd, SIOCGLIFNUM, &lifn) == -1) { - (void) snprintf(errbuf, ERRBUFSIZE, "SIOCGLIFNUM: %s", - strerror(errno)); - (void) close(fd); - return (-1); + opstr = "SIOCGLIFNUM"; + goto fail; } /* @@ -97,16 +91,17 @@ again: */ lifn.lifn_count += 4; + lifc.lifc_flags = flags; lifc.lifc_family = family; lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); - lifc.lifc_buf = alloca(lifc.lifc_len); - lifc.lifc_flags = 0; + if ((lifc.lifc_buf = realloc(lifc.lifc_buf, lifc.lifc_len)) == NULL) { + opstr = "realloc"; + goto fail; + } if (ioctl(fd, SIOCGLIFCONF, &lifc) == -1) { - (void) snprintf(errbuf, ERRBUFSIZE, "SIOCGLIFCONF: %s", - strerror(errno)); - (void) close(fd); - return (-1); + opstr = "SIOCGLIFCONF"; + goto fail; } /* @@ -121,12 +116,9 @@ again: /* * Allocate the address list to return. */ - ifaddrlist = calloc(nlifr, sizeof (struct ifaddrlist)); - if (ifaddrlist == NULL) { - (void) snprintf(errbuf, ERRBUFSIZE, "calloc: %s", - strerror(errno)); - (void) close(fd); - return (-1); + if ((ifaddrlist = calloc(nlifr, sizeof (struct ifaddrlist))) == NULL) { + opstr = "calloc"; + goto fail; } /* @@ -142,7 +134,7 @@ again: if (ioctl(fd, SIOCGLIFFLAGS, lifrp) == -1) { if (errno == ENXIO) continue; - iocstr = "SIOCGLIFFLAGS"; + opstr = "SIOCGLIFFLAGS"; goto fail; } al->flags = lifrp->lifr_flags; @@ -150,7 +142,7 @@ again: if (ioctl(fd, SIOCGLIFINDEX, lifrp) == -1) { if (errno == ENXIO) continue; - iocstr = "SIOCGLIFINDEX"; + opstr = "SIOCGLIFINDEX"; goto fail; } al->index = lifrp->lifr_index; @@ -158,7 +150,7 @@ again: if (ioctl(fd, SIOCGLIFADDR, lifrp) == -1) { if (errno == ENXIO) continue; - iocstr = "SIOCGLIFADDR"; + opstr = "SIOCGLIFADDR"; goto fail; } @@ -174,6 +166,7 @@ again: } (void) close(fd); + free(lifc.lifc_buf); if (count == 0) { free(ifaddrlist); *ipaddrp = NULL; @@ -183,9 +176,14 @@ again: *ipaddrp = ifaddrlist; return (count); fail: - (void) snprintf(errbuf, ERRBUFSIZE, "%s: %s: %s", iocstr, al->device, - strerror(errno)); - + if (al == NULL) { + (void) snprintf(errbuf, ERRBUFSIZE, "%s: %s", opstr, + strerror(errno)); + } else { + (void) snprintf(errbuf, ERRBUFSIZE, "%s: %s: %s", opstr, + al->device, strerror(errno)); + } + free(lifc.lifc_buf); free(ifaddrlist); (void) close(fd); return (-1); diff --git a/usr/src/lib/libinetutil/common/ifaddrlistx.c b/usr/src/lib/libinetutil/common/ifaddrlistx.c new file mode 100644 index 0000000000..ce85c5521f --- /dev/null +++ b/usr/src/lib/libinetutil/common/ifaddrlistx.c @@ -0,0 +1,168 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <errno.h> +#include <libinetutil.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/sockio.h> + +/* + * Create a list of the addresses on physical interface `ifname' with at least + * one of the flags in `set' set and all of the flags in `clear' clear. + * Return the number of items in the list, or -1 on failure. + */ +int +ifaddrlistx(const char *ifname, uint64_t set, uint64_t clear, + ifaddrlistx_t **ifaddrsp) +{ + struct lifconf lifc; + struct lifnum lifn; + struct lifreq *lifrp; + ifaddrlistx_t *ifaddrp, *ifaddrs = NULL; + int i, nlifr, naddr = 0; + char *cp; + uint_t flags; + int s4, s6 = -1; + boolean_t isv6; + int save_errno; + struct sockaddr_storage addr; + + (void) memset(&lifc, 0, sizeof (lifc)); + flags = LIFC_NOXMIT | LIFC_ALLZONES | LIFC_TEMPORARY | LIFC_UNDER_IPMP; + + /* + * We need both IPv4 and IPv6 sockets to query both IPv4 and IPv6 + * interfaces below. + */ + if ((s4 = socket(AF_INET, SOCK_DGRAM, 0)) == -1 || + (s6 = socket(AF_INET6, SOCK_DGRAM, 0)) == -1) { + goto fail; + } + + /* + * Get the number of network interfaces of type `family'. + */ + lifn.lifn_family = AF_UNSPEC; + lifn.lifn_flags = flags; +again: + if (ioctl(s4, SIOCGLIFNUM, &lifn) == -1) + goto fail; + + /* + * Pad the interface count to detect when additional interfaces have + * been configured between SIOCGLIFNUM and SIOCGLIFCONF. + */ + lifn.lifn_count += 4; + + lifc.lifc_flags = flags; + lifc.lifc_family = AF_UNSPEC; + lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); + if ((lifc.lifc_buf = realloc(lifc.lifc_buf, lifc.lifc_len)) == NULL) + goto fail; + + if (ioctl(s4, SIOCGLIFCONF, &lifc) == -1) + goto fail; + + /* + * If every lifr_req slot is taken, then additional interfaces must + * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF. + * Recalculate to make sure we didn't miss any interfaces. + */ + nlifr = lifc.lifc_len / sizeof (struct lifreq); + if (nlifr >= lifn.lifn_count) + goto again; + + /* + * Populate the ifaddrlistx by querying each matching interface. If a + * query ioctl returns ENXIO, then the interface must have been + * removed after the SIOCGLIFCONF completed -- so we just ignore it. + */ + for (lifrp = lifc.lifc_req, i = 0; i < nlifr; i++, lifrp++) { + if ((cp = strchr(lifrp->lifr_name, ':')) != NULL) + *cp = '\0'; + + if (strcmp(lifrp->lifr_name, ifname) != 0) + continue; + + if (cp != NULL) + *cp = ':'; + + addr = lifrp->lifr_addr; + isv6 = addr.ss_family == AF_INET6; + if (ioctl(isv6 ? s6 : s4, SIOCGLIFFLAGS, lifrp) == -1) { + if (errno == ENXIO) + continue; + goto fail; + } + + if (set != 0 && ((lifrp->lifr_flags & set) == 0) || + (lifrp->lifr_flags & clear) != 0) + continue; + + /* + * We've got a match; allocate a new record. + */ + if ((ifaddrp = malloc(sizeof (ifaddrlistx_t))) == NULL) + goto fail; + + (void) strlcpy(ifaddrp->ia_name, lifrp->lifr_name, LIFNAMSIZ); + ifaddrp->ia_flags = lifrp->lifr_flags; + ifaddrp->ia_addr = addr; + ifaddrp->ia_next = ifaddrs; + ifaddrs = ifaddrp; + naddr++; + } + + (void) close(s4); + (void) close(s6); + free(lifc.lifc_buf); + *ifaddrsp = ifaddrs; + return (naddr); +fail: + save_errno = errno; + (void) close(s4); + (void) close(s6); + free(lifc.lifc_buf); + ifaddrlistx_free(ifaddrs); + errno = save_errno; + return (-1); +} + +/* + * Free the provided ifaddrlistx_t. + */ +void +ifaddrlistx_free(ifaddrlistx_t *ifaddrp) +{ + ifaddrlistx_t *next_ifaddrp; + + for (; ifaddrp != NULL; ifaddrp = next_ifaddrp) { + next_ifaddrp = ifaddrp->ia_next; + free(ifaddrp); + } +} diff --git a/usr/src/lib/libinetutil/common/inetutil4.c b/usr/src/lib/libinetutil/common/inetutil.c index ff5607e192..195d080b79 100644 --- a/usr/src/lib/libinetutil/common/inetutil4.c +++ b/usr/src/lib/libinetutil/common/inetutil.c @@ -18,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <unistd.h> #include <netinet/in.h> #include <libinetutil.h> @@ -32,7 +31,7 @@ extern int getnetmaskbyaddr(const struct in_addr, struct in_addr *); /* - * Generic internet (v4) functions. + * Internet utility functions. */ /* @@ -67,3 +66,32 @@ get_netmask4(const struct in_addr *n_addrp, struct in_addr *s_addrp) else s_addrp->s_addr = IN_CLASSE_NET; } + +/* + * Checks if the IP addresses `ssp1' and `ssp2' are equal. + */ +boolean_t +sockaddrcmp(const struct sockaddr_storage *ssp1, + const struct sockaddr_storage *ssp2) +{ + struct in_addr addr1, addr2; + const struct in6_addr *addr6p1, *addr6p2; + + if (ssp1->ss_family != ssp2->ss_family) + return (B_FALSE); + + if (ssp1 == ssp2) + return (B_TRUE); + + switch (ssp1->ss_family) { + case AF_INET: + addr1 = ((const struct sockaddr_in *)ssp1)->sin_addr; + addr2 = ((const struct sockaddr_in *)ssp2)->sin_addr; + return (addr1.s_addr == addr2.s_addr); + case AF_INET6: + addr6p1 = &((const struct sockaddr_in6 *)ssp1)->sin6_addr; + addr6p2 = &((const struct sockaddr_in6 *)ssp2)->sin6_addr; + return (IN6_ARE_ADDR_EQUAL(addr6p1, addr6p2)); + } + return (B_FALSE); +} diff --git a/usr/src/lib/libinetutil/common/libinetutil.h b/usr/src/lib/libinetutil/common/libinetutil.h index b21d54f56c..0bece07e07 100644 --- a/usr/src/lib/libinetutil/common/libinetutil.h +++ b/usr/src/lib/libinetutil/common/libinetutil.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,15 +20,13 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _LIBINETUTIL_H #define _LIBINETUTIL_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Contains SMI-private API for general Internet functionality */ @@ -59,11 +56,14 @@ typedef struct { extern boolean_t ifparse_ifspec(const char *, ifspec_t *); extern void get_netmask4(const struct in_addr *, struct in_addr *); +extern boolean_t sockaddrcmp(const struct sockaddr_storage *, + const struct sockaddr_storage *); /* * Extended version of the classic BSD ifaddrlist() interface: * - * int ifaddrlist(struct ifaddrlist **addrlistp, int af, char *errbuf); + * int ifaddrlist(struct ifaddrlist **addrlistp, int af, uint_t flags, + * char *errbuf); * * * addrlistp: Upon success, ifaddrlist() sets *addrlistp to a * dynamically-allocated array of addresses. @@ -71,6 +71,9 @@ extern void get_netmask4(const struct in_addr *, struct in_addr *); * * af: Either AF_INET to obtain IPv4 addresses, or AF_INET6 to * obtain IPv6 addresses. * + * * flags: LIFC_* flags that control the classes of interfaces that + * will be visible. + * * * errbuf: A caller-supplied buffer of ERRBUFSIZE. Upon failure, * provides the reason for the failure. * @@ -89,9 +92,43 @@ struct ifaddrlist { uint64_t flags; /* interface flags */ }; -#define ERRBUFSIZE 128 /* expected size of third argument */ +#define ERRBUFSIZE 128 /* expected size of fourth argument */ + +extern int ifaddrlist(struct ifaddrlist **, int, uint_t, char *); -extern int ifaddrlist(struct ifaddrlist **, int, char *); +/* + * Similar to ifaddrlist(), but returns a linked-list of addresses for a + * *specific* interface name, and allows specific address flags to be matched + * against. A linked list is used rather than an array so that information + * can grow over time without affecting binary compatibility. Also, leaves + * error-handling up to the caller. Returns the number of ifaddrlistx's + * chained through ifaddrp. + * + * int ifaddrlistx(const char *ifname, uint64_t set, uint64_t clear, + * ifaddrlistx_t **ifaddrp); + * + * * ifname: Interface name to match against. + * + * * set: One or more flags that must be set on the address for + * it to be returned. + * + * * clear: Flags that must be clear on the address for it to be + * returned. + * + * * ifaddrp: Upon success, ifaddrlistx() sets *ifaddrp to the head + * of a dynamically-allocated array of ifaddrlistx structures. + * + * Once done, the caller must free `ifaddrp' by calling ifaddrlistx_free(). + */ +typedef struct ifaddrlistx { + struct ifaddrlistx *ia_next; + char ia_name[LIFNAMSIZ]; + uint64_t ia_flags; + struct sockaddr_storage ia_addr; +} ifaddrlistx_t; + +extern int ifaddrlistx(const char *, uint64_t, uint64_t, ifaddrlistx_t **); +extern void ifaddrlistx_free(ifaddrlistx_t *); /* * Timer queues diff --git a/usr/src/lib/libinetutil/common/mapfile-vers b/usr/src/lib/libinetutil/common/mapfile-vers index 51c168fcc4..c9a7829fdb 100644 --- a/usr/src/lib/libinetutil/common/mapfile-vers +++ b/usr/src/lib/libinetutil/common/mapfile-vers @@ -19,17 +19,17 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# SUNWprivate_1.1 { global: get_netmask4; hexascii_to_octet; ifaddrlist; + ifaddrlistx; + ifaddrlistx_free; ifparse_ifspec; iu_adjust_timer; iu_cancel_timer; @@ -48,6 +48,7 @@ SUNWprivate_1.1 { iu_tq_destroy; iu_unregister_event; octet_to_hexascii; + sockaddrcmp; local: *; }; diff --git a/usr/src/lib/libipmp/Makefile b/usr/src/lib/libipmp/Makefile index 188c49c073..5d52f304dc 100644 --- a/usr/src/lib/libipmp/Makefile +++ b/usr/src/lib/libipmp/Makefile @@ -19,15 +19,13 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# include $(SRC)/lib/Makefile.lib -HDRS = ipmp.h ipmp_mpathd.h ipmp_query.h ipmp_query_impl.h +HDRS = ipmp.h ipmp_admin.h ipmp_mpathd.h ipmp_query.h ipmp_query_impl.h HDRDIR = common SUBDIRS = $(MACH) diff --git a/usr/src/lib/libipmp/Makefile.com b/usr/src/lib/libipmp/Makefile.com index bea02659a8..d3065ae37c 100644 --- a/usr/src/lib/libipmp/Makefile.com +++ b/usr/src/lib/libipmp/Makefile.com @@ -19,20 +19,19 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# LIBRARY = libipmp.a VERS = .1 -OBJECTS = ipmp_query.o ipmp_mpathd.o ipmp.o +OBJECTS = ipmp_admin.o ipmp_query.o ipmp_mpathd.o ipmp.o include ../../Makefile.lib +include ../../Makefile.rootfs LIBS = $(DYNLIB) $(LINTLIB) -LDLIBS += -lsocket -lc +LDLIBS += -linetutil -lsocket -lc SRCDIR = ../common $(LINTLIB):= SRCS = $(SRCDIR)/$(LINTSRC) diff --git a/usr/src/lib/libipmp/common/ipmp.c b/usr/src/lib/libipmp/common/ipmp.c index b9a7984889..cf9c3c7c3c 100644 --- a/usr/src/lib/libipmp/common/ipmp.c +++ b/usr/src/lib/libipmp/common/ipmp.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * IPMP general interfaces (PSARC/2002/615). */ @@ -34,6 +31,8 @@ #include <stdlib.h> #include <locale.h> #include <unistd.h> +#include <string.h> +#include <errno.h> #include "ipmp_impl.h" @@ -92,13 +91,15 @@ static char *errmsgs[IPMP_NERR] = { "operation failed", /* 1 IPMP_FAILURE */ "minimum failover redundancy not met", /* 2 IPMP_EMINRED */ "failback disabled", /* 3 IPMP_EFBDISABLED */ - "unable to completely fail back", /* 4 IPMP_EFBPARTIAL */ + "unknown IPMP data address", /* 4 IPMP_EUNKADDR */ "invalid argument", /* 5 IPMP_EINVAL */ "out of memory", /* 6 IPMP_ENOMEM */ "cannot contact in.mpathd", /* 7 IPMP_ENOMPATHD */ "unknown IPMP group", /* 8 IPMP_EUNKGROUP */ "interface is not using IPMP", /* 9 IPMP_EUNKIF */ - "unable to communicate with in.mpathd" /* 10 IPMP_EPROTO */ + "unable to communicate with in.mpathd", /* 10 IPMP_EPROTO */ + "interface has duplicate hardware address" + /* 11 IPMP_EHWADDRDUP */ }; /* @@ -110,5 +111,8 @@ ipmp_errmsg(int error) if (error >= IPMP_NERR || error < 0) return (dgettext(TEXT_DOMAIN, "<unknown error>")); + if (error == IPMP_FAILURE) + return (strerror(errno)); + return (dgettext(TEXT_DOMAIN, errmsgs[error])); } diff --git a/usr/src/lib/libipmp/common/ipmp.h b/usr/src/lib/libipmp/common/ipmp.h index 0112615a84..2ca0a9b2b9 100644 --- a/usr/src/lib/libipmp/common/ipmp.h +++ b/usr/src/lib/libipmp/common/ipmp.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPMP_H #define _IPMP_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * General IPMP-related definitions and functions. * @@ -50,13 +47,14 @@ enum { IPMP_FAILURE, /* operation failed (check errno) */ IPMP_EMINRED, /* minimum failover redundancy not met */ IPMP_EFBDISABLED, /* failback disabled */ - IPMP_EFBPARTIAL, /* unable to completely fail back */ + IPMP_EUNKADDR, /* unknown IPMP data address */ IPMP_EINVAL, /* invalid argument */ IPMP_ENOMEM, /* out of memory */ IPMP_ENOMPATHD, /* cannot contact in.mpathd */ IPMP_EUNKGROUP, /* unknown IPMP group */ IPMP_EUNKIF, /* interface is not using IPMP */ IPMP_EPROTO, /* unable to communicate with in.mpathd */ + IPMP_EHWADDRDUP, /* interface has duplicate hardware address */ IPMP_NERR /* number of error codes */ }; diff --git a/usr/src/lib/libipmp/common/ipmp_admin.c b/usr/src/lib/libipmp/common/ipmp_admin.c new file mode 100644 index 0000000000..8a282f5286 --- /dev/null +++ b/usr/src/lib/libipmp/common/ipmp_admin.c @@ -0,0 +1,104 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * IPMP administrative interfaces (see PSARC/2007/272). + */ + +#include <assert.h> +#include <errno.h> +#include <string.h> +#include <unistd.h> +#include <sys/time.h> +#include <sys/types.h> + +#include "ipmp_impl.h" +#include "ipmp_mpathd.h" +#include "ipmp_admin.h" + +static int +ipmp_command(ipmp_handle_t handle, const void *req, uint_t reqsize) +{ + ipmp_state_t *statep = (ipmp_state_t *)handle; + mi_result_t result; + struct timeval end; + int save_errno; + int retval; + + if (gettimeofday(&end, NULL) == -1) + return (IPMP_FAILURE); + end.tv_sec += IPMP_REQTIMEOUT; + + assert(statep->st_fd == -1); + retval = ipmp_connect(&statep->st_fd); + if (retval != IPMP_SUCCESS) + return (retval); + + retval = ipmp_write(statep->st_fd, req, reqsize); + if (retval != IPMP_SUCCESS) + goto out; + + retval = ipmp_read(statep->st_fd, &result, sizeof (result), &end); + if (retval != IPMP_SUCCESS) + goto out; + + errno = result.me_sys_error; + retval = result.me_mpathd_error; +out: + save_errno = errno; + (void) close(statep->st_fd); + statep->st_fd = -1; + errno = save_errno; + return (retval); +} + +int +ipmp_offline(ipmp_handle_t handle, const char *ifname, uint_t minred) +{ + mi_offline_t mio; + + mio.mio_command = MI_OFFLINE; + mio.mio_min_redundancy = minred; + (void) strlcpy(mio.mio_ifname, ifname, LIFNAMSIZ); + return (ipmp_command(handle, &mio, sizeof (mio))); +} + +int +ipmp_undo_offline(ipmp_handle_t handle, const char *ifname) +{ + mi_undo_offline_t miu; + + miu.miu_command = MI_UNDO_OFFLINE; + (void) strlcpy(miu.miu_ifname, ifname, LIFNAMSIZ); + return (ipmp_command(handle, &miu, sizeof (miu))); +} + +int +ipmp_ping_daemon(ipmp_handle_t handle) +{ + mi_ping_t mip; + + mip.mip_command = MI_PING; + return (ipmp_command(handle, &mip, sizeof (mip))); +} diff --git a/usr/src/lib/libipmp/common/ipmp_admin.h b/usr/src/lib/libipmp/common/ipmp_admin.h new file mode 100644 index 0000000000..fa0986f7fa --- /dev/null +++ b/usr/src/lib/libipmp/common/ipmp_admin.h @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IPMP_ADMIN_H +#define _IPMP_ADMIN_H + +#include <ipmp.h> +#include <sys/types.h> + +/* + * IPMP administrative interfaces. + * + * These interfaces may only be used within ON or after signing a contract + * with ON. For documentation, refer to PSARC/2007/272. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +extern int ipmp_offline(ipmp_handle_t, const char *, uint_t); +extern int ipmp_undo_offline(ipmp_handle_t, const char *); +extern int ipmp_ping_daemon(ipmp_handle_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _IPMP_ADMIN_H */ diff --git a/usr/src/lib/libipmp/common/ipmp_mpathd.c b/usr/src/lib/libipmp/common/ipmp_mpathd.c index ee1d35de33..e24de71017 100644 --- a/usr/src/lib/libipmp/common/ipmp_mpathd.c +++ b/usr/src/lib/libipmp/common/ipmp_mpathd.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -18,14 +17,11 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Low-level interfaces for communicating with in.mpathd(1M). * @@ -66,16 +62,16 @@ ipmp_connect(int *fdp) return (IPMP_FAILURE); /* - * Enable TCP_ANONPRIVBIND so the kernel will choose our source port. - * Since we're using loopback sockets, requiring use of privileged - * source ports is sufficient for security. + * If we have sufficient privilege, enable TCP_ANONPRIVBIND so the + * kernel will choose a privileged source port (since in.mpathd only + * accepts requests on loopback, this is sufficient for security). + * If not, drive on since MI_QUERY and MI_PING commands are allowed + * from non-privileged ports. */ - if (setsockopt(fd, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) == -1) - goto fail; + (void) setsockopt(fd, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, sizeof (on)); /* - * Bind to a privileged port chosen by the kernel. + * Bind to a port chosen by the kernel. */ (void) memset(&sin, 0, sizeof (struct sockaddr_in)); sin.sin_port = htons(0); diff --git a/usr/src/lib/libipmp/common/ipmp_mpathd.h b/usr/src/lib/libipmp/common/ipmp_mpathd.h index 61ae71b78f..7df3b4fd92 100644 --- a/usr/src/lib/libipmp/common/ipmp_mpathd.h +++ b/usr/src/lib/libipmp/common/ipmp_mpathd.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -18,26 +17,17 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 1999-2002 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPMP_MPATHD_H #define _IPMP_MPATHD_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Definitions for the messaging protocol between in.mpathd and libipmp. - * This interface is loosely documented in PSARC/2000/306. - * - * PLEASE NOTE: Although this interface is officially consolidation-private, - * we will be reclassifying it as project-private in the future, and - * transitioning any existing consumers to use higher-level libipmp routines. - * - * Put another way: treat this as if it was project-private! + * This interface is project-private to the IPMP subsystem. */ #include <sys/types.h> @@ -49,33 +39,41 @@ extern "C" { #endif #define MPATHD_PORT 5999 -#define MPATHD_PATH "/usr/lib/inet/in.mpathd" +#define MPATHD_PATH "/lib/inet/in.mpathd" /* * Supported commands. */ enum { - MI_PING = 0, /* sanity test */ + MI_PING = 0, /* ping in.mpathd */ MI_OFFLINE = 1, /* offline the interface */ MI_UNDO_OFFLINE = 2, /* undo the offline */ - MI_SETOINDEX = 3, /* set original interface index */ - MI_QUERY = 4, /* query ipmp-related information */ + MI_QUERY = 3, /* query ipmp-related information */ MI_NCMD /* total number of commands */ }; /* * Types of information which can be requested and received (except for - * IPMP_IFLIST, which can only be received). + * IPMP_IFLIST and IPMP_ADDRLIST, which can only be received). */ typedef enum { IPMP_GROUPLIST = 1, IPMP_GROUPINFO = 2, IPMP_IFINFO = 3, IPMP_IFLIST = 4, - IPMP_SNAP = 5 + IPMP_SNAP = 5, + IPMP_ADDRLIST = 6, + IPMP_ADDRINFO = 7 } ipmp_infotype_t; /* + * Daemon ping request. + */ +typedef struct mi_ping { + uint32_t mip_command; +} mi_ping_t; + +/* * Interface offline request; `mio_ifname' is the interface to offline; * `mio_min_redundancy' is the minimum amount of usable interfaces after * offline that must exist for the operation to succeed. @@ -83,7 +81,6 @@ typedef enum { typedef struct mi_offline { uint32_t mio_command; char mio_ifname[LIFNAMSIZ]; - char mio_move_to_if[LIFNAMSIZ]; /* currently unused */ uint32_t mio_min_redundancy; } mi_offline_t; @@ -97,24 +94,12 @@ typedef struct mi_undo_offline { } mi_undo_offline_t; /* - * Set original interface index request: `mis_lifname' is the name of the - * logical interface that is having its index reset; `mis_new_pifname' is the - * name of the interface whose index will be associated with `mis_lifname'; - * `mis_iftype' is the interface type. - */ -typedef struct mi_setoindex { - uint32_t mis_command; - char mis_lifname[LIFNAMSIZ]; - char mis_new_pifname[LIFNAMSIZ]; - uint32_t mis_iftype; -} mi_setoindex_t; - -/* * Retrieve IPMP-related information: `miq_inforeq' is the type of information - * being request (see above for the list of types). If the request is for - * either IPMP_GROUPINFO or IPMP_IFINFO, then either `miq_grname' or - * `miq_ifname' should be set (respectively) to indicate the name of the - * group or interface to retrieve the information for. + * being request (see above for the list of types). If the request type is + * IPMP_GROUPINFO, then `miq_grname' indicates the group. If the request type + * is IPMP_IFINFO, then `miq_ifname' indicates the interface. If the request + * type is IPMP_ADDRINFO then `miq_grname' indicates the group and `miq_addr' + * indicates the address. */ typedef struct mi_query { uint32_t miq_command; @@ -123,6 +108,7 @@ typedef struct mi_query { char miqu_ifname[LIFNAMSIZ]; char miqu_grname[LIFGRNAMSIZ]; } miq_infodata; + struct sockaddr_storage miq_addr; } mi_query_t; #define miq_ifname miq_infodata.miqu_ifname #define miq_grname miq_infodata.miqu_grname @@ -132,10 +118,10 @@ typedef struct mi_query { * requirement for receiving any command. */ union mi_commands { - uint32_t mi_command; + uint32_t mi_command; + mi_ping_t mi_pcmd; mi_offline_t mi_ocmd; mi_undo_offline_t mi_ucmd; - mi_setoindex_t mi_scmd; mi_query_t mi_qcmd; }; @@ -147,18 +133,7 @@ typedef struct mi_result { uint32_t me_mpathd_error; /* Mpathd error */ } mi_result_t; -/* - * Legacy values for me_mpathd_error; the daemon now returns the IPMP - * error codes defined in <ipmp.h>, which are compatible with these error - * codes. These will be removed in the future. - */ -enum { - MPATHD_SUCCESS = 0, /* operation succeeded */ - MPATHD_SYS_ERROR = 1, /* check me_sys_error for the errno */ - MPATHD_MIN_RED_ERROR = 2, /* minimum redundancy not met */ - MPATHD_FAILBACK_DISABLED = 3, /* failback administratively disabled */ - MPATHD_FAILBACK_PARTIAL = 4 /* unable to completely failback */ -}; +#define IPMP_REQTIMEOUT 5 /* seconds */ extern int ipmp_connect(int *); extern int ipmp_read(int, void *, size_t, const struct timeval *); diff --git a/usr/src/lib/libipmp/common/ipmp_query.c b/usr/src/lib/libipmp/common/ipmp_query.c index 8a7dc7ee69..a0af2da578 100644 --- a/usr/src/lib/libipmp/common/ipmp_query.c +++ b/usr/src/lib/libipmp/common/ipmp_query.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -18,20 +17,18 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* - * IPMP query interfaces (PSARC/2002/615). + * IPMP query interfaces (see PSARC/2002/615 and PSARC/2007/272). */ #include <assert.h> #include <errno.h> +#include <libinetutil.h> #include <string.h> #include <stdlib.h> #include <unistd.h> @@ -41,13 +38,19 @@ #include "ipmp_mpathd.h" #include "ipmp_query_impl.h" -#define IPMP_REQTIMEOUT 5 /* seconds */ - static ipmp_ifinfo_t *ipmp_ifinfo_clone(ipmp_ifinfo_t *); +static ipmp_addrinfo_t *ipmp_addrinfo_clone(ipmp_addrinfo_t *); +static ipmp_addrlist_t *ipmp_addrlist_clone(ipmp_addrlist_t *); static ipmp_grouplist_t *ipmp_grouplist_clone(ipmp_grouplist_t *); static ipmp_groupinfo_t *ipmp_groupinfo_clone(ipmp_groupinfo_t *); +static ipmp_iflist_t *ipmp_iflist_create(uint_t, char (*)[LIFNAMSIZ]); +static void ipmp_freeiflist(ipmp_iflist_t *); +static ipmp_addrlist_t *ipmp_addrlist_create(uint_t, struct sockaddr_storage *); +static void ipmp_freeaddrlist(ipmp_addrlist_t *); static ipmp_groupinfo_t *ipmp_snap_getgroupinfo(ipmp_snap_t *, const char *); static ipmp_ifinfo_t *ipmp_snap_getifinfo(ipmp_snap_t *, const char *); +static ipmp_addrinfo_t *ipmp_snap_getaddrinfo(ipmp_snap_t *, const char *, + struct sockaddr_storage *); static int ipmp_snap_take(ipmp_state_t *, ipmp_snap_t **); static boolean_t ipmp_checktlv(ipmp_infotype_t, size_t, void *); static int ipmp_querydone(ipmp_state_t *, int); @@ -62,7 +65,7 @@ static int ipmp_querydone(ipmp_state_t *, int); */ static int ipmp_sendquery(ipmp_state_t *statep, ipmp_infotype_t type, const char *name, - struct timeval *endtp) + const void *addr, struct timeval *endtp) { mi_query_t query; mi_result_t result; @@ -72,6 +75,11 @@ ipmp_sendquery(ipmp_state_t *statep, ipmp_infotype_t type, const char *name, query.miq_inforeq = type; switch (type) { + case IPMP_ADDRINFO: + (void) strlcpy(query.miq_grname, name, LIFGRNAMSIZ); + query.miq_addr = *(struct sockaddr_storage *)addr; + break; + case IPMP_GROUPINFO: (void) strlcpy(query.miq_grname, name, LIFGRNAMSIZ); break; @@ -138,6 +146,61 @@ ipmp_readinfo(ipmp_state_t *statep, ipmp_infotype_t infotype, void **infop, } /* + * Using `statep', read in the remaining IPMP group information TLVs from + * in.mpathd into `grinfop' before the current time becomes `endtp'. Returns + * an IPMP error code. On failure, `grinfop' will have its original contents. + */ +static int +ipmp_readgroupinfo_lists(ipmp_state_t *statep, ipmp_groupinfo_t *grinfop, + const struct timeval *endtp) +{ + int retval; + ipmp_iflist_t *iflistp; + ipmp_addrlist_t *adlistp; + + retval = ipmp_readinfo(statep, IPMP_IFLIST, (void **)&iflistp, endtp); + if (retval != IPMP_SUCCESS) + return (retval); + + retval = ipmp_readinfo(statep, IPMP_ADDRLIST, (void **)&adlistp, endtp); + if (retval != IPMP_SUCCESS) { + ipmp_freeiflist(iflistp); + return (retval); + } + + grinfop->gr_iflistp = iflistp; + grinfop->gr_adlistp = adlistp; + return (IPMP_SUCCESS); +} + +/* + * Using `statep', read in the remaining IPMP interface information TLVs from + * in.mpathd into `ifinfop' before the current time becomes `endtp'. Returns + * an IPMP error code. On failure, `ifinfop' will have its original contents. + */ +static int +ipmp_readifinfo_lists(ipmp_state_t *statep, ipmp_ifinfo_t *ifinfop, + const struct timeval *endtp) +{ + int retval; + ipmp_addrlist_t *tlist4p, *tlist6p; + + retval = ipmp_readinfo(statep, IPMP_ADDRLIST, (void **)&tlist4p, endtp); + if (retval != IPMP_SUCCESS) + return (retval); + + retval = ipmp_readinfo(statep, IPMP_ADDRLIST, (void **)&tlist6p, endtp); + if (retval != IPMP_SUCCESS) { + ipmp_freeaddrlist(tlist4p); + return (retval); + } + + ifinfop->if_targinfo4.it_targlistp = tlist4p; + ifinfop->if_targinfo6.it_targlistp = tlist6p; + return (IPMP_SUCCESS); +} + +/* * Complete the query operation started in ipmp_sendquery(). The interface is * designed to be easy to use in the `return' statement of a function, and * thus returns the passed in `retval' and preserves `errno'. @@ -169,7 +232,7 @@ ipmp_getgrouplist(ipmp_handle_t handle, ipmp_grouplist_t **grlistpp) return (*grlistpp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM); } - retval = ipmp_sendquery(statep, IPMP_GROUPLIST, NULL, &end); + retval = ipmp_sendquery(statep, IPMP_GROUPLIST, NULL, NULL, &end); if (retval != IPMP_SUCCESS) return (retval); @@ -196,7 +259,6 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name, ipmp_groupinfo_t **grinfopp) { ipmp_state_t *statep = handle; - ipmp_iflist_t *iflistp; int retval; struct timeval end; ipmp_groupinfo_t *grinfop; @@ -210,7 +272,7 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name, return (*grinfopp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM); } - retval = ipmp_sendquery(statep, IPMP_GROUPINFO, name, &end); + retval = ipmp_sendquery(statep, IPMP_GROUPINFO, name, NULL, &end); if (retval != IPMP_SUCCESS) return (retval); @@ -218,11 +280,9 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name, if (retval != IPMP_SUCCESS) return (ipmp_querydone(statep, retval)); - retval = ipmp_readinfo(statep, IPMP_IFLIST, (void **)&iflistp, &end); + retval = ipmp_readgroupinfo_lists(statep, *grinfopp, &end); if (retval != IPMP_SUCCESS) free(*grinfopp); - else - (*grinfopp)->gr_iflistp = iflistp; return (ipmp_querydone(statep, retval)); } @@ -233,7 +293,8 @@ ipmp_getgroupinfo(ipmp_handle_t handle, const char *name, void ipmp_freegroupinfo(ipmp_groupinfo_t *grinfop) { - free(grinfop->gr_iflistp); + ipmp_freeaddrlist(grinfop->gr_adlistp); + ipmp_freeiflist(grinfop->gr_iflistp); free(grinfop); } @@ -259,11 +320,18 @@ ipmp_getifinfo(ipmp_handle_t handle, const char *name, ipmp_ifinfo_t **ifinfopp) return (*ifinfopp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM); } - retval = ipmp_sendquery(statep, IPMP_IFINFO, name, &end); + retval = ipmp_sendquery(statep, IPMP_IFINFO, name, NULL, &end); if (retval != IPMP_SUCCESS) return (retval); retval = ipmp_readinfo(statep, IPMP_IFINFO, (void **)ifinfopp, &end); + if (retval != IPMP_SUCCESS) + return (ipmp_querydone(statep, retval)); + + retval = ipmp_readifinfo_lists(statep, *ifinfopp, &end); + if (retval != IPMP_SUCCESS) + free(*ifinfopp); + return (ipmp_querydone(statep, retval)); } @@ -273,10 +341,52 @@ ipmp_getifinfo(ipmp_handle_t handle, const char *name, ipmp_ifinfo_t **ifinfopp) void ipmp_freeifinfo(ipmp_ifinfo_t *ifinfop) { + ipmp_freeaddrlist(ifinfop->if_targinfo4.it_targlistp); + ipmp_freeaddrlist(ifinfop->if_targinfo6.it_targlistp); free(ifinfop); } /* + * Using `handle', get the address information associated with address `addrp' + * on group `grname' and store the results in a dynamically allocated buffer + * pointed to by `*adinfopp'. Returns an IPMP error code. + */ +int +ipmp_getaddrinfo(ipmp_handle_t handle, const char *grname, + struct sockaddr_storage *addrp, ipmp_addrinfo_t **adinfopp) +{ + ipmp_state_t *statep = handle; + ipmp_addrinfo_t *adinfop; + int retval; + struct timeval end; + + if (statep->st_snap != NULL) { + adinfop = ipmp_snap_getaddrinfo(statep->st_snap, grname, addrp); + if (adinfop == NULL) + return (IPMP_EUNKADDR); + + *adinfopp = ipmp_addrinfo_clone(adinfop); + return (*adinfopp != NULL ? IPMP_SUCCESS : IPMP_ENOMEM); + } + + retval = ipmp_sendquery(statep, IPMP_ADDRINFO, grname, addrp, &end); + if (retval != IPMP_SUCCESS) + return (retval); + + retval = ipmp_readinfo(statep, IPMP_ADDRINFO, (void **)adinfopp, &end); + return (ipmp_querydone(statep, retval)); +} + +/* + * Free the address information pointed to by `adinfop'. + */ +void +ipmp_freeaddrinfo(ipmp_addrinfo_t *adinfop) +{ + free(adinfop); +} + +/* * Check if `buf' has a NUL byte in its first `bufsize' bytes. */ static boolean_t @@ -300,12 +410,25 @@ ipmp_checktlv(ipmp_infotype_t type, size_t len, void *value) ipmp_ifinfo_t *ifinfop; ipmp_grouplist_t *grlistp; ipmp_groupinfo_t *grinfop; + ipmp_addrlist_t *adlistp; unsigned int i; switch (type) { + case IPMP_ADDRINFO: + if (len != sizeof (ipmp_addrinfo_t)) + return (B_FALSE); + break; + + case IPMP_ADDRLIST: + adlistp = (ipmp_addrlist_t *)value; + if (len < IPMP_ADDRLIST_SIZE(0) || + len < IPMP_ADDRLIST_SIZE(adlistp->al_naddr)) + return (B_FALSE); + break; + case IPMP_IFLIST: iflistp = (ipmp_iflist_t *)value; - if (len < IPMP_IFLIST_MINSIZE || + if (len < IPMP_IFLIST_SIZE(0) || len < IPMP_IFLIST_SIZE(iflistp->il_nif)) return (B_FALSE); @@ -326,7 +449,7 @@ ipmp_checktlv(ipmp_infotype_t type, size_t len, void *value) case IPMP_GROUPLIST: grlistp = (ipmp_grouplist_t *)value; - if (len < IPMP_GROUPLIST_MINSIZE || + if (len < IPMP_GROUPLIST_SIZE(0) || len < IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup)) return (B_FALSE); @@ -357,9 +480,8 @@ ipmp_checktlv(ipmp_infotype_t type, size_t len, void *value) } /* - * Create a group list with signature `sig' containing `ngroup' groups named - * by `groups'. Returns a pointer to the new group list on success, or NULL - * on failure. + * Create a group list; arguments match ipmp_grouplist_t fields. Returns a + * pointer to the new group list on success, or NULL on failure. */ ipmp_grouplist_t * ipmp_grouplist_create(uint64_t sig, unsigned int ngroup, @@ -392,13 +514,80 @@ ipmp_grouplist_clone(ipmp_grouplist_t *grlistp) } /* - * Create an interface information structure for interface `name' and - * associate `group', `state' and `type' with it. Returns a pointer to the - * interface information on success, or NULL on failure. + * Create target information; arguments match ipmp_targinfo_t fields. Returns + * a pointer to the new target info on success, or NULL on failure. + */ +ipmp_targinfo_t * +ipmp_targinfo_create(const char *name, struct sockaddr_storage *testaddrp, + ipmp_if_targmode_t targmode, uint_t ntarg, struct sockaddr_storage *targs) +{ + ipmp_targinfo_t *targinfop; + + targinfop = malloc(sizeof (ipmp_targinfo_t)); + if (targinfop == NULL) + return (NULL); + + targinfop->it_testaddr = *testaddrp; + targinfop->it_targmode = targmode; + targinfop->it_targlistp = ipmp_addrlist_create(ntarg, targs); + if (targinfop->it_targlistp == NULL) { + ipmp_freetarginfo(targinfop); + return (NULL); + } + (void) strlcpy(targinfop->it_name, name, LIFNAMSIZ); + + return (targinfop); +} + +/* + * Free the target information pointed to by `targinfop'. + */ +void +ipmp_freetarginfo(ipmp_targinfo_t *targinfop) +{ + free(targinfop->it_targlistp); + free(targinfop); +} + +/* + * Create an interface list; arguments match ipmp_iflist_t fields. Returns a + * pointer to the new interface list on success, or NULL on failure. + */ +static ipmp_iflist_t * +ipmp_iflist_create(uint_t nif, char (*ifs)[LIFNAMSIZ]) +{ + unsigned int i; + ipmp_iflist_t *iflistp; + + iflistp = malloc(IPMP_IFLIST_SIZE(nif)); + if (iflistp == NULL) + return (NULL); + + iflistp->il_nif = nif; + for (i = 0; i < nif; i++) + (void) strlcpy(iflistp->il_ifs[i], ifs[i], LIFNAMSIZ); + + return (iflistp); +} + +/* + * Free the interface list pointed to by `iflistp'. + */ +static void +ipmp_freeiflist(ipmp_iflist_t *iflistp) +{ + free(iflistp); +} + +/* + * Create an interface; arguments match ipmp_ifinfo_t fields. Returns a + * pointer to the new interface on success, or NULL on failure. */ ipmp_ifinfo_t * ipmp_ifinfo_create(const char *name, const char *group, ipmp_if_state_t state, - ipmp_if_type_t type) + ipmp_if_type_t type, ipmp_if_linkstate_t linkstate, + ipmp_if_probestate_t probestate, ipmp_if_flags_t flags, + ipmp_targinfo_t *targinfo4p, ipmp_targinfo_t *targinfo6p) { ipmp_ifinfo_t *ifinfop; @@ -408,8 +597,25 @@ ipmp_ifinfo_create(const char *name, const char *group, ipmp_if_state_t state, (void) strlcpy(ifinfop->if_name, name, LIFNAMSIZ); (void) strlcpy(ifinfop->if_group, group, LIFGRNAMSIZ); - ifinfop->if_state = state; - ifinfop->if_type = type; + + ifinfop->if_state = state; + ifinfop->if_type = type; + ifinfop->if_linkstate = linkstate; + ifinfop->if_probestate = probestate; + ifinfop->if_flags = flags; + ifinfop->if_targinfo4 = *targinfo4p; + ifinfop->if_targinfo6 = *targinfo6p; + + ifinfop->if_targinfo4.it_targlistp = + ipmp_addrlist_clone(targinfo4p->it_targlistp); + ifinfop->if_targinfo6.it_targlistp = + ipmp_addrlist_clone(targinfo6p->it_targlistp); + + if (ifinfop->if_targinfo4.it_targlistp == NULL || + ifinfop->if_targinfo6.it_targlistp == NULL) { + ipmp_freeifinfo(ifinfop); + return (NULL); + } return (ifinfop); } @@ -422,40 +628,41 @@ ipmp_ifinfo_t * ipmp_ifinfo_clone(ipmp_ifinfo_t *ifinfop) { return (ipmp_ifinfo_create(ifinfop->if_name, ifinfop->if_group, - ifinfop->if_state, ifinfop->if_type)); + ifinfop->if_state, ifinfop->if_type, ifinfop->if_linkstate, + ifinfop->if_probestate, ifinfop->if_flags, &ifinfop->if_targinfo4, + &ifinfop->if_targinfo6)); } /* - * Create a group named `name' with signature `sig', in state `state', and - * with the `nif' interfaces named by `ifs' as members. Returns a pointer + * Create a group; arguments match ipmp_groupinfo_t fields. Returns a pointer * to the new group on success, or NULL on failure. */ ipmp_groupinfo_t * -ipmp_groupinfo_create(const char *name, uint64_t sig, ipmp_group_state_t state, - unsigned int nif, char (*ifs)[LIFNAMSIZ]) +ipmp_groupinfo_create(const char *name, uint64_t sig, uint_t fdt, + ipmp_group_state_t state, uint_t nif, char (*ifs)[LIFNAMSIZ], + const char *grifname, const char *m4ifname, const char *m6ifname, + const char *bcifname, uint_t naddr, struct sockaddr_storage *addrs) { ipmp_groupinfo_t *grinfop; - ipmp_iflist_t *iflistp; - unsigned int i; grinfop = malloc(sizeof (ipmp_groupinfo_t)); if (grinfop == NULL) return (NULL); - iflistp = malloc(IPMP_IFLIST_SIZE(nif)); - if (iflistp == NULL) { - free(grinfop); + grinfop->gr_sig = sig; + grinfop->gr_fdt = fdt; + grinfop->gr_state = state; + grinfop->gr_iflistp = ipmp_iflist_create(nif, ifs); + grinfop->gr_adlistp = ipmp_addrlist_create(naddr, addrs); + if (grinfop->gr_iflistp == NULL || grinfop->gr_adlistp == NULL) { + ipmp_freegroupinfo(grinfop); return (NULL); } - - grinfop->gr_sig = sig; - grinfop->gr_state = state; - grinfop->gr_iflistp = iflistp; (void) strlcpy(grinfop->gr_name, name, LIFGRNAMSIZ); - - iflistp->il_nif = nif; - for (i = 0; i < nif; i++) - (void) strlcpy(iflistp->il_ifs[i], ifs[i], LIFNAMSIZ); + (void) strlcpy(grinfop->gr_ifname, grifname, LIFNAMSIZ); + (void) strlcpy(grinfop->gr_m4ifname, m4ifname, LIFNAMSIZ); + (void) strlcpy(grinfop->gr_m6ifname, m6ifname, LIFNAMSIZ); + (void) strlcpy(grinfop->gr_bcifname, bcifname, LIFNAMSIZ); return (grinfop); } @@ -467,9 +674,86 @@ ipmp_groupinfo_create(const char *name, uint64_t sig, ipmp_group_state_t state, ipmp_groupinfo_t * ipmp_groupinfo_clone(ipmp_groupinfo_t *grinfop) { + ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; + return (ipmp_groupinfo_create(grinfop->gr_name, grinfop->gr_sig, - grinfop->gr_state, grinfop->gr_iflistp->il_nif, - grinfop->gr_iflistp->il_ifs)); + grinfop->gr_fdt, grinfop->gr_state, grinfop->gr_iflistp->il_nif, + grinfop->gr_iflistp->il_ifs, grinfop->gr_ifname, + grinfop->gr_m4ifname, grinfop->gr_m6ifname, grinfop->gr_bcifname, + adlistp->al_naddr, adlistp->al_addrs)); +} + +/* + * Create an address list; arguments match ipmp_addrlist_t fields. Returns + * a pointer to the new address list on success, or NULL on failure. + */ +static ipmp_addrlist_t * +ipmp_addrlist_create(uint_t naddr, struct sockaddr_storage *addrs) +{ + unsigned int i; + ipmp_addrlist_t *adlistp; + + adlistp = malloc(IPMP_ADDRLIST_SIZE(naddr)); + if (adlistp == NULL) + return (NULL); + + adlistp->al_naddr = naddr; + for (i = 0; i < naddr; i++) + adlistp->al_addrs[i] = addrs[i]; + + return (adlistp); +} + +/* + * Clone the address list named by `adlistp'. Returns a pointer to the clone + * on success, or NULL on failure. + */ +static ipmp_addrlist_t * +ipmp_addrlist_clone(ipmp_addrlist_t *adlistp) +{ + return (ipmp_addrlist_create(adlistp->al_naddr, adlistp->al_addrs)); +} + +/* + * Free the address list pointed to by `adlistp'. + */ +static void +ipmp_freeaddrlist(ipmp_addrlist_t *adlistp) +{ + free(adlistp); +} + +/* + * Create an address; arguments match ipmp_addrinfo_t fields. Returns a + * pointer to the new address on success, or NULL on failure. + */ +ipmp_addrinfo_t * +ipmp_addrinfo_create(struct sockaddr_storage *addrp, ipmp_addr_state_t state, + const char *group, const char *binding) +{ + ipmp_addrinfo_t *adinfop; + + adinfop = malloc(sizeof (ipmp_addrinfo_t)); + if (adinfop == NULL) + return (NULL); + + adinfop->ad_addr = *addrp; + adinfop->ad_state = state; + (void) strlcpy(adinfop->ad_group, group, LIFGRNAMSIZ); + (void) strlcpy(adinfop->ad_binding, binding, LIFNAMSIZ); + + return (adinfop); +} + +/* + * Clone the address information named by `adinfop'. Returns a pointer to + * the clone on success, or NULL on failure. + */ +ipmp_addrinfo_t * +ipmp_addrinfo_clone(ipmp_addrinfo_t *adinfop) +{ + return (ipmp_addrinfo_create(&adinfop->ad_addr, adinfop->ad_state, + adinfop->ad_group, adinfop->ad_binding)); } /* @@ -523,8 +807,10 @@ ipmp_snap_create(void) snap->sn_grlistp = NULL; snap->sn_grinfolistp = NULL; snap->sn_ifinfolistp = NULL; + snap->sn_adinfolistp = NULL; snap->sn_ngroup = 0; snap->sn_nif = 0; + snap->sn_naddr = 0; return (snap); } @@ -536,6 +822,7 @@ void ipmp_snap_free(ipmp_snap_t *snap) { ipmp_ifinfolist_t *iflp, *ifnext; + ipmp_addrinfolist_t *adlp, *adnext; ipmp_groupinfolist_t *grlp, *grnext; ipmp_freegrouplist(snap->sn_grlistp); @@ -552,6 +839,12 @@ ipmp_snap_free(ipmp_snap_t *snap) free(iflp); } + for (adlp = snap->sn_adinfolistp; adlp != NULL; adlp = adnext) { + adnext = adlp->adl_next; + ipmp_freeaddrinfo(adlp->adl_adinfop); + free(adlp); + } + free(snap); } @@ -612,6 +905,34 @@ ipmp_snap_addifinfo(ipmp_snap_t *snap, ipmp_ifinfo_t *ifinfop) } /* + * Add the address information in `adinfop' to the snapshot named by `snap'. + * Returns an IPMP error code. + */ +int +ipmp_snap_addaddrinfo(ipmp_snap_t *snap, ipmp_addrinfo_t *adinfop) +{ + ipmp_addrinfolist_t *adlp; + + /* + * Any duplicate addresses should've already been weeded by in.mpathd. + */ + if (ipmp_snap_getaddrinfo(snap, adinfop->ad_group, + &adinfop->ad_addr) != NULL) + return (IPMP_EPROTO); + + adlp = malloc(sizeof (ipmp_addrinfolist_t)); + if (adlp == NULL) + return (IPMP_ENOMEM); + + adlp->adl_adinfop = adinfop; + adlp->adl_next = snap->sn_adinfolistp; + snap->sn_adinfolistp = adlp; + snap->sn_naddr++; + + return (IPMP_SUCCESS); +} + +/* * Retrieve the information for the group `name' in snapshot `snap'. * Returns a pointer to the group information on success, or NULL on failure. */ @@ -647,6 +968,26 @@ ipmp_snap_getifinfo(ipmp_snap_t *snap, const char *name) } /* + * Retrieve the information for the address `addrp' on group `grname' in + * snapshot `snap'. Returns a pointer to the address information on success, + * or NULL on failure. + */ +static ipmp_addrinfo_t * +ipmp_snap_getaddrinfo(ipmp_snap_t *snap, const char *grname, + struct sockaddr_storage *addrp) +{ + ipmp_addrinfolist_t *adlp; + + for (adlp = snap->sn_adinfolistp; adlp != NULL; adlp = adlp->adl_next) { + if (strcmp(grname, adlp->adl_adinfop->ad_group) == 0 && + sockaddrcmp(addrp, &adlp->adl_adinfop->ad_addr)) + break; + } + + return (adlp != NULL ? adlp->adl_adinfop : NULL); +} + +/* * Using `statep', take a snapshot of the IPMP subsystem and if successful * return it in a dynamically allocated snapshot pointed to by `*snapp'. * Returns an IPMP error code. @@ -656,7 +997,6 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp) { ipmp_snap_t *snap, *osnap; ipmp_infotype_t type; - ipmp_iflist_t *iflistp; int retval; size_t len; void *infop; @@ -666,7 +1006,7 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp) if (snap == NULL) return (IPMP_ENOMEM); - retval = ipmp_sendquery(statep, IPMP_SNAP, NULL, &end); + retval = ipmp_sendquery(statep, IPMP_SNAP, NULL, NULL, &end); if (retval != IPMP_SUCCESS) { ipmp_snap_free(snap); return (retval); @@ -679,12 +1019,11 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp) } /* - * Using the information in the passed `osnap' snapshot, build up our - * own snapshot. If we receive more than one grouplist, or more than - * the expected number of interfaces or groups, then bail out. Note - * that there's only so much we can do to check that the information - * sent by in.mpathd makes sense. We know there will always be at - * least one TLV (IPMP_GROUPLIST). + * Using the information in the `osnap' snapshot, build up our own + * snapshot. We know there will always be at least one TLV (for + * IPMP_GROUPLIST). If we receive anything illogical (e.g., more than + * the expected number of interfaces), then bail out. However, to a + * large extent we have to trust the information sent by in.mpathd. */ do { infop = NULL; @@ -711,7 +1050,32 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp) retval = IPMP_EPROTO; break; } + + /* + * Read in V4 and V6 targlist TLVs that follow. + */ + retval = ipmp_readifinfo_lists(statep, infop, &end); + if (retval != IPMP_SUCCESS) + break; + retval = ipmp_snap_addifinfo(snap, infop); + if (retval != IPMP_SUCCESS) { + ipmp_freeifinfo(infop); + infop = NULL; + } + break; + + case IPMP_ADDRINFO: + if (snap->sn_naddr == osnap->sn_naddr) { + retval = IPMP_EPROTO; + break; + } + + retval = ipmp_snap_addaddrinfo(snap, infop); + /* + * NOTE: since we didn't call ipmp_read*info_lists(), + * no need to use ipmp_freeaddrinfo() on failure. + */ break; case IPMP_GROUPINFO: @@ -721,18 +1085,17 @@ ipmp_snap_take(ipmp_state_t *statep, ipmp_snap_t **snapp) } /* - * An IPMP_IFLIST TLV always follows the - * IPMP_GROUPINFO TLV; read it in. + * Read in IPMP groupinfo list TLVs that follow. */ - retval = ipmp_readinfo(statep, IPMP_IFLIST, - (void **)&iflistp, &end); + retval = ipmp_readgroupinfo_lists(statep, infop, &end); if (retval != IPMP_SUCCESS) break; - ((ipmp_groupinfo_t *)infop)->gr_iflistp = iflistp; retval = ipmp_snap_addgroupinfo(snap, infop); - if (retval != IPMP_SUCCESS) - free(iflistp); + if (retval != IPMP_SUCCESS) { + ipmp_freegroupinfo(infop); + infop = NULL; + } break; default: @@ -747,7 +1110,8 @@ fail: return (ipmp_querydone(statep, retval)); } } while (snap->sn_grlistp == NULL || snap->sn_nif < osnap->sn_nif || - snap->sn_ngroup < osnap->sn_ngroup); + snap->sn_ngroup < osnap->sn_ngroup || + snap->sn_naddr < osnap->sn_naddr); free(osnap); *snapp = snap; diff --git a/usr/src/lib/libipmp/common/ipmp_query.h b/usr/src/lib/libipmp/common/ipmp_query.h index d92554887a..160f561dd2 100644 --- a/usr/src/lib/libipmp/common/ipmp_query.h +++ b/usr/src/lib/libipmp/common/ipmp_query.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -18,17 +17,14 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPMP_QUERY_H #define _IPMP_QUERY_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/socket.h> /* needed by <net/if.h> */ #include <net/if.h> /* for LIF*NAMSIZ */ @@ -38,7 +34,7 @@ * IPMP query interfaces. * * These interfaces may only be used within ON or after signing a contract - * with ON. For documentation, refer to PSARC/2002/615. + * with ON. For documentation, refer to PSARC/2002/615 and PSARC/2007/272. */ #ifdef __cplusplus @@ -46,6 +42,43 @@ extern "C" { #endif /* + * Assorted enumerations used in the data types described below. + */ +typedef enum ipmp_if_probestate { + IPMP_PROBE_OK, /* probes detect no problems */ + IPMP_PROBE_FAILED, /* probes detect failure */ + IPMP_PROBE_UNKNOWN, /* probe detection unavailable */ + IPMP_PROBE_DISABLED /* probe detection disabled */ +} ipmp_if_probestate_t; + +typedef enum ipmp_if_linkstate { + IPMP_LINK_UP, /* link detects up */ + IPMP_LINK_DOWN, /* link detects down */ + IPMP_LINK_UNKNOWN /* link detection unavailable */ +} ipmp_if_linkstate_t; + +typedef enum ipmp_if_flags { + IPMP_IFFLAG_INACTIVE = 0x1, + IPMP_IFFLAG_HWADDRDUP = 0x2, + IPMP_IFFLAG_ACTIVE = 0x4, + IPMP_IFFLAG_DOWN = 0x8 +} ipmp_if_flags_t; + +typedef enum ipmp_addr_state { + IPMP_ADDR_UP, /* address is up */ + IPMP_ADDR_DOWN /* address is down */ +} ipmp_addr_state_t; + +typedef enum ipmp_if_targmode { + IPMP_TARG_DISABLED, /* use of targets is disabled */ + IPMP_TARG_ROUTES, /* route-learned targets */ + IPMP_TARG_MULTICAST /* multicast-learned targets */ +} ipmp_if_targmode_t; + +#define IPMP_LIST_SIZE(listtype, elsize, nel) \ + ((sizeof (ipmp_ ## listtype ## _t) - (elsize)) + ((nel) * (elsize))) + +/* * Data type describing a list of IPMP groups. */ typedef struct ipmp_grouplist { @@ -54,8 +87,8 @@ typedef struct ipmp_grouplist { char gl_groups[1][LIFGRNAMSIZ]; } ipmp_grouplist_t; -#define IPMP_GROUPLIST_MINSIZE (sizeof (ipmp_grouplist_t) - LIFGRNAMSIZ) -#define IPMP_GROUPLIST_SIZE(ngr) (IPMP_GROUPLIST_MINSIZE + (ngr) * LIFGRNAMSIZ) +#define IPMP_GROUPLIST_SIZE(ngr) \ + IPMP_LIST_SIZE(grouplist, LIFGRNAMSIZ, ngr) /* * Data type describing a list of interfaces. @@ -65,8 +98,19 @@ typedef struct ipmp_iflist { char il_ifs[1][LIFNAMSIZ]; } ipmp_iflist_t; -#define IPMP_IFLIST_MINSIZE (sizeof (ipmp_iflist_t) - LIFNAMSIZ) -#define IPMP_IFLIST_SIZE(nif) (IPMP_IFLIST_MINSIZE + (nif) * LIFNAMSIZ) +#define IPMP_IFLIST_SIZE(nif) \ + IPMP_LIST_SIZE(iflist, LIFNAMSIZ, nif) + +/* + * Data type describing a list of addresses. + */ +typedef struct ipmp_addrlist { + unsigned int al_naddr; + struct sockaddr_storage al_addrs[1]; +} ipmp_addrlist_t; + +#define IPMP_ADDRLIST_SIZE(naddr) \ + IPMP_LIST_SIZE(addrlist, sizeof (struct sockaddr_storage), naddr) /* * Data type describing the state of an IPMP group. @@ -76,18 +120,49 @@ typedef struct ipmp_groupinfo { uint64_t gr_sig; ipmp_group_state_t gr_state; ipmp_iflist_t *gr_iflistp; + ipmp_addrlist_t *gr_adlistp; + char gr_ifname[LIFNAMSIZ]; + char gr_m4ifname[LIFNAMSIZ]; + char gr_m6ifname[LIFNAMSIZ]; + char gr_bcifname[LIFNAMSIZ]; + unsigned int gr_fdt; } ipmp_groupinfo_t; /* + * Data type describing IPMP target information for a particular interface. + */ +typedef struct ipmp_targinfo { + char it_name[LIFNAMSIZ]; + struct sockaddr_storage it_testaddr; + ipmp_if_targmode_t it_targmode; + ipmp_addrlist_t *it_targlistp; +} ipmp_targinfo_t; + +/* * Data type describing the IPMP-related state of an interface. */ typedef struct ipmp_ifinfo { - char if_name[LIFNAMSIZ]; - char if_group[LIFGRNAMSIZ]; - ipmp_if_state_t if_state; - ipmp_if_type_t if_type; + char if_name[LIFNAMSIZ]; + char if_group[LIFGRNAMSIZ]; + ipmp_if_state_t if_state; + ipmp_if_type_t if_type; + ipmp_if_linkstate_t if_linkstate; + ipmp_if_probestate_t if_probestate; + ipmp_if_flags_t if_flags; + ipmp_targinfo_t if_targinfo4; + ipmp_targinfo_t if_targinfo6; } ipmp_ifinfo_t; +/* + * Data type describing an IPMP data address. + */ +typedef struct ipmp_addrinfo { + struct sockaddr_storage ad_addr; + ipmp_addr_state_t ad_state; + char ad_group[LIFGRNAMSIZ]; + char ad_binding[LIFNAMSIZ]; +} ipmp_addrinfo_t; + typedef enum { IPMP_QCONTEXT_LIVE, IPMP_QCONTEXT_SNAP @@ -100,6 +175,9 @@ extern int ipmp_getgroupinfo(ipmp_handle_t, const char *, ipmp_groupinfo_t **); extern void ipmp_freegroupinfo(ipmp_groupinfo_t *); extern int ipmp_getifinfo(ipmp_handle_t, const char *, ipmp_ifinfo_t **); extern void ipmp_freeifinfo(ipmp_ifinfo_t *); +extern int ipmp_getaddrinfo(ipmp_handle_t, const char *, + struct sockaddr_storage *, ipmp_addrinfo_t **); +extern void ipmp_freeaddrinfo(ipmp_addrinfo_t *); #ifdef __cplusplus } diff --git a/usr/src/lib/libipmp/common/ipmp_query_impl.h b/usr/src/lib/libipmp/common/ipmp_query_impl.h index 03ecb5cd84..6ac5c3ca27 100644 --- a/usr/src/lib/libipmp/common/ipmp_query_impl.h +++ b/usr/src/lib/libipmp/common/ipmp_query_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -18,17 +17,14 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPMP_QUERY_IMPL_H #define _IPMP_QUERY_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <ipmp_query.h> /* @@ -58,14 +54,24 @@ typedef struct ipmp_ifinfolist { } ipmp_ifinfolist_t; /* + * List of ipmp_addrinfo_t structures. + */ +typedef struct ipmp_addrinfolist { + struct ipmp_addrinfolist *adl_next; + ipmp_addrinfo_t *adl_adinfop; +} ipmp_addrinfolist_t; + +/* * Snapshot of IPMP state. */ typedef struct ipmp_snap { ipmp_grouplist_t *sn_grlistp; ipmp_groupinfolist_t *sn_grinfolistp; ipmp_ifinfolist_t *sn_ifinfolistp; + ipmp_addrinfolist_t *sn_adinfolistp; unsigned int sn_ngroup; unsigned int sn_nif; + unsigned int sn_naddr; } ipmp_snap_t; /* @@ -74,17 +80,28 @@ typedef struct ipmp_snap { extern ipmp_snap_t *ipmp_snap_create(void); extern void ipmp_snap_free(ipmp_snap_t *); extern int ipmp_snap_addifinfo(ipmp_snap_t *, ipmp_ifinfo_t *); +extern int ipmp_snap_addaddrinfo(ipmp_snap_t *, ipmp_addrinfo_t *); extern int ipmp_snap_addgroupinfo(ipmp_snap_t *, ipmp_groupinfo_t *); /* - * IPMP structure creation routines. + * IPMP structure creation/destruction routines. */ extern ipmp_ifinfo_t *ipmp_ifinfo_create(const char *, const char *, - ipmp_if_state_t, ipmp_if_type_t); -extern ipmp_groupinfo_t *ipmp_groupinfo_create(const char *, uint64_t, - ipmp_group_state_t, unsigned int, char (*)[LIFNAMSIZ]); + ipmp_if_state_t, ipmp_if_type_t, ipmp_if_linkstate_t, ipmp_if_probestate_t, + ipmp_if_flags_t, ipmp_targinfo_t *, ipmp_targinfo_t *); +extern ipmp_groupinfo_t *ipmp_groupinfo_create(const char *, uint64_t, uint_t, + ipmp_group_state_t, uint_t, char (*)[LIFNAMSIZ], const char *, + const char *, const char *, const char *, uint_t, + struct sockaddr_storage *); extern ipmp_grouplist_t *ipmp_grouplist_create(uint64_t, unsigned int, char (*)[LIFGRNAMSIZ]); +extern ipmp_addrinfo_t *ipmp_addrinfo_create(struct sockaddr_storage *, + ipmp_addr_state_t, const char *, const char *); +extern ipmp_targinfo_t *ipmp_targinfo_create(const char *, + struct sockaddr_storage *, ipmp_if_targmode_t, uint_t, + struct sockaddr_storage *); +extern void ipmp_freetarginfo(ipmp_targinfo_t *); + #ifdef __cplusplus } diff --git a/usr/src/lib/libipmp/common/llib-lipmp b/usr/src/lib/libipmp/common/llib-lipmp index a16011745a..a22eec5d66 100644 --- a/usr/src/lib/libipmp/common/llib-lipmp +++ b/usr/src/lib/libipmp/common/llib-lipmp @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* LINTLIBRARY */ /* PROTOLIB1 */ #include <ipmp.h> +#include <ipmp_admin.h> #include <ipmp_mpathd.h> #include <ipmp_query_impl.h> diff --git a/usr/src/lib/libipmp/common/mapfile-vers b/usr/src/lib/libipmp/common/mapfile-vers index a4052bfcd3..8c93248338 100644 --- a/usr/src/lib/libipmp/common/mapfile-vers +++ b/usr/src/lib/libipmp/common/mapfile-vers @@ -19,32 +19,39 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# SUNWprivate_1.1 { global: + ipmp_addrinfo_create; ipmp_close; ipmp_errmsg; + ipmp_freeaddrinfo; ipmp_freegroupinfo; ipmp_freegrouplist; ipmp_freeifinfo; + ipmp_freetarginfo; + ipmp_getaddrinfo; ipmp_getgroupinfo; ipmp_getgrouplist; ipmp_getifinfo; ipmp_groupinfo_create; ipmp_grouplist_create; ipmp_ifinfo_create; + ipmp_offline; ipmp_open; + ipmp_ping_daemon; ipmp_read; ipmp_setqcontext; + ipmp_snap_addaddrinfo; ipmp_snap_addgroupinfo; ipmp_snap_addifinfo; ipmp_snap_create; ipmp_snap_free; + ipmp_targinfo_create; + ipmp_undo_offline; ipmp_write; ipmp_writetlv; local: diff --git a/usr/src/lib/libnsl/nss/netdir_inet_sundry.c b/usr/src/lib/libnsl/nss/netdir_inet_sundry.c index 742e7408b2..4e9473a8cf 100644 --- a/usr/src/lib/libnsl/nss/netdir_inet_sundry.c +++ b/usr/src/lib/libnsl/nss/netdir_inet_sundry.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,7 +20,7 @@ */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * lib/libnsl/nss/netdir_inet_sundry.c @@ -39,8 +38,6 @@ * Copied mostly from erstwhile lib/nametoaddr/tcpip/tcpip.c. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mt.h" #include <stdlib.h> #include <stdio.h> @@ -69,9 +66,6 @@ #include <syslog.h> #include <values.h> #include <limits.h> -#ifdef DEBUG -#include <stdio.h> -#endif #include <nss_dbdefs.h> #include "nss.h" @@ -151,8 +145,8 @@ __inet_taddr2uaddr(struct netconfig *tp, struct netbuf *addr) /* LINTED pointer cast */ sa6 = (struct sockaddr_in6 *)(addr->buf); myport = ntohs(sa6->sin6_port); - if (inet_ntop(AF_INET6, (void *)sa6->sin6_addr.s6_addr, - tmp, sizeof (tmp)) == 0) { + if (inet_ntop(AF_INET6, sa6->sin6_addr.s6_addr, tmp, + sizeof (tmp)) == NULL) { _nderror = ND_BADARG; return (NULL); } @@ -400,7 +394,7 @@ getifnum: continue; if_info[n_ifs].if_address = - ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr; + ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr; if (nss_ioctl(AF_INET, SIOCGLIFFLAGS, lifr) < 0) continue; @@ -413,7 +407,7 @@ getifnum: continue; if_info[n_ifs].if_netmask = - ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr; + ((struct sockaddr_in *)&lifr->lifr_addr)->sin_addr; n_ifs++; } free(buf); @@ -528,21 +522,12 @@ get_best_match(struct in_addr addr) if_addr = ntohl(ifn->if_address.s_addr); /* host order */ /* - * Checking if the interface selected is FAILED or DEPRECATED. - * In case IFF_FAILED or IFF_DEPRECATED flag for the interface - * is set, we move on to the next interface in the list. - * Refer IPMP(IP Multi Pathing) for more details. - */ - - if ((ifn->if_flags & (IFF_FAILED | IFF_DEPRECATED)) != 0) - continue; - - /* * set initial count to first bit set in netmask, with * zero being the number of the least significant bit. */ - for (count = 0, mask = netmask; mask && ((mask & 1) == 0); - count++, mask >>= 1); + count = 0; + for (mask = netmask; mask && ((mask & 1) == 0); mask >>= 1) + count++; /* * Set limit so that we don't try to match prefixes shorter @@ -570,12 +555,6 @@ get_best_match(struct in_addr addr) * (2) the best partial subnet match * (3) the first non-loopback && non-PPP interface * (4) the first non-loopback interface (PPP is OK) - * - * While checking for condition (3) and (4), we also look - * if the interface we are returning is neither FAILED - * nor DEPRECATED. In case there are no interface - * available, which are neither FAILED nor DEPRECRATED, - * we return 0. */ found = FALSE; while (netmask && count < subnet_count) { @@ -607,8 +586,7 @@ get_best_match(struct in_addr addr) */ if (bestmatch == NULL) { for (ifn = if_info; ifn < (if_info + n_ifs); ifn++) { - if ((ifn->if_flags & (IFF_LOOPBACK | - IFF_FAILED | IFF_DEPRECATED)) == 0) { + if ((ifn->if_flags & IFF_LOOPBACK) == 0) { bestmatch = ifn; /* @@ -619,10 +597,6 @@ get_best_match(struct in_addr addr) * list... */ if ((ifn->if_flags & IFF_POINTOPOINT) == 0) { -#ifdef DEBUG - (void) printf("found !loopback && !non-PPP interface: %s\n", - inet_ntoa(ifn->if_address)); -#endif break; } } @@ -701,9 +675,9 @@ select_server_addr(union any_in_addr *dst_addr, int family, } /* open a UDP socket */ - if ((tmp_fd = _so_socket(family, SOCK_DGRAM, 0, - NULL, SOV_SOCKBSD)) < 0) { - syslog(LOG_ERR, "selsect_server_addr:connect failed\n"); + tmp_fd = _so_socket(family, SOCK_DGRAM, 0, NULL, SOV_SOCKBSD); + if (tmp_fd < 0) { + syslog(LOG_ERR, "select_server_addr: connect failed\n"); return (FALSE); } @@ -716,15 +690,16 @@ select_server_addr(union any_in_addr *dst_addr, int family, * message, as it'll try to send the probe packet out and will * receive ICMP unreachable. */ - if (family == AF_INET) + if (family == AF_INET) { src_addr->addr.s_addr = INADDR_ANY; - else + } else { /* * Since in6addr_any is not in the scope * use the following hack */ (void) memset(src_addr->addr6.s6_addr, - 0, sizeof (struct in6_addr)); + 0, sizeof (struct in6_addr)); + } (void) close(tmp_fd); free(sock); return (FALSE); @@ -732,7 +707,7 @@ select_server_addr(union any_in_addr *dst_addr, int family, /* get the local sock info */ if (_so_getsockname(tmp_fd, sock, &sock_len, SOV_DEFAULT) < 0) { - syslog(LOG_ERR, "selsect_server_addr:getsockname failed\n"); + syslog(LOG_ERR, "select_server_addr: getsockname failed\n"); (void) close(tmp_fd); free(sock); return (FALSE); @@ -799,11 +774,6 @@ inet_netdir_mergeaddr(struct netconfig *tp, char *ruaddr, char *uaddr) clientaddr.s_addr = inet_addr(ruaddr); -#ifdef DEBUG - (void) printf("client's address is %s and %s\n", - ruaddr, inet_ntoa(clientaddr)); -#endif - /* We know cp is not NULL due to the check above */ *cp = '.'; /* Put the dot back in the IP addr */ @@ -895,28 +865,22 @@ inet_netdir_mergeaddr(struct netconfig *tp, char *ruaddr, char *uaddr) FALSE) return (NULL); server_addr.sin6_addr = out_addr.addr6; + } else { + (void) memcpy(&server_addr, &sa, sizeof (server_addr)); } - else - (void) memcpy(&server_addr, &sa, - sizeof (struct sockaddr_in6)); -#ifdef DEBUG - printf("%s\n", inet_ntop(af, out_addr.addr6.s6_addr, - tmp, sizeof (tmp))); -#endif - - if (inet_ntop(af, server_addr.sin6_addr.s6_addr, - tmp, sizeof (tmp)) == NULL) { + + if (inet_ntop(af, server_addr.sin6_addr.s6_addr, tmp, + sizeof (tmp)) == NULL) { _nderror = ND_NOHOST; return (NULL); } /* now extract the port info */ if ((dot = strrchr(uaddr, '.')) != 0) { + char *p = --dot; - char *p; - - p = --dot; - while (*p-- != '.'); + while (*p-- != '.') + ; p++; (void) strcat(tmp + strlen(tmp), p); _nderror = ND_OK; @@ -1051,7 +1015,7 @@ bindresvport(struct netconfig *nconf, int fd, struct netbuf *addr) * this, if the caller has set this option before calling * bindresvport(), it will be unset. Better be safe... */ - *optval = 0; + *optval = 0; resp.flags = 0; resp.opt.buf = (char *)reqbuf; resp.opt.maxlen = sizeof (reqbuf); diff --git a/usr/src/lib/libsocket/inet/interface_id.c b/usr/src/lib/libsocket/inet/interface_id.c index 2a512b025f..88854fe9da 100644 --- a/usr/src/lib/libsocket/inet/interface_id.c +++ b/usr/src/lib/libsocket/inet/interface_id.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdio.h> #include <ctype.h> #include <string.h> @@ -120,6 +117,9 @@ if_indextoname(uint32_t ifindex, char *ifname) int numifs; size_t bufsize; boolean_t found; + uint_t flags; + + flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES | LIFC_UNDER_IPMP; /* A interface index of 0 is invalid */ if (ifindex == 0) { @@ -137,14 +137,19 @@ if_indextoname(uint32_t ifindex, char *ifname) /* Prepare to send a SIOCGLIFNUM request message */ lifn.lifn_family = AF_UNSPEC; - lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; + lifn.lifn_flags = flags; if (ioctl(s, SIOCGLIFNUM, (char *)&lifn) < 0) { int save_err = errno; (void) close(s); errno = save_err; return (NULL); } - numifs = lifn.lifn_count; + + /* + * NOTE: "+ 10" sleaze mitigates new IP interfaces showing up between + * the SIOCGLIFNUM and the SIOCGLIFCONF. + */ + numifs = lifn.lifn_count + 10; /* * Provide enough buffer to obtain the interface @@ -161,7 +166,7 @@ if_indextoname(uint32_t ifindex, char *ifname) return (NULL); } lifc.lifc_family = AF_UNSPEC; - lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; + lifc.lifc_flags = flags; lifc.lifc_len = bufsize; lifc.lifc_buf = buf; if (ioctl(s, SIOCGLIFCONF, (char *)&lifc) < 0) { diff --git a/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c b/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c index dabc2e0929..62ebedf522 100644 --- a/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c +++ b/usr/src/lib/smbsrv/libsmbns/common/smbns_dyndns.c @@ -1936,7 +1936,7 @@ dyndns_update_core(char *fqdn) return (-1); do { - if (ni.ni_nic.nic_sysflags & (IFF_STANDBY | IFF_PRIVATE)) + if (ni.ni_nic.nic_sysflags & IFF_PRIVATE) continue; addr.s_addr = ni.ni_nic.nic_ip; @@ -2003,7 +2003,7 @@ dyndns_clear_rev_zone(char *fqdn) return (-1); do { - if (ni.ni_nic.nic_sysflags & (IFF_STANDBY | IFF_PRIVATE)) + if (ni.ni_nic.nic_sysflags & IFF_PRIVATE) continue; addr.s_addr = ni.ni_nic.nic_ip; diff --git a/usr/src/pkgdefs/SUNWarc/prototype_com b/usr/src/pkgdefs/SUNWarc/prototype_com index e9d6270d88..7e04f8b580 100644 --- a/usr/src/pkgdefs/SUNWarc/prototype_com +++ b/usr/src/pkgdefs/SUNWarc/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -122,8 +122,6 @@ s none usr/lib/llib-lintl=../../lib/llib-lintl s none usr/lib/llib-lintl.ln=../../lib/llib-lintl.ln f none usr/lib/llib-lipmi 644 root bin f none usr/lib/llib-lipmi.ln 644 root bin -f none usr/lib/llib-lipmp 644 root bin -f none usr/lib/llib-lipmp.ln 644 root bin f none usr/lib/llib-lipp 644 root bin f none usr/lib/llib-lipp.ln 644 root bin s none usr/lib/llib-lkstat=../../lib/llib-lkstat diff --git a/usr/src/pkgdefs/SUNWarcr/prototype_com b/usr/src/pkgdefs/SUNWarcr/prototype_com index 6095ff7fe5..852330d742 100644 --- a/usr/src/pkgdefs/SUNWarcr/prototype_com +++ b/usr/src/pkgdefs/SUNWarcr/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -80,6 +80,8 @@ f none lib/llib-lgen 644 root bin f none lib/llib-lgen.ln 644 root bin f none lib/llib-lintl 644 root bin f none lib/llib-lintl.ln 644 root bin +f none lib/llib-lipmp 644 root bin +f none lib/llib-lipmp.ln 644 root bin f none lib/llib-lkmf.ln 644 root bin f none lib/llib-lkmfberder.ln 644 root bin f none lib/llib-lkstat 644 root bin diff --git a/usr/src/pkgdefs/SUNWckr/prototype_com b/usr/src/pkgdefs/SUNWckr/prototype_com index ead3a7e5e8..989847d09d 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_com +++ b/usr/src/pkgdefs/SUNWckr/prototype_com @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -81,6 +81,7 @@ f none kernel/drv/crypto.conf 644 root sys f none kernel/drv/cryptoadm.conf 644 root sys f none kernel/drv/devinfo.conf 644 root sys f none kernel/drv/dld.conf 644 root sys +f none kernel/drv/dlpistub.conf 644 root sys f none kernel/drv/icmp.conf 644 root sys f none kernel/drv/icmp6.conf 644 root sys f none kernel/drv/ip.conf 644 root sys @@ -123,7 +124,6 @@ f none kernel/drv/tcp6.conf 644 root sys f none kernel/drv/tl.conf 644 root sys f none kernel/drv/udp.conf 644 root sys f none kernel/drv/udp6.conf 644 root sys -f none kernel/drv/vni.conf 644 root sys f none kernel/drv/vnic.conf 644 root sys f none kernel/drv/wc.conf 644 root sys d none kernel/exec 755 root sys diff --git a/usr/src/pkgdefs/SUNWckr/prototype_i386 b/usr/src/pkgdefs/SUNWckr/prototype_i386 index 421d760621..e2972713c6 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_i386 +++ b/usr/src/pkgdefs/SUNWckr/prototype_i386 @@ -86,6 +86,7 @@ f none kernel/drv/crypto 755 root sys f none kernel/drv/cryptoadm 755 root sys f none kernel/drv/devinfo 755 root sys f none kernel/drv/dld 755 root sys +f none kernel/drv/dlpistub 755 root sys f none kernel/drv/i8042 755 root sys f none kernel/drv/icmp 755 root sys f none kernel/drv/icmp6 755 root sys @@ -152,7 +153,6 @@ f none kernel/drv/ucode.conf 644 root sys f none kernel/drv/udp 755 root sys f none kernel/drv/udp6 755 root sys f none kernel/drv/vgatext 755 root sys -f none kernel/drv/vni 755 root sys f none kernel/drv/vnic 755 root sys f none kernel/drv/wc 755 root sys f none kernel/exec/elfexec 755 root sys @@ -308,6 +308,7 @@ f none kernel/drv/amd64/crypto 755 root sys f none kernel/drv/amd64/cryptoadm 755 root sys f none kernel/drv/amd64/devinfo 755 root sys f none kernel/drv/amd64/dld 755 root sys +f none kernel/drv/amd64/dlpistub 755 root sys f none kernel/drv/amd64/i8042 755 root sys f none kernel/drv/amd64/icmp 755 root sys f none kernel/drv/amd64/icmp6 755 root sys @@ -366,7 +367,6 @@ f none kernel/drv/amd64/ucode 755 root sys f none kernel/drv/amd64/udp 755 root sys f none kernel/drv/amd64/udp6 755 root sys f none kernel/drv/amd64/vgatext 755 root sys -f none kernel/drv/amd64/vni 755 root sys f none kernel/drv/amd64/vnic 755 root sys f none kernel/drv/amd64/wc 755 root sys d none kernel/exec/amd64 755 root sys diff --git a/usr/src/pkgdefs/SUNWckr/prototype_sparc b/usr/src/pkgdefs/SUNWckr/prototype_sparc index e81a86168e..a8f0b93be0 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_sparc +++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -82,6 +82,7 @@ f none kernel/drv/sparcv9/cryptoadm 755 root sys f none kernel/drv/sparcv9/dad 755 root sys f none kernel/drv/sparcv9/devinfo 755 root sys f none kernel/drv/sparcv9/dld 755 root sys +f none kernel/drv/sparcv9/dlpistub 755 root sys f none kernel/drv/sparcv9/esp 755 root sys f none kernel/drv/sparcv9/i8042 755 root sys f none kernel/drv/sparcv9/icmp 755 root sys @@ -137,7 +138,6 @@ f none kernel/drv/sparcv9/ttymux 755 root sys f none kernel/drv/sparcv9/uata 755 root sys f none kernel/drv/sparcv9/udp 755 root sys f none kernel/drv/sparcv9/udp6 755 root sys -f none kernel/drv/sparcv9/vni 755 root sys f none kernel/drv/sparcv9/vnic 755 root sys f none kernel/drv/sparcv9/wc 755 root sys d none kernel/exec/sparcv9 755 root sys diff --git a/usr/src/pkgdefs/SUNWcsd/postinstall b/usr/src/pkgdefs/SUNWcsd/postinstall index b481a763ca..caa9bb3402 100644 --- a/usr/src/pkgdefs/SUNWcsd/postinstall +++ b/usr/src/pkgdefs/SUNWcsd/postinstall @@ -20,7 +20,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -48,6 +48,7 @@ prototype_com=' devices/pseudo/arp@0:arp dev/arp devices/pseudo/clone@0:ibd dev/ibd devices/pseudo/dld@0:ctl dev/dld +devices/pseudo/dlpistub@0:ipmpstub dev/ipmpstub devices/pseudo/icmp@0:icmp dev/icmp devices/pseudo/icmp@0:icmp dev/rawip devices/pseudo/icmp6@0:icmp6 dev/icmp6 diff --git a/usr/src/pkgdefs/SUNWcsl/prototype_com b/usr/src/pkgdefs/SUNWcsl/prototype_com index a856560c5e..d5918f5883 100644 --- a/usr/src/pkgdefs/SUNWcsl/prototype_com +++ b/usr/src/pkgdefs/SUNWcsl/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -144,8 +144,6 @@ s none usr/lib/libintl.so=../../lib/libintl.so.1 s none usr/lib/libintl.so.1=../../lib/libintl.so.1 f none usr/lib/libipmi.so.1 755 root bin s none usr/lib/libipmi.so=./libipmi.so.1 -s none usr/lib/libipmp.so=./libipmp.so.1 -f none usr/lib/libipmp.so.1 755 root bin s none usr/lib/libipp.so=./libipp.so.1 f none usr/lib/libipp.so.1 755 root bin f none usr/lib/libipsecutil.so.1 755 root bin diff --git a/usr/src/pkgdefs/SUNWcslr/prototype_com b/usr/src/pkgdefs/SUNWcslr/prototype_com index ed7059250a..71ebaff013 100644 --- a/usr/src/pkgdefs/SUNWcslr/prototype_com +++ b/usr/src/pkgdefs/SUNWcslr/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -95,6 +95,8 @@ f none lib/libinetcfg.so.1 755 root bin f none lib/libinetutil.so.1 755 root bin s none lib/libintl.so=libintl.so.1 f none lib/libintl.so.1 755 root bin +s none lib/libipmp.so=./libipmp.so.1 +f none lib/libipmp.so.1 755 root bin s none lib/libkmf.so=libkmf.so.1 f none lib/libkmf.so.1 755 root bin s none lib/libkmfberder.so=libkmfberder.so.1 diff --git a/usr/src/pkgdefs/SUNWcsr/prototype_com b/usr/src/pkgdefs/SUNWcsr/prototype_com index 02051a08ae..b60abe0f00 100644 --- a/usr/src/pkgdefs/SUNWcsr/prototype_com +++ b/usr/src/pkgdefs/SUNWcsr/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -332,6 +332,7 @@ d none lib 755 root bin d none lib/crypto 755 root bin f none lib/crypto/kcfd 555 root bin d none lib/inet 755 root bin +f none lib/inet/in.mpathd 555 root bin f none lib/inet/nwamd 555 root bin d none lib/svc 0755 root bin d none lib/svc/bin 0755 root bin @@ -404,7 +405,8 @@ f none sbin/fiocompress 555 root bin f none sbin/hostconfig 555 root bin f none sbin/ifconfig 555 root bin f none sbin/ifparse 555 root bin -s none sbin/in.mpathd=../usr/lib/inet/in.mpathd +s none sbin/in.mpathd=../lib/inet/in.mpathd +f none sbin/ipmpstat 555 root bin f none sbin/soconfig 555 root bin f none sbin/init 555 root sys s none sbin/jsh=sh diff --git a/usr/src/pkgdefs/SUNWcsu/prototype_com b/usr/src/pkgdefs/SUNWcsu/prototype_com index 6bb2772f1a..464da8254a 100644 --- a/usr/src/pkgdefs/SUNWcsu/prototype_com +++ b/usr/src/pkgdefs/SUNWcsu/prototype_com @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -649,7 +649,7 @@ d none usr/lib/inet/dhcp 755 root bin d none usr/lib/inet/dhcp/nsu 755 root bin d none usr/lib/inet/dhcp/svc 755 root bin f none usr/lib/inet/in.iked 555 root bin -f none usr/lib/inet/in.mpathd 555 root bin +s none usr/lib/inet/in.mpathd=../../../lib/inet/in.mpathd f none usr/lib/inet/inetd 555 root bin f none usr/lib/intrd 555 root bin f none usr/lib/isaexec 555 root bin @@ -865,6 +865,7 @@ s none usr/sbin/init=../../sbin/init f none usr/sbin/install 555 root bin f none usr/sbin/installboot 555 root sys f none usr/sbin/ipaddrsel 555 root bin +s none usr/sbin/ipmpstat=../../sbin/ipmpstat f none usr/sbin/ipsecalgs 555 root bin f none usr/sbin/ipsecconf 555 root bin f none usr/sbin/ipseckey 555 root bin diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com index 45536bf13e..555f28921c 100644 --- a/usr/src/pkgdefs/SUNWhea/prototype_com +++ b/usr/src/pkgdefs/SUNWhea/prototype_com @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -270,6 +270,7 @@ f none usr/include/inet/tcp_stack.h 644 root bin f none usr/include/inet/wifi_ioctl.h 644 root bin f none usr/include/inttypes.h 644 root bin f none usr/include/ipmp.h 644 root bin +f none usr/include/ipmp_admin.h 644 root bin f none usr/include/ipmp_mpathd.h 644 root bin f none usr/include/ipmp_query.h 644 root bin d none usr/include/ipp 755 root bin diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh index 3ac332b45c..7fd4a7186b 100644 --- a/usr/src/tools/scripts/bfu.sh +++ b/usr/src/tools/scripts/bfu.sh @@ -21,7 +21,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # Upgrade a machine from a cpio archive area in about 5 minutes. @@ -8060,10 +8060,14 @@ mondo_loop() { # The global zone needs to have its /dev/dld symlink created # during install so that processes can access it early in boot - # before devfsadm is run. + # before devfsadm is run. Likewise for /dev/ipmpstub. if [ ! -L $rootprefix/dev/dld ]; then ln -s ../devices/pseudo/dld@0:ctl $rootprefix/dev/dld fi + if [ ! -L $rootprefix/dev/ipmpstub ]; then + ln -s ../devices/pseudo/dlpistub@0:ipmpstub \ + $rootprefix/dev/ipmpstub + fi fi # Fix up audit permissions diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 5fcd81b433..448a0d712d 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -485,7 +485,7 @@ IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \ sctp_ioc.o sctp_bind.o sctp_notify.o sctp_asconf.o \ sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o -IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \ +IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \ ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \ ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \ @@ -1605,9 +1605,9 @@ IPF_OBJS += ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o ip_nat.o \ IBD_OBJS += ibd.o -SDP_OBJS += sdpddi.o +DLPISTUB_OBJS += dlpistub.o -VNI_OBJS += vni.o +SDP_OBJS += sdpddi.o CTF_OBJS += ctf_create.o ctf_decl.o ctf_error.o ctf_hash.o ctf_labels.o \ ctf_lookup.o ctf_open.o ctf_types.o ctf_util.o ctf_subr.o ctf_mod.o diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index 1cd82570c1..db550667da 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # uts/common/Makefile.rules @@ -447,7 +447,7 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ip/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) -$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ipnet/%.c +$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ipnet/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -489,7 +489,7 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/sockmods/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) -$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/vni/%.c +$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/dlpistub/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1572,7 +1572,7 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/arp/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ip/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) -$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipnet/%.c +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipnet/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipf/%.c @@ -1599,10 +1599,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/tcp/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/nca/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) -$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/vni/%.c +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/dlpistub/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) - $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/inet/arp.h b/usr/src/uts/common/inet/arp.h index 0bca52e9ae..4351c91666 100644 --- a/usr/src/uts/common/inet/arp.h +++ b/usr/src/uts/common/inet/arp.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -28,6 +28,7 @@ #define _INET_ARP_H #include <sys/types.h> +#include <net/if.h> #ifdef __cplusplus extern "C" { @@ -64,6 +65,8 @@ extern "C" { */ #define AR_ARP_CLOSING (AR_IOCTL + 16) #define AR_ARP_EXTEND (AR_IOCTL + 17) +#define AR_IPMP_ACTIVATE (AR_IOCTL + 18) +#define AR_IPMP_DEACTIVATE (AR_IOCTL + 19) /* Both ace_flags and area_flags; must also modify arp.c in mdb */ #define ACE_F_PERMANENT 0x0001 @@ -182,6 +185,14 @@ typedef struct ar_mapping_add_s { /* the mask&proto_addr */ } arma_t; +/* Structure used to notify ARP of changes to IPMP group topology */ +typedef struct ar_ipmp_event_s { + uint32_t arie_cmd; + uint32_t arie_name_offset; + uint32_t arie_name_length; + char arie_grifname[LIFNAMSIZ]; +} arie_t; + /* Structure used to notify clients of interesting conditions. */ typedef struct ar_client_notify_s { uint32_t arcn_cmd; diff --git a/usr/src/uts/common/inet/arp/arp.c b/usr/src/uts/common/inet/arp/arp.c index 815dfd19d3..06c499ced9 100644 --- a/usr/src/uts/common/inet/arp/arp.c +++ b/usr/src/uts/common/inet/arp/arp.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -85,6 +85,30 @@ * talking to a given peer, then it doesn't matter if we have the right mapping * for that peer. It would be possible to send queries on aging entries that * are active, but this isn't done. + * + * IPMP Notes + * ---------- + * + * ARP is aware of IPMP. In particular, IP notifies ARP about all "active" + * (able to transmit data packets) interfaces in a given group via + * AR_IPMP_ACTIVATE and AR_IPMP_DEACTIVATE messages. These messages, combined + * with the "IPMP arl_t" that ARP creates over the IPMP DLPI stub driver, + * enable ARP to track all the arl_t's that are in the same group and thus + * ensure that ACEs are shared across each group and the arl_t that ARP + * chooses to transmit on for a given ACE is optimal. + * + * ARP relies on IP for hardware address updates. In particular, if the + * hardware address of an interface changes (DL_NOTE_PHYS_ADDR), then IP will + * bring the interface down and back up -- and as part of bringing it back + * up, will send messages to ARP that allow it to update the affected arl's + * with new hardware addresses. + * + * N.B.: One side-effect of this approach is that when an interface fails and + * then starts to repair, it will temporarily populate the ARP cache with + * addresses that are owned by it rather than the group's arl_t. To address + * this, we could add more messages (e.g., AR_IPMP_JOIN and AR_IPMP_LEAVE), + * but as the issue appears to be only cosmetic (redundant entries in the ARP + * cache during interace repair), we've kept things simple for now. */ /* @@ -134,6 +158,12 @@ typedef struct { #define ARH_FIXED_LEN 8 /* + * Macro used when creating ACEs to determine the arl that should own it. + */ +#define OWNING_ARL(arl) \ + ((arl)->arl_ipmp_arl != NULL ? (arl)->arl_ipmp_arl : arl) + +/* * MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK * doesn't quite do it for us. */ @@ -154,7 +184,7 @@ static int ar_ce_create(arl_t *arl, uint32_t proto, uchar_t *hw_addr, uint32_t hw_addr_len, uchar_t *proto_addr, uint32_t proto_addr_len, uchar_t *proto_mask, uchar_t *proto_extract_mask, uint32_t hw_extract_start, - uint32_t flags); + uchar_t *sender_addr, uint32_t flags); static void ar_ce_delete(ace_t *ace); static void ar_ce_delete_per_arl(ace_t *ace, void *arg); static ace_t **ar_ce_hash(arp_stack_t *as, uint32_t proto, @@ -167,6 +197,8 @@ static ace_t *ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp, ace_t *matchfn()); static ace_t *ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr, uint32_t proto_addr_length); +static ace_t *ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, + uchar_t *proto_addr, uint32_t proto_addr_length); static boolean_t ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length); static void ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *), @@ -187,6 +219,8 @@ static int ar_interface_up(queue_t *q, mblk_t *mp); static int ar_interface_down(queue_t *q, mblk_t *mp); static int ar_interface_on(queue_t *q, mblk_t *mp); static int ar_interface_off(queue_t *q, mblk_t *mp); +static int ar_ipmp_activate(queue_t *q, mblk_t *mp); +static int ar_ipmp_deactivate(queue_t *q, mblk_t *mp); static void ar_ll_cleanup_arl_queue(queue_t *q); static void ar_ll_down(arl_t *arl); static arl_t *ar_ll_lookup_by_name(arp_stack_t *as, const char *name); @@ -208,7 +242,7 @@ static int ar_param_set(queue_t *q, mblk_t *mp, char *value, static void ar_query_delete(ace_t *ace, void *ar); static void ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, uint32_t proto_addr_len); -static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace); +static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace); static void ar_rput(queue_t *q, mblk_t *mp_orig); static void ar_rput_dlpi(queue_t *q, mblk_t *mp); static void ar_set_address(ace_t *ace, uchar_t *addrpos, @@ -344,6 +378,10 @@ static arct_t ar_cmd_tbl[] = { ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_ON" }, { ar_interface_off, AR_INTERFACE_OFF, sizeof (arc_t), ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_OFF" }, + { ar_ipmp_activate, AR_IPMP_ACTIVATE, sizeof (arie_t), + ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_ACTIVATE" }, + { ar_ipmp_deactivate, AR_IPMP_DEACTIVATE, sizeof (arie_t), + ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_DEACTIVATE" }, { ar_set_ppa, (uint32_t)IF_UNITSEL, sizeof (int), ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "IF_UNITSEL" }, { ar_nd_ioctl, ND_GET, 1, @@ -358,6 +396,65 @@ static arct_t ar_cmd_tbl[] = { }; /* + * Lookup and return an arl appropriate for sending packets with either source + * hardware address `hw_addr' or source protocol address `ip_addr', in that + * order. If neither was specified or neither match, return any arl in the + * same group as `arl'. + */ +static arl_t * +ar_ipmp_lookup_xmit_arl(arl_t *arl, uchar_t *hw_addr, uint_t hw_addrlen, + uchar_t *ip_addr) +{ + arlphy_t *ap; + ace_t *src_ace; + arl_t *xmit_arl = NULL; + arp_stack_t *as = ARL_TO_ARPSTACK(arl); + + ASSERT(arl->arl_flags & ARL_F_IPMP); + + if (hw_addr != NULL && hw_addrlen != 0) { + xmit_arl = as->as_arl_head; + for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) { + /* + * There may be arls with the same HW address that are + * not in our IPMP group; we don't want those. + */ + if (xmit_arl->arl_ipmp_arl != arl) + continue; + + ap = xmit_arl->arl_phy; + if (ap != NULL && ap->ap_hw_addrlen == hw_addrlen && + bcmp(ap->ap_hw_addr, hw_addr, hw_addrlen) == 0) + break; + } + + DTRACE_PROBE4(xmit_arl_hwsrc, arl_t *, arl, arl_t *, + xmit_arl, uchar_t *, hw_addr, uint_t, hw_addrlen); + } + + if (xmit_arl == NULL && ip_addr != NULL) { + src_ace = ar_ce_lookup_permanent(as, IP_ARP_PROTO_TYPE, ip_addr, + IP_ADDR_LEN); + if (src_ace != NULL) + xmit_arl = src_ace->ace_xmit_arl; + + DTRACE_PROBE4(xmit_arl_ipsrc, arl_t *, arl, arl_t *, + xmit_arl, uchar_t *, ip_addr, uint_t, IP_ADDR_LEN); + } + + if (xmit_arl == NULL) { + xmit_arl = as->as_arl_head; + for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) + if (xmit_arl->arl_ipmp_arl == arl && xmit_arl != arl) + break; + + DTRACE_PROBE2(xmit_arl_any, arl_t *, arl, arl_t *, xmit_arl); + } + + return (xmit_arl); +} + +/* * ARP Cache Entry creation routine. * Cache entries are allocated within timer messages and inserted into * the global hash list based on protocol and protocol address. @@ -365,7 +462,8 @@ static arct_t ar_cmd_tbl[] = { static int ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, uchar_t *proto_addr, uint_t proto_addr_len, uchar_t *proto_mask, - uchar_t *proto_extract_mask, uint_t hw_extract_start, uint_t flags) + uchar_t *proto_extract_mask, uint_t hw_extract_start, uchar_t *sender_addr, + uint_t flags) { static ace_t ace_null; ace_t *ace; @@ -373,17 +471,35 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, uchar_t *dst; mblk_t *mp; arp_stack_t *as = ARL_TO_ARPSTACK(arl); + arl_t *xmit_arl; arlphy_t *ap; if ((flags & ~ACE_EXTERNAL_FLAGS_MASK) || arl == NULL) return (EINVAL); - if ((ap = arl->arl_phy) == NULL) + if (proto_addr == NULL || proto_addr_len == 0 || + (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN)) return (EINVAL); if (flags & ACE_F_MYADDR) flags |= ACE_F_PUBLISH | ACE_F_AUTHORITY; + /* + * Latch a transmit arl for this ace. + */ + if (arl->arl_flags & ARL_F_IPMP) { + ASSERT(proto == IP_ARP_PROTO_TYPE); + xmit_arl = ar_ipmp_lookup_xmit_arl(arl, hw_addr, hw_addr_len, + sender_addr); + } else { + xmit_arl = arl; + } + + if (xmit_arl == NULL || xmit_arl->arl_phy == NULL) + return (EINVAL); + + ap = xmit_arl->arl_phy; + if (!hw_addr && hw_addr_len == 0) { if (flags == ACE_F_PERMANENT) { /* Not publish */ /* 224.0.0.0 to zero length address */ @@ -398,9 +514,6 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, flags |= ACE_F_RESOLVED; } - if (proto_addr == NULL || proto_addr_len == 0 || - (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN)) - return (EINVAL); /* Handle hw_addr_len == 0 for DL_ENABMULTI_REQ etc. */ if (hw_addr_len != 0 && hw_addr == NULL) return (EINVAL); @@ -432,6 +545,7 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, ace->ace_proto = proto; ace->ace_mp = mp; ace->ace_arl = arl; + ace->ace_xmit_arl = xmit_arl; dst = (uchar_t *)&ace[1]; @@ -510,12 +624,73 @@ ar_ce_delete(ace_t *ace) static void ar_ce_delete_per_arl(ace_t *ace, void *arl) { - if (ace->ace_arl == arl) { + if (ace->ace_arl == arl || ace->ace_xmit_arl == arl) { ace->ace_flags &= ~ACE_F_PERMANENT; ar_ce_delete(ace); } } +/* + * ar_ce_walk routine used when deactivating an `arl' in a group. Deletes + * `ace' if it was using `arl_arg' as its output interface. + */ +static void +ar_ce_ipmp_deactivate(ace_t *ace, void *arl_arg) +{ + arl_t *arl = arl_arg; + + ASSERT(!(arl->arl_flags & ARL_F_IPMP)); + + if (ace->ace_arl == arl) { + ASSERT(ace->ace_xmit_arl == arl); + /* + * This ACE is tied to the arl leaving the group (e.g., an + * ACE_F_PERMANENT for a test address) and is not used by the + * group, so we can leave it be. + */ + return; + } + + if (ace->ace_xmit_arl != arl) + return; + + ASSERT(ace->ace_arl == arl->arl_ipmp_arl); + + /* + * IP should've already sent us messages asking us to move any + * ACE_F_MYADDR entries to another arl, but there are two exceptions: + * + * 1. The group was misconfigured with interfaces that have duplicate + * hardware addresses, but in.mpathd was unable to offline those + * duplicate interfaces. + * + * 2. The messages from IP were lost or never created (e.g. due to + * memory pressure). + * + * We handle the first case by just quietly deleting the ACE. Since + * the second case cannot be distinguished from a more serious bug in + * the IPMP framework, we ASSERT() that this can't happen on DEBUG + * systems, but quietly delete the ACE on production systems (the + * deleted ACE will render the IP address unreachable). + */ + if (ace->ace_flags & ACE_F_MYADDR) { + arlphy_t *ap = arl->arl_phy; + uint_t hw_addrlen = ap->ap_hw_addrlen; + + ASSERT(hw_addrlen == ace->ace_hw_addr_length && + bcmp(ap->ap_hw_addr, ace->ace_hw_addr, hw_addrlen) == 0); + } + + /* + * NOTE: it's possible this arl got selected as the ace_xmit_arl when + * creating an ACE_F_PERMANENT ACE on behalf of an SIOCS*ARP ioctl for + * an IPMP IP interface. But it's still OK for us to delete such an + * ACE since ipmp_illgrp_refresh_arpent() will ask us to recreate it + * and we'll pick another arl then. + */ + ar_ce_delete(ace); +} + /* Cache entry hash routine, based on protocol and protocol address. */ static ace_t ** ar_ce_hash(arp_stack_t *as, uint32_t proto, const uchar_t *proto_addr, @@ -559,7 +734,8 @@ ar_ce_lookup_entry(arl_t *arl, uint32_t proto, const uchar_t *proto_addr, return (NULL); ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length); for (; ace; ace = ace->ace_next) { - if (ace->ace_arl == arl && + if ((ace->ace_arl == arl || + ace->ace_arl == arl->arl_ipmp_arl) && ace->ace_proto_addr_length == proto_addr_length && ace->ace_proto == proto) { int i1 = proto_addr_length; @@ -632,13 +808,6 @@ ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr, /* * Look for a permanent entry for proto_addr across all interfaces. - * This is used for sending ARP requests out. Requests may come from - * IP on le0 with the source address of le1 and we need to send out - * the request on le1 so that ARP does not think that somebody else - * is using its PERMANENT address. If le0 and le1 are sitting on - * the same wire, the same IP -> ethernet mapping might exist on - * both the interfaces. But we should look for the permanent - * mapping to avoid arp interpreting it as a duplicate. */ static ace_t * ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr, @@ -653,8 +822,8 @@ ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr, if (ace->ace_proto_addr_length == proto_addr_length && ace->ace_proto == proto) { int i1 = proto_addr_length; - uchar_t *ace_addr = ace->ace_proto_addr; - uchar_t *mask = ace->ace_proto_mask; + uchar_t *ace_addr = ace->ace_proto_addr; + uchar_t *mask = ace->ace_proto_mask; /* * Note that the ace_proto_mask is applied to the @@ -703,12 +872,8 @@ ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length) * 1. Resolution of unresolved entries and update of resolved entries. * 2. Detection of nodes with our own IP address (duplicates). * - * This is complicated by ill groups. We don't currently have knowledge of ill - * groups, so we can't distinguish between a packet that comes in on one of the - * arls that's part of the group versus one that's on an unrelated arl. Thus, - * we take a conservative approach. If the arls match, then we update resolved - * and unresolved entries alike. If they don't match, then we update only - * unresolved entries. + * If the resolving ARL is in the same group as a matching ACE's ARL, then + * update the ACE. Otherwise, make no updates. * * For all entries, we first check to see if this is a duplicate (probable * loopback) message. If so, then just ignore it. @@ -741,7 +906,7 @@ ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length) static int ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr, - uint32_t hlen, const uchar_t *src_paddr, uint32_t plen) + uint32_t hlen, const uchar_t *src_paddr, uint32_t plen, arl_t **ace_arlp) { ace_t *ace; ace_t *ace_next; @@ -778,31 +943,35 @@ ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr, if (i1 >= 0) continue; + *ace_arlp = ace->ace_arl; + /* - * If both IP addr and hardware address match what we already - * have, then this is a broadcast packet emitted by one of our - * interfaces, reflected by the switch and received on another - * interface. We return AR_LOOPBACK. + * If the IP address is ours, and the hardware address matches + * one of our own arls, then this is a broadcast packet + * emitted by one of our interfaces, reflected by the switch + * and received on another interface. We return AR_LOOPBACK. */ - if ((ace->ace_flags & ACE_F_MYADDR) && - hlen == ace->ace_hw_addr_length && - bcmp(ace->ace_hw_addr, src_haddr, - ace->ace_hw_addr_length) == 0) { - return (AR_LOOPBACK); + if (ace->ace_flags & ACE_F_MYADDR) { + arl_t *hw_arl = as->as_arl_head; + arlphy_t *ap; + + for (; hw_arl != NULL; hw_arl = hw_arl->arl_next) { + ap = hw_arl->arl_phy; + if (ap != NULL && ap->ap_hw_addrlen == hlen && + bcmp(ap->ap_hw_addr, src_haddr, hlen) == 0) + return (AR_LOOPBACK); + } } /* * If the entry is unverified, then we've just verified that * someone else already owns this address, because this is a * message with the same protocol address but different - * hardware address. Conflicts received via an interface which - * doesn't own the conflict address are not actioned. Multiple - * interfaces on the same segment imply any conflict will also - * be seen via the correct interface, so we can ignore anything - * not matching the arl from the ace. + * hardware address. NOTE: the ace_xmit_arl check ensures we + * don't send duplicate AR_FAILEDs if arl is in an IPMP group. */ if ((ace->ace_flags & ACE_F_UNVERIFIED) && - arl == ace->ace_arl) { + arl == ace->ace_xmit_arl) { ar_ce_delete(ace); return (AR_FAILED); } @@ -814,30 +983,29 @@ ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr, * that, if we're currently in initial announcement mode, we * switch back to the lazier defense mode. Knowing that * there's at least one duplicate out there, we ought not - * blindly announce. Conflicts received via an interface which - * doesn't own the conflict address are not actioned. Multiple - * interfaces on the same segment imply the conflict will also - * be seen via the correct interface, so we can ignore anything - * not matching the arl from the ace. + * blindly announce. NOTE: the ace_xmit_arl check ensures we + * don't send duplicate AR_BOGONs if arl is in an IPMP group. */ if ((ace->ace_flags & ACE_F_AUTHORITY) && - arl == ace->ace_arl) { + arl == ace->ace_xmit_arl) { ace->ace_xmit_count = 0; return (AR_BOGON); } /* - * Limit updating across other ills to unresolved - * entries only. We don't want to inadvertently update - * published entries. + * Only update this ACE if it's on the same network -- i.e., + * it's for our ARL or another ARL in the same IPMP group. */ - if (ace->ace_arl == arl || !ACE_RESOLVED(ace)) { + if (ace->ace_arl == arl || ace->ace_arl == arl->arl_ipmp_arl) { if (ar_ce_resolve(ace, src_haddr, hlen)) retv = AR_CHANGED; else if (retv == AR_NOTFOUND) retv = AR_MERGED; } } + + if (retv == AR_NOTFOUND) + *ace_arlp = NULL; return (retv); } @@ -917,7 +1085,7 @@ static void ar_delete_notify(const ace_t *ace) { const arl_t *arl = ace->ace_arl; - const arlphy_t *ap = arl->arl_phy; + const arlphy_t *ap = ace->ace_xmit_arl->arl_phy; mblk_t *mp; size_t len; arh_t *arh; @@ -945,7 +1113,7 @@ ar_close(queue_t *q) { ar_t *ar = (ar_t *)q->q_ptr; char name[LIFNAMSIZ]; - arl_t *arl; + arl_t *arl, *xarl; arl_t **arlp; cred_t *cr; arc_t *arc; @@ -999,6 +1167,21 @@ ar_close(queue_t *q) while (arl->arl_state != ARL_S_DOWN) qwait(arl->arl_rq); + if (arl->arl_flags & ARL_F_IPMP) { + /* + * Though rude, someone could force the IPMP arl + * closed without removing the underlying interfaces. + * In that case, force the ARLs out of the group. + */ + xarl = as->as_arl_head; + for (; xarl != NULL; xarl = xarl->arl_next) { + if (xarl->arl_ipmp_arl != arl || xarl == arl) + continue; + ar_ce_walk(as, ar_ce_ipmp_deactivate, xarl); + xarl->arl_ipmp_arl = NULL; + } + } + ar_ll_clear_defaults(arl); /* * If this is the control stream for an arl, delete anything @@ -1417,9 +1600,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) area_t *area; ace_t *ace; uchar_t *hw_addr; - uint32_t hw_addr_len; + uint32_t hw_addr_len; uchar_t *proto_addr; - uint32_t proto_addr_len; + uint32_t proto_addr_len; uchar_t *proto_mask; arl_t *arl; mblk_t *mp = mp_orig; @@ -1494,6 +1677,7 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) proto_mask, NULL, (uint32_t)0, + NULL, aflags & ~ACE_F_MAPPING & ~ACE_F_UNVERIFIED & ~ACE_F_DEFEND); if (err != 0) { DTRACE_PROBE3(eadd_create_failed, arl_t *, arl, area_t *, area, @@ -1502,7 +1686,13 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) } if (aflags & ACE_F_PUBLISH) { - arlphy_t *ap = arl->arl_phy; + arlphy_t *ap; + + ace = ar_ce_lookup(arl, area->area_proto, proto_addr, + proto_addr_len); + ASSERT(ace != NULL); + + ap = ace->ace_xmit_arl->arl_phy; if (hw_addr == NULL || hw_addr_len == 0) { hw_addr = ap->ap_hw_addr; @@ -1519,10 +1709,6 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) ap->ap_hw_addrlen = hw_addr_len; } - ace = ar_ce_lookup(arl, area->area_proto, proto_addr, - proto_addr_len); - ASSERT(ace != NULL); - if (ace->ace_flags & ACE_F_FAST) { ace->ace_xmit_count = as->as_fastprobe_count; ace->ace_xmit_interval = as->as_fastprobe_delay; @@ -1555,9 +1741,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) */ DTRACE_PROBE2(eadd_probe, ace_t *, ace, area_t *, area); - ar_xmit(arl, ARP_REQUEST, area->area_proto, - proto_addr_len, hw_addr, NULL, NULL, - proto_addr, NULL, as); + ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, + area->area_proto, proto_addr_len, + hw_addr, NULL, NULL, proto_addr, NULL, as); ace->ace_xmit_count--; ace->ace_xmit_interval = (ace->ace_flags & ACE_F_FAST) ? @@ -1573,9 +1759,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) } else { DTRACE_PROBE2(eadd_announce, ace_t *, ace, area_t *, area); - ar_xmit(arl, ARP_REQUEST, area->area_proto, - proto_addr_len, hw_addr, proto_addr, - ap->ap_arp_addr, proto_addr, NULL, as); + ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, + area->area_proto, proto_addr_len, hw_addr, + proto_addr, ap->ap_arp_addr, proto_addr, NULL, as); ace->ace_last_bcast = ddi_get_lbolt(); /* @@ -1583,9 +1769,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) * entry; we believe we're the authority for this * entry. In that case, and if we're not just doing * one-off defense of the address, we send more than - * one copy, so that if this is an IPMP failover, we'll - * still have a good chance of updating everyone even - * when there's a packet loss or two. + * one copy, so we'll still have a good chance of + * updating everyone even when there's a packet loss + * or two. */ if ((aflags & ACE_F_AUTHORITY) && !(aflags & ACE_F_DEFEND) && @@ -1667,7 +1853,6 @@ static int ar_entry_query(queue_t *q, mblk_t *mp_orig) { ace_t *ace; - ace_t *src_ace = NULL; areq_t *areq; arl_t *arl; int err; @@ -1782,20 +1967,12 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) err = ENXIO; goto err_ret; } - if (arl->arl_phy == NULL) { - /* Can't get help if we don't know how. */ - DTRACE_PROBE2(query_no_phy, ace_t *, ace, - areq_t *, areq); - mpp[0] = NULL; - mp->b_prev = NULL; - err = ENXIO; - goto err_ret; - } DTRACE_PROBE2(query_unresolved, ace_t, ace, areq_t *, areq); } else { /* No ace yet. Make one now. (This is the common case.) */ - if (areq->areq_xmit_count == 0 || arl->arl_phy == NULL) { - DTRACE_PROBE2(query_phy, arl_t *, arl, areq_t *, areq); + if (areq->areq_xmit_count == 0) { + DTRACE_PROBE2(query_template, arl_t *, arl, + areq_t *, areq); mp->b_prev = NULL; err = ENXIO; goto err_ret; @@ -1814,9 +1991,9 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) err = EINVAL; goto err_ret; } - err = ar_ce_create(arl, areq->areq_proto, NULL, 0, + err = ar_ce_create(OWNING_ARL(arl), areq->areq_proto, NULL, 0, proto_addr, proto_addr_len, NULL, - NULL, (uint32_t)0, + NULL, (uint32_t)0, sender_addr, areq->areq_flags); if (err != 0) { DTRACE_PROBE3(query_create_failed, arl_t *, arl, @@ -1835,49 +2012,13 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) goto err_ret; } ace->ace_query_mp = mp; - /* - * We don't have group information here. But if the sender - * address belongs to a different arl, we might as well - * search the other arl for a resolved ACE. If we find one, - * we resolve it rather than sending out a ARP request. - */ - src_ace = ar_ce_lookup_permanent(as, areq->areq_proto, - sender_addr, areq->areq_sender_addr_length); - if (src_ace == NULL) { - DTRACE_PROBE3(query_source_missing, arl_t *, arl, - areq_t *, areq, ace_t *, ace); - ar_query_reply(ace, ENXIO, NULL, (uint32_t)0); - /* - * ar_query_reply has already freed the mp. - * Return EINPROGRESS, so that caller won't attempt - * to free the 'mp' again. - */ - return (EINPROGRESS); - } - if (src_ace->ace_arl != ace->ace_arl) { - ace_t *dst_ace; - - /* - * Check for a resolved entry in the src_ace->ace_arl. - */ - dst_ace = ar_ce_lookup_entry(src_ace->ace_arl, - areq->areq_proto, proto_addr, proto_addr_len); - - if (dst_ace != NULL && ACE_RESOLVED(dst_ace)) { - DTRACE_PROBE3(query_other_arl, arl_t *, arl, - areq_t *, areq, ace_t *, dst_ace); - (void) ar_ce_resolve(ace, dst_ace->ace_hw_addr, - dst_ace->ace_hw_addr_length); - return (EINPROGRESS); - } - } } - ms = ar_query_xmit(as, ace, src_ace); + ms = ar_query_xmit(as, ace); if (ms == 0) { /* Immediate reply requested. */ ar_query_reply(ace, ENXIO, NULL, (uint32_t)0); } else { - mi_timer(arl->arl_wq, ace->ace_mp, ms); + mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, ms); } return (EINPROGRESS); err_ret: @@ -2073,6 +2214,80 @@ done: } /* + * Given an arie_t `mp', find the arl_t's that it names and return them + * in `*arlp' and `*ipmp_arlp'. If they cannot be found, return B_FALSE. + */ +static boolean_t +ar_ipmp_lookup(arp_stack_t *as, mblk_t *mp, arl_t **arlp, arl_t **ipmp_arlp) +{ + arie_t *arie = (arie_t *)mp->b_rptr; + + *arlp = ar_ll_lookup_from_mp(as, mp); + if (*arlp == NULL) { + DTRACE_PROBE1(ipmp_lookup_no_arl, mblk_t *, mp); + return (B_FALSE); + } + + arie->arie_grifname[LIFNAMSIZ - 1] = '\0'; + *ipmp_arlp = ar_ll_lookup_by_name(as, arie->arie_grifname); + if (*ipmp_arlp == NULL) { + DTRACE_PROBE1(ipmp_lookup_no_ipmp_arl, mblk_t *, mp); + return (B_FALSE); + } + + DTRACE_PROBE2(ipmp_lookup, arl_t *, *arlp, arl_t *, *ipmp_arlp); + return (B_TRUE); +} + +/* + * Bind an arl_t to an IPMP group arl_t. + */ +static int +ar_ipmp_activate(queue_t *q, mblk_t *mp) +{ + arl_t *arl, *ipmp_arl; + arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; + + if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl)) + return (EINVAL); + + if (arl->arl_ipmp_arl != NULL) { + DTRACE_PROBE1(ipmp_activated_already, arl_t *, arl); + return (EALREADY); + } + + DTRACE_PROBE2(ipmp_activate, arl_t *, arl, arl_t *, ipmp_arl); + arl->arl_ipmp_arl = ipmp_arl; + return (0); +} + +/* + * Unbind an arl_t from an IPMP group arl_t and update the ace_t's so + * that it is no longer part of the group. + */ +static int +ar_ipmp_deactivate(queue_t *q, mblk_t *mp) +{ + arl_t *arl, *ipmp_arl; + arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; + + if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl)) + return (EINVAL); + + if (ipmp_arl != arl->arl_ipmp_arl) { + DTRACE_PROBE2(ipmp_deactivate_notactive, arl_t *, arl, arl_t *, + ipmp_arl); + return (EINVAL); + } + + DTRACE_PROBE2(ipmp_deactivate, arl_t *, arl, arl_t *, + arl->arl_ipmp_arl); + ar_ce_walk(as, ar_ce_ipmp_deactivate, arl); + arl->arl_ipmp_arl = NULL; + return (0); +} + +/* * Enable an interface to process ARP_REQUEST and ARP_RESPONSE messages. */ /* ARGSUSED */ @@ -2199,6 +2414,11 @@ ar_ll_init(arp_stack_t *as, ar_t *ar, mblk_t *mp) if ((arl = (arl_t *)mi_zalloc(sizeof (arl_t))) == NULL) return; + if (dlia->dl_mac_type == SUNW_DL_IPMP) { + arl->arl_flags |= ARL_F_IPMP; + arl->arl_ipmp_arl = arl; + } + arl->arl_provider_style = dlia->dl_provider_style; arl->arl_rq = ar->ar_rq; arl->arl_wq = ar->ar_wq; @@ -2261,7 +2481,7 @@ ar_ll_set_defaults(arl_t *arl, mblk_t *mp) dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr; dl_unitdata_req_t *dlur; uchar_t *up; - arlphy_t *ap; + arlphy_t *ap; ASSERT(arl != NULL); @@ -2270,6 +2490,14 @@ ar_ll_set_defaults(arl_t *arl, mblk_t *mp) */ ar_ll_clear_defaults(arl); + if (arl->arl_flags & ARL_F_IPMP) { + /* + * If this is an IPMP arl_t, we have nothing to do, + * since we will never transmit or receive. + */ + return; + } + ap = kmem_zalloc(sizeof (arlphy_t), KM_NOSLEEP); if (ap == NULL) goto bad; @@ -2470,12 +2698,12 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig) mblk_t *mp = mp_orig; ace_t *ace; uchar_t *hw_addr; - uint32_t hw_addr_len; + uint32_t hw_addr_len; uchar_t *proto_addr; - uint32_t proto_addr_len; + uint32_t proto_addr_len; uchar_t *proto_mask; uchar_t *proto_extract_mask; - uint32_t hw_extract_start; + uint32_t hw_extract_start; arl_t *arl; arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; @@ -2524,6 +2752,7 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig) proto_mask, proto_extract_mask, hw_extract_start, + NULL, arma->arma_flags | ACE_F_MAPPING)); } @@ -2857,12 +3086,12 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, uint32_t proto_addr_len) { mblk_t *areq_mp; - arl_t *arl = ace->ace_arl; mblk_t *mp; mblk_t *xmit_mp; - arp_stack_t *as = ARL_TO_ARPSTACK(arl); + queue_t *arl_wq = ace->ace_arl->arl_wq; + arp_stack_t *as = ARL_TO_ARPSTACK(ace->ace_arl); ip_stack_t *ipst = as->as_netstack->netstack_ip; - arlphy_t *ap = arl->arl_phy; + arlphy_t *ap = ace->ace_xmit_arl->arl_phy; /* * On error or completion for a query, we need to shut down the timer. @@ -2870,7 +3099,8 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, * Duplicate Address Detection, or it will never finish that phase. */ if (!(ace->ace_flags & (ACE_F_UNVERIFIED | ACE_F_AUTHORITY))) - mi_timer(arl->arl_wq, ace->ace_mp, -1L); + mi_timer(arl_wq, ace->ace_mp, -1L); + /* Establish the return value appropriate. */ if (ret_val == 0) { if (!ACE_RESOLVED(ace) || ap == NULL) @@ -2973,25 +3203,24 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, */ ar_ce_delete(ace); } else { - mi_timer(arl->arl_wq, ace->ace_mp, - as->as_cleanup_interval); + mi_timer(arl_wq, ace->ace_mp, as->as_cleanup_interval); } } } /* * Returns number of milliseconds after which we should either rexmit or abort. - * Return of zero means we should abort. src_ace is the ace corresponding - * to the source address in the areq sent by IP. + * Return of zero means we should abort. */ static clock_t -ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace) +ar_query_xmit(arp_stack_t *as, ace_t *ace) { areq_t *areq; mblk_t *mp; uchar_t *proto_addr; uchar_t *sender_addr; - arl_t *src_arl; + ace_t *src_ace; + arl_t *xmit_arl = ace->ace_xmit_arl; mp = ace->ace_query_mp; /* @@ -3016,18 +3245,15 @@ ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace) areq->areq_sender_addr_length); /* - * Get the source h/w address for the sender addr. With interface - * groups, IP sends us source address belonging to a different - * interface. + * Get the ace for the sender address, so that we can verify that + * we have one and that DAD has completed. */ + src_ace = ar_ce_lookup(xmit_arl, areq->areq_proto, sender_addr, + areq->areq_sender_addr_length); if (src_ace == NULL) { - src_ace = ar_ce_lookup_permanent(as, areq->areq_proto, - sender_addr, areq->areq_sender_addr_length); - if (src_ace == NULL) { - DTRACE_PROBE3(xmit_no_source, ace_t *, ace, - areq_t *, areq, uchar_t *, sender_addr); - return (0); - } + DTRACE_PROBE3(xmit_no_source, ace_t *, ace, areq_t *, areq, + uchar_t *, sender_addr); + return (0); } /* @@ -3044,18 +3270,12 @@ ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace) return (areq->areq_xmit_interval); } - /* - * Transmit on src_arl. We should transmit on src_arl. Otherwise - * the switch will send back a copy on other interfaces of the - * same group and as we could be using somebody else's source - * address + hardware address, ARP will treat this as a bogon. - */ - src_arl = src_ace->ace_arl; DTRACE_PROBE3(xmit_send, ace_t *, ace, ace_t *, src_ace, areq_t *, areq); - ar_xmit(src_arl, ARP_REQUEST, areq->areq_proto, - areq->areq_sender_addr_length, src_arl->arl_phy->ap_hw_addr, - sender_addr, src_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as); + + ar_xmit(xmit_arl, ARP_REQUEST, areq->areq_proto, + areq->areq_sender_addr_length, xmit_arl->arl_phy->ap_hw_addr, + sender_addr, xmit_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as); src_ace->ace_last_bcast = ddi_get_lbolt(); return (areq->areq_xmit_interval); } @@ -3066,6 +3286,7 @@ ar_rput(queue_t *q, mblk_t *mp) { arh_t *arh; arl_t *arl; + arl_t *client_arl; ace_t *dst_ace; uchar_t *dst_paddr; int err; @@ -3079,6 +3300,8 @@ ar_rput(queue_t *q, mblk_t *mp) uchar_t *src_paddr; uchar_t *dst_haddr; boolean_t is_probe; + boolean_t is_unicast = B_FALSE; + dl_unitdata_ind_t *dlindp; int i; arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; @@ -3135,9 +3358,10 @@ ar_rput(queue_t *q, mblk_t *mp) return; case M_PCPROTO: case M_PROTO: + dlindp = (dl_unitdata_ind_t *)mp->b_rptr; if (MBLKL(mp) >= sizeof (dl_unitdata_ind_t) && - ((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive == - DL_UNITDATA_IND) { + dlindp->dl_primitive == DL_UNITDATA_IND) { + is_unicast = (dlindp->dl_group_address == 0); arl = ((ar_t *)q->q_ptr)->ar_arl; if (arl != NULL && arl->arl_phy != NULL) { /* Real messages from the wire! */ @@ -3261,19 +3485,24 @@ ar_rput(queue_t *q, mblk_t *mp) * RFC 826: first check if the <protocol, sender protocol address> is * in the cache, if there is a sender protocol address. Note that this * step also handles resolutions based on source. + * + * Note that IP expects that each notification it receives will be + * tied to the ill it received it on. Thus, we must talk to it over + * the arl tied to the resolved IP address (if any), hence client_arl. */ if (is_probe) err = AR_NOTFOUND; else err = ar_ce_resolve_all(arl, proto, src_haddr, hlen, src_paddr, - plen); + plen, &client_arl); + switch (err) { case AR_BOGON: - ar_client_notify(arl, mp1, AR_CN_BOGON); + ar_client_notify(client_arl, mp1, AR_CN_BOGON); mp1 = NULL; break; case AR_FAILED: - ar_client_notify(arl, mp1, AR_CN_FAILED); + ar_client_notify(client_arl, mp1, AR_CN_FAILED); mp1 = NULL; break; case AR_LOOPBACK: @@ -3293,7 +3522,9 @@ ar_rput(queue_t *q, mblk_t *mp) * Now look up the destination address. By RFC 826, we ignore the * packet at this step if the target isn't one of our addresses. This * is true even if the target is something we're trying to resolve and - * the packet is a response. + * the packet is a response. To avoid duplicate responses, we also + * ignore the packet if it was multicast/broadcast to an arl that's in + * an IPMP group but was not the designated xmit_arl for the ACE. * * Note that in order to do this correctly, we need to know when to * notify IP of a change implied by the source address of the ARP @@ -3304,6 +3535,7 @@ ar_rput(queue_t *q, mblk_t *mp) */ dst_ace = ar_ce_lookup_entry(arl, proto, dst_paddr, plen); if (dst_ace == NULL || !ACE_RESOLVED(dst_ace) || + (dst_ace->ace_xmit_arl != arl && !is_unicast) || !(dst_ace->ace_flags & ACE_F_PUBLISH)) { /* * Let the client know if the source mapping has changed, even @@ -3311,7 +3543,7 @@ ar_rput(queue_t *q, mblk_t *mp) * client. */ if (err == AR_CHANGED) - ar_client_notify(arl, mp1, AR_CN_ANNOUNCE); + ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE); else freemsg(mp1); freeb(mp); @@ -3341,6 +3573,7 @@ ar_rput(queue_t *q, mblk_t *mp) "arp_rput_end: q %p (%S)", q, "reflection"); return; } + /* * Conflicts seen via the wrong interface may be bogus. * Multiple interfaces on the same segment imply any conflict @@ -3378,12 +3611,21 @@ ar_rput(queue_t *q, mblk_t *mp) * the src_paddr field before sending it to IP. The same is * required for probes, where src_paddr will be INADDR_ANY. */ - if (is_probe || op == ARP_RESPONSE) { + if (is_probe) { + /* + * In this case, client_arl will be invalid (e.g., + * since probes don't have a valid sender address). + * But dst_ace has the appropriate arl. + */ bcopy(dst_paddr, src_paddr, plen); - ar_client_notify(arl, mp1, AR_CN_FAILED); + ar_client_notify(dst_ace->ace_arl, mp1, AR_CN_FAILED); + ar_ce_delete(dst_ace); + } else if (op == ARP_RESPONSE) { + bcopy(dst_paddr, src_paddr, plen); + ar_client_notify(client_arl, mp1, AR_CN_FAILED); ar_ce_delete(dst_ace); } else if (err == AR_CHANGED) { - ar_client_notify(arl, mp1, AR_CN_ANNOUNCE); + ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE); } else { DTRACE_PROBE3(rput_request_unverified, arl_t *, arl, arh_t *, arh, ace_t *, dst_ace); @@ -3431,19 +3673,19 @@ ar_rput(queue_t *q, mblk_t *mp) dst_ace->ace_hw_addr, dst_ace->ace_proto_addr, src_haddr, src_paddr, dstaddr, as); if (!is_probe && err == AR_NOTFOUND && - ar_ce_create(arl, proto, src_haddr, hlen, src_paddr, plen, - NULL, NULL, 0, 0) == 0) { + ar_ce_create(OWNING_ARL(arl), proto, src_haddr, hlen, + src_paddr, plen, NULL, NULL, 0, NULL, 0) == 0) { ace_t *ace; ace = ar_ce_lookup(arl, proto, src_paddr, plen); ASSERT(ace != NULL); - mi_timer(arl->arl_wq, ace->ace_mp, + mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, as->as_cleanup_interval); } } if (err == AR_CHANGED) { freeb(mp); - ar_client_notify(arl, mp1, AR_CN_ANNOUNCE); + ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE); TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, "arp_rput_end: q %p (%S)", q, "reqchange"); } else { @@ -3459,7 +3701,7 @@ ar_ce_restart_dad(ace_t *ace, void *arl_arg) arl_t *arl = arl_arg; arp_stack_t *as = ARL_TO_ARPSTACK(arl); - if ((ace->ace_arl == arl) && + if ((ace->ace_xmit_arl == arl) && (ace->ace_flags & (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) == (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) { /* @@ -4060,9 +4302,9 @@ ar_wput(queue_t *q, mblk_t *mp) static boolean_t arp_say_ready(ace_t *ace) { - mblk_t *mp; + mblk_t *mp; arl_t *arl = ace->ace_arl; - arlphy_t *ap = arl->arl_phy; + arlphy_t *ap = ace->ace_xmit_arl->arl_phy; arh_t *arh; uchar_t *cp; @@ -4107,7 +4349,7 @@ ace_reschedule(ace_t *ace, void *arg) ace_t **acemax; ace_t *atemp; - if (ace->ace_arl != art->art_arl) + if (ace->ace_xmit_arl != art->art_arl) return; /* * Only published entries that are ready for announcement are eligible. @@ -4179,7 +4421,6 @@ static void ar_wsrv(queue_t *q) { ace_t *ace; - arl_t *arl; arlphy_t *ap; mblk_t *mp; clock_t ms; @@ -4196,8 +4437,7 @@ ar_wsrv(queue_t *q) ace = (ace_t *)mp->b_rptr; if (ace->ace_flags & ACE_F_DYING) continue; - arl = ace->ace_arl; - ap = arl->arl_phy; + ap = ace->ace_xmit_arl->arl_phy; if (ace->ace_flags & ACE_F_UNVERIFIED) { ASSERT(ace->ace_flags & ACE_F_PUBLISH); ASSERT(ace->ace_query_mp == NULL); @@ -4216,7 +4456,7 @@ ar_wsrv(queue_t *q) DTRACE_PROBE1(timer_probe, ace_t *, ace); ace->ace_xmit_count--; - ar_xmit(arl, ARP_REQUEST, + ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, ace->ace_proto, ace->ace_proto_addr_length, ace->ace_hw_addr, NULL, NULL, @@ -4247,7 +4487,7 @@ ar_wsrv(queue_t *q) now - ap->ap_defend_start > SEC_TO_TICK(as->as_defend_period)) { ap->ap_defend_start = now; - arl_reschedule(arl); + arl_reschedule(ace->ace_xmit_arl); } /* * Finish the job that we started in @@ -4288,12 +4528,12 @@ ar_wsrv(queue_t *q) DTRACE_PROBE1(timer_defend, ace_t *, ace); } - ar_xmit(arl, ARP_REQUEST, + ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, ace->ace_proto, ace->ace_proto_addr_length, ace->ace_hw_addr, ace->ace_proto_addr, - ap->ap_arp_addr, + ace->ace_xmit_arl->arl_phy->ap_arp_addr, ace->ace_proto_addr, NULL, as); ace->ace_last_bcast = now; if (ace->ace_xmit_count == 0) @@ -4316,7 +4556,8 @@ ar_wsrv(queue_t *q) ndp_lookup_ipaddr(*(ipaddr_t *) ace->ace_proto_addr, as->as_netstack)) { ace->ace_flags |= ACE_F_OLD; - mi_timer(arl->arl_wq, ace->ace_mp, + mi_timer(ace->ace_arl->arl_wq, + ace->ace_mp, as->as_cleanup_interval); } else { ar_delete_notify(ace); @@ -4333,7 +4574,7 @@ ar_wsrv(queue_t *q) * we complete the operation with a failure indication. * Otherwise, we restart the timer. */ - ms = ar_query_xmit(as, ace, NULL); + ms = ar_query_xmit(as, ace); if (ms == 0) ar_query_reply(ace, ENXIO, NULL, (uint32_t)0); else @@ -4360,6 +4601,8 @@ ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen, mblk_t *mp; arlphy_t *ap = arl->arl_phy; + ASSERT(!(arl->arl_flags & ARL_F_IPMP)); + if (ap == NULL) { DTRACE_PROBE1(xmit_no_arl_phy, arl_t *, arl); return; diff --git a/usr/src/uts/common/inet/arp_impl.h b/usr/src/uts/common/inet/arp_impl.h index a2564d5602..f16fdc97a0 100644 --- a/usr/src/uts/common/inet/arp_impl.h +++ b/usr/src/uts/common/inet/arp_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +67,7 @@ typedef struct arl_s { uint_t arl_closing : 1; /* stream is closing */ uint32_t arl_index; /* instance number */ struct arlphy_s *arl_phy; /* physical info, if any */ + struct arl_s *arl_ipmp_arl; /* pointer to group arl_t */ } arl_t; /* @@ -75,7 +76,7 @@ typedef struct arl_s { */ #define ARL_TO_ARPSTACK(_arl) (((ar_t *)(_arl)->arl_rq->q_ptr)->ar_as) -/* ARL physical info structure for a link level device */ +/* ARL physical info structure, one per physical link level device */ typedef struct arlphy_s { uint32_t ap_arp_hw_type; /* hardware type */ uchar_t *ap_arp_addr; /* multicast address to use */ @@ -110,6 +111,7 @@ typedef struct ace_s { clock_t ace_last_bcast; /* last broadcast Response */ clock_t ace_xmit_interval; int ace_xmit_count; + arl_t *ace_xmit_arl; /* xmit on this arl */ } ace_t; #define ARPHOOK_INTERESTED_PHYSICAL_IN(as) \ @@ -216,6 +218,7 @@ struct arp_stack { typedef struct arp_stack arp_stack_t; #define ARL_F_NOARP 0x01 +#define ARL_F_IPMP 0x02 #define ARL_S_DOWN 0x00 #define ARL_S_PENDING 0x01 diff --git a/usr/src/uts/common/inet/dlpistub/dlpistub.c b/usr/src/uts/common/inet/dlpistub/dlpistub.c new file mode 100644 index 0000000000..961876ac47 --- /dev/null +++ b/usr/src/uts/common/inet/dlpistub/dlpistub.c @@ -0,0 +1,370 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * DLPI stub driver; currently supports VNI and IPMP stub devices. + */ + +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/dlpi.h> +#include <sys/stat.h> +#include <sys/strsun.h> +#include <sys/stropts.h> +#include <sys/types.h> +#include <sys/id_space.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/mkdev.h> +#include <sys/sdt.h> + +#include "dlpistub_impl.h" + +static id_space_t *ds_minors; +static dev_info_t *ds_dip; + +/* + * DL_INFO_ACK template. + */ +static dl_info_ack_t ds_infoack = { + DL_INFO_ACK, /* dl_primitive */ + 0, /* dl_max_sdu */ + 0, /* dl_min_sdu */ + 0, /* dl_addr_length */ + 0, /* dl_mac_type */ + 0, /* dl_reserved */ + 0, /* dl_current_state */ + 0, /* dl_sap_length */ + DL_CLDLS, /* dl_service_mode */ + 0, /* dl_qos_length */ + 0, /* dl_qos_offset */ + 0, /* dl_qos_range_length */ + 0, /* dl_qos_range_offset */ + DL_STYLE2, /* dl_provider_style */ + 0, /* dl_addr_offset */ + DL_VERSION_2, /* dl_version */ + 0, /* dl_brdcst_addr_length */ + 0, /* dl_brdcst_addr_offset */ + 0 /* dl_growth */ +}; + +static int +ds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, "vni", S_IFCHR, DS_MINOR_VNI, + DDI_PSEUDO, 0) == DDI_FAILURE || + ddi_create_minor_node(dip, "ipmpstub", S_IFCHR, DS_MINOR_IPMP, + DDI_PSEUDO, 0) == DDI_FAILURE) { + ddi_remove_minor_node(dip, NULL); + cmn_err(CE_NOTE, "ds_attach: cannot create minor nodes"); + return (DDI_FAILURE); + } + + ds_dip = dip; + ds_minors = id_space_create("ds_minors", DS_MINOR_START, MAXMIN32); + return (DDI_SUCCESS); +} + +static int +ds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + id_space_destroy(ds_minors); + ds_minors = NULL; + ASSERT(dip == ds_dip); + ddi_remove_minor_node(dip, NULL); + ds_dip = NULL; + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +static int +ds_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error = DDI_FAILURE; + + switch (infocmd) { + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2DEVINFO: + if (ds_dip != NULL) { + *result = ds_dip; + error = DDI_SUCCESS; + } + break; + } + return (error); +} + +/* ARGSUSED */ +static int +ds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) +{ + int type; + dlpistub_t *dsp; + + if (sflag == CLONEOPEN || sflag == MODOPEN) + return (EINVAL); + + if (q->q_ptr != NULL) + return (0); + + switch (getminor(*devp)) { + case DS_MINOR_VNI: + type = SUNW_DL_VNI; + break; + case DS_MINOR_IPMP: + type = SUNW_DL_IPMP; + break; + default: + return (ENXIO); + } + + dsp = kmem_zalloc(sizeof (dlpistub_t), KM_SLEEP); + dsp->ds_type = type; + dsp->ds_minor = (minor_t)id_alloc(ds_minors); + dsp->ds_state = DL_UNATTACHED; + *devp = makedevice(getmajor(*devp), dsp->ds_minor); + q->q_ptr = WR(q)->q_ptr = dsp; + qprocson(q); + + return (0); +} + +/* ARGSUSED */ +static int +ds_close(queue_t *q, int flag, cred_t *credp) +{ + dlpistub_t *dsp = q->q_ptr; + + qprocsoff(q); + q->q_ptr = WR(q)->q_ptr = NULL; + + id_free(ds_minors, dsp->ds_minor); + kmem_free(dsp, sizeof (dlpistub_t)); + + return (0); +} + +static int +ds_badprim(queue_t *q, mblk_t *mp, t_scalar_t prim) +{ + dlerrorack(q, mp, prim, DL_BADPRIM, 0); + return (0); +} + +static int +ds_outstate(queue_t *q, mblk_t *mp, t_scalar_t prim) +{ + dlerrorack(q, mp, prim, DL_OUTSTATE, 0); + return (0); +} + +static int +ds_wput(queue_t *q, mblk_t *mp) +{ + union DL_primitives *dlp; + dl_info_ack_t *dlip; + dlpistub_t *dsp = q->q_ptr; + t_scalar_t prim; + + switch (DB_TYPE(mp)) { + case M_PROTO: + case M_PCPROTO: + if (MBLKL(mp) < sizeof (t_scalar_t)) { + dlerrorack(q, mp, DL_PRIM_INVAL, DL_UNSUPPORTED, 0); + return (0); + } + + dlp = (void *)mp->b_rptr; + prim = dlp->dl_primitive; + switch (prim) { + case DL_ATTACH_REQ: + if (MBLKL(mp) < DL_ATTACH_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + if (dsp->ds_state != DL_UNATTACHED) + return (ds_outstate(q, mp, prim)); + + dsp->ds_state = DL_UNBOUND; + dlokack(q, mp, DL_ATTACH_REQ); + break; + + case DL_BIND_REQ: + if (MBLKL(mp) < DL_BIND_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + if (dsp->ds_state != DL_UNBOUND) + return (ds_outstate(q, mp, prim)); + + dsp->ds_state = DL_IDLE; + dlbindack(q, mp, dlp->bind_req.dl_sap, NULL, 0, 0, 0); + break; + + case DL_INFO_REQ: + if (MBLKL(mp) < DL_INFO_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + mp = mexchange(q, mp, sizeof (dl_info_ack_t), + M_PCPROTO, DL_INFO_ACK); + if (mp != NULL) { + dlip = (void *)mp->b_rptr; + *dlip = ds_infoack; + dlip->dl_mac_type = dsp->ds_type; + dlip->dl_current_state = dsp->ds_state; + qreply(q, mp); + } + break; + + case DL_PHYS_ADDR_REQ: + if (MBLKL(mp) < DL_PHYS_ADDR_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + dlphysaddrack(q, mp, NULL, 0); + break; + + case DL_UNBIND_REQ: + if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + if (dsp->ds_state != DL_IDLE) + return (ds_outstate(q, mp, prim)); + + dsp->ds_state = DL_UNBOUND; + dlokack(q, mp, DL_UNBIND_REQ); + break; + + case DL_DETACH_REQ: + if (MBLKL(mp) < DL_DETACH_REQ_SIZE) + return (ds_badprim(q, mp, prim)); + + if (dsp->ds_state != DL_UNBOUND) + return (ds_outstate(q, mp, prim)); + + dsp->ds_state = DL_UNATTACHED; + dlokack(q, mp, DL_DETACH_REQ); + break; + + case DL_UNITDATA_REQ: + DTRACE_PROBE2(dlpistub__data, dlpistub_t *, dsp, + mblk_t *, mp); + freemsg(mp); + break; + + default: + dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0); + } + break; + + case M_IOCTL: + miocnak(q, mp, 0, EINVAL); + break; + + case M_FLUSH: + *mp->b_rptr &= ~FLUSHW; + if (*mp->b_rptr & FLUSHR) + qreply(q, mp); + else + freemsg(mp); + break; + default: + freemsg(mp); + break; + } + + return (0); +} + +static struct module_info ds_minfo = { + DS_IDNUM, /* mi_idnum */ + "dlpistub", /* mi_idname */ + 0, /* mi_minpsz */ + INFPSZ, /* mi_maxpsz */ + 0, /* mi_hiwat */ + 0, /* mi_lowat */ +}; + +static struct qinit ds_rinit = { + NULL, /* qi_putp */ + NULL, /* qi_srvp */ + ds_open, /* qi_qopen */ + ds_close, /* qi_qclose */ + NULL, /* qi_qadmin */ + &ds_minfo, /* qi_minfo */ +}; + +static struct qinit ds_winit = { + ds_wput, /* qi_putp */ + NULL, /* qi_srvp */ + NULL, /* qi_qopen */ + NULL, /* qi_qclose */ + NULL, /* qi_qadmin */ + &ds_minfo, /* qi_minfo */ +}; + +static struct streamtab ds_info = { + &ds_rinit, /* st_rdinit */ + &ds_winit /* st_wrinit */ +}; + +DDI_DEFINE_STREAM_OPS(ds_ops, nulldev, nulldev, ds_attach, ds_detach, + nodev, ds_devinfo, D_MP|D_MTPERMOD, &ds_info, ddi_quiesce_not_supported); + +static struct modldrv modldrv = { + &mod_driverops, + "DLPI stub driver", + &ds_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/inet/vni/vni.conf b/usr/src/uts/common/inet/dlpistub/dlpistub.conf index d79915e01c..72264ca466 100644 --- a/usr/src/uts/common/inet/vni/vni.conf +++ b/usr/src/uts/common/inet/dlpistub/dlpistub.conf @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,10 +19,7 @@ # CDDL HEADER END # # -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # - -#ident "%Z%%M% %I% %E% SMI" -# -name="vni" parent="pseudo" instance=0; +name="dlpistub" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h b/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h new file mode 100644 index 0000000000..ece15320ee --- /dev/null +++ b/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _INET_DLPISTUB_IMPL_H +#define _INET_DLPISTUB_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> + +typedef struct dlpistub { + int ds_type; /* DLPI MAC type */ + t_uscalar_t ds_state; /* DLPI state */ + minor_t ds_minor; /* corresponding minor */ +} dlpistub_t; + +#define DS_IDNUM 0x2a84 + +enum { DS_MINOR_VNI = 1, DS_MINOR_IPMP, DS_MINOR_START }; + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_DLPISTUB_IMPL_H */ diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index 323c8fd0de..41595280cb 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -56,6 +56,7 @@ extern "C" { #include <net/route.h> #include <sys/systm.h> #include <sys/multidata.h> +#include <sys/list.h> #include <net/radix.h> #include <sys/modhash.h> @@ -565,15 +566,21 @@ typedef struct ipha_s { #define IPH_ECN_ECT0 0x2 /* ECN-Capable Transport, ECT(0) */ #define IPH_ECN_CE 0x3 /* ECN-Congestion Experienced (CE) */ +struct ill_s; + +typedef boolean_t ip_v6intfid_func_t(struct ill_s *, in6_addr_t *); +typedef boolean_t ip_v6mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *, + in6_addr_t *); +typedef boolean_t ip_v4mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *, + ipaddr_t *); + /* IP Mac info structure */ typedef struct ip_m_s { - t_uscalar_t ip_m_mac_type; /* From <sys/dlpi.h> */ - int ip_m_type; /* From <net/if_types.h> */ - boolean_t (*ip_m_v4mapinfo)(uint_t, uint8_t *, uint8_t *, - uint32_t *, ipaddr_t *); - boolean_t (*ip_m_v6mapinfo)(uint_t, uint8_t *, uint8_t *, - uint32_t *, in6_addr_t *); - boolean_t (*ip_m_v6intfid)(uint_t, uint8_t *, in6_addr_t *); + t_uscalar_t ip_m_mac_type; /* From <sys/dlpi.h> */ + int ip_m_type; /* From <net/if_types.h> */ + ip_v4mapinfo_func_t *ip_m_v4mapinfo; + ip_v6mapinfo_func_t *ip_m_v6mapinfo; + ip_v6intfid_func_t *ip_m_v6intfid; } ip_m_t; /* @@ -583,18 +590,22 @@ typedef struct ip_m_s { * layer multicast address range. * b. map from IPv6 multicast address range (ff00::/8) to the link * layer multicast address range. - * c. derive the default IPv6 interface identifier from the link layer - * address. + * c. derive the default IPv6 interface identifier from the interface. + * d. derive the default IPv6 destination interface identifier from + * the interface (point-to-point only). */ #define MEDIA_V4MINFO(ip_m, plen, bphys, maddr, hwxp, v4ptr) \ (((ip_m)->ip_m_v4mapinfo != NULL) && \ (*(ip_m)->ip_m_v4mapinfo)(plen, bphys, maddr, hwxp, v4ptr)) -#define MEDIA_V6INTFID(ip_m, plen, phys, v6ptr) \ - (((ip_m)->ip_m_v6intfid != NULL) && \ - (*(ip_m)->ip_m_v6intfid)(plen, phys, v6ptr)) #define MEDIA_V6MINFO(ip_m, plen, bphys, maddr, hwxp, v6ptr) \ (((ip_m)->ip_m_v6mapinfo != NULL) && \ (*(ip_m)->ip_m_v6mapinfo)(plen, bphys, maddr, hwxp, v6ptr)) +#define MEDIA_V6INTFID(ip_m, ill, v6ptr) \ + (((ip_m)->ip_m_v6intfid != NULL) && \ + (*(ip_m)->ip_m_v6intfid)(ill, v6ptr)) +#define MEDIA_V6DESTINTFID(ip_m, ill, v6ptr) \ + (((ip_m)->ip_m_v6destintfid != NULL) && \ + (*(ip_m)->ip_m_v6destintfid)(ill, v6ptr)) /* Router entry types */ #define IRE_BROADCAST 0x0001 /* Route entry for broadcast address */ @@ -621,18 +632,12 @@ typedef struct ip_m_s { * the bucket should delete this IRE from this bucket. */ #define IRE_MARK_CONDEMNED 0x0001 + /* - * If a broadcast IRE is marked with IRE_MARK_NORECV, ip_rput will drop the - * broadcast packets received on that interface. This is marked only - * on broadcast ires. Employed by IPMP, where we have multiple NICs on the - * same subnet receiving the same broadcast packet. - */ -#define IRE_MARK_NORECV 0x0002 -/* - * IRE_CACHE marked this way won't be returned by ire_cache_lookup. Need - * to look specifically using MATCH_IRE_MARK_HIDDEN. Used by IPMP. + * An IRE with IRE_MARK_TESTHIDDEN is used by in.mpathd for test traffic. It + * can only be looked up by requesting MATCH_IRE_MARK_TESTHIDDEN. */ -#define IRE_MARK_HIDDEN 0x0004 /* Typically Used by in.mpathd */ +#define IRE_MARK_TESTHIDDEN 0x0004 /* * An IRE with IRE_MARK_NOADD is created in ip_newroute_ipif when the outgoing @@ -788,45 +793,18 @@ typedef struct mrec_s { * ilm records the state of multicast memberships with the driver and is * maintained per interface. * - * Notes : - * - * 1) There is no direct link between a given ilg and ilm. If the - * application has joined a group G with ifindex I, we will have - * an ilg with ilg_v6group and ilg_ill. There will be a corresponding - * ilm with ilm_ill/ilm_v6addr recording the multicast membership. - * To delete the membership, - * - * a) Search for ilg matching on G and I with ilg_v6group - * and ilg_ill. Delete ilg_ill. - * b) Search the corresponding ilm matching on G and I with - * ilm_v6addr and ilm_ill. Delete ilm. - * - * In IPv4, the only difference is, we look using ipifs instead of - * ills. - * - * 2) With IP multipathing, we want to keep receiving even after the - * interface has failed. We do this by moving multicast memberships - * to a new_ill within the group. This is achieved by sending - * DL_DISABMULTI_REQS on ilg_ill/ilm_ill and sending DL_ENABMULTIREQS - * on the new_ill and changing ilg_ill/ilm_ill to new_ill. But, we - * need to be able to delete memberships which will still come down - * with the ifindex of the old ill which is what the application - * knows of. Thus we store the ilm_/ilg_orig_ifindex to keep track - * of where we joined initially so that we can lookup even after we - * moved the membership. It is also used for moving back the membership - * when the old ill has been repaired. This is done by looking up for - * ilms with ilm_orig_ifindex matching on the old ill's ifindex. Only - * ilms actually move from old ill to new ill. ilgs don't move (just - * the ilg_ill is changed when it moves) as it just records the state - * of the application that has joined a group G where as ilm records - * the state joined with the driver. Thus when we send DL_XXXMULTI_REQs - * we also need to keep the ilm in the right ill. - * - * In IPv4, as ipifs move from old ill to new_ill, ilgs and ilms move - * implicitly as we use only ipifs in IPv4. Thus, one can always lookup - * a given ilm/ilg even after it fails without the support of - * orig_ifindex. We move ilms still to record the driver state as - * mentioned above. + * There is no direct link between a given ilg and ilm. If the + * application has joined a group G with ifindex I, we will have + * an ilg with ilg_v6group and ilg_ill. There will be a corresponding + * ilm with ilm_ill/ilm_v6addr recording the multicast membership. + * To delete the membership: + * + * a) Search for ilg matching on G and I with ilg_v6group + * and ilg_ill. Delete ilg_ill. + * b) Search the corresponding ilm matching on G and I with + * ilm_v6addr and ilm_ill. Delete ilm. + * + * For IPv4 the only difference is that we look using ipifs, not ills. */ /* @@ -839,7 +817,6 @@ typedef struct ilg_s { in6_addr_t ilg_v6group; struct ipif_s *ilg_ipif; /* Logical interface we are member on */ struct ill_s *ilg_ill; /* Used by IPv6 */ - int ilg_orig_ifindex; /* Interface originally joined on */ uint_t ilg_flags; mcast_record_t ilg_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */ slist_t *ilg_filter; @@ -866,9 +843,7 @@ typedef struct ilm_s { struct ilm_s *ilm_next; /* Linked list for each ill */ uint_t ilm_state; /* state of the membership */ struct ill_s *ilm_ill; /* Back pointer to ill for IPv6 */ - int ilm_orig_ifindex; /* V6_MULTICAST_IF/ilm_ipif index */ uint_t ilm_flags; - boolean_t ilm_is_new; /* new ilm */ boolean_t ilm_notify_driver; /* Need to notify the driver */ zoneid_t ilm_zoneid; int ilm_no_ilg_cnt; /* number of joins w/ no ilg */ @@ -881,28 +856,11 @@ typedef struct ilm_s { #define ilm_addr V4_PART_OF_V6(ilm_v6addr) -/* - * ilm_walker_cleanup needs to execute when the ilm_walker_cnt goes down to - * zero. In addition it needs to block new walkers while it is unlinking ilm's - * from the list. Thus simple atomics for the ill_ilm_walker_cnt don't suffice. - */ -#define ILM_WALKER_HOLD(ill) { \ - mutex_enter(&(ill)->ill_lock); \ - ill->ill_ilm_walker_cnt++; \ - mutex_exit(&(ill)->ill_lock); \ -} - -/* - * ilm_walker_cleanup releases ill_lock - */ -#define ILM_WALKER_RELE(ill) { \ - mutex_enter(&(ill)->ill_lock); \ - (ill)->ill_ilm_walker_cnt--; \ - if ((ill)->ill_ilm_walker_cnt == 0 && (ill)->ill_ilm_cleanup_reqd) \ - ilm_walker_cleanup(ill); \ - else \ - mutex_exit(&(ill)->ill_lock); \ -} +typedef struct ilm_walker { + struct ill_s *ilw_ill; /* associated ill */ + struct ill_s *ilw_ipmp_ill; /* associated ipmp ill (if any) */ + struct ill_s *ilw_walk_ill; /* current ill being walked */ +} ilm_walker_t; /* * Soft reference to an IPsec SA. @@ -1047,11 +1005,8 @@ typedef struct conn_s conn_t; * ipc_acking_unbind conn_acking_unbind * ipc_pad_to_bit_31 conn_pad_to_bit_31 * - * ipc_nofailover_ill conn_nofailover_ill - * * ipc_proto conn_proto * ipc_incoming_ill conn_incoming_ill - * ipc_outgoing_pill conn_outgoing_pill * ipc_pending_ill conn_pending_ill * ipc_unbind_mp conn_unbind_mp * ipc_ilg conn_ilg @@ -1061,8 +1016,6 @@ typedef struct conn_s conn_t; * ipc_refcv conn_refcv * ipc_multicast_ipif conn_multicast_ipif * ipc_multicast_ill conn_multicast_ill - * ipc_orig_bound_ifindex conn_orig_bound_ifindex - * ipc_orig_multicast_ifindex conn_orig_multicast_ifindex * ipc_drain_next conn_drain_next * ipc_drain_prev conn_drain_prev * ipc_idl conn_idl @@ -1263,7 +1216,6 @@ typedef struct th_hash_s { /* The following are ipif_state_flags */ #define IPIF_CONDEMNED 0x1 /* The ipif is being removed */ #define IPIF_CHANGING 0x2 /* A critcal ipif field is changing */ -#define IPIF_MOVING 0x8 /* The ipif is being moved */ #define IPIF_SET_LINKLOCAL 0x10 /* transient flag during bringup */ #define IPIF_ZERO_SOURCE 0x20 /* transient flag during bringup */ @@ -1273,7 +1225,6 @@ typedef struct ipif_s { struct ill_s *ipif_ill; /* Back pointer to our ill */ int ipif_id; /* Logical unit number */ uint_t ipif_mtu; /* Starts at ipif_ill->ill_max_frag */ - uint_t ipif_saved_mtu; /* Save of mtu during ipif_move() */ in6_addr_t ipif_v6lcl_addr; /* Local IP address for this if. */ in6_addr_t ipif_v6src_addr; /* Source IP address for this if. */ in6_addr_t ipif_v6subnet; /* Subnet prefix for this if. */ @@ -1306,17 +1257,15 @@ typedef struct ipif_s { uint_t ipif_ob_pkt_count; /* Outbound packets to our dead IREs */ /* Exclusive bit fields, protected by ipsq_t */ unsigned int - ipif_multicast_up : 1, /* We have joined the allhosts group */ - ipif_replace_zero : 1, /* Replacement for zero */ + ipif_multicast_up : 1, /* ipif_multicast_up() successful */ ipif_was_up : 1, /* ipif was up before */ ipif_addr_ready : 1, /* DAD is done */ - ipif_was_dup : 1, /* DAD had failed */ + + ipif_joined_allhosts : 1, /* allhosts joined */ ipif_pad_to_31 : 27; - int ipif_orig_ifindex; /* ifindex before SLIFFAILOVER */ uint_t ipif_seqid; /* unique index across all ills */ - uint_t ipif_orig_ipifid; /* ipif_id before SLIFFAILOVER */ uint_t ipif_state_flags; /* See IPIF_* flag defs above */ uint_t ipif_refcnt; /* active consistent reader cnt */ @@ -1328,6 +1277,16 @@ typedef struct ipif_s { zoneid_t ipif_zoneid; /* zone ID number */ timeout_id_t ipif_recovery_id; /* Timer for DAD recovery */ boolean_t ipif_trace_disable; /* True when alloc fails */ + /* + * For an IPMP interface, ipif_bound_ill tracks the ill whose hardware + * information this ipif is associated with via ARP/NDP. We can use + * an ill pointer (rather than an index) because only ills that are + * part of a group will be pointed to, and an ill cannot disappear + * while it's in a group. + */ + struct ill_s *ipif_bound_ill; + struct ipif_s *ipif_bound_next; /* bound ipif chain */ + boolean_t ipif_bound; /* B_TRUE if we successfully bound */ } ipif_t; /* @@ -1405,8 +1364,6 @@ typedef struct ipif_s { * * bit fields ill_lock ill_lock * - * ipif_orig_ifindex ipsq None - * ipif_orig_ipifid ipsq None * ipif_seqid ipsq Write once * * ipif_state_flags ill_lock ill_lock @@ -1414,6 +1371,10 @@ typedef struct ipif_s { * ipif_ire_cnt ill_lock ill_lock * ipif_ilm_cnt ill_lock ill_lock * ipif_saved_ire_cnt + * + * ipif_bound_ill ipsq + ipmp_lock ipsq OR ipmp_lock + * ipif_bound_next ipsq ipsq + * ipif_bound ipsq ipsq */ #define IP_TR_HASH(tid) ((((uintptr_t)tid) >> 6) & (IP_TR_HASH_MAX - 1)) @@ -1457,103 +1418,154 @@ typedef struct ipif_s { #define IPI2MODE(ipi) ((ipi)->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT) /* - * The IP-MT design revolves around the serialization object ipsq_t. - * It is associated with an IPMP group. If IPMP is not enabled, there is - * 1 ipsq_t per phyint. Eg. an ipsq_t would cover both hme0's IPv4 stream - * - * ipsq_lock protects - * ipsq_reentry_cnt, ipsq_writer, ipsq_xopq_mphead, ipsq_xopq_mptail, - * ipsq_mphead, ipsq_mptail, ipsq_split - * - * ipsq_pending_ipif, ipsq_current_ipif, ipsq_pending_mp, ipsq_flags, - * ipsq_waitfor - * - * The fields in the last line above below are set mostly by a writer thread - * But there is an exception in the last call to ipif_ill_refrele_tail which - * could also race with a conn close which could be cleaning up the - * fields. So we choose to protect using ipsq_lock instead of depending on - * the property of the writer. - * ill_g_lock protects - * ipsq_refs, ipsq_phyint_list - */ -typedef struct ipsq_s { - kmutex_t ipsq_lock; - int ipsq_reentry_cnt; - kthread_t *ipsq_writer; /* current owner (thread id) */ - int ipsq_flags; - mblk_t *ipsq_xopq_mphead; /* list of excl ops mostly ioctls */ - mblk_t *ipsq_xopq_mptail; - mblk_t *ipsq_mphead; /* msgs on ipsq linked thru b_next */ - mblk_t *ipsq_mptail; /* msgs on ipsq linked thru b_next */ - int ipsq_current_ioctl; /* current ioctl, or 0 if no ioctl */ - boolean_t ipsq_current_done; /* is the current op done? */ - ipif_t *ipsq_current_ipif; /* ipif associated with current op */ - ipif_t *ipsq_pending_ipif; /* ipif associated w. ipsq_pending_mp */ - mblk_t *ipsq_pending_mp; /* current ioctl mp while waiting for */ - /* response from another module */ - struct ipsq_s *ipsq_next; /* list of all syncq's (ipsq_g_list) */ - uint_t ipsq_refs; /* Number of phyints on this ipsq */ - struct phyint *ipsq_phyint_list; /* List of phyints on this ipsq */ - boolean_t ipsq_split; /* ipsq may need to be split */ - int ipsq_waitfor; /* Values encoded below */ - char ipsq_name[LIFNAMSIZ+1]; /* same as phyint_groupname */ - ip_stack_t *ipsq_ipst; /* Does not have a netstack_hold */ - + * The IP-MT design revolves around the serialization objects ipsq_t (IPSQ) + * and ipxop_t (exclusive operation or "xop"). Becoming "writer" on an IPSQ + * ensures that no other threads can become "writer" on any IPSQs sharing that + * IPSQ's xop until the writer thread is done. + * + * Each phyint points to one IPSQ that remains fixed over the phyint's life. + * Each IPSQ points to one xop that can change over the IPSQ's life. If a + * phyint is *not* in an IPMP group, then its IPSQ will refer to the IPSQ's + * "own" xop (ipsq_ownxop). If a phyint *is* part of an IPMP group, then its + * IPSQ will refer to the "group" xop, which is shorthand for the xop of the + * IPSQ of the IPMP meta-interface's phyint. Thus, all phyints that are part + * of the same IPMP group will have their IPSQ's point to the group xop, and + * thus becoming "writer" on any phyint in the group will prevent any other + * writer on any other phyint in the group. All IPSQs sharing the same xop + * are chained together through ipsq_next (in the degenerate common case, + * ipsq_next simply refers to itself). Note that the group xop is guaranteed + * to exist at least as long as there are members in the group, since the IPMP + * meta-interface can only be destroyed if the group is empty. + * + * Incoming exclusive operation requests are enqueued on the IPSQ they arrived + * on rather than the xop. This makes switching xop's (as would happen when a + * phyint leaves an IPMP group) simple, because after the phyint leaves the + * group, any operations enqueued on its IPSQ can be safely processed with + * respect to its new xop, and any operations enqueued on the IPSQs of its + * former group can be processed with respect to their existing group xop. + * Even so, switching xops is a subtle dance; see ipsq_dq() for details. + * + * An IPSQ's "own" xop is embedded within the IPSQ itself since they have have + * identical lifetimes, and because doing so simplifies pointer management. + * While each phyint and IPSQ point to each other, it is not possible to free + * the IPSQ when the phyint is freed, since we may still *inside* the IPSQ + * when the phyint is being freed. Thus, ipsq_phyint is set to NULL when the + * phyint is freed, and the IPSQ free is later done in ipsq_exit(). + * + * ipsq_t synchronization: read write + * + * ipsq_xopq_mphead ipx_lock ipx_lock + * ipsq_xopq_mptail ipx_lock ipx_lock + * ipsq_xop_switch_mp ipsq_lock ipsq_lock + * ipsq_phyint write once write once + * ipsq_next RW_READER ill_g_lock RW_WRITER ill_g_lock + * ipsq_xop ipsq_lock or ipsq ipsq_lock + ipsq + * ipsq_swxop ipsq ipsq + * ipsq_ownxop see ipxop_t see ipxop_t + * ipsq_ipst write once write once + * + * ipxop_t synchronization: read write + * + * ipx_writer ipx_lock ipx_lock + * ipx_xop_queued ipx_lock ipx_lock + * ipx_mphead ipx_lock ipx_lock + * ipx_mptail ipx_lock ipx_lock + * ipx_ipsq write once write once + * ips_ipsq_queued ipx_lock ipx_lock + * ipx_waitfor ipsq or ipx_lock ipsq + ipx_lock + * ipx_reentry_cnt ipsq or ipx_lock ipsq + ipx_lock + * ipx_current_done ipsq ipsq + * ipx_current_ioctl ipsq ipsq + * ipx_current_ipif ipsq or ipx_lock ipsq + ipx_lock + * ipx_pending_ipif ipsq or ipx_lock ipsq + ipx_lock + * ipx_pending_mp ipsq or ipx_lock ipsq + ipx_lock + * ipx_forced ipsq ipsq + * ipx_depth ipsq ipsq + * ipx_stack ipsq ipsq + */ +typedef struct ipxop_s { + kmutex_t ipx_lock; /* see above */ + kthread_t *ipx_writer; /* current owner */ + mblk_t *ipx_mphead; /* messages tied to this op */ + mblk_t *ipx_mptail; + struct ipsq_s *ipx_ipsq; /* associated ipsq */ + boolean_t ipx_ipsq_queued; /* ipsq using xop has queued op */ + int ipx_waitfor; /* waiting; values encoded below */ + int ipx_reentry_cnt; + boolean_t ipx_current_done; /* is the current operation done? */ + int ipx_current_ioctl; /* current ioctl, or 0 if no ioctl */ + ipif_t *ipx_current_ipif; /* ipif for current op */ + ipif_t *ipx_pending_ipif; /* ipif for ipsq_pending_mp */ + mblk_t *ipx_pending_mp; /* current ioctl mp while waiting */ + boolean_t ipx_forced; /* debugging aid */ #ifdef DEBUG - int ipsq_depth; /* debugging aid */ -#define IPSQ_STACK_DEPTH 15 - pc_t ipsq_stack[IPSQ_STACK_DEPTH]; /* debugging aid */ + int ipx_depth; /* debugging aid */ +#define IPX_STACK_DEPTH 15 + pc_t ipx_stack[IPX_STACK_DEPTH]; /* debugging aid */ #endif -} ipsq_t; +} ipxop_t; -/* ipsq_flags */ -#define IPSQ_GROUP 0x1 /* This ipsq belongs to an IPMP group */ +typedef struct ipsq_s { + kmutex_t ipsq_lock; /* see above */ + mblk_t *ipsq_switch_mp; /* op to handle right after switch */ + mblk_t *ipsq_xopq_mphead; /* list of excl ops (mostly ioctls) */ + mblk_t *ipsq_xopq_mptail; + struct phyint *ipsq_phyint; /* associated phyint */ + struct ipsq_s *ipsq_next; /* next ipsq sharing ipsq_xop */ + struct ipxop_s *ipsq_xop; /* current xop synchronization info */ + struct ipxop_s *ipsq_swxop; /* switch xop to on ipsq_exit() */ + struct ipxop_s ipsq_ownxop; /* our own xop (may not be in-use) */ + ip_stack_t *ipsq_ipst; /* does not have a netstack_hold */ +} ipsq_t; /* - * ipsq_waitfor: - * - * IPIF_DOWN 1 ipif_down waiting for refcnts to drop - * ILL_DOWN 2 ill_down waiting for refcnts to drop - * IPIF_FREE 3 ipif_free waiting for refcnts to drop - * ILL_FREE 4 ill unplumb waiting for refcnts to drop - * ILL_MOVE_OK 5 failover waiting for refcnts to drop + * ipx_waitfor values: */ +enum { + IPIF_DOWN = 1, /* ipif_down() waiting for refcnts to drop */ + ILL_DOWN, /* ill_down() waiting for refcnts to drop */ + IPIF_FREE, /* ipif_free() waiting for refcnts to drop */ + ILL_FREE /* ill unplumb waiting for refcnts to drop */ +}; -enum { IPIF_DOWN = 1, ILL_DOWN, IPIF_FREE, ILL_FREE, ILL_MOVE_OK }; +/* Operation types for ipsq_try_enter() */ +#define CUR_OP 0 /* request writer within current operation */ +#define NEW_OP 1 /* request writer for a new operation */ +#define SWITCH_OP 2 /* request writer once IPSQ XOP switches */ -/* Flags passed to ipsq_try_enter */ -#define CUR_OP 0 /* Current ioctl continuing again */ -#define NEW_OP 1 /* New ioctl starting afresh */ +/* + * Kstats tracked on each IPMP meta-interface. Order here must match + * ipmp_kstats[] in ip/ipmp.c. + */ +enum { + IPMP_KSTAT_OBYTES, IPMP_KSTAT_OBYTES64, IPMP_KSTAT_RBYTES, + IPMP_KSTAT_RBYTES64, IPMP_KSTAT_OPACKETS, IPMP_KSTAT_OPACKETS64, + IPMP_KSTAT_OERRORS, IPMP_KSTAT_IPACKETS, IPMP_KSTAT_IPACKETS64, + IPMP_KSTAT_IERRORS, IPMP_KSTAT_MULTIRCV, IPMP_KSTAT_MULTIXMT, + IPMP_KSTAT_BRDCSTRCV, IPMP_KSTAT_BRDCSTXMT, IPMP_KSTAT_LINK_UP, + IPMP_KSTAT_MAX /* keep last */ +}; /* * phyint represents state that is common to both IPv4 and IPv6 interfaces. * There is a separate ill_t representing IPv4 and IPv6 which has a * backpointer to the phyint structure for accessing common state. - * - * NOTE : It just stores the group name as there is only one name for - * IPv4 and IPv6 i.e it is a underlying link property. Actually - * IPv4 and IPv6 ill are grouped together when their phyints have - * the same name. */ typedef struct phyint { struct ill_s *phyint_illv4; struct ill_s *phyint_illv6; - uint_t phyint_ifindex; /* SIOCLSLIFINDEX */ - char *phyint_groupname; /* SIOCSLIFGROUPNAME */ - uint_t phyint_groupname_len; + uint_t phyint_ifindex; /* SIOCSLIFINDEX */ uint64_t phyint_flags; avl_node_t phyint_avl_by_index; /* avl tree by index */ avl_node_t phyint_avl_by_name; /* avl tree by name */ kmutex_t phyint_lock; struct ipsq_s *phyint_ipsq; /* back pointer to ipsq */ - struct phyint *phyint_ipsq_next; /* phyint list on this ipsq */ - /* Once Clearview IPMP is added the follow two fields can be removed */ - uint_t phyint_group_ifindex; /* index assigned to group */ - uint_t phyint_hook_ifindex; /* index used with neti/hook */ + struct ipmp_grp_s *phyint_grp; /* associated IPMP group */ + char phyint_name[LIFNAMSIZ]; /* physical interface name */ + uint64_t phyint_kstats0[IPMP_KSTAT_MAX]; /* baseline kstats */ } phyint_t; #define CACHE_ALIGN_SIZE 64 - #define CACHE_ALIGN(align_struct) P2ROUNDUP(sizeof (struct align_struct),\ CACHE_ALIGN_SIZE) struct _phyint_list_s_ { @@ -1568,34 +1580,6 @@ typedef union phyint_list_u { #define phyint_list_avl_by_index phyint_list_s.phyint_list_avl_by_index #define phyint_list_avl_by_name phyint_list_s.phyint_list_avl_by_name -/* - * ILL groups. We group ills, - * - * - if the ills have the same group name. (New way) - * - * ill_group locking notes: - * - * illgrp_lock protects ill_grp_ill_schednext. - * - * ill_g_lock protects ill_grp_next, illgrp_ill, illgrp_ill_count. - * Holding ill_g_lock freezes the memberships of ills in IPMP groups. - * It also freezes the global list of ills and all ipifs in all ills. - * - * To remove an ipif from the linked list of ipifs of that ill ipif_free_tail - * holds both ill_g_lock, and ill_lock. Similarly to remove an ill from the - * global list of ills, ill_glist_delete() holds ill_g_lock as writer. - * This simplifies things for ipif_select_source, illgrp_scheduler etc. - * that need to walk the members of an illgrp. They just hold ill_g_lock - * as reader to do the walk. - * - */ -typedef struct ill_group { - kmutex_t illgrp_lock; - struct ill_group *illgrp_next; /* Next ill_group */ - struct ill_s *illgrp_ill_schednext; /* Next ill to be scheduled */ - struct ill_s *illgrp_ill; /* First ill in the group */ - int illgrp_ill_count; -} ill_group_t; /* * Fragmentation hash bucket @@ -1792,6 +1776,108 @@ typedef struct ill_lso_capab_s ill_lso_capab_t; #define IS_LOOPBACK(ill) \ ((ill)->ill_phyint->phyint_flags & PHYI_LOOPBACK) +/* Is this an IPMP meta-interface ILL? */ +#define IS_IPMP(ill) \ + ((ill)->ill_phyint->phyint_flags & PHYI_IPMP) + +/* Is this ILL under an IPMP meta-interface? (aka "in a group?") */ +#define IS_UNDER_IPMP(ill) \ + ((ill)->ill_grp != NULL && !IS_IPMP(ill)) + +/* Is ill1 in the same illgrp as ill2? */ +#define IS_IN_SAME_ILLGRP(ill1, ill2) \ + ((ill1)->ill_grp != NULL && ((ill1)->ill_grp == (ill2)->ill_grp)) + +/* Is ill1 on the same LAN as ill2? */ +#define IS_ON_SAME_LAN(ill1, ill2) \ + ((ill1) == (ill2) || IS_IN_SAME_ILLGRP(ill1, ill2)) + +#define ILL_OTHER(ill) \ + ((ill)->ill_isv6 ? (ill)->ill_phyint->phyint_illv4 : \ + (ill)->ill_phyint->phyint_illv6) + +/* + * IPMP group ILL state structure -- up to two per IPMP group (V4 and V6). + * Created when the V4 and/or V6 IPMP meta-interface is I_PLINK'd. It is + * guaranteed to persist while there are interfaces of that type in the group. + * In general, most fields are accessed outside of the IPSQ (e.g., in the + * datapath), and thus use locks in addition to the IPSQ for protection. + * + * synchronization: read write + * + * ig_if ipsq or ill_g_lock ipsq and ill_g_lock + * ig_actif ipsq or ipmp_lock ipsq and ipmp_lock + * ig_nactif ipsq or ipmp_lock ipsq and ipmp_lock + * ig_next_ill ipsq or ipmp_lock ipsq and ipmp_lock + * ig_ipmp_ill write once write once + * ig_cast_ill ipsq or ipmp_lock ipsq and ipmp_lock + * ig_arpent ipsq ipsq + * ig_mtu ipsq ipsq + */ +typedef struct ipmp_illgrp_s { + list_t ig_if; /* list of all interfaces */ + list_t ig_actif; /* list of active interfaces */ + uint_t ig_nactif; /* number of active interfaces */ + struct ill_s *ig_next_ill; /* next active interface to use */ + struct ill_s *ig_ipmp_ill; /* backpointer to IPMP meta-interface */ + struct ill_s *ig_cast_ill; /* nominated ill for multi/broadcast */ + list_t ig_arpent; /* list of ARP entries */ + uint_t ig_mtu; /* ig_ipmp_ill->ill_max_mtu */ +} ipmp_illgrp_t; + +/* + * IPMP group state structure -- one per IPMP group. Created when the + * IPMP meta-interface is plumbed; it is guaranteed to persist while there + * are interfaces in it. + * + * ipmp_grp_t synchronization: read write + * + * gr_name ipmp_lock ipmp_lock + * gr_ifname write once write once + * gr_mactype ipmp_lock ipmp_lock + * gr_phyint write once write once + * gr_nif ipmp_lock ipmp_lock + * gr_nactif ipsq ipsq + * gr_v4 ipmp_lock ipmp_lock + * gr_v6 ipmp_lock ipmp_lock + * gr_nv4 ipmp_lock ipmp_lock + * gr_nv6 ipmp_lock ipmp_lock + * gr_pendv4 ipmp_lock ipmp_lock + * gr_pendv6 ipmp_lock ipmp_lock + * gr_linkdownmp ipsq ipsq + * gr_ksp ipmp_lock ipmp_lock + * gr_kstats0 atomic atomic + */ +typedef struct ipmp_grp_s { + char gr_name[LIFGRNAMSIZ]; /* group name */ + char gr_ifname[LIFNAMSIZ]; /* interface name */ + t_uscalar_t gr_mactype; /* DLPI mactype of group */ + phyint_t *gr_phyint; /* IPMP group phyint */ + uint_t gr_nif; /* number of interfaces in group */ + uint_t gr_nactif; /* number of active interfaces */ + ipmp_illgrp_t *gr_v4; /* V4 group information */ + ipmp_illgrp_t *gr_v6; /* V6 group information */ + uint_t gr_nv4; /* number of ills in V4 group */ + uint_t gr_nv6; /* number of ills in V6 group */ + uint_t gr_pendv4; /* number of pending ills in V4 group */ + uint_t gr_pendv6; /* number of pending ills in V6 group */ + mblk_t *gr_linkdownmp; /* message used to bring link down */ + kstat_t *gr_ksp; /* group kstat pointer */ + uint64_t gr_kstats0[IPMP_KSTAT_MAX]; /* baseline group kstats */ +} ipmp_grp_t; + +/* + * IPMP ARP entry -- one per SIOCS*ARP entry tied to the group. Used to keep + * ARP up-to-date as the active set of interfaces in the group changes. + */ +typedef struct ipmp_arpent_s { + mblk_t *ia_area_mp; /* AR_ENTRY_ADD pointer */ + ipaddr_t ia_ipaddr; /* IP address for this entry */ + boolean_t ia_proxyarp; /* proxy ARP entry? */ + boolean_t ia_notified; /* ARP notified about this entry? */ + list_node_t ia_node; /* next ARP entry in list */ +} ipmp_arpent_t; + /* * IP Lower level Structure. * Instance data structure in ip_open when there is a device below us. @@ -1851,6 +1937,7 @@ typedef struct ill_s { mblk_t *ill_unbind_mp; /* unbind mp from ill_dl_up() */ mblk_t *ill_promiscoff_mp; /* for ill_leave_allmulti() */ mblk_t *ill_dlpi_deferred; /* b_next chain of control messages */ + mblk_t *ill_ardeact_mp; /* deact mp from ipmp_ill_activate() */ mblk_t *ill_phys_addr_mp; /* mblk which holds ill_phys_addr */ #define ill_last_mp_to_free ill_phys_addr_mp @@ -1867,21 +1954,19 @@ typedef struct ill_s { ill_dlpi_style_set : 1, ill_ifname_pending : 1, - ill_move_in_progress : 1, /* FAILOVER/FAILBACK in progress */ ill_join_allmulti : 1, ill_logical_down : 1, - ill_is_6to4tun : 1, /* Interface is a 6to4 tunnel */ + ill_promisc_on_phys : 1, /* phys interface in promisc mode */ ill_dl_up : 1, ill_up_ipifs : 1, - ill_note_link : 1, /* supports link-up notification */ + ill_capab_reneg : 1, /* capability renegotiation to be done */ ill_dld_capab_inprog : 1, /* direct dld capab call in prog */ ill_need_recover_multicast : 1, - - ill_pad_to_bit_31 : 16; + ill_pad_to_bit_31 : 17; /* Following bit fields protected by ill_lock */ uint_t @@ -1891,10 +1976,8 @@ typedef struct ill_s { ill_arp_closing : 1, ill_arp_bringup_pending : 1, - ill_mtu_userspecified : 1, /* SIOCSLIFLNKINFO has set the mtu */ ill_arp_extend : 1, /* ARP has DAD extensions */ - - ill_pad_bit_31 : 25; + ill_pad_bit_31 : 26; /* * Used in SIOCSIFMUXID and SIOCGIFMUXID for 'ifconfig unplumb'. @@ -1931,6 +2014,7 @@ typedef struct ill_s { */ uint8_t ill_max_hops; /* Maximum hops for any logical interface */ uint_t ill_max_mtu; /* Maximum MTU for any logical interface */ + uint_t ill_user_mtu; /* User-specified MTU via SIOCSLIFLNKINFO */ uint32_t ill_reachable_time; /* Value for ND algorithm in msec */ uint32_t ill_reachable_retrans_time; /* Value for ND algorithm msec */ uint_t ill_max_buf; /* Max # of req to buffer for ND */ @@ -1953,13 +2037,9 @@ typedef struct ill_s { * of the ipif. */ mblk_t *ill_arp_on_mp; - /* Peer ill of an IPMP move operation */ - struct ill_s *ill_move_peer; phyint_t *ill_phyint; uint64_t ill_flags; - ill_group_t *ill_group; - struct ill_s *ill_group_next; kmutex_t ill_lock; /* Please see table below */ /* @@ -2005,6 +2085,18 @@ typedef struct ill_s { void *ill_flownotify_mh; /* Tx flow ctl, mac cb handle */ uint_t ill_ilm_cnt; /* ilms referencing this ill */ uint_t ill_ipallmulti_cnt; /* ip_join_allmulti() calls */ + /* + * IPMP fields. + */ + ipmp_illgrp_t *ill_grp; /* IPMP group information */ + list_node_t ill_actnode; /* next active ill in group */ + list_node_t ill_grpnode; /* next ill in group */ + ipif_t *ill_src_ipif; /* source address selection rotor */ + ipif_t *ill_move_ipif; /* ipif awaiting move to new ill */ + boolean_t ill_nom_cast; /* nominated for mcast/bcast */ + uint_t ill_bound_cnt; /* # of data addresses bound to ill */ + ipif_t *ill_bound_ipif; /* ipif chain bound to ill */ + timeout_id_t ill_refresh_tid; /* ill refresh retry timeout id */ } ill_t; /* @@ -2088,6 +2180,7 @@ typedef struct ill_s { * * ill_max_mtu * + * ill_user_mtu ipsq + ill_lock ill_lock * ill_reachable_time ipsq + ill_lock ill_lock * ill_reachable_retrans_time ipsq + ill_lock ill_lock * ill_max_buf ipsq + ill_lock ill_lock @@ -2102,12 +2195,9 @@ typedef struct ill_s { * ill_arp_down_mp ipsq ipsq * ill_arp_del_mapping_mp ipsq ipsq * ill_arp_on_mp ipsq ipsq - * ill_move_peer ipsq ipsq * * ill_phyint ipsq, ill_g_lock, ill_lock Any of them * ill_flags ill_lock ill_lock - * ill_group ipsq, ill_g_lock, ill_lock Any of them - * ill_group_next ipsq, ill_g_lock, ill_lock Any of them * ill_nd_lla_mp ipsq + down ill only when ill is up * ill_nd_lla ipsq + down ill only when ill is up * ill_nd_lla_len ipsq + down ill only when ill is up @@ -2122,11 +2212,26 @@ typedef struct ill_s { * ill_ilm_walker_cnt ill_lock ill_lock * ill_nce_cnt ill_lock ill_lock * ill_ilm_cnt ill_lock ill_lock + * ill_src_ipif ill_g_lock ill_g_lock * ill_trace ill_lock ill_lock * ill_usesrc_grp_next ill_g_usesrc_lock ill_g_usesrc_lock * ill_dhcpinit atomics atomics * ill_flownotify_mh write once write once * ill_capab_pending_cnt ipsq ipsq + * + * ill_bound_cnt ipsq ipsq + * ill_bound_ipif ipsq ipsq + * ill_actnode ipsq + ipmp_lock ipsq OR ipmp_lock + * ill_grpnode ipsq + ill_g_lock ipsq OR ill_g_lock + * ill_src_ipif ill_g_lock ill_g_lock + * ill_move_ipif ipsq ipsq + * ill_nom_cast ipsq ipsq OR advisory + * ill_refresh_tid ill_lock ill_lock + * ill_grp (for IPMP ill) write once write once + * ill_grp (for underlying ill) ipsq + ill_g_lock ipsq OR ill_g_lock + * + * NOTE: It's OK to make heuristic decisions on an underlying interface + * by using IS_UNDER_IPMP() or comparing ill_grp's raw pointer value. */ /* @@ -2167,7 +2272,7 @@ enum { IF_CMD = 1, LIF_CMD, TUN_CMD, ARP_CMD, XARP_CMD, MSFILT_CMD, MISC_CMD }; #define IPI_MODOK 0x2 /* Permitted on mod instance of IP */ #define IPI_WR 0x4 /* Need to grab writer access */ #define IPI_GET_CMD 0x8 /* branch to mi_copyout on success */ -#define IPI_REPL 0x10 /* valid for replacement ipif created in MOVE */ +/* unused 0x10 */ #define IPI_NULL_BCONT 0x20 /* ioctl has not data and hence no b_cont */ #define IPI_PASS_DOWN 0x40 /* pass this ioctl down when a module only */ @@ -2176,17 +2281,6 @@ extern ip_ioctl_cmd_t ip_misc_ioctl_table[]; extern int ip_ndx_ioctl_count; extern int ip_misc_ioctl_count; -#define ILL_CLEAR_MOVE(ill) { \ - ill_t *peer_ill; \ - \ - peer_ill = (ill)->ill_move_peer; \ - ASSERT(peer_ill != NULL); \ - (ill)->ill_move_in_progress = B_FALSE; \ - peer_ill->ill_move_in_progress = B_FALSE; \ - (ill)->ill_move_peer = NULL; \ - peer_ill->ill_move_peer = NULL; \ -} - /* Passed down by ARP to IP during I_PLINK/I_PUNLINK */ typedef struct ipmx_s { char ipmx_name[LIFNAMSIZ]; /* if name */ @@ -2799,19 +2893,11 @@ typedef struct ip_pktinfo { (!((ipif)->ipif_state_flags & (IPIF_CONDEMNED)) || \ IAM_WRITER_IPIF(ipif)) -/* - * These macros are used by critical set ioctls and failover ioctls to - * mark the ipif appropriately before starting the operation and to clear the - * marks after completing the operation. - */ -#define IPIF_UNMARK_MOVING(ipif) \ - (ipif)->ipif_state_flags &= ~IPIF_MOVING & ~IPIF_CHANGING; - #define ILL_UNMARK_CHANGING(ill) \ (ill)->ill_state_flags &= ~ILL_CHANGING; /* Macros used to assert that this thread is a writer */ -#define IAM_WRITER_IPSQ(ipsq) ((ipsq)->ipsq_writer == curthread) +#define IAM_WRITER_IPSQ(ipsq) ((ipsq)->ipsq_xop->ipx_writer == curthread) #define IAM_WRITER_ILL(ill) IAM_WRITER_IPSQ((ill)->ill_phyint->phyint_ipsq) #define IAM_WRITER_IPIF(ipif) IAM_WRITER_ILL((ipif)->ipif_ill) @@ -2837,9 +2923,9 @@ typedef struct ip_pktinfo { #define RELEASE_ILL_LOCKS(ill_1, ill_2) \ { \ if (ill_1 != NULL) \ - mutex_exit(&(ill_1)->ill_lock); \ + mutex_exit(&(ill_1)->ill_lock); \ if (ill_2 != NULL && ill_2 != ill_1) \ - mutex_exit(&(ill_2)->ill_lock); \ + mutex_exit(&(ill_2)->ill_lock); \ } /* Get the other protocol instance ill */ @@ -2847,14 +2933,9 @@ typedef struct ip_pktinfo { ((ill)->ill_isv6 ? (ill)->ill_phyint->phyint_illv4 : \ (ill)->ill_phyint->phyint_illv6) -#define MATCH_V4_ONLY 0x1 -#define MATCH_V6_ONLY 0x2 -#define MATCH_ILL_ONLY 0x4 - /* ioctl command info: Ioctl properties extracted and stored in here */ typedef struct cmd_info_s { - char ci_groupname[LIFNAMSIZ + 1]; /* SIOCSLIFGROUPNAME */ ipif_t *ci_ipif; /* ipif associated with [l]ifreq ioctl's */ sin_t *ci_sin; /* the sin struct passed down */ sin6_t *ci_sin6; /* the sin6_t struct passed down */ @@ -2990,10 +3071,8 @@ extern struct module_info ip_mod_info; ((ipst)->ips_ip6_loopback_out_event.he_interested) /* - * Hooks marcos used inside of ip + * Hooks macros used inside of ip */ -#define IPHA_VHL ipha_version_and_hdr_length - #define FW_HOOKS(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst) \ \ if ((_hook).he_interested) { \ @@ -3002,21 +3081,8 @@ extern struct module_info ip_mod_info; _NOTE(CONSTCOND) \ ASSERT((_ilp != NULL) || (_olp != NULL)); \ \ - _NOTE(CONSTCOND) \ - if ((_ilp != NULL) && \ - (((ill_t *)(_ilp))->ill_phyint != NULL)) \ - info.hpe_ifp = (phy_if_t)((ill_t *) \ - (_ilp))->ill_phyint->phyint_hook_ifindex; \ - else \ - info.hpe_ifp = 0; \ - \ - _NOTE(CONSTCOND) \ - if ((_olp != NULL) && \ - (((ill_t *)(_olp))->ill_phyint != NULL)) \ - info.hpe_ofp = (phy_if_t)((ill_t *) \ - (_olp))->ill_phyint->phyint_hook_ifindex; \ - else \ - info.hpe_ofp = 0; \ + FW_SET_ILL_INDEX(info.hpe_ifp, (ill_t *)_ilp); \ + FW_SET_ILL_INDEX(info.hpe_ofp, (ill_t *)_olp); \ info.hpe_protocol = ipst->ips_ipv4_net_data; \ info.hpe_hdr = _iph; \ info.hpe_mp = &(_fm); \ @@ -3026,10 +3092,8 @@ extern struct module_info ip_mod_info; _event, (hook_data_t)&info) != 0) { \ ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\ (_hook).he_name, (void *)_fm, (void *)_m)); \ - if (_fm != NULL) { \ - freemsg(_fm); \ - _fm = NULL; \ - } \ + freemsg(_fm); \ + _fm = NULL; \ _iph = NULL; \ _m = NULL; \ } else { \ @@ -3046,21 +3110,8 @@ extern struct module_info ip_mod_info; _NOTE(CONSTCOND) \ ASSERT((_ilp != NULL) || (_olp != NULL)); \ \ - _NOTE(CONSTCOND) \ - if ((_ilp != NULL) && \ - (((ill_t *)(_ilp))->ill_phyint != NULL)) \ - info.hpe_ifp = (phy_if_t)((ill_t *) \ - (_ilp))->ill_phyint->phyint_hook_ifindex; \ - else \ - info.hpe_ifp = 0; \ - \ - _NOTE(CONSTCOND) \ - if ((_olp != NULL) && \ - (((ill_t *)(_olp))->ill_phyint != NULL)) \ - info.hpe_ofp = (phy_if_t)((ill_t *) \ - (_olp))->ill_phyint->phyint_hook_ifindex; \ - else \ - info.hpe_ofp = 0; \ + FW_SET_ILL_INDEX(info.hpe_ifp, (ill_t *)_ilp); \ + FW_SET_ILL_INDEX(info.hpe_ofp, (ill_t *)_olp); \ info.hpe_protocol = ipst->ips_ipv6_net_data; \ info.hpe_hdr = _iph; \ info.hpe_mp = &(_fm); \ @@ -3070,10 +3121,8 @@ extern struct module_info ip_mod_info; _event, (hook_data_t)&info) != 0) { \ ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\ (_hook).he_name, (void *)_fm, (void *)_m)); \ - if (_fm != NULL) { \ - freemsg(_fm); \ - _fm = NULL; \ - } \ + freemsg(_fm); \ + _fm = NULL; \ _iph = NULL; \ _m = NULL; \ } else { \ @@ -3082,6 +3131,17 @@ extern struct module_info ip_mod_info; } \ } +#define FW_SET_ILL_INDEX(fp, ill) \ + _NOTE(CONSTCOND) \ + if ((ill) == NULL || (ill)->ill_phyint == NULL) { \ + (fp) = 0; \ + _NOTE(CONSTCOND) \ + } else if (IS_UNDER_IPMP(ill)) { \ + (fp) = ipmp_ill_get_ipmp_ifindex(ill); \ + } else { \ + (fp) = (ill)->ill_phyint->phyint_ifindex; \ + } + /* * Network byte order macros */ @@ -3146,16 +3206,15 @@ struct ipsec_out_s; struct mac_header_info_s; -extern boolean_t ip_assign_ifindex(uint_t *, ip_stack_t *); extern void ill_frag_timer(void *); extern ill_t *ill_first(int, int, ill_walk_context_t *, ip_stack_t *); extern ill_t *ill_next(ill_walk_context_t *, ill_t *); extern void ill_frag_timer_start(ill_t *); extern void ill_nic_event_dispatch(ill_t *, lif_if_t, nic_event_t, nic_event_data_t, size_t); -extern void ill_nic_event_plumb(ill_t *, boolean_t); extern mblk_t *ip_carve_mp(mblk_t **, ssize_t); extern mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); +extern mblk_t *ip_dlnotify_alloc(uint_t, uint_t); extern char *ip_dot_addr(ipaddr_t, char *); extern const char *mac_colon_addr(const uint8_t *, size_t, char *, size_t); extern void ip_lwput(queue_t *, mblk_t *); @@ -3239,8 +3298,49 @@ extern int ip_hdr_complete(ipha_t *, zoneid_t, ip_stack_t *); extern struct qinit iprinitv6; extern struct qinit ipwinitv6; -extern void conn_drain_insert(conn_t *connp); -extern int conn_ipsec_length(conn_t *connp); +extern void ipmp_init(ip_stack_t *); +extern void ipmp_destroy(ip_stack_t *); +extern ipmp_grp_t *ipmp_grp_create(const char *, phyint_t *); +extern void ipmp_grp_destroy(ipmp_grp_t *); +extern void ipmp_grp_info(const ipmp_grp_t *, lifgroupinfo_t *); +extern int ipmp_grp_rename(ipmp_grp_t *, const char *); +extern ipmp_grp_t *ipmp_grp_lookup(const char *, ip_stack_t *); +extern int ipmp_grp_vet_phyint(ipmp_grp_t *, phyint_t *); +extern ipmp_illgrp_t *ipmp_illgrp_create(ill_t *); +extern void ipmp_illgrp_destroy(ipmp_illgrp_t *); +extern ill_t *ipmp_illgrp_add_ipif(ipmp_illgrp_t *, ipif_t *); +extern void ipmp_illgrp_del_ipif(ipmp_illgrp_t *, ipif_t *); +extern ill_t *ipmp_illgrp_next_ill(ipmp_illgrp_t *); +extern ill_t *ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *); +extern ill_t *ipmp_illgrp_cast_ill(ipmp_illgrp_t *); +extern ill_t *ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *); +extern ill_t *ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *); +extern void ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *); +extern ipmp_arpent_t *ipmp_illgrp_create_arpent(ipmp_illgrp_t *, mblk_t *, + boolean_t); +extern void ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *, ipmp_arpent_t *); +extern ipmp_arpent_t *ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *, ipaddr_t *); +extern void ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *); +extern void ipmp_illgrp_mark_arpent(ipmp_illgrp_t *, ipmp_arpent_t *); +extern ill_t *ipmp_illgrp_find_ill(ipmp_illgrp_t *, uchar_t *, uint_t); +extern void ipmp_illgrp_link_grp(ipmp_illgrp_t *, ipmp_grp_t *); +extern int ipmp_illgrp_unlink_grp(ipmp_illgrp_t *); +extern uint_t ipmp_ill_get_ipmp_ifindex(const ill_t *); +extern void ipmp_ill_join_illgrp(ill_t *, ipmp_illgrp_t *); +extern void ipmp_ill_leave_illgrp(ill_t *); +extern ill_t *ipmp_ill_hold_ipmp_ill(ill_t *); +extern boolean_t ipmp_ill_is_active(ill_t *); +extern void ipmp_ill_refresh_active(ill_t *); +extern void ipmp_phyint_join_grp(phyint_t *, ipmp_grp_t *); +extern void ipmp_phyint_leave_grp(phyint_t *); +extern void ipmp_phyint_refresh_active(phyint_t *); +extern ill_t *ipmp_ipif_bound_ill(const ipif_t *); +extern ill_t *ipmp_ipif_hold_bound_ill(const ipif_t *); +extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *); +extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *); + +extern void conn_drain_insert(conn_t *connp); +extern int conn_ipsec_length(conn_t *connp); extern void ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *, ire_t *); extern ipaddr_t ip_get_dst(ipha_t *); @@ -3274,9 +3374,6 @@ extern int ip_srcid_report(queue_t *, mblk_t *, caddr_t, cred_t *); extern uint8_t ipoptp_next(ipoptp_t *); extern uint8_t ipoptp_first(ipoptp_t *, ipha_t *); extern int ip_opt_get_user(const ipha_t *, uchar_t *); -extern ill_t *ip_grab_attach_ill(ill_t *, mblk_t *, int, boolean_t, - ip_stack_t *); -extern ire_t *conn_set_outgoing_ill(conn_t *, ire_t *, ill_t **); extern int ipsec_req_from_conn(conn_t *, ipsec_req_t *, int); extern int ip_snmp_get(queue_t *q, mblk_t *mctl, int level); extern int ip_snmp_set(queue_t *q, int, int, uchar_t *, int); @@ -3295,7 +3392,6 @@ extern void ip_savebuf(void **, uint_t *, boolean_t, const void *, uint_t); extern boolean_t ipsq_pending_mp_cleanup(ill_t *, conn_t *); extern void conn_ioctl_cleanup(conn_t *); extern ill_t *conn_get_held_ill(conn_t *, ill_t **, int *); -extern ill_t *ip_newroute_get_dst_ill(ill_t *); struct multidata_s; struct pdesc_s; @@ -3314,9 +3410,6 @@ extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *, uint_t); extern void ip_unbind(conn_t *connp); -extern phyint_t *phyint_lookup_group(char *, boolean_t, ip_stack_t *); -extern phyint_t *phyint_lookup_group_ifindex(uint_t, ip_stack_t *); - extern void tnet_init(void); extern void tnet_fini(void); @@ -3434,6 +3527,8 @@ typedef struct ipobs_cb { * ihd_ifindex Interface index that the packet was received/sent over. * For local packets, this is the index of the interface * associated with the local destination address. + * ihd_grifindex IPMP group interface index (zero unless ihd_ifindex + * is an IPMP underlying interface). * ihd_stack Netstack the packet is from. */ typedef struct ipobs_hook_data { @@ -3443,6 +3538,7 @@ typedef struct ipobs_hook_data { ipobs_hook_type_t ihd_htype; uint16_t ihd_ipver; uint64_t ihd_ifindex; + uint64_t ihd_grifindex; netstack_t *ihd_stack; } ipobs_hook_data_t; diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index 3f967ea183..d484831a3c 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -1892,7 +1892,6 @@ icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) * case MRT_VERSION: * case MRT_ASSERT: * case IP_SEC_OPT: - * case IP_DONTFAILOVER_IF: * case IP_NEXTHOP: */ default: @@ -2481,7 +2480,6 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, case MRT_VERSION: case MRT_ASSERT: case IP_SEC_OPT: - case IP_DONTFAILOVER_IF: case IP_NEXTHOP: /* * "soft" error (negative) @@ -3014,9 +3012,7 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, case IPV6_PATHMTU: return (EINVAL); - case IPV6_BOUND_PIF: case IPV6_SEC_OPT: - case IPV6_DONTFAILOVER_IF: case IPV6_SRC_PREFERENCES: case IPV6_V6ONLY: /* Handled at IP level */ diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c index 4f15801dfb..24ba9d689c 100644 --- a/usr/src/uts/common/inet/ip/icmp_opt_data.c +++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -138,9 +138,6 @@ opdes_t icmp_opt_arr[] = { { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 /* no ifindex */ }, -{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (struct in_addr), 0 /* not initialized */ }, - { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, sizeof (int), 0 }, @@ -222,12 +219,6 @@ opdes_t icmp_opt_arr[] = { { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 /* no ifindex */ }, -{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (int), 0 /* no ifindex */ }, - -{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (int), 0 /* no ifindex */ }, - { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, sizeof (int), 0 }, diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c index 091509c71e..681f198aa7 100644 --- a/usr/src/uts/common/inet/ip/igmp.c +++ b/usr/src/uts/common/inet/ip/igmp.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -46,7 +46,7 @@ #include <sys/cmn_err.h> #include <sys/atomic.h> #include <sys/zone.h> - +#include <sys/callb.h> #include <sys/param.h> #include <sys/socket.h> #include <inet/ipclassifier.h> @@ -83,7 +83,7 @@ static mrec_t *mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, static void mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype, slist_t *flist); static mrec_t *mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist); - +static void mcast_signal_restart_thread(ip_stack_t *ipst); /* * Macros used to do timer len conversions. Timer values are always @@ -122,7 +122,7 @@ static mrec_t *mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist); * The first multicast join will trigger the igmp timers / mld timers * The unit for next is milliseconds. */ -void +static void igmp_start_timers(unsigned next, ip_stack_t *ipst) { int time_left; @@ -207,7 +207,7 @@ igmp_start_timers(unsigned next, ip_stack_t *ipst) * mld_start_timers: * The unit for next is milliseconds. */ -void +static void mld_start_timers(unsigned next, ip_stack_t *ipst) { int time_left; @@ -306,7 +306,8 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) uint32_t group; uint_t next; ipif_t *ipif; - ip_stack_t *ipst; + ip_stack_t *ipst; + ilm_walker_t ilw; ASSERT(ill != NULL); ASSERT(!ill->ill_isv6); @@ -401,8 +402,7 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) "igmp_input: we are only " "member src 0x%x ipif_local 0x%x", (int)ntohl(src), - (int) - ntohl(ipif->ipif_lcl_addr)); + (int)ntohl(ipif->ipif_lcl_addr)); } mutex_exit(&ill->ill_lock); return (mp); @@ -440,23 +440,20 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) } /* - * If we belong to the group being reported, and - * we are a 'Delaying member' in the RFC terminology, - * stop our timer for that group and 'clear flag' i.e. - * mark as IGMP_OTHERMEMBER. Do this for all logical - * interfaces on the given physical interface. + * If our ill has ILMs that belong to the group being + * reported, and we are a 'Delaying Member' in the RFC + * terminology, stop our timer for that group and 'clear + * flag' i.e. mark as IGMP_OTHERMEMBER. */ - mutex_enter(&ill->ill_lock); - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - ilm = ilm_lookup_ipif(ipif, group); - if (ilm != NULL) { + ilm = ilm_walker_start(&ilw, ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + if (ilm->ilm_addr == group) { ++ipst->ips_igmpstat.igps_rcv_ourreports; ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_OTHERMEMBER; } - } /* for */ - mutex_exit(&ill->ill_lock); + } + ilm_walker_finish(&ilw); break; case IGMP_V3_MEMBERSHIP_REPORT: @@ -485,6 +482,7 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) int timer; uint_t next, current; ip_stack_t *ipst; + ilm_walker_t ilw; ipst = ill->ill_ipst; ++ipst->ips_igmpstat.igps_rcv_queries; @@ -583,11 +581,12 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) * the maximum timeout. */ next = (unsigned)INFINITY; - mutex_enter(&ill->ill_lock); + ilm = ilm_walker_start(&ilw, ill); + mutex_enter(&ill->ill_lock); current = CURRENT_MSTIME; - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { /* * A multicast router joins INADDR_ANY address * to enable promiscuous reception of all @@ -610,6 +609,7 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) } } mutex_exit(&ill->ill_lock); + ilm_walker_finish(&ilw); return (next); } @@ -623,6 +623,7 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen) ipaddr_t *src_array; uint8_t qrv; ip_stack_t *ipst; + ilm_walker_t ilw; ipst = ill->ill_ipst; /* make sure numsrc matches packet size */ @@ -693,8 +694,9 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen) } else { /* group or group/source specific query */ + ilm = ilm_walker_start(&ilw, ill); mutex_enter(&ill->ill_lock); - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) || (ilm->ilm_addr == htonl(INADDR_ANY)) || (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) || @@ -749,6 +751,7 @@ group_query: ilm->ilm_timer += current; } mutex_exit(&ill->ill_lock); + ilm_walker_finish(&ilw); } return (next); @@ -819,13 +822,22 @@ igmp_joingroup(ilm_t *ilm) mutex_exit(&ill->ill_lock); /* - * To avoid deadlock, we defer igmp_start_timers() to - * ipsq_exit(). See the comment in ipsq_exit() for details. + * We need to restart the IGMP timers, but we can't do it here + * since we're inside the IPSQ and thus igmp_start_timers() -> + * untimeout() (inside the IPSQ, waiting for a running timeout + * to finish) could deadlock with igmp_timeout_handler() -> + * ipsq_enter() (running the timeout, waiting to get inside + * the IPSQ). We also can't just delay it until after we + * ipsq_exit() since we could be inside more than one IPSQ and + * thus still have the other IPSQs pinned after we exit -- and + * igmp_start_timers() may be trying to enter one of those. + * Instead, signal a dedicated thread that will do it for us. */ mutex_enter(&ipst->ips_igmp_timer_lock); ipst->ips_igmp_deferred_next = MIN(timer, ipst->ips_igmp_deferred_next); mutex_exit(&ipst->ips_igmp_timer_lock); + mcast_signal_restart_thread(ipst); } if (ip_debug > 1) { @@ -897,13 +909,14 @@ mld_joingroup(ilm_t *ilm) mutex_exit(&ill->ill_lock); /* - * To avoid deadlock, we defer mld_start_timers() to - * ipsq_exit(). See the comment in ipsq_exit() for details. + * Signal another thread to restart the timers. See the + * comment in igmp_joingroup() for details. */ mutex_enter(&ipst->ips_mld_timer_lock); ipst->ips_mld_deferred_next = MIN(timer, ipst->ips_mld_deferred_next); mutex_exit(&ipst->ips_mld_timer_lock); + mcast_signal_restart_thread(ipst); } if (ip_debug > 1) { @@ -1073,8 +1086,8 @@ send_to_in: /* * Need to set up retransmission state; merge the new info with the * current state (which may be null). If the timer is not currently - * running, start it (need to do a delayed start of the timer as - * we're currently in the sq). + * running, signal a thread to restart it -- see the comment in + * igmp_joingroup() for details. */ rp = mcast_merge_rtx(ilm, rp, flist); if (ilm->ilm_rtx.rtx_timer == INFINITY) { @@ -1085,6 +1098,7 @@ send_to_in: ilm->ilm_rtx.rtx_timer); ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; mutex_exit(&ipst->ips_igmp_timer_lock); + mcast_signal_restart_thread(ipst); } mutex_exit(&ill->ill_lock); @@ -1161,8 +1175,8 @@ send_to_in: /* * Need to set up retransmission state; merge the new info with the * current state (which may be null). If the timer is not currently - * running, start it (need to do a deferred start of the timer as - * we're currently in the sq). + * running, signal a thread to restart it -- see the comment in + * igmp_joingroup() for details. */ rp = mcast_merge_rtx(ilm, rp, flist); ASSERT(ilm->ilm_rtx.rtx_cnt > 0); @@ -1174,6 +1188,7 @@ send_to_in: MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer); ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; mutex_exit(&ipst->ips_mld_timer_lock); + mcast_signal_restart_thread(ipst); } mutex_exit(&ill->ill_lock); @@ -1397,12 +1412,10 @@ per_ilm_rtxtimer: * * igmp_input() receives igmp queries and responds to the queries * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers(). - * Later the igmp_timer fires, the timeout handler igmp_timerout_handler() + * Later the igmp_timer fires, the timeout handler igmp_timeout_handler() * performs the action exclusively after entering each ill's ipsq as writer. - * The actual igmp timeout handler needs to run in the ipsq since it has to - * access the ilm's and we don't want another exclusive operation like - * say an IPMP failover to be simultaneously moving the ilms from one ill to - * another. + * (The need to enter the IPSQ is largely historical but there are still some + * fields like ilm_filter that rely on it.) * * The igmp_slowtimeo() function is called thru another timer. * igmp_slowtimeout_lock protects the igmp_slowtimeout_id @@ -1420,7 +1433,6 @@ igmp_timeout_handler(void *arg) ASSERT(arg != NULL); mutex_enter(&ipst->ips_igmp_timer_lock); ASSERT(ipst->ips_igmp_timeout_id != 0); - ipst->ips_igmp_timer_thread = curthread; ipst->ips_igmp_timer_scheduled_last = 0; ipst->ips_igmp_time_to_next = 0; mutex_exit(&ipst->ips_igmp_timer_lock); @@ -1452,7 +1464,6 @@ igmp_timeout_handler(void *arg) mutex_enter(&ipst->ips_igmp_timer_lock); ASSERT(ipst->ips_igmp_timeout_id != 0); ipst->ips_igmp_timeout_id = 0; - ipst->ips_igmp_timer_thread = NULL; mutex_exit(&ipst->ips_igmp_timer_lock); if (global_next != INFINITY) @@ -1663,7 +1674,6 @@ mld_timeout_handler(void *arg) ASSERT(arg != NULL); mutex_enter(&ipst->ips_mld_timer_lock); ASSERT(ipst->ips_mld_timeout_id != 0); - ipst->ips_mld_timer_thread = curthread; ipst->ips_mld_timer_scheduled_last = 0; ipst->ips_mld_time_to_next = 0; mutex_exit(&ipst->ips_mld_timer_lock); @@ -1695,7 +1705,6 @@ mld_timeout_handler(void *arg) mutex_enter(&ipst->ips_mld_timer_lock); ASSERT(ipst->ips_mld_timeout_id != 0); ipst->ips_mld_timeout_id = 0; - ipst->ips_mld_timer_thread = NULL; mutex_exit(&ipst->ips_mld_timer_lock); if (global_next != INFINITY) @@ -1871,7 +1880,7 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr) int hdrlen = sizeof (ipha_t) + RTRALERT_LEN; size_t size = hdrlen + sizeof (igmpa_t); ipif_t *ipif = ilm->ilm_ipif; - ill_t *ill = ipif->ipif_ill; /* Will be the "lower" ill */ + ill_t *ill = ipif->ipif_ill; mblk_t *first_mp; ipsec_out_t *io; zoneid_t zoneid; @@ -1887,14 +1896,6 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr) * not get forwarded on other interfaces or looped back, we * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop * to B_FALSE. - * - * We also need to make sure that this does not get load balanced - * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if - * here. If it gets load balanced, switches supporting igmp snooping - * will send the packet that it receives for this multicast group - * to the interface that we are sending on. As we have joined the - * multicast group on this ill, by sending the packet out on this - * ill, we receive all the packets back on this ill. */ first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI); if (first_mp == NULL) @@ -1909,7 +1910,6 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr) io->ipsec_out_len = sizeof (ipsec_out_t); io->ipsec_out_use_global_policy = B_TRUE; io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; - io->ipsec_out_attach_if = B_TRUE; io->ipsec_out_multicast_loop = B_FALSE; io->ipsec_out_dontroute = B_TRUE; if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES) @@ -1995,6 +1995,8 @@ igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist) zoneid_t zoneid; ip_stack_t *ipst = ill->ill_ipst; + ASSERT(IAM_WRITER_IPIF(ipif)); + /* if there aren't any records, there's nothing to send */ if (reclist == NULL) return; @@ -2022,6 +2024,14 @@ nextpkt: int srcspace, srcsperpkt; srcspace = ill->ill_max_frag - (size + sizeof (grphdra_t)); + + /* + * Skip if there's not even enough room in + * a single packet to send something useful. + */ + if (srcspace <= sizeof (ipaddr_t)) + continue; + srcsperpkt = srcspace / sizeof (ipaddr_t); /* * Increment size and numrec, because we will @@ -2082,7 +2092,6 @@ nextpkt: io->ipsec_out_len = sizeof (ipsec_out_t); io->ipsec_out_use_global_policy = B_TRUE; io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; - io->ipsec_out_attach_if = B_TRUE; io->ipsec_out_multicast_loop = B_FALSE; io->ipsec_out_dontroute = B_TRUE; if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES) @@ -2188,6 +2197,7 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) uint_t next; int mldlen; ip_stack_t *ipst = ill->ill_ipst; + ilm_walker_t ilw; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal); @@ -2294,7 +2304,6 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) return; } - /* * If we belong to the group being reported, and we are a * 'Delaying member' per the RFC terminology, stop our timer @@ -2303,8 +2312,8 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) * membership entries for the same group address (one per zone) * so we need to walk the ill_ilm list. */ - mutex_enter(&ill->ill_lock); - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { + ilm = ilm_walker_start(&ilw, ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr)) continue; BUMP_MIB(ill->ill_icmp6_mib, @@ -2313,7 +2322,7 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_OTHERMEMBER; } - mutex_exit(&ill->ill_lock); + ilm_walker_finish(&ilw); break; } case MLD_LISTENER_REDUCTION: @@ -2343,6 +2352,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) int timer; uint_t next, current; in6_addr_t *v6group; + ilm_walker_t ilw; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries); @@ -2397,10 +2407,12 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) * maximum timeout. */ next = INFINITY; - mutex_enter(&ill->ill_lock); + ilm = ilm_walker_start(&ilw, ill); + mutex_enter(&ill->ill_lock); current = CURRENT_MSTIME; - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { + + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr)); if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) || @@ -2430,6 +2442,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) } } mutex_exit(&ill->ill_lock); + ilm_walker_finish(&ilw); return (next); } @@ -2446,6 +2459,7 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen) in6_addr_t *v6group, *src_array; uint_t next, numsrc, i, mrd, delay, qqi, current; uint8_t qrv; + ilm_walker_t ilw; v6group = &mld2q->mld2q_addr; numsrc = ntohs(mld2q->mld2q_numsrc); @@ -2518,8 +2532,9 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen) } else { /* group or group/source specific query */ + ilm = ilm_walker_start(&ilw, ill); mutex_enter(&ill->ill_lock); - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) || IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) || IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) || @@ -2574,6 +2589,7 @@ group_query: break; } mutex_exit(&ill->ill_lock); + ilm_walker_finish(&ilw); } return (next); @@ -2591,9 +2607,8 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr) ip6_hbh_t *ip6hbh; struct ip6_opt_router *ip6router; size_t size = IPV6_HDR_LEN + sizeof (mld_hdr_t); - ill_t *ill = ilm->ilm_ill; /* Will be the "lower" ill */ + ill_t *ill = ilm->ilm_ill; ipif_t *ipif; - ip6i_t *ip6i; /* * We need to place a router alert option in this packet. The length @@ -2605,30 +2620,14 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr) ASSERT(ill->ill_isv6); - /* - * We need to make sure that this packet does not get load balanced. - * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and - * ip_newroute_ipif_v6 knows how to handle such packets. - * If it gets load balanced, switches supporting MLD snooping - * (in the future) will send the packet that it receives for this - * multicast group to the interface that we are sending on. As we have - * joined the multicast group on this ill, by sending the packet out - * on this ill, we receive all the packets back on this ill. - */ - size += sizeof (ip6i_t) + router_alert_length; + size += router_alert_length; mp = allocb(size, BPRI_HI); if (mp == NULL) return; bzero(mp->b_rptr, size); mp->b_wptr = mp->b_rptr + size; - ip6i = (ip6i_t *)mp->b_rptr; - ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - ip6i->ip6i_nxt = IPPROTO_RAW; - ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT; - ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; - - ip6h = (ip6_t *)&ip6i[1]; + ip6h = (ip6_t *)mp->b_rptr; ip6hbh = (struct ip6_hbh *)&ip6h[1]; ip6router = (struct ip6_opt_router *)&ip6hbh[1]; /* @@ -2698,7 +2697,6 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist) in6_addr_t *srcarray; ip6_t *ip6h; ip6_hbh_t *ip6hbh; - ip6i_t *ip6i; struct ip6_opt_router *ip6router; size_t size, optlen, padlen, icmpsize, rsize; ipif_t *ipif; @@ -2707,6 +2705,8 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist) mrec_t *next_reclist = reclist; boolean_t morepkts; + ASSERT(IAM_WRITER_ILL(ill)); + /* If there aren't any records, there's nothing to send */ if (reclist == NULL) return; @@ -2743,6 +2743,14 @@ nextpkt: int srcspace, srcsperpkt; srcspace = ill->ill_max_frag - (size + sizeof (mld2mar_t)); + + /* + * Skip if there's not even enough room in + * a single packet to send something useful. + */ + if (srcspace <= sizeof (in6_addr_t)) + continue; + srcsperpkt = srcspace / sizeof (in6_addr_t); /* * Increment icmpsize and size, because we will @@ -2787,30 +2795,13 @@ nextpkt: size += rsize; } - /* - * We need to make sure that this packet does not get load balanced. - * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and - * ip_newroute_ipif_v6 know how to handle such packets. - * If it gets load balanced, switches supporting MLD snooping - * (in the future) will send the packet that it receives for this - * multicast group to the interface that we are sending on. As we have - * joined the multicast group on this ill, by sending the packet out - * on this ill, we receive all the packets back on this ill. - */ - size += sizeof (ip6i_t); mp = allocb(size, BPRI_HI); if (mp == NULL) goto free_reclist; bzero(mp->b_rptr, size); mp->b_wptr = mp->b_rptr + size; - ip6i = (ip6i_t *)mp->b_rptr; - ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - ip6i->ip6i_nxt = IPPROTO_RAW; - ip6i->ip6i_flags = IP6I_ATTACH_IF; - ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; - - ip6h = (ip6_t *)&(ip6i[1]); + ip6h = (ip6_t *)mp->b_rptr; ip6hbh = (ip6_hbh_t *)&(ip6h[1]); ip6router = (struct ip6_opt_router *)&(ip6hbh[1]); mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen); @@ -3102,3 +3093,64 @@ mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist) return (rtnmrec); } + +/* + * Convenience routine to signal the restart-timer thread. + */ +static void +mcast_signal_restart_thread(ip_stack_t *ipst) +{ + mutex_enter(&ipst->ips_mrt_lock); + ipst->ips_mrt_flags |= IP_MRT_RUN; + cv_signal(&ipst->ips_mrt_cv); + mutex_exit(&ipst->ips_mrt_lock); +} + +/* + * Thread to restart IGMP/MLD timers. See the comment in igmp_joingroup() for + * the story behind this unfortunate thread. + */ +void +mcast_restart_timers_thread(ip_stack_t *ipst) +{ + int next; + char name[64]; + callb_cpr_t cprinfo; + + (void) snprintf(name, sizeof (name), "mcast_restart_timers_thread_%d", + ipst->ips_netstack->netstack_stackid); + CALLB_CPR_INIT(&cprinfo, &ipst->ips_mrt_lock, callb_generic_cpr, name); + + for (;;) { + mutex_enter(&ipst->ips_mrt_lock); + while (!(ipst->ips_mrt_flags & (IP_MRT_STOP|IP_MRT_RUN))) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(&ipst->ips_mrt_cv, &ipst->ips_mrt_lock); + CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_mrt_lock); + } + if (ipst->ips_mrt_flags & IP_MRT_STOP) + break; + ipst->ips_mrt_flags &= ~IP_MRT_RUN; + mutex_exit(&ipst->ips_mrt_lock); + + mutex_enter(&ipst->ips_igmp_timer_lock); + next = ipst->ips_igmp_deferred_next; + ipst->ips_igmp_deferred_next = INFINITY; + mutex_exit(&ipst->ips_igmp_timer_lock); + + if (next != INFINITY) + igmp_start_timers(next, ipst); + + mutex_enter(&ipst->ips_mld_timer_lock); + next = ipst->ips_mld_deferred_next; + ipst->ips_mld_deferred_next = INFINITY; + mutex_exit(&ipst->ips_mld_timer_lock); + if (next != INFINITY) + mld_start_timers(next, ipst); + } + + ipst->ips_mrt_flags |= IP_MRT_DONE; + cv_signal(&ipst->ips_mrt_done_cv); + CALLB_CPR_EXIT(&cprinfo); /* drops ips_mrt_lock */ + thread_exit(); +} diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 1d0bcf37de..dd87a09974 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -170,11 +170,14 @@ typedef struct listptr_s listptr_t; */ typedef struct iproutedata_s { uint_t ird_idx; + uint_t ird_flags; /* see below */ listptr_t ird_route; /* ipRouteEntryTable */ listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ listptr_t ird_attrs; /* ipRouteAttributeTable */ } iproutedata_t; +#define IRD_REPORT_TESTHIDDEN 0x01 /* include IRE_MARK_TESTHIDDEN routes */ + /* * Cluster specific hooks. These should be NULL when booted as a non-cluster */ @@ -228,31 +231,27 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any * MT level protection given by STREAMS. IP uses a combination of its own * internal serialization mechanism and standard Solaris locking techniques. - * The internal serialization is per phyint (no IPMP) or per IPMP group. - * This is used to serialize plumbing operations, IPMP operations, certain - * multicast operations, most set ioctls, igmp/mld timers etc. + * The internal serialization is per phyint. This is used to serialize + * plumbing operations, certain multicast operations, most set ioctls, + * igmp/mld timers etc. * * Plumbing is a long sequence of operations involving message * exchanges between IP, ARP and device drivers. Many set ioctls are typically * involved in plumbing operations. A natural model is to serialize these * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in * parallel without any interference. But various set ioctls on hme0 are best - * serialized. However if the system uses IPMP, the operations are easier if - * they are serialized on a per IPMP group basis since IPMP operations - * happen across ill's of a group. Thus the lowest common denominator is to - * serialize most set ioctls, multicast join/leave operations, IPMP operations - * igmp/mld timer operations, and processing of DLPI control messages received - * from drivers on a per IPMP group basis. If the system does not employ - * IPMP the serialization is on a per phyint basis. This serialization is - * provided by the ipsq_t and primitives operating on this. Details can - * be found in ip_if.c above the core primitives operating on ipsq_t. + * serialized, along with multicast join/leave operations, igmp/mld timer + * operations, and processing of DLPI control messages received from drivers + * on a per phyint basis. This serialization is provided by the ipsq_t and + * primitives operating on this. Details can be found in ip_if.c above the + * core primitives operating on ipsq_t. * * Lookups of an ipif or ill by a thread return a refheld ipif / ill. * Simiarly lookup of an ire by a thread also returns a refheld ire. * In addition ipif's and ill's referenced by the ire are also indirectly * refheld. Thus no ipif or ill can vanish nor can critical parameters like * the ipif's address or netmask change as long as an ipif is refheld - * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the + * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the * address of an ipif has to go through the ipsq_t. This ensures that only * 1 such exclusive operation proceeds at any time on the ipif. It then * deletes all ires associated with this ipif, and waits for all refcnts @@ -281,33 +280,24 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * - ill_g_lock: This is a global reader/writer lock. Protects the following * * The AVL tree based global multi list of all ills. * * The linked list of all ipifs of an ill - * * The <ill-ipsq> mapping - * * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next - * * The illgroup list threaded by ill_group_next. + * * The <ipsq-xop> mapping * * <ill-phyint> association * Insertion/deletion of an ill in the system, insertion/deletion of an ipif - * into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion - * of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill - * will all have to hold the ill_g_lock as writer for the actual duration - * of the insertion/deletion/change. More details about the <ill-ipsq> mapping - * may be found in the IPMP section. + * into an ill, changing the <ipsq-xop> mapping of an ill, changing the + * <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as + * writer for the actual duration of the insertion/deletion/change. * * - ill_lock: This is a per ill mutex. - * It protects some members of the ill and is documented below. - * It also protects the <ill-ipsq> mapping - * It also protects the illgroup list threaded by ill_group_next. + * It protects some members of the ill_t struct; see ip.h for details. * It also protects the <ill-phyint> assoc. * It also protects the list of ipifs hanging off the ill. * * - ipsq_lock: This is a per ipsq_t mutex lock. - * This protects all the other members of the ipsq struct except - * ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock + * This protects some members of the ipsq_t struct; see ip.h for details. + * It also protects the <ipsq-ipxop> mapping * - * - illgrp_lock: This is a per ill_group mutex lock. - * The only thing it protects is the illgrp_ill_schednext member of ill_group - * which dictates which is the next ill in an ill_group that is to be chosen - * for sending outgoing packets, through creation of an IRE_CACHE that - * references this ill. + * - ipx_lock: This is a per ipxop_t mutex lock. + * This protects some members of the ipxop_t struct; see ip.h for details. * * - phyint_lock: This is a per phyint mutex lock. Protects just the * phyint_flags @@ -335,27 +325,24 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * Note, it is only necessary to take this lock if the ill_usesrc_grp_next * field is changing state i.e from NULL to non-NULL or vice-versa. For * example, it is not necessary to take this lock in the initial portion - * of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and - * ip_sioctl_flags since the these operations are executed exclusively and - * that ensures that the "usesrc group state" cannot change. The "usesrc - * group state" change can happen only in the latter part of - * ip_sioctl_slifusesrc and in ill_delete. + * of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these + * operations are executed exclusively and that ensures that the "usesrc + * group state" cannot change. The "usesrc group state" change can happen + * only in the latter part of ip_sioctl_slifusesrc and in ill_delete. * - * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications. + * Changing <ill-phyint>, <ipsq-xop> assocications: * * To change the <ill-phyint> association, the ill_g_lock must be held * as writer, and the ill_locks of both the v4 and v6 instance of the ill * must be held. * - * To change the <ill-ipsq> association the ill_g_lock must be held as writer - * and the ill_lock of the ill in question must be held. - * - * To change the <ill-illgroup> association the ill_g_lock must be held as - * writer and the ill_lock of the ill in question must be held. + * To change the <ipsq-xop> association, the ill_g_lock must be held as + * writer, the ipsq_lock must be held, and one must be writer on the ipsq. + * This is only done when ills are added or removed from IPMP groups. * * To add or delete an ipif from the list of ipifs hanging off the ill, * ill_g_lock (writer) and ill_lock must be held and the thread must be - * a writer on the associated ipsq,. + * a writer on the associated ipsq. * * To add or delete an ill to the system, the ill_g_lock must be held as * writer and the thread must be a writer on the associated ipsq. @@ -367,8 +354,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * * Some lock hierarchy scenarios are listed below. * - * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock - * ill_g_lock -> illgrp_lock -> ill_lock + * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock * ill_g_lock -> ill_lock(s) -> phyint_lock * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock * ill_g_lock -> ip_addr_avail_lock @@ -587,8 +573,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * back, i.e. the loopback which is required since neither Ethernet drivers * nor Ethernet hardware loops them back. This is the case when the normal * routes (ignoring IREs with different zoneids) would send out the packet on - * the same ill (or ill group) as the ill with which is IRE_LOCAL is - * associated. + * the same ill as the ill with which is IRE_LOCAL is associated. * * Multiple zones can share a common broadcast address; typically all zones * share the 255.255.255.255 address. Incoming as well as locally originated @@ -695,8 +680,8 @@ static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *, mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *); static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *, ip_stack_t *); -static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *, - uint16_t *); +static boolean_t ip_rput_fragment(ill_t *, ill_t *, mblk_t **, ipha_t *, + uint32_t *, uint16_t *); int ip_snmp_get(queue_t *, mblk_t *, int); static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *, mib2_ipIfStatsEntry_t *, ip_stack_t *); @@ -723,9 +708,9 @@ static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *, ip_stack_t *ipst); -static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, +static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int, ip_stack_t *ipst); -static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, +static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int, ip_stack_t *ipst); static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); @@ -775,8 +760,6 @@ static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); -static int ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t, - cred_t *); static int ip_squeue_switch(int); static void *ip_kstat_init(netstackid_t, ip_stack_t *); @@ -946,8 +929,6 @@ static ipndp_t lcl_ndp_arr[] = { { ip_cgtp_filter_get, ip_cgtp_filter_set, NULL, "ip_cgtp_filter" }, #define IPNDP_IPMP_HOOK_OFFSET 10 - { ip_param_generic_get, ipmp_hook_emulation_set, NULL, - "ipmp_hook_emulation" }, { ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug, "ip_debug" }, }; @@ -984,20 +965,19 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, - /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, + /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD, IF_CMD, ip_sioctl_get_addr, NULL }, /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), - IPI_GET_CMD | IPI_REPL, - IF_CMD, ip_sioctl_get_dstaddr, NULL }, + IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL }, /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), - IPI_PRIV | IPI_WR | IPI_REPL, + IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), - IPI_MODOK | IPI_GET_CMD | IPI_REPL, + IPI_MODOK | IPI_GET_CMD, IF_CMD, ip_sioctl_get_flags, NULL }, /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, @@ -1009,31 +989,28 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_mtu, NULL }, - /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, + /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD, IF_CMD, ip_sioctl_get_mtu, NULL }, /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), - IPI_GET_CMD | IPI_REPL, - IF_CMD, ip_sioctl_get_brdaddr, NULL }, + IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL }, /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_brdaddr, NULL }, /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), - IPI_GET_CMD | IPI_REPL, - IF_CMD, ip_sioctl_get_netmask, NULL }, + IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL }, /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), - IPI_GET_CMD | IPI_REPL, - IF_CMD, ip_sioctl_get_metric, NULL }, + IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL }, /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, IF_CMD, ip_sioctl_metric, NULL }, /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* See 166-168 below for extended SIOC*XARP ioctls */ - /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV, + /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR, ARP_CMD, ip_sioctl_arp, NULL }, - /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL, + /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD, ARP_CMD, ip_sioctl_arp, NULL }, - /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV, + /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR, ARP_CMD, ip_sioctl_arp, NULL }, /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, @@ -1098,21 +1075,19 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, - /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL, + /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD, MISC_CMD, ip_sioctl_get_ifnum, NULL }, - /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, + /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD, IF_CMD, ip_sioctl_get_muxid, NULL }, /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - IF_CMD, ip_sioctl_muxid, NULL }, + IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL }, /* Both if and lif variants share same func */ - /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, + /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD, IF_CMD, ip_sioctl_get_lifindex, NULL }, /* Both if and lif variants share same func */ /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - IF_CMD, ip_sioctl_slifindex, NULL }, + IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL }, /* copyin size cannot be coded for SIOCGIFCONF */ /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD, @@ -1136,28 +1111,25 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_removeif, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif, ip_sioctl_removeif_restart }, /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), - IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL, + IPI_GET_CMD | IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_addif, NULL }, #define SIOCLIFADDR_NDX 112 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_addr, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL }, /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_dstaddr, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL }, /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), - IPI_GET_CMD | IPI_MODOK | IPI_REPL, + IPI_GET_CMD | IPI_MODOK, LIF_CMD, ip_sioctl_get_flags, NULL }, /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, @@ -1167,58 +1139,48 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { ip_sioctl_get_lifconf, NULL }, /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_mtu, NULL }, - /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, + /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD, LIF_CMD, ip_sioctl_get_mtu, NULL }, /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_brdaddr, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL }, /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_brdaddr, NULL }, /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_netmask, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL }, /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_metric, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL }, /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_metric, NULL }, /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL, + IPI_PRIV | IPI_WR | IPI_MODOK, LIF_CMD, ip_sioctl_slifname, ip_sioctl_slifname_restart }, - /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL, + /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD, MISC_CMD, ip_sioctl_get_lifnum, NULL }, /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_muxid, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL }, /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_muxid, NULL }, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL }, /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_lifindex, 0 }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 }, /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_slifindex, 0 }, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 }, /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_token, NULL }, /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_token, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL }, /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_subnet, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL }, /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_lnkinfo, NULL }, /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, LIF_CMD, ip_siocdelndp_v6, NULL }, /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, @@ -1231,8 +1193,8 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { MISC_CMD, ip_sioctl_tonlink, NULL }, /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, MISC_CMD, ip_sioctl_tmysite, NULL }, - /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL, - TUN_CMD, ip_sioctl_tunparam, NULL }, + /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), 0, + TUN_CMD, ip_sioctl_tunparam, NULL }, /* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req), IPI_PRIV | IPI_WR, TUN_CMD, ip_sioctl_tunparam, NULL }, @@ -1243,29 +1205,24 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, - /* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_move, ip_sioctl_move }, - /* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_move, ip_sioctl_move }, + /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, + + /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD | + IPI_WR, LIF_CMD, ip_sioctl_get_binding, NULL }, /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_groupname, NULL }, - /* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_oindex, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL }, + /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t), + IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL }, /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, - /* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR, - LIF_CMD, ip_sioctl_slifoindex, NULL }, + /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* These are handled in ip_sioctl_copyin_setup itself */ /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, @@ -1277,22 +1234,20 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, ip_sioctl_get_lifconf, NULL }, - /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV, + /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR, XARP_CMD, ip_sioctl_arp, NULL }, - /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL, + /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD, XARP_CMD, ip_sioctl_arp, NULL }, - /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV, + /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR, XARP_CMD, ip_sioctl_arp, NULL }, /* SIOCPOPSOCKFS is not handled by IP */ /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), - IPI_GET_CMD | IPI_REPL, - LIF_CMD, ip_sioctl_get_lifzone, NULL }, + IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL }, /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), - IPI_PRIV | IPI_WR | IPI_REPL, - LIF_CMD, ip_sioctl_slifzone, + IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone, ip_sioctl_slifzone_restart }, /* 172-174 are SCTP ioctls and not handled by IP */ /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, @@ -1315,8 +1270,7 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { MSFILT_CMD, ip_sioctl_msfilter, NULL }, /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR, MSFILT_CMD, ip_sioctl_msfilter, NULL }, - /* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD, - ip_sioctl_set_ipmpfailback, NULL }, + /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* SIOCSENABLESDP is handled by SDP */ /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL }, /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL }, @@ -1326,7 +1280,7 @@ int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); ip_ioctl_cmd_t ip_misc_ioctl_table[] = { { OSIOCGTUNPARAM, sizeof (struct old_iftun_req), - IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL }, + IPI_GET_CMD, TUN_CMD, ip_sioctl_tunparam, NULL }, { OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR, TUN_CMD, ip_sioctl_tunparam, NULL }, { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, @@ -1336,11 +1290,11 @@ ip_ioctl_cmd_t ip_misc_ioctl_table[] = { { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL }, { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, { IP_IOCTL, 0, 0, 0, NULL, NULL }, - { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD, + { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD, MISC_CMD, mrt_ioctl}, - { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD, + { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD, MISC_CMD, mrt_ioctl}, - { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD, + { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD, MISC_CMD, mrt_ioctl} }; @@ -1629,8 +1583,6 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, ipif_t *ipif; mblk_t *first_mp; ipsec_in_t *ii; - ire_t *src_ire; - boolean_t onlink; timestruc_t now; uint32_t ill_index; ip_stack_t *ipst; @@ -2014,59 +1966,6 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, if (!IS_SIMPLE_IPH(ipha)) icmp_options_update(ipha); - /* - * ICMP echo replies should go out on the same interface - * the request came on as probes used by in.mpathd for detecting - * NIC failures are ECHO packets. We turn-off load spreading - * by setting ipsec_in_attach_if to B_TRUE, which is copied - * to ipsec_out_attach_if by ipsec_in_to_out called later in this - * function. This is in turn handled by ip_wput and ip_newroute - * to make sure that the packet goes out on the interface it came - * in on. If we don't turnoff load spreading, the packets might get - * dropped if there are no non-FAILED/INACTIVE interfaces for it - * to go out and in.mpathd would wrongly detect a failure or - * mis-detect a NIC failure for link failure. As load spreading - * can happen only if ill_group is not NULL, we do only for - * that case and this does not affect the normal case. - * - * We turn off load spreading only on echo packets that came from - * on-link hosts. If the interface route has been deleted, this will - * not be enforced as we can't do much. For off-link hosts, as the - * default routes in IPv4 does not typically have an ire_ipif - * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute. - * Moreover, expecting a default route through this interface may - * not be correct. We use ipha_dst because of the swap above. - */ - onlink = B_FALSE; - if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) { - /* - * First, we need to make sure that it is not one of our - * local addresses. If we set onlink when it is one of - * our local addresses, we will end up creating IRE_CACHES - * for one of our local addresses. Then, we will never - * accept packets for them afterwards. - */ - src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (src_ire == NULL) { - ipif = ipif_get_next_ipif(NULL, ill); - if (ipif == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(mp); - return; - } - src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, - IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, - NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE, ipst); - ipif_refrele(ipif); - if (src_ire != NULL) { - onlink = B_TRUE; - ire_refrele(src_ire); - } - } else { - ire_refrele(src_ire); - } - } if (!mctl_present) { /* * This packet should go out the same way as it @@ -2085,20 +1984,7 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, /* This is not a secure packet */ ii->ipsec_in_secure = B_FALSE; - if (onlink) { - ii->ipsec_in_attach_if = B_TRUE; - ii->ipsec_in_ill_index = - ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = - recv_ill->ill_phyint->phyint_ifindex; - } first_mp->b_cont = mp; - } else if (onlink) { - ii = (ipsec_in_t *)first_mp->b_rptr; - ii->ipsec_in_attach_if = B_TRUE; - ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ } else { ii = (ipsec_in_t *)first_mp->b_rptr; ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ @@ -3733,7 +3619,6 @@ ipif_dup_recovery(void *arg) ill_t *ill = ipif->ipif_ill; mblk_t *arp_add_mp; mblk_t *arp_del_mp; - area_t *area; ip_stack_t *ipst = ill->ill_ipst; ipif->ipif_recovery_id = 0; @@ -3744,12 +3629,13 @@ ipif_dup_recovery(void *arg) */ if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) || (ipif->ipif_flags & IPIF_POINTOPOINT) || - (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))) { + (ipif->ipif_state_flags & (IPIF_CONDEMNED))) { /* No reason to try to bring this address back. */ return; } - if ((arp_add_mp = ipif_area_alloc(ipif)) == NULL) + /* ACE_F_UNVERIFIED restarts DAD */ + if ((arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) goto alloc_fail; if (ipif->ipif_arp_del_mp == NULL) { @@ -3758,10 +3644,6 @@ ipif_dup_recovery(void *arg) ipif->ipif_arp_del_mp = arp_del_mp; } - /* Setting the 'unverified' flag restarts DAD */ - area = (area_t *)arp_add_mp->b_rptr; - area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | - ACE_F_UNVERIFIED; putnext(ill->ill_rq, arp_add_mp); return; @@ -3873,6 +3755,7 @@ ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) EINPROGRESS) { ipif->ipif_addr_ready = 1; (void) ipif_up_done(ipif); + ASSERT(ill->ill_move_ipif == NULL); } continue; } @@ -3893,6 +3776,7 @@ ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) ill->ill_net_type == IRE_IF_RESOLVER && !(ipif->ipif_state_flags & IPIF_CONDEMNED) && ipst->ips_ip_dup_recovery > 0) { + ASSERT(ipif->ipif_recovery_id == 0); ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); } @@ -4196,8 +4080,9 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, { mblk_t *mp; ip_pktinfo_t *pinfo; - ipha_t *ipha; + ipha_t *ipha; struct ether_header *pether; + boolean_t ipmp_ill_held = B_FALSE; mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED); if (mp == NULL) { @@ -4205,12 +4090,53 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, return (data_mp); } - ipha = (ipha_t *)data_mp->b_rptr; + ipha = (ipha_t *)data_mp->b_rptr; pinfo = (ip_pktinfo_t *)mp->b_rptr; bzero(pinfo, sizeof (ip_pktinfo_t)); pinfo->ip_pkt_flags = (uchar_t)flags; pinfo->ip_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */ + pether = (struct ether_header *)((char *)ipha + - sizeof (struct ether_header)); + + /* + * Make sure the interface is an ethernet type, since this option + * is currently supported only on this type of interface. Also make + * sure we are pointing correctly above db_base. + */ + if ((flags & IPF_RECVSLLA) && + ((uchar_t *)pether >= data_mp->b_datap->db_base) && + (ill->ill_type == IFT_ETHER) && + (ill->ill_net_type == IRE_IF_RESOLVER)) { + pinfo->ip_pkt_slla.sdl_type = IFT_ETHER; + bcopy(pether->ether_shost.ether_addr_octet, + pinfo->ip_pkt_slla.sdl_data, ETHERADDRL); + } else { + /* + * Clear the bit. Indicate to upper layer that IP is not + * sending this ancillary info. + */ + pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA; + } + + /* + * If `ill' is in an IPMP group, use the IPMP ill to determine + * IPF_RECVIF and IPF_RECVADDR. (This currently assumes that + * IPF_RECVADDR support on test addresses is not needed.) + * + * Note that `ill' may already be an IPMP ill if e.g. we're + * processing a packet looped back to an IPMP data address + * (since those IRE_LOCALs are tied to IPMP ills). + */ + if (IS_UNDER_IPMP(ill)) { + if ((ill = ipmp_ill_hold_ipmp_ill(ill)) == NULL) { + ip1dbg(("ip_add_info: cannot hold IPMP ill.\n")); + freemsg(mp); + return (data_mp); + } + ipmp_ill_held = B_TRUE; + } + if (flags & (IPF_RECVIF | IPF_RECVADDR)) pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex; if (flags & IPF_RECVADDR) { @@ -4239,7 +4165,7 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL | IRE_LOOPBACK, ipif, zoneid, NULL, - MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst); + MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); if (ire == NULL) { /* * packet must have come on a different @@ -4276,29 +4202,8 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, } } - pether = (struct ether_header *)((char *)ipha - - sizeof (struct ether_header)); - /* - * Make sure the interface is an ethernet type, since this option - * is currently supported only on this type of interface. Also make - * sure we are pointing correctly above db_base. - */ - - if ((flags & IPF_RECVSLLA) && - ((uchar_t *)pether >= data_mp->b_datap->db_base) && - (ill->ill_type == IFT_ETHER) && - (ill->ill_net_type == IRE_IF_RESOLVER)) { - - pinfo->ip_pkt_slla.sdl_type = IFT_ETHER; - bcopy((uchar_t *)pether->ether_shost.ether_addr_octet, - (uchar_t *)pinfo->ip_pkt_slla.sdl_data, ETHERADDRL); - } else { - /* - * Clear the bit. Indicate to upper layer that IP is not - * sending this ancillary info. - */ - pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA; - } + if (ipmp_ill_held) + ill_refrele(ill); mp->b_datap->db_type = M_CTL; mp->b_wptr += sizeof (ip_pktinfo_t); @@ -4946,8 +4851,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, } } - if (dst_ire != NULL && - dst_ire->ire_type == IRE_LOCAL && + if (dst_ire != NULL && dst_ire->ire_type == IRE_LOCAL && dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) { /* * If the IRE belongs to a different zone, look for a matching @@ -4983,7 +4887,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, * Pick a source address so that a proper inbound * load spreading would happen. */ - ill_t *dst_ill = dst_ire->ire_ipif->ipif_ill; + ill_t *ire_ill = dst_ire->ire_ipif->ipif_ill; ipif_t *src_ipif = NULL; ire_t *ipif_ire; @@ -4998,10 +4902,10 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, * found above so that upper layers know that the * destination address is a broadcast address. * - * 2) If this is part of a group, select a better - * source address so that better inbound load - * balancing happens. Do the same if the ipif - * is DEPRECATED. + * 2) If the ipif is DEPRECATED, select a better + * source address. Similarly, if the ipif is on + * the IPMP meta-interface, pick a source address + * at random to improve inbound load spreading. * * 3) If the outgoing interface is part of a usesrc * group, then try selecting a source address from @@ -5011,9 +4915,9 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, dst_ire->ire_zoneid != ALL_ZONES) || (!(dst_ire->ire_flags & RTF_SETSRC)) && (!(dst_ire->ire_type & IRE_BROADCAST) && - ((dst_ill->ill_group != NULL) || + (IS_IPMP(ire_ill) || (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || - (dst_ill->ill_usesrc_ifindex != 0)))) { + (ire_ill->ill_usesrc_ifindex != 0)))) { /* * If the destination is reachable via a * given gateway, the selected source address @@ -5035,7 +4939,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, */ ipaddr_t saddr = dst_ire->ire_ipif->ipif_src_addr; - src_ipif = ipif_select_source(dst_ill, + src_ipif = ipif_select_source(ire_ill, saddr, zoneid); if (src_ipif != NULL) { if (IS_VNI(src_ipif->ipif_ill)) { @@ -5478,14 +5382,6 @@ ip_modclose(ill_t *ill) (void) ill_frag_timeout(ill, 0); /* - * If MOVE was in progress, clear the - * move_in_progress fields also. - */ - if (ill->ill_move_in_progress) { - ILL_CLEAR_MOVE(ill); - } - - /* * Call ill_delete to bring down the ipifs, ilms and ill on * this ill. Then wait for the refcnts to drop to zero. * ill_is_freeable checks whether the ill is really quiescent. @@ -5510,7 +5406,7 @@ ip_modclose(ill_t *ill) */ netstack_hold(ipst->ips_netstack); - /* qprocsoff is called in ill_delete_tail */ + /* qprocsoff is done via ill_delete_tail */ ill_delete_tail(ill); ASSERT(ill->ill_ipst == NULL); @@ -5755,6 +5651,11 @@ ip_stack_shutdown(netstackid_t stackid, void *arg) ipst->ips_capab_taskq_quit = B_TRUE; cv_signal(&ipst->ips_capab_taskq_cv); mutex_exit(&ipst->ips_capab_taskq_lock); + + mutex_enter(&ipst->ips_mrt_lock); + ipst->ips_mrt_flags |= IP_MRT_STOP; + cv_signal(&ipst->ips_mrt_cv); + mutex_exit(&ipst->ips_mrt_lock); } /* @@ -5766,6 +5667,9 @@ ip_stack_fini(netstackid_t stackid, void *arg) ip_stack_t *ipst = (ip_stack_t *)arg; int ret; +#ifdef NS_DEBUG + printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid); +#endif /* * At this point, all of the notifications that the events and * protocols are going away have been run, meaning that we can @@ -5779,9 +5683,14 @@ ip_stack_fini(netstackid_t stackid, void *arg) cv_destroy(&ipst->ips_capab_taskq_cv); list_destroy(&ipst->ips_capab_taskq_list); -#ifdef NS_DEBUG - printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid); -#endif + mutex_enter(&ipst->ips_mrt_lock); + while (!(ipst->ips_mrt_flags & IP_MRT_DONE)) + cv_wait(&ipst->ips_mrt_done_cv, &ipst->ips_mrt_lock); + mutex_destroy(&ipst->ips_mrt_lock); + cv_destroy(&ipst->ips_mrt_cv); + cv_destroy(&ipst->ips_mrt_done_cv); + + ipmp_destroy(ipst); rw_destroy(&ipst->ips_srcid_lock); ip_kstat_fini(stackid, ipst->ips_ip_mibkp); @@ -6038,10 +5947,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) "ip_cgtp_filter") == 0); ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_data = (caddr_t)&ipst->ips_ip_cgtp_filter; - ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_name, - "ipmp_hook_emulation") == 0); - ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_data = - (caddr_t)&ipst->ips_ipmp_hook_emulation; (void) ip_param_register(&ipst->ips_ip_g_nd, ipst->ips_param_arr, A_CNT(lcl_param_arr), @@ -6053,8 +5958,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) ipst->ips_ip6_kstat = ip6_kstat_init(stackid, &ipst->ips_ip6_statistics); - ipst->ips_ipmp_enable_failback = B_TRUE; - ipst->ips_ip_src_id = 1; rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL); @@ -6062,6 +5965,7 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) ip_net_init(ipst, ns); ipv4_hook_init(ipst); ipv6_hook_init(ipst); + ipmp_init(ipst); /* * Create the taskq dispatcher thread and initialize related stuff. @@ -6073,6 +5977,15 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) list_create(&ipst->ips_capab_taskq_list, sizeof (mblk_t), offsetof(mblk_t, b_next)); + /* + * Create the mcast_restart_timers_thread() worker thread. + */ + mutex_init(&ipst->ips_mrt_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ipst->ips_mrt_cv, NULL, CV_DEFAULT, NULL); + cv_init(&ipst->ips_mrt_done_cv, NULL, CV_DEFAULT, NULL); + ipst->ips_mrt_thread = thread_create(NULL, 0, + mcast_restart_timers_thread, ipst, 0, &p0, TS_RUN, minclsyspri); + major = mod_name_to_major(INET_NAME); (void) ldi_ident_from_major(major, &ipst->ips_ldi_ident); return (ipst); @@ -6109,6 +6022,24 @@ ip_dlpi_alloc(size_t len, t_uscalar_t prim) } /* + * Allocate and initialize a DLPI notification. (May be called as writer.) + */ +mblk_t * +ip_dlnotify_alloc(uint_t notification, uint_t data) +{ + dl_notify_ind_t *notifyp; + mblk_t *mp; + + if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL) + return (NULL); + + notifyp = (dl_notify_ind_t *)mp->b_rptr; + notifyp->dl_notification = notification; + notifyp->dl_data = data; + return (mp); +} + +/* * Debug formatting routine. Returns a character string representation of the * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer. @@ -7753,71 +7684,30 @@ ip_net_mask(ipaddr_t addr) } /* - * Select an ill for the packet by considering load spreading across - * a different ill in the group if dst_ill is part of some group. - */ -ill_t * -ip_newroute_get_dst_ill(ill_t *dst_ill) -{ - ill_t *ill; - - /* - * We schedule irrespective of whether the source address is - * INADDR_ANY or not. illgrp_scheduler returns a held ill. - */ - ill = illgrp_scheduler(dst_ill); - if (ill == NULL) - return (NULL); - - /* - * For groups with names ip_sioctl_groupname ensures that all - * ills are of same type. For groups without names, ifgrp_insert - * ensures this. - */ - ASSERT(dst_ill->ill_type == ill->ill_type); - - return (ill); -} - -/* - * Helper function for the IPIF_NOFAILOVER/ATTACH_IF interface attachment case. + * Helper ill lookup function used by IPsec. */ ill_t * -ip_grab_attach_ill(ill_t *ill, mblk_t *first_mp, int ifindex, boolean_t isv6, - ip_stack_t *ipst) +ip_grab_ill(mblk_t *first_mp, int ifindex, boolean_t isv6, ip_stack_t *ipst) { ill_t *ret_ill; ASSERT(ifindex != 0); + ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, ipst); - if (ret_ill == NULL || - (ret_ill->ill_phyint->phyint_flags & PHYI_OFFLINE)) { + if (ret_ill == NULL) { if (isv6) { - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip6_mib, - ipIfStatsOutDiscards); - } - ip1dbg(("ip_grab_attach_ill (IPv6): " - "bad ifindex %d.\n", ifindex)); + BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); + ip1dbg(("ip_grab_ill (IPv6): bad ifindex %d.\n", + ifindex)); } else { - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutDiscards); - } - ip1dbg(("ip_grab_attach_ill (IPv4): " - "bad ifindex %d.\n", ifindex)); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ip1dbg(("ip_grab_ill (IPv4): bad ifindex %d.\n", + ifindex)); } - if (ret_ill != NULL) - ill_refrele(ret_ill); freemsg(first_mp); return (NULL); } - return (ret_ill); } @@ -7859,7 +7749,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, ire_t *sire = NULL; mblk_t *first_mp; ire_t *save_ire; - ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER address */ ushort_t ire_marks = 0; boolean_t mctl_present; ipsec_out_t *io; @@ -7873,7 +7762,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, boolean_t multirt_is_resolvable; boolean_t multirt_resolve_next; boolean_t unspec_src; - boolean_t do_attach_ill = B_FALSE; boolean_t ip_nexthop = B_FALSE; tsol_ire_gw_secattr_t *attrp = NULL; tsol_gcgrp_t *gcgrp = NULL; @@ -7902,22 +7790,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, return; } - if (mctl_present && io->ipsec_out_attach_if) { - /* ip_grab_attach_ill returns a held ill */ - attach_ill = ip_grab_attach_ill(NULL, first_mp, - io->ipsec_out_ill_index, B_FALSE, ipst); - - /* Failure case frees things for us. */ - if (attach_ill == NULL) - return; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) - ire_marks = IRE_MARK_HIDDEN; - } if (mctl_present && io->ipsec_out_ip_nexthop) { ip_nexthop = B_TRUE; nexthop_addr = io->ipsec_out_nexthop_addr; @@ -7997,31 +7869,15 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, dst = nexthop_addr; } } - } else if (attach_ill == NULL) { + } else { ire = ire_ftable_lookup(dst, 0, 0, 0, NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp), MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT | MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE, ipst); - } else { - /* - * attach_ill is set only for communicating with - * on-link hosts. So, don't look for DEFAULT. - */ - ipif_t *attach_ipif; - - attach_ipif = ipif_get_next_ipif(NULL, attach_ill); - if (attach_ipif == NULL) { - ill_refrele(attach_ill); - goto icmp_err_ret; - } - ire = ire_ftable_lookup(dst, 0, 0, 0, attach_ipif, - &sire, zoneid, 0, MBLK_GETLABEL(mp), - MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL | - MATCH_IRE_SECATTR, ipst); - ipif_refrele(attach_ipif); } + ip3dbg(("ip_newroute: ire_ftable_lookup() " "returned ire %p, sire %p\n", (void *)ire, (void *)sire)); @@ -8122,8 +7978,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, } ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst); - if (attach_ill != NULL) - ill_refrele(attach_ill); goto icmp_err_ret; } @@ -8134,8 +7988,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, */ if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) { - if (attach_ill != NULL) - ill_refrele(attach_ill); goto icmp_err_ret; } /* @@ -8157,119 +8009,51 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, sire->ire_last_used_time = lbolt; } /* - * We have a route to reach the destination. - * - * 1) If the interface is part of ill group, try to get a new - * ill taking load spreading into account. - * - * 2) After selecting the ill, get a source address that - * might create good inbound load spreading. - * ipif_select_source does this for us. + * We have a route to reach the destination. Find the + * appropriate ill, then get a source address using + * ipif_select_source(). * - * If the application specified the ill (ifindex), we still - * load spread. Only if the packets needs to go out - * specifically on a given ill e.g. binding to - * IPIF_NOFAILOVER address, then we don't try to use a - * different ill for load spreading. + * If we are here trying to create an IRE_CACHE for an offlink + * destination and have an IRE_CACHE entry for VNI, then use + * ire_stq instead since VNI's queue is a black hole. */ - if (attach_ill == NULL) { - /* - * Don't perform outbound load spreading in the - * case of an RTF_MULTIRT route, as we actually - * typically want to replicate outgoing packets - * through particular interfaces. - */ - if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { - dst_ill = ire->ire_ipif->ipif_ill; - /* for uniformity */ - ill_refhold(dst_ill); - } else { - /* - * If we are here trying to create an IRE_CACHE - * for an offlink destination and have the - * IRE_CACHE for the next hop and the latter is - * using virtual IP source address selection i.e - * it's ire->ire_ipif is pointing to a virtual - * network interface (vni) then - * ip_newroute_get_dst_ll() will return the vni - * interface as the dst_ill. Since the vni is - * virtual i.e not associated with any physical - * interface, it cannot be the dst_ill, hence - * in such a case call ip_newroute_get_dst_ll() - * with the stq_ill instead of the ire_ipif ILL. - * The function returns a refheld ill. - */ - if ((ire->ire_type == IRE_CACHE) && - IS_VNI(ire->ire_ipif->ipif_ill)) - dst_ill = ip_newroute_get_dst_ill( - ire->ire_stq->q_ptr); - else - dst_ill = ip_newroute_get_dst_ill( - ire->ire_ipif->ipif_ill); - } - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute: " - "no dst ill for dst" - " %s\n", AF_INET, &dst); - } - goto icmp_err_ret; - } - } else { - dst_ill = ire->ire_ipif->ipif_ill; - /* for uniformity */ + if ((ire->ire_type == IRE_CACHE) && + IS_VNI(ire->ire_ipif->ipif_ill)) { + dst_ill = ire->ire_stq->q_ptr; ill_refhold(dst_ill); - /* - * We should have found a route matching ill as we - * called ire_ftable_lookup with MATCH_IRE_ILL. - * Rather than asserting, when there is a mismatch, - * we just drop the packet. - */ - if (dst_ill != attach_ill) { - ip0dbg(("ip_newroute: Packet dropped as " - "IPIF_NOFAILOVER ill is %s, " - "ire->ire_ipif->ipif_ill is %s\n", - attach_ill->ill_name, - dst_ill->ill_name)); - ill_refrele(attach_ill); - goto icmp_err_ret; + } else { + ill_t *ill = ire->ire_ipif->ipif_ill; + + if (IS_IPMP(ill)) { + dst_ill = + ipmp_illgrp_hold_next_ill(ill->ill_grp); + } else { + dst_ill = ill; + ill_refhold(dst_ill); } } - /* attach_ill can't go in loop. IPMP and CGTP are disjoint */ - if (attach_ill != NULL) { - ill_refrele(attach_ill); - attach_ill = NULL; - do_attach_ill = B_TRUE; + + if (dst_ill == NULL) { + if (ip_debug > 2) { + pr_addr_dbg("ip_newroute: no dst " + "ill for dst %s\n", AF_INET, &dst); + } + goto icmp_err_ret; } - ASSERT(dst_ill != NULL); ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name)); /* * Pick the best source address from dst_ill. * - * 1) If it is part of a multipathing group, we would - * like to spread the inbound packets across different - * interfaces. ipif_select_source picks a random source - * across the different ills in the group. - * - * 2) If it is not part of a multipathing group, we try - * to pick the source address from the destination + * 1) Try to pick the source address from the destination * route. Clustering assumes that when we have multiple * prefixes hosted on an interface, the prefix of the * source address matches the prefix of the destination * route. We do this only if the address is not * DEPRECATED. * - * 3) If the conn is in a different zone than the ire, we + * 2) If the conn is in a different zone than the ire, we * need to pick a source address from the right zone. - * - * NOTE : If we hit case (1) above, the prefix of the source - * address picked may not match the prefix of the - * destination routes prefix as ipif_select_source - * does not look at "dst" while picking a source - * address. - * If we want the same behavior as (2), we will need - * to change the behavior of ipif_select_source. */ ASSERT(src_ipif == NULL); if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { @@ -8287,7 +8071,8 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, if (src_ipif == NULL && (!unspec_src || ipha->ipha_src != INADDR_ANY)) { ire_marks |= IRE_MARK_USESRC_CHECK; - if ((dst_ill->ill_group != NULL) || + if (!IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && + IS_IPMP(ire->ire_ipif->ipif_ill) || (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || (connp != NULL && ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES) || @@ -8312,6 +8097,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, * as dst_ire source address. */ ipaddr_t saddr = ire->ire_ipif->ipif_src_addr; + src_ipif = ipif_select_source(dst_ill, saddr, zoneid); if (src_ipif == NULL) { @@ -8319,7 +8105,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, pr_addr_dbg("ip_newroute: " "no src for dst %s ", AF_INET, &dst); - printf("through interface %s\n", + printf("on interface %s\n", dst_ill->ill_name); } goto icmp_err_ret; @@ -8558,6 +8344,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, MULTIRT_DEBUG_TAG(first_mp); } } + ire_add_then_send(q, ire, xmit_mp); ire_refrele(save_ire); @@ -8766,7 +8553,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, "ip_newroute: no " "src for gw %s ", AF_INET, &gw); - printf("through " + printf("on " "interface %s\n", dst_ill->ill_name); } @@ -8867,16 +8654,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, areq = (areq_t *)mp->b_rptr; addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset); - if (do_attach_ill) { - /* - * This is bind to no failover case. - * arp packet also must go out on attach_ill. - */ - ASSERT(ipha->ipha_src != NULL); - *addrp = ipha->ipha_src; - } else { - *addrp = save_ire->ire_src_addr; - } + *addrp = save_ire->ire_src_addr; ire_refrele(save_ire); addrp = (ipaddr_t *)((char *)areq + @@ -9076,14 +8854,10 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, ipaddr_t *addrp; mblk_t *first_mp; ire_t *save_ire = NULL; - ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER */ ipif_t *src_ipif = NULL; ushort_t ire_marks = 0; ill_t *dst_ill = NULL; - boolean_t mctl_present; - ipsec_out_t *io; ipha_t *ipha; - int ihandle = 0; mblk_t *saved_mp; ire_t *fire = NULL; mblk_t *copy_mp = NULL; @@ -9117,10 +8891,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst), ipif->ipif_ill->ill_name)); - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - if (mctl_present) - io = (ipsec_out_t *)first_mp->b_rptr; - + first_mp = mp; + if (DB_TYPE(mp) == M_CTL) + mp = mp->b_cont; ipha = (ipha_t *)mp->b_rptr; /* @@ -9161,64 +8934,29 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, (void *)ipif, ntohl(dst), (void *)fire)); } - if (mctl_present && io->ipsec_out_attach_if) { - attach_ill = ip_grab_attach_ill(NULL, first_mp, - io->ipsec_out_ill_index, B_FALSE, ipst); - - /* Failure case frees things for us. */ - if (attach_ill == NULL) { - ipif_refrele(ipif); - if (fire != NULL) - ire_refrele(fire); - return; - } + /* + * Note: While we pick a dst_ill we are really only + * interested in the ill for load spreading. The source + * ipif is determined by source address selection below. + */ + if (IS_IPMP(ipif->ipif_ill)) { + ipmp_illgrp_t *illg = ipif->ipif_ill->ill_grp; - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) { - ire_marks = IRE_MARK_HIDDEN; - } - /* - * ip_wput passes the right ipif for IPIF_NOFAILOVER - * case. - */ - dst_ill = ipif->ipif_ill; - /* attach_ill has been refheld by ip_grab_attach_ill */ - ASSERT(dst_ill == attach_ill); + if (CLASSD(ipha_dst)) + dst_ill = ipmp_illgrp_hold_cast_ill(illg); + else + dst_ill = ipmp_illgrp_hold_next_ill(illg); } else { - /* - * If the interface belongs to an interface group, - * make sure the next possible interface in the group - * is used. This encourages load spreading among - * peers in an interface group. - * Note: load spreading is disabled for RTF_MULTIRT - * routes. - */ - if ((flags & RTF_MULTIRT) && (fire != NULL) && - (fire->ire_flags & RTF_MULTIRT)) { - /* - * Don't perform outbound load spreading - * in the case of an RTF_MULTIRT issued route, - * we actually typically want to replicate - * outgoing packets through particular - * interfaces. - */ - dst_ill = ipif->ipif_ill; - ill_refhold(dst_ill); - } else { - dst_ill = ip_newroute_get_dst_ill( - ipif->ipif_ill); - } - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute_ipif: " - "no dst ill for dst %s\n", - AF_INET, &dst); - } - goto err_ret; + dst_ill = ipif->ipif_ill; + ill_refhold(dst_ill); + } + + if (dst_ill == NULL) { + if (ip_debug > 2) { + pr_addr_dbg("ip_newroute_ipif: no dst ill " + "for dst %s\n", AF_INET, &dst); } + goto err_ret; } /* @@ -9242,7 +8980,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, unspec_src = (connp != NULL && connp->conn_unspec_src); - if (((!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) || + if (!IS_UNDER_IPMP(ipif->ipif_ill) && + (IS_IPMP(ipif->ipif_ill) || + (!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) || (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP || (connp != NULL && ipif->ipif_zoneid != zoneid && ipif->ipif_zoneid != ALL_ZONES)) && @@ -9256,7 +8996,7 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, "no src for dst %s", AF_INET, &dst); } - ip1dbg((" through interface %s\n", + ip1dbg((" on interface %s\n", dst_ill->ill_name)); goto err_ret; } @@ -9291,12 +9031,7 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, goto err_ret; if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) goto err_ret; - /* - * ihandle is needed when the ire is added to - * cache table. - */ save_ire = ire; - ihandle = save_ire->ire_ihandle; ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, " "flags %04x\n", @@ -9328,10 +9063,6 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, ipha->ipha_src = fire->ire_src_addr; } } else { - ASSERT((connp == NULL) || - (connp->conn_outgoing_ill != NULL) || - (connp->conn_dontroute) || - infop->ip_opt_ill_index != 0); /* * The only ways we can come here are: * 1) IP_BOUND_IF socket option is set @@ -9340,6 +9071,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, * In all cases, the new ire will not be added * into cache table. */ + ASSERT(connp == NULL || connp->conn_dontroute || + connp->conn_outgoing_ill != NULL || + infop->ip_opt_ill_index != 0); ire_marks |= IRE_MARK_NOADD; } @@ -9374,7 +9108,8 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, (save_ire != NULL ? save_ire->ire_mask : 0), (fire != NULL) ? /* Parent handle */ fire->ire_phandle : 0, - ihandle, /* Interface handle */ + (save_ire != NULL) ? /* Interface handle */ + save_ire->ire_ihandle : 0, (fire != NULL) ? (fire->ire_flags & (RTF_SETSRC | RTF_MULTIRT)) : 0, @@ -9533,7 +9268,8 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, (save_ire != NULL ? save_ire->ire_mask : 0), (fire != NULL) ? /* Parent handle */ fire->ire_phandle : 0, - ihandle, /* Interface handle */ + (save_ire != NULL) ? /* Interface handle */ + save_ire->ire_ihandle : 0, (fire != NULL) ? /* flags if any */ (fire->ire_flags & (RTF_SETSRC | RTF_MULTIRT)) : 0, @@ -9593,12 +9329,20 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, /* * Fill in the source and dest addrs for the resolver. * NOTE: this depends on memory layouts imposed by - * ill_init(). + * ill_init(). There are corner cases above where we + * might've created the IRE with an INADDR_ANY source + * address (e.g., if the zeroth ipif on an underlying + * ill in an IPMP group is 0.0.0.0, but another ipif + * on the ill has a usable test address). If so, tell + * ARP to use ipha_src as its sender address. */ areq = (areq_t *)mp->b_rptr; addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset); - *addrp = ire->ire_src_addr; + if (ire->ire_src_addr != INADDR_ANY) + *addrp = ire->ire_src_addr; + else + *addrp = ipha->ipha_src; addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset); *addrp = dst; @@ -10136,7 +9880,7 @@ ip_ipsec_load_complete(ipsec_stack_t *ipss) /* * Can't be used. Need to call svr4* -> optset directly. the leaf routine * determines the grp on which it has to become exclusive, queues the mp - * and sq draining restarts the optmgmt + * and IPSQ draining restarts the optmgmt */ static boolean_t ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp) @@ -10482,28 +10226,6 @@ ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option, } switch (option) { - case IP_DONTFAILOVER_IF: - /* - * This option is used by in.mpathd to ensure - * that IPMP probe packets only go out on the - * test interfaces. in.mpathd sets this option - * on the non-failover interfaces. - * For backward compatibility, this option - * implicitly sets IP_MULTICAST_IF, as used - * be done in bind(), so that ip_wput gets - * this ipif to send mcast packets. - */ - if (ipif != NULL) { - ASSERT(addr != INADDR_ANY); - connp->conn_nofailover_ill = ipif->ipif_ill; - connp->conn_multicast_ipif = ipif; - } else { - ASSERT(addr == INADDR_ANY); - connp->conn_nofailover_ill = NULL; - connp->conn_multicast_ipif = NULL; - } - break; - case IP_MULTICAST_IF: connp->conn_multicast_ipif = ipif; break; @@ -10551,7 +10273,7 @@ ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly, ill_refrele(ill); return (0); } - if (!ipif_lookup_zoneid_group(ill, connp->conn_zoneid, + if (!ipif_lookup_zoneid(ill, connp->conn_zoneid, 0, NULL)) { ill_refrele(ill); ill = NULL; @@ -10596,8 +10318,6 @@ setit: case IP_BOUND_IF: connp->conn_incoming_ill = ill; connp->conn_outgoing_ill = ill; - connp->conn_orig_bound_ifindex = (ill == NULL) ? - 0 : ifindex; break; case IP_MULTICAST_IF: @@ -10650,40 +10370,6 @@ setit: case IPV6_BOUND_IF: connp->conn_incoming_ill = ill; connp->conn_outgoing_ill = ill; - connp->conn_orig_bound_ifindex = (ill == NULL) ? - 0 : ifindex; - break; - - case IPV6_BOUND_PIF: - /* - * Limit all transmit to this ill. - * Unlike IPV6_BOUND_IF, using this option - * prevents load spreading and failover from - * happening when the interface is part of the - * group. That's why we don't need to remember - * the ifindex in orig_bound_ifindex as in - * IPV6_BOUND_IF. - */ - connp->conn_outgoing_pill = ill; - break; - - case IPV6_DONTFAILOVER_IF: - /* - * This option is used by in.mpathd to ensure - * that IPMP probe packets only go out on the - * test interfaces. in.mpathd sets this option - * on the non-failover interfaces. - */ - connp->conn_nofailover_ill = ill; - /* - * For backward compatibility, this option - * implicitly sets ip_multicast_ill as used in - * IPV6_MULTICAST_IF so that ip_wput gets - * this ill to send mcast packets. - */ - connp->conn_multicast_ill = ill; - connp->conn_orig_multicast_ifindex = (ill == NULL) ? - 0 : ifindex; break; case IPV6_MULTICAST_IF: @@ -10700,12 +10386,9 @@ setit: if (!checkonly) { if (ifindex == 0) { connp->conn_multicast_ill = NULL; - connp->conn_orig_multicast_ifindex = 0; connp->conn_multicast_ipif = NULL; } else if (ill != NULL) { connp->conn_multicast_ill = ill; - connp->conn_orig_multicast_ifindex = - ifindex; } } break; @@ -10867,8 +10550,7 @@ ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, if (secpolicy_ip_config(cr, B_FALSE) != 0) return (EPERM); /* FALLTHRU */ - case IP_MULTICAST_IF: - case IP_DONTFAILOVER_IF: { + case IP_MULTICAST_IF: { ipaddr_t addr = *i1; error = ip_opt_set_ipif(connp, addr, checkonly, name, @@ -11189,8 +10871,6 @@ ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, case IPPROTO_IPV6: switch (name) { case IPV6_BOUND_IF: - case IPV6_BOUND_PIF: - case IPV6_DONTFAILOVER_IF: error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, level, name, first_mp); if (error != 0) @@ -12288,11 +11968,10 @@ ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha, * frees mp on failure. */ static boolean_t -ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha, +ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, uint32_t *cksum_val, uint16_t *cksum_flags) { uint32_t frag_offset_flags; - ill_t *ill = (ill_t *)q->q_ptr; mblk_t *mp = *mpp; mblk_t *t_mp; ipaddr_t dst; @@ -12337,12 +12016,12 @@ ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha, /* * We utilize hardware computed checksum info only for UDP since - * IP fragmentation is a normal occurence for the protocol. In + * IP fragmentation is a normal occurrence for the protocol. In * addition, checksum offload support for IP fragments carrying * UDP payload is commonly implemented across network adapters. */ - ASSERT(ill != NULL); - if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && + ASSERT(recv_ill != NULL); + if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(recv_ill) && (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { mblk_t *mp1 = mp->b_cont; int32_t len; @@ -12808,7 +12487,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, goto ipoptions; /* Check the IP header checksum. */ - if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { + if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) { /* Clear the IP header h/w cksum flag */ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; } else if (!mctl_present) { @@ -12871,7 +12550,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, * Revert to software checksum calculation if the interface * isn't capable of checksum offload or if IPsec is present. */ - if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) + if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum) hck_flags = DB_CKSUMFLAGS(mp); if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) @@ -12958,8 +12637,11 @@ fragmented: * reassembled packet has a valid hardware computed * checksum information associated with it. */ - if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags)) + if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, &sum, + &reass_hck_flags)) { goto slow_done; + } + /* * Make sure that first_mp points back to mp as * the mp we came in with could have changed in @@ -13073,7 +12755,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, goto ipoptions; } else if (!mctl_present) { /* Check the IP header checksum. */ - if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { + if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) { /* Clear the IP header h/w cksum flag */ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; } else if (!mctl_present) { @@ -13159,7 +12841,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, * Revert to software checksum calculation if the interface * isn't capable of checksum offload or if IPsec is present. */ - if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) + if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum) hck_flags = DB_CKSUMFLAGS(mp); if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) @@ -13386,7 +13068,7 @@ ipoptions: u1 = ntohs(ipha->ipha_fragment_offset_and_flags); if (u1 & (IPH_MF | IPH_OFFSET)) { fragmented: - if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { + if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) { if (mctl_present) freeb(first_mp); goto slow_done; @@ -13530,7 +13212,7 @@ ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, goto ipoptions; } else { /* Check the IP header checksum. */ - if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, ill) && + if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill) && !mctl_present) { #define uph ((uint16_t *)ipha) sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + @@ -13644,7 +13326,7 @@ ipoptions: u1 = ntohs(ipha->ipha_fragment_offset_and_flags); if (u1 & (IPH_MF | IPH_OFFSET)) { fragmented: - if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) + if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) goto slow_done; /* * Make sure that first_mp points back to mp as @@ -13877,6 +13559,11 @@ ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst) return (B_TRUE); } +/* + * Handle the situation where a packet came in on `ill' but matched an IRE + * whose ire_rfq doesn't match `ill'. We return the IRE that should be used + * for interface statistics. + */ ire_t * ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) { @@ -13887,16 +13574,22 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) boolean_t strict_check = B_FALSE; /* - * This packet came in on an interface other than the one associated - * with the first ire we found for the destination address. We do - * another ire lookup here, using the ingress ill, to see if the - * interface is in an interface group. + * IPMP common case: if IRE and ILL are in the same group, there's no + * issue (e.g. packet received on an underlying interface matched an + * IRE_LOCAL on its associated group interface). + */ + if (ire->ire_rfq != NULL && + IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) { + return (ire); + } + + /* + * Do another ire lookup here, using the ingress ill, to see if the + * interface is in a usesrc group. * As long as the ills belong to the same group, we don't consider * them to be arriving on the wrong interface. Thus, if the switch * is doing inbound load spreading, we won't drop packets when the - * ip*_strict_dst_multihoming switch is on. Note, the same holds true - * for 'usesrc groups' where the destination address may belong to - * another interface to allow multipathing to happen. + * ip*_strict_dst_multihoming switch is on. * We also need to check for IPIF_UNNUMBERED point2point interfaces * where the local address may not be unique. In this case we were * at the mercy of the initial ire cache lookup and the IRE_LOCAL it @@ -13910,18 +13603,18 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) strict_check = B_TRUE; new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, - (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst); + (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst); } else { ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr)); if (ipst->ips_ipv6_strict_dst_multihoming) strict_check = B_TRUE; new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL, IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, - (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst); + (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst); } /* * If the same ire that was returned in ip_input() is found then this - * is an indication that interface groups are in use. The packet + * is an indication that usesrc groups are in use. The packet * arrived on a different ill in the group than the one associated with * the destination address. If a different ire was found then the same * IP address must be hosted on multiple ills. This is possible with @@ -14075,11 +13768,10 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) /* * Forwarding fastpath exception case: - * If either of the follwoing case is true, we take - * the slowpath + * If any of the following are true, we take the slowpath: * o forwarding is not enabled - * o incoming and outgoing interface are the same, or the same - * IPMP group + * o incoming and outgoing interface are the same, or in the same + * IPMP group. * o corresponding ire is in incomplete state * o packet needs fragmentation * o ARP cache is not resolved @@ -14090,8 +13782,7 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) pkt_len = ntohs(ipha->ipha_length); stq_ill = (ill_t *)ire->ire_stq->q_ptr; if (!(stq_ill->ill_flags & ILLF_ROUTER) || - (ill == stq_ill) || - (ill->ill_group != NULL && ill->ill_group == stq_ill->ill_group) || + (ill == stq_ill) || IS_IN_SAME_ILLGRP(ill, stq_ill) || (ire->ire_nce == NULL) || (pkt_len > ire->ire_max_frag) || ((fpmp = ire->ire_nce->nce_fp_mp) == NULL) || @@ -14185,11 +13876,10 @@ static void ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward) { - ill_group_t *ill_group; - ill_group_t *ire_group; queue_t *dev_q; ire_t *src_ire; ip_stack_t *ipst = ill->ill_ipst; + boolean_t same_illgrp = B_FALSE; ASSERT(ire->ire_stq != NULL); @@ -14200,11 +13890,8 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, * If the caller of this function is ip_fast_forward() skip the * next three checks as it does not apply. */ - if (from_ip_fast_forward) { - ill_group = ill->ill_group; - ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; + if (from_ip_fast_forward) goto skip; - } if (ll_multicast != 0) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); @@ -14230,13 +13917,10 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, goto drop_pkt; } - ill_group = ill->ill_group; - ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; /* * Check if we want to forward this one at this time. * We allow source routed packets on a host provided that - * they go out the same interface or same interface group - * as they came in on. + * they go out the same ill or illgrp as they came in on. * * XXX To be quicker, we may wish to not chase pointers to * get the ILLF_ROUTER flag and instead store the @@ -14245,11 +13929,12 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, * whenever the ILLF_ROUTER flag changes. */ skip: + same_illgrp = IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr); + if (((ill->ill_flags & - ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & - ILLF_ROUTER) == 0) && - !(ip_source_routed(ipha, ipst) && (ire->ire_rfq == q || - (ill_group != NULL && ill_group == ire_group)))) { + ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_ROUTER) == 0) && + !(ip_source_routed(ipha, ipst) && + (ire->ire_rfq == q || same_illgrp))) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); if (ip_source_routed(ipha, ipst)) { q = WR(q); @@ -14290,12 +13975,10 @@ skip: ire_t *nhop_ire = NULL; /* - * Check whether ire_rfq and q are from the same ill - * or if they are not same, they at least belong - * to the same group. If so, send redirects. + * Check whether ire_rfq and q are from the same ill or illgrp. + * If so, send redirects. */ - if ((ire->ire_rfq == q || - (ill_group != NULL && ill_group == ire_group)) && + if ((ire->ire_rfq == q || same_illgrp) && !ip_source_routed(ipha, ipst)) { nhop = (ire->ire_gateway_addr != 0 ? @@ -14396,26 +14079,15 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha, } /* * For multicast we have set dst to be INADDR_BROADCAST - * for delivering to all STREAMS. IRE_MARK_NORECV is really - * only for broadcast packets. + * for delivering to all STREAMS. */ if (!CLASSD(ipha->ipha_dst)) { ire_t *new_ire; ipif_t *ipif; - /* - * For ill groups, as the switch duplicates broadcasts - * across all the ports, we need to filter out and - * send up only one copy. There is one copy for every - * broadcast address on each ill. Thus, we look for a - * specific IRE on this ill and look at IRE_MARK_NORECV - * later to see whether this ill is eligible to receive - * them or not. ill_nominate_bcast_rcv() nominates only - * one set of IREs for receiving. - */ ipif = ipif_get_next_ipif(NULL, ill); if (ipif == NULL) { - ire_refrele(ire); +discard: ire_refrele(ire); freemsg(mp); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); return (NULL); @@ -14425,13 +14097,17 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha, ipif_refrele(ipif); if (new_ire != NULL) { - if (new_ire->ire_marks & IRE_MARK_NORECV) { - ire_refrele(ire); + /* + * If the matching IRE_BROADCAST is part of an IPMP + * group, then drop the packet unless our ill has been + * nominated to receive for the group. + */ + if (IS_IPMP(new_ire->ire_ipif->ipif_ill) && + new_ire->ire_rfq != q) { ire_refrele(new_ire); - freemsg(mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return (NULL); + goto discard; } + /* * In the special case of multirouted broadcast * packets, we unconditionally need to "gateway" @@ -14571,6 +14247,13 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, ntohs(ipha->ipha_length)); /* + * So that we don't end up with dups, only one ill an IPMP group is + * nominated to receive multicast traffic. + */ + if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast) + goto drop_pkt; + + /* * Forward packets only if we have joined the allmulti * group on this interface. */ @@ -14619,18 +14302,15 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, } } - ILM_WALKER_HOLD(ill); if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) { /* * This might just be caused by the fact that * multiple IP Multicast addresses map to the same * link layer multicast - no need to increment counter! */ - ILM_WALKER_RELE(ill); freemsg(mp); return (B_TRUE); } - ILM_WALKER_RELE(ill); done: ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp))); /* @@ -15498,8 +15178,8 @@ local: * broadcast ire. */ if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) { - if ((ire = ip_check_multihome(&ipha->ipha_dst, ire, - ill)) == NULL) { + ire = ip_check_multihome(&ipha->ipha_dst, ire, ill); + if (ire == NULL) { /* Drop packet */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); @@ -15935,19 +15615,12 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) ip1dbg(("ip_rput_dlpi_writer ..")); ill = (ill_t *)q->q_ptr; - ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); - + ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop); ASSERT(IAM_WRITER_ILL(ill)); ipst = ill->ill_ipst; - /* - * ipsq_pending_mp and ipsq_pending_ipif track each other. i.e. - * both are null or non-null. However we can assert that only - * after grabbing the ipsq_lock. So we don't make any assertion - * here and in other places in the code. - */ - ipif = ipsq->ipsq_pending_ipif; + ipif = ipsq->ipsq_xop->ipx_pending_ipif; /* * The current ioctl could have been aborted by the user and a new * ioctl to bring up another ill could have started. We could still @@ -16045,9 +15718,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) */ ASSERT(connp != NULL); q = CONNP_TO_WQ(connp); - if (ill->ill_move_in_progress) { - ILL_CLEAR_MOVE(ill); - } (void) ipif_down(ipif, NULL, NULL); /* error is set below the switch */ } @@ -16196,45 +15866,31 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * ill_dl_up(), which stopped ipif_up()'s processing. */ if (ill->ill_isv6) { - /* - * v6 interfaces. - * Unlike ARP which has to do another bind - * and attach, once we get here we are - * done with NDP. Except in the case of - * ILLF_XRESOLV, in which case we send an - * AR_INTERFACE_UP to the external resolver. - * If all goes well, the ioctl will complete - * in ip_rput(). If there's an error, we - * complete it here. - */ - if ((err = ipif_ndp_up(ipif)) == 0) { - if (ill->ill_flags & ILLF_XRESOLV) { - mutex_enter(&connp->conn_lock); - mutex_enter(&ill->ill_lock); - success = ipsq_pending_mp_add( - connp, ipif, q, mp1, 0); - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - if (success) { - err = ipif_resolver_up(ipif, - Res_act_initial); - if (err == EINPROGRESS) { - freemsg(mp); - return; - } - ASSERT(err != 0); - mp1 = ipsq_pending_mp_get(ipsq, - &connp); - ASSERT(mp1 != NULL); - } else { - /* conn has started closing */ - err = EINTR; - } - } else { /* Non XRESOLV interface */ - (void) ipif_resolver_up(ipif, + if (ill->ill_flags & ILLF_XRESOLV) { + mutex_enter(&connp->conn_lock); + mutex_enter(&ill->ill_lock); + success = ipsq_pending_mp_add(connp, ipif, q, + mp1, 0); + mutex_exit(&ill->ill_lock); + mutex_exit(&connp->conn_lock); + if (success) { + err = ipif_resolver_up(ipif, Res_act_initial); - err = ipif_up_done_v6(ipif); + if (err == EINPROGRESS) { + freemsg(mp); + return; + } + ASSERT(err != 0); + mp1 = ipsq_pending_mp_get(ipsq, &connp); + ASSERT(mp1 != NULL); + } else { + /* conn has started closing */ + err = EINTR; } + } else { /* Non XRESOLV interface */ + (void) ipif_resolver_up(ipif, Res_act_initial); + if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0) + err = ipif_up_done_v6(ipif); } } else if (ill->ill_net_type == IRE_IF_RESOLVER) { /* @@ -16275,14 +15931,31 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } } - if (ill->ill_up_ipifs) { - ill_group_cleanup(ill); + /* + * If we have a moved ipif to bring up, and everything has + * succeeded to this point, bring it up on the IPMP ill. + * Otherwise, leave it down -- the admin can try to bring it + * up by hand if need be. + */ + if (ill->ill_move_ipif != NULL) { + if (err != 0) { + ill->ill_move_ipif = NULL; + } else { + ipif = ill->ill_move_ipif; + ill->ill_move_ipif = NULL; + err = ipif_up(ipif, q, mp1); + if (err == EINPROGRESS) { + freemsg(mp); + return; + } + } } - break; + case DL_NOTIFY_IND: { dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr; ire_t *ire; + uint_t orig_mtu; boolean_t need_ire_walk_v4 = B_FALSE; boolean_t need_ire_walk_v6 = B_FALSE; @@ -16322,17 +15995,27 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * which it is being derived. */ mutex_enter(&ill->ill_lock); + + orig_mtu = ill->ill_max_mtu; ill->ill_max_frag = (uint_t)notify->dl_data; + ill->ill_max_mtu = (uint_t)notify->dl_data; + + /* + * If ill_user_mtu was set (via SIOCSLIFLNKINFO), + * clamp ill_max_mtu at it. + */ + if (ill->ill_user_mtu != 0 && + ill->ill_user_mtu < ill->ill_max_mtu) + ill->ill_max_mtu = ill->ill_user_mtu; /* - * If an SIOCSLIFLNKINFO has changed the ill_max_mtu - * leave it alone + * If the MTU is unchanged, we're done. */ - if (ill->ill_mtu_userspecified) { + if (orig_mtu == ill->ill_max_mtu) { mutex_exit(&ill->ill_lock); break; } - ill->ill_max_mtu = ill->ill_max_frag; + if (ill->ill_isv6) { if (ill->ill_max_mtu < IPV6_MIN_MTU) ill->ill_max_mtu = IPV6_MIN_MTU; @@ -16371,7 +16054,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) if (need_ire_walk_v6) ire_walk_v6(ill_mtu_change, (char *)ill, ALL_ZONES, ipst); + + /* + * Refresh IPMP meta-interface MTU if necessary. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_illgrp_refresh_mtu(ill->ill_grp); break; + case DL_NOTE_LINK_UP: case DL_NOTE_LINK_DOWN: { /* @@ -16385,9 +16075,17 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) went_up = notify->dl_notification == DL_NOTE_LINK_UP; mutex_enter(&phyint->phyint_lock); + new_phyint_flags = went_up ? phyint->phyint_flags | PHYI_RUNNING : phyint->phyint_flags & ~PHYI_RUNNING; + + if (IS_IPMP(ill)) { + new_phyint_flags = went_up ? + new_phyint_flags & ~PHYI_FAILED : + new_phyint_flags | PHYI_FAILED; + } + if (new_phyint_flags != phyint->phyint_flags) { phyint->phyint_flags = new_phyint_flags; changed = B_TRUE; @@ -16474,7 +16172,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * is invoked from an ill queue, conn_oper_pending_ill is not * available, but we know the ioctl is pending on ill_wq.) */ - uint_t paddrlen, paddroff; + uint_t paddrlen, paddroff; paddrreq = ill->ill_phys_addr_pend; paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length; @@ -16592,29 +16290,59 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } freemsg(mp); - if (mp1 != NULL) { + if (mp1 == NULL) + return; + + /* + * The operation must complete without EINPROGRESS since + * ipsq_pending_mp_get() has removed the mblk (mp1). Otherwise, + * the operation will be stuck forever inside the IPSQ. + */ + ASSERT(err != EINPROGRESS); + + switch (ipsq->ipsq_xop->ipx_current_ioctl) { + case 0: + ipsq_current_finish(ipsq); + break; + + case SIOCSLIFNAME: + case IF_UNITSEL: { + ill_t *ill_other = ILL_OTHER(ill); + /* - * The operation must complete without EINPROGRESS - * since ipsq_pending_mp_get() has removed the mblk - * from ipsq_pending_mp. Otherwise, the operation - * will be stuck forever in the ipsq. + * If SIOCSLIFNAME or IF_UNITSEL is about to succeed, and the + * ill has a peer which is in an IPMP group, then place ill + * into the same group. One catch: although ifconfig plumbs + * the appropriate IPMP meta-interface prior to plumbing this + * ill, it is possible for multiple ifconfig applications to + * race (or for another application to adjust plumbing), in + * which case the IPMP meta-interface we need will be missing. + * If so, kick the phyint out of the group. */ - ASSERT(err != EINPROGRESS); + if (err == 0 && ill_other != NULL && IS_UNDER_IPMP(ill_other)) { + ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; + ipmp_illgrp_t *illg; - switch (ipsq->ipsq_current_ioctl) { - case 0: - ipsq_current_finish(ipsq); - break; + illg = ill->ill_isv6 ? grp->gr_v6 : grp->gr_v4; + if (illg == NULL) + ipmp_phyint_leave_grp(ill->ill_phyint); + else + ipmp_ill_join_illgrp(ill, illg); + } - case SIOCLIFADDIF: - case SIOCSLIFNAME: + if (ipsq->ipsq_xop->ipx_current_ioctl == IF_UNITSEL) + ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); + else ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq); - break; + break; + } + case SIOCLIFADDIF: + ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq); + break; - default: - ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); - break; - } + default: + ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); + break; } } @@ -16626,20 +16354,16 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) void ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) { - ill_t *ill; + ill_t *ill = q->q_ptr; struct iocblk *iocp; mblk_t *mp1; conn_t *connp = NULL; ip1dbg(("ip_rput_other ")); - ill = (ill_t *)q->q_ptr; - /* - * This routine is not a writer in the case of SIOCGTUNPARAM - * in which case ipsq is NULL. - */ if (ipsq != NULL) { ASSERT(IAM_WRITER_IPSQ(ipsq)); - ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); + ASSERT(ipsq->ipsq_xop == + ill->ill_phyint->phyint_ipsq->ipsq_xop); } switch (mp->b_datap->db_type) { @@ -16752,7 +16476,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) case DL_IOC_HDR_INFO: /* - * If this was the first attempt turn of the + * If this was the first attempt, turn off the * fastpath probing. */ mutex_enter(&ill->ill_lock); @@ -16768,7 +16492,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } freemsg(mp); break; - case SIOCSTUNPARAM: + case SIOCSTUNPARAM: case OSIOCSTUNPARAM: ASSERT(ipsq != NULL); /* @@ -17017,14 +16741,13 @@ ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif) /* * Find an IRE which matches the destination and the outgoing * queue in the cache table. All we need is an IRE_CACHE which - * is pointing at ipif->ipif_ill. If it is part of some ill group, - * then it is enough to have some IRE_CACHE in the group. + * is pointing at ipif->ipif_ill. */ if (ipif->ipif_flags & IPIF_POINTOPOINT) dst = ipif->ipif_pp_dst_addr; ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, MBLK_GETLABEL(mp), - MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR, ipst); + MATCH_IRE_ILL | MATCH_IRE_SECATTR, ipst); if (ire == NULL) { /* * Mark this packet to make it be delivered to @@ -17321,7 +17044,8 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) */ mp->b_datap->db_type = M_DATA; icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp, - ip6h, icmp6, ill, B_TRUE, ii->ipsec_in_zoneid); + ip6h, icmp6, ill, recv_ill, B_TRUE, + ii->ipsec_in_zoneid); } if (ill_need_rele) ill_refrele(ill); @@ -17357,37 +17081,36 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) } switch (ipha->ipha_protocol) { - case IPPROTO_UDP: - ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, - recv_ill); - if (ire_need_rele) - ire_refrele(ire); - break; - case IPPROTO_TCP: - if (!ire_need_rele) - IRE_REFHOLD(ire); - mp = ip_tcp_input(mp, ipha, ill, B_TRUE, - ire, ipsec_mp, 0, ill->ill_rq, NULL); - IRE_REFRELE(ire); - if (mp != NULL) { - - SQUEUE_ENTER(GET_SQUEUE(mp), mp, - mp, 1, SQ_PROCESS, - SQTAG_IP_PROTO_AGAIN); - } - break; - case IPPROTO_SCTP: - if (!ire_need_rele) - IRE_REFHOLD(ire); - ip_sctp_input(mp, ipha, ill, B_TRUE, ire, - ipsec_mp, 0, ill->ill_rq, dst); - break; - default: - ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, - recv_ill, 0); - if (ire_need_rele) - ire_refrele(ire); - break; + case IPPROTO_UDP: + ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, + recv_ill); + if (ire_need_rele) + ire_refrele(ire); + break; + case IPPROTO_TCP: + if (!ire_need_rele) + IRE_REFHOLD(ire); + mp = ip_tcp_input(mp, ipha, ill, B_TRUE, + ire, ipsec_mp, 0, ill->ill_rq, NULL); + IRE_REFRELE(ire); + if (mp != NULL) { + SQUEUE_ENTER(GET_SQUEUE(mp), mp, + mp, 1, SQ_PROCESS, + SQTAG_IP_PROTO_AGAIN); + } + break; + case IPPROTO_SCTP: + if (!ire_need_rele) + IRE_REFHOLD(ire); + ip_sctp_input(mp, ipha, ill, B_TRUE, ire, + ipsec_mp, 0, ill->ill_rq, dst); + break; + default: + ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, + recv_ill, 0); + if (ire_need_rele) + ire_refrele(ire); + break; } } else { uint32_t rput_flags = 0; @@ -17621,9 +17344,9 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, */ ASSERT(!mctl_present); ASSERT(first_mp == mp); - if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { + if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) return; - } + /* * Make sure that first_mp points back to mp as * the mp we came in with could have changed in @@ -17647,17 +17370,10 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, ilm_t *ilm; mblk_t *mp1; zoneid_t last_zoneid; + ilm_walker_t ilw; if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) { ASSERT(ire->ire_type == IRE_BROADCAST); - /* - * Inactive/Failed interfaces are not supposed to - * respond to the multicast packets. - */ - if (ill_is_probeonly(ill)) { - freemsg(first_mp); - return; - } /* * In the multicast case, applications may have joined @@ -17680,11 +17396,9 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, * have been exhausted. */ last_zoneid = -1; - ILM_WALKER_HOLD(recv_ill); - for (ilm = recv_ill->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if ((ilm->ilm_flags & ILM_DELETED) || - ipha->ipha_dst != ilm->ilm_addr || + ilm = ilm_walker_start(&ilw, recv_ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + if (ipha->ipha_dst != ilm->ilm_addr || ilm->ilm_zoneid == last_zoneid || ilm->ilm_zoneid == ire->ire_zoneid || ilm->ilm_zoneid == ALL_ZONES || @@ -17693,12 +17407,12 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, mp1 = ip_copymsg(first_mp); if (mp1 == NULL) continue; - icmp_inbound(q, mp1, B_TRUE, ill, + icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill, 0, sum, mctl_present, B_TRUE, recv_ill, ilm->ilm_zoneid); last_zoneid = ilm->ilm_zoneid; } - ILM_WALKER_RELE(recv_ill); + ilm_walker_finish(&ilw); } else if (ire->ire_type == IRE_BROADCAST) { /* * In the broadcast case, there may be many zones @@ -18580,14 +18294,13 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level) return (1); } - if ((mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, ipst)) == NULL) { + mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, level, ipst); + if (mpctl == NULL) return (1); - } - mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, ipst); - if (mpctl == NULL) { + mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, level, ipst); + if (mpctl == NULL) return (1); - } if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) { return (1); @@ -19048,6 +18761,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) mblk_t *mp_tail = NULL; ill_walk_context_t ctx; zoneid_t zoneid; + ilm_walker_t ilw; /* * make a copy of the original message @@ -19064,7 +18778,10 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - ILM_WALKER_HOLD(ill); + if (IS_UNDER_IPMP(ill)) + continue; + + ilm = ilm_walker_start(&ilw, ill); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (ipif->ipif_zoneid != zoneid && @@ -19074,7 +18791,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) OCTET_LENGTH); ipm.ipGroupMemberIfIndex.o_length = mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes); - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { ASSERT(ilm->ilm_ipif != NULL); ASSERT(ilm->ilm_ill == NULL); if (ilm->ilm_ipif != ipif) @@ -19090,7 +18807,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) } } } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } rw_exit(&ipst->ips_ill_g_lock); optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); @@ -19112,6 +18829,7 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) mblk_t *mp_tail = NULL; ill_walk_context_t ctx; zoneid_t zoneid; + ilm_walker_t ilw; /* * make a copy of the original message @@ -19127,9 +18845,12 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - ILM_WALKER_HOLD(ill); + if (IS_UNDER_IPMP(ill)) + continue; + + ilm = ilm_walker_start(&ilw, ill); ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex; - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { ASSERT(ilm->ilm_ipif == NULL); ASSERT(ilm->ilm_ill != NULL); if (ilm->ilm_zoneid != zoneid) @@ -19145,7 +18866,7 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) (uint_t)sizeof (ipm6))); } } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } rw_exit(&ipst->ips_ill_g_lock); @@ -19171,6 +18892,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) zoneid_t zoneid; int i; slist_t *sl; + ilm_walker_t ilw; /* * make a copy of the original message @@ -19187,7 +18909,10 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - ILM_WALKER_HOLD(ill); + if (IS_UNDER_IPMP(ill)) + continue; + + ilm = ilm_walker_start(&ilw, ill); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (ipif->ipif_zoneid != zoneid) @@ -19196,7 +18921,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) OCTET_LENGTH); ips.ipGroupSourceIfIndex.o_length = mi_strlen(ips.ipGroupSourceIfIndex.o_bytes); - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { ASSERT(ilm->ilm_ipif != NULL); ASSERT(ilm->ilm_ill == NULL); sl = ilm->ilm_filter; @@ -19220,7 +18945,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) } } } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } rw_exit(&ipst->ips_ill_g_lock); optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); @@ -19244,6 +18969,7 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) zoneid_t zoneid; int i; slist_t *sl; + ilm_walker_t ilw; /* * make a copy of the original message @@ -19259,9 +18985,12 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - ILM_WALKER_HOLD(ill); + if (IS_UNDER_IPMP(ill)) + continue; + + ilm = ilm_walker_start(&ilw, ill); ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex; - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { ASSERT(ilm->ilm_ipif == NULL); ASSERT(ilm->ilm_ill != NULL); sl = ilm->ilm_filter; @@ -19279,7 +19008,7 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) } } } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } rw_exit(&ipst->ips_ill_g_lock); @@ -19345,7 +19074,8 @@ ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) * in one IRE walk. */ static mblk_t * -ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) +ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level, + ip_stack_t *ipst) { struct opthdr *optp; mblk_t *mp2ctl; /* Returned */ @@ -19377,6 +19107,14 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) ird.ird_route.lp_head = mpctl->b_cont; ird.ird_netmedia.lp_head = mp3ctl->b_cont; ird.ird_attrs.lp_head = mp4ctl->b_cont; + /* + * If the level has been set the special EXPER_IP_AND_TESTHIDDEN + * value, then also include IRE_MARK_TESTHIDDEN IREs. This is + * intended a temporary solution until a proper MIB API is provided + * that provides complete filtering/caller-opt-in. + */ + if (level == EXPER_IP_AND_TESTHIDDEN) + ird.ird_flags |= IRD_REPORT_TESTHIDDEN; zoneid = Q_TO_CONN(q)->conn_zoneid; ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst); @@ -19419,7 +19157,8 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) * ipv6NetToMediaEntryTable in an NDP walk. */ static mblk_t * -ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) +ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level, + ip_stack_t *ipst) { struct opthdr *optp; mblk_t *mp2ctl; /* Returned */ @@ -19451,6 +19190,14 @@ ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) ird.ird_route.lp_head = mpctl->b_cont; ird.ird_netmedia.lp_head = mp3ctl->b_cont; ird.ird_attrs.lp_head = mp4ctl->b_cont; + /* + * If the level has been set the special EXPER_IP_AND_TESTHIDDEN + * value, then also include IRE_MARK_TESTHIDDEN IREs. This is + * intended a temporary solution until a proper MIB API is provided + * that provides complete filtering/caller-opt-in. + */ + if (level == EXPER_IP_AND_TESTHIDDEN) + ird.ird_flags |= IRD_REPORT_TESTHIDDEN; zoneid = Q_TO_CONN(q)->conn_zoneid; ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst); @@ -19671,6 +19418,11 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) ASSERT(ire->ire_ipversion == IPV4_VERSION); + if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) && + ire->ire_marks & IRE_MARK_TESTHIDDEN) { + return; + } + if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) return; @@ -19812,6 +19564,11 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) ASSERT(ire->ire_ipversion == IPV6_VERSION); + if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) && + ire->ire_marks & IRE_MARK_TESTHIDDEN) { + return; + } + if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) return; @@ -20518,8 +20275,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, boolean_t mctl_present; ipsec_out_t *io; int match_flags; - ill_t *attach_ill = NULL; - /* Bind to IPIF_NOFAILOVER ill etc. */ ill_t *xmit_ill = NULL; /* IP_PKTINFO etc. */ ipif_t *dst_ipif; boolean_t multirt_need_resolve = B_FALSE; @@ -20639,16 +20394,11 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, } /* - * IP_DONTFAILOVER_IF and IP_BOUND_IF have precedence over ill index - * passed in IP_PKTINFO. + * IP_BOUND_IF has precedence over the ill index passed in IP_PKTINFO. */ - if (infop->ip_opt_ill_index != 0 && - connp->conn_outgoing_ill == NULL && - connp->conn_nofailover_ill == NULL) { - - xmit_ill = ill_lookup_on_ifindex( - infop->ip_opt_ill_index, B_FALSE, NULL, NULL, NULL, NULL, - ipst); + if (infop->ip_opt_ill_index != 0 && connp->conn_outgoing_ill == NULL) { + xmit_ill = ill_lookup_on_ifindex(infop->ip_opt_ill_index, + B_FALSE, NULL, NULL, NULL, NULL, ipst); if (xmit_ill == NULL || IS_VNI(xmit_ill)) goto drop_pkt; @@ -20659,7 +20409,7 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, * accessible from all zones i.e has a valid ipif in * all zones. */ - if (!ipif_lookup_zoneid_group(xmit_ill, zoneid, 0, NULL)) { + if (!ipif_lookup_zoneid(xmit_ill, zoneid, 0, NULL)) { goto drop_pkt; } } @@ -20696,18 +20446,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, goto version_hdrlen_check; dst = ipha->ipha_dst; - if (connp->conn_nofailover_ill != NULL) { - attach_ill = conn_get_held_ill(connp, - &connp->conn_nofailover_ill, &err); - if (err == ILL_LOOKUP_FAILED) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - if (need_decref) - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - } - /* If IP_BOUND_IF has been set, use that ill. */ if (connp->conn_outgoing_ill != NULL) { xmit_ill = conn_get_held_ill(connp, @@ -20761,9 +20499,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, ire = NULL; } - if (attach_ill != NULL) - goto send_from_ill; - /* * We cache IRE_CACHEs to avoid lookups. We don't do * this for the tcp global queue and listen end point @@ -21074,45 +20809,21 @@ notdata: } ASSERT(first_mp != NULL); - /* - * ICMP echo replies attach an ipsec_out and set ipsec_out_attach_if - * to make sure that this packet goes out on the same interface it - * came in. We handle that here. - */ - if (mctl_present) { - uint_t ifindex; + if (mctl_present) { io = (ipsec_out_t *)first_mp->b_rptr; - if (io->ipsec_out_attach_if || io->ipsec_out_ip_nexthop) { + if (io->ipsec_out_ip_nexthop) { /* * We may have lost the conn context if we are * coming here from ip_newroute(). Copy the * nexthop information. */ - if (io->ipsec_out_ip_nexthop) { - ip_nexthop = B_TRUE; - nexthop_addr = io->ipsec_out_nexthop_addr; + ip_nexthop = B_TRUE; + nexthop_addr = io->ipsec_out_nexthop_addr; - ipha = (ipha_t *)mp->b_rptr; - dst = ipha->ipha_dst; - goto send_from_ill; - } else { - ASSERT(io->ipsec_out_ill_index != 0); - ifindex = io->ipsec_out_ill_index; - attach_ill = ill_lookup_on_ifindex(ifindex, - B_FALSE, NULL, NULL, NULL, NULL, ipst); - if (attach_ill == NULL) { - ASSERT(xmit_ill == NULL); - ip1dbg(("ip_output: bad ifindex for " - "(BIND TO IPIF_NOFAILOVER) %d\n", - ifindex)); - freemsg(first_mp); - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutDiscards); - ASSERT(!need_decref); - return; - } - } + ipha = (ipha_t *)mp->b_rptr; + dst = ipha->ipha_dst; + goto send_from_ill; } } @@ -21161,7 +20872,7 @@ hdrtoosmall: ipha = (ipha_t *)mp->b_rptr; if (first_mp == NULL) { - ASSERT(attach_ill == NULL && xmit_ill == NULL); + ASSERT(xmit_ill == NULL); /* * If we got here because of "goto hdrtoosmall" * We need to attach a IPSEC_OUT. @@ -21213,8 +20924,6 @@ version_hdrlen_check: */ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion); ASSERT(xmit_ill == NULL); - if (attach_ill != NULL) - ill_refrele(attach_ill); if (need_decref) mp->b_flag |= MSGHASREF; (void) ip_output_v6(arg, first_mp, arg2, caller); @@ -21255,8 +20964,6 @@ version_hdrlen_check: zoneid, ipst)) { ASSERT(xmit_ill == NULL); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - if (attach_ill != NULL) - ill_refrele(attach_ill); TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, "ip_wput_end: q %p (%S)", q, "badopts"); if (need_decref) @@ -21295,22 +21002,6 @@ multicast: */ ill_t *ill = (ill_t *)q->q_ptr; - /* - * Don't honor attach_if for this case. If ill - * is part of the group, ipif could belong to - * any ill and we cannot maintain attach_ill - * and ipif_ill same anymore and the assert - * below would fail. - */ - if (mctl_present && io->ipsec_out_attach_if) { - io->ipsec_out_ill_index = 0; - io->ipsec_out_attach_if = B_FALSE; - ASSERT(attach_ill != NULL); - ill_refrele(attach_ill); - attach_ill = NULL; - } - - ASSERT(attach_ill == NULL); ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID); if (ipif == NULL) { if (need_decref) @@ -21429,25 +21120,11 @@ multicast: first_mp->b_cont = mp; mctl_present = B_TRUE; } - if (attach_ill != NULL) { - ASSERT(attach_ill == ipif->ipif_ill); - match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) { - match_flags |= MATCH_IRE_MARK_HIDDEN; - } - io->ipsec_out_ill_index = - attach_ill->ill_phyint->phyint_ifindex; - io->ipsec_out_attach_if = B_TRUE; - } else { - match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; - io->ipsec_out_ill_index = - ipif->ipif_ill->ill_phyint->phyint_ifindex; - } + match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; + io->ipsec_out_ill_index = + ipif->ipif_ill->ill_phyint->phyint_ifindex; + if (connp != NULL) { io->ipsec_out_multicast_loop = connp->conn_multicast_loop; @@ -21469,9 +21146,7 @@ multicast: * * NOTE : We need to do it for non-secure case also as * this might go out secure if there is a global policy - * match in ip_wput_ire. For bind to IPIF_NOFAILOVER - * address, the source should be initialized already and - * hence we won't be initializing here. + * match in ip_wput_ire. * * As we do not have the ire yet, it is possible that * we set the source address here and then later discover @@ -21507,14 +21182,6 @@ multicast: zoneid, MBLK_GETLABEL(mp), match_flags, ipst); } - /* - * refrele attach_ill as its not needed anymore. - */ - if (attach_ill != NULL) { - ill_refrele(attach_ill); - attach_ill = NULL; - } - if (ire == NULL) { /* * Multicast loopback and multicast forwarding is @@ -21630,33 +21297,9 @@ noroute: ipif_refrele(dst_ipif); } } - /* - * If we are bound to IPIF_NOFAILOVER address, look for - * an IRE_CACHE matching the ill. - */ -send_from_ill: - if (attach_ill != NULL) { - ipif_t *attach_ipif; - match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) { - match_flags |= MATCH_IRE_MARK_HIDDEN; - } - - attach_ipif = ipif_get_next_ipif(NULL, attach_ill); - if (attach_ipif == NULL) { - ip1dbg(("ip_wput: No ipif for attach_ill\n")); - goto discard_pkt; - } - ire = ire_ctable_lookup(dst, 0, 0, attach_ipif, - zoneid, MBLK_GETLABEL(mp), match_flags, ipst); - ipif_refrele(attach_ipif); - } else if (xmit_ill != NULL) { +send_from_ill: + if (xmit_ill != NULL) { ipif_t *ipif; /* @@ -21681,6 +21324,10 @@ send_from_ill: goto drop_pkt; } + match_flags = 0; + if (IS_UNDER_IPMP(xmit_ill)) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; + /* * Look for a ire that is part of the group, * if found use it else call ip_newroute_ipif. @@ -21689,7 +21336,7 @@ send_from_ill: * ill is accessible from all zones i.e has a * valid ipif in all zones. */ - match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; + match_flags |= MATCH_IRE_ILL | MATCH_IRE_SECATTR; ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, MBLK_GETLABEL(mp), match_flags, ipst); /* @@ -21729,12 +21376,7 @@ send_from_ill: ipst); } if (!ire) { - /* - * Make sure we don't load spread if this - * is IPIF_NOFAILOVER case. - */ - if ((attach_ill != NULL) || - (ip_nexthop && !ignore_nexthop)) { + if (ip_nexthop && !ignore_nexthop) { if (mctl_present) { io = (ipsec_out_t *)first_mp->b_rptr; ASSERT(first_mp->b_datap->db_type == @@ -21764,15 +21406,8 @@ send_from_ill: first_mp->b_cont = mp; mctl_present = B_TRUE; } - if (attach_ill != NULL) { - io->ipsec_out_ill_index = attach_ill-> - ill_phyint->phyint_ifindex; - io->ipsec_out_attach_if = B_TRUE; - } else { - io->ipsec_out_ip_nexthop = ip_nexthop; - io->ipsec_out_nexthop_addr = - nexthop_addr; - } + io->ipsec_out_ip_nexthop = ip_nexthop; + io->ipsec_out_nexthop_addr = nexthop_addr; } noirefound: /* @@ -21787,8 +21422,6 @@ noirefound: ip_newroute(q, first_mp, dst, connp, zoneid, ipst); TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, "ip_wput_end: q %p (%S)", q, "newroute"); - if (attach_ill != NULL) - ill_refrele(attach_ill); if (xmit_ill != NULL) ill_refrele(xmit_ill); if (need_decref) @@ -21869,8 +21502,6 @@ noirefound: ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); } } - if (attach_ill != NULL) - ill_refrele(attach_ill); if (xmit_ill != NULL) ill_refrele(xmit_ill); if (need_decref) @@ -21896,8 +21527,6 @@ drop_pkt: if (need_decref) CONN_DEC_REF(connp); freemsg(first_mp); - if (attach_ill != NULL) - ill_refrele(attach_ill); if (xmit_ill != NULL) ill_refrele(xmit_ill); TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, @@ -21923,8 +21552,8 @@ ip_wput(queue_t *q, mblk_t *mp) /* * * The following rules must be observed when accessing any ipif or ill - * that has been cached in the conn. Typically conn_nofailover_ill, - * conn_outgoing_ill, conn_multicast_ipif and conn_multicast_ill. + * that has been cached in the conn. Typically conn_outgoing_ill, + * conn_multicast_ipif and conn_multicast_ill. * * Access: The ipif or ill pointed to from the conn can be accessed under * the protection of the conn_lock or after it has been refheld under the @@ -21944,10 +21573,8 @@ ip_wput(queue_t *q, mblk_t *mp) * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock * or a reference to the ipif or a reference to an ire that references the - * ipif. An ipif does not change its ill except for failover/failback. Since - * failover/failback happens only after bringing down the ipif and making sure - * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock - * the above holds. + * ipif. An ipif only changes its ill when migrating from an underlying ill + * to an IPMP ill in ipif_up(). */ ipif_t * conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err) @@ -22302,96 +21929,6 @@ ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, zoneid)); } -ire_t * -conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill) -{ - ipaddr_t addr; - ire_t *save_ire; - irb_t *irb; - ill_group_t *illgrp; - int err; - - save_ire = ire; - addr = ire->ire_addr; - - ASSERT(ire->ire_type == IRE_BROADCAST); - - illgrp = connp->conn_outgoing_ill->ill_group; - if (illgrp == NULL) { - *conn_outgoing_ill = conn_get_held_ill(connp, - &connp->conn_outgoing_ill, &err); - if (err == ILL_LOOKUP_FAILED) { - ire_refrele(save_ire); - return (NULL); - } - return (save_ire); - } - /* - * If IP_BOUND_IF has been done, conn_outgoing_ill will be set. - * If it is part of the group, we need to send on the ire - * that has been cleared of IRE_MARK_NORECV and that belongs - * to this group. This is okay as IP_BOUND_IF really means - * any ill in the group. We depend on the fact that the - * first ire in the group is always cleared of IRE_MARK_NORECV - * if such an ire exists. This is possible only if you have - * at least one ill in the group that has not failed. - * - * First get to the ire that matches the address and group. - * - * We don't look for an ire with a matching zoneid because a given zone - * won't always have broadcast ires on all ills in the group. - */ - irb = ire->ire_bucket; - rw_enter(&irb->irb_lock, RW_READER); - if (ire->ire_marks & IRE_MARK_NORECV) { - /* - * If the current zone only has an ire broadcast for this - * address marked NORECV, the ire we want is ahead in the - * bucket, so we look it up deliberately ignoring the zoneid. - */ - for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_addr != addr) - continue; - /* skip over deleted ires */ - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - } - } - while (ire != NULL) { - /* - * If a new interface is coming up, we could end up - * seeing the loopback ire and the non-loopback ire - * may not have been added yet. So check for ire_stq - */ - if (ire->ire_stq != NULL && (ire->ire_addr != addr || - ire->ire_ipif->ipif_ill->ill_group == illgrp)) { - break; - } - ire = ire->ire_next; - } - if (ire != NULL && ire->ire_addr == addr && - ire->ire_ipif->ipif_ill->ill_group == illgrp) { - IRE_REFHOLD(ire); - rw_exit(&irb->irb_lock); - ire_refrele(save_ire); - *conn_outgoing_ill = ire_to_ill(ire); - /* - * Refhold the ill to make the conn_outgoing_ill - * independent of the ire. ip_wput_ire goes in a loop - * and may refrele the ire. Since we have an ire at this - * point we don't need to use ILL_CAN_LOOKUP on the ill. - */ - ill_refhold(*conn_outgoing_ill); - return (ire); - } - rw_exit(&irb->irb_lock); - ip1dbg(("conn_set_outgoing_ill: No matching ire\n")); - /* - * If we can't find a suitable ire, return the original ire. - */ - return (save_ire); -} - /* * This function does the ire_refrele of the ire passed in as the * argument. As this function looks up more ires i.e broadcast ires, @@ -22401,7 +21938,6 @@ conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill) * IPQoS Notes: * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for * IPsec packets are done in ipsec_out_process. - * */ void ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, @@ -22471,9 +22007,8 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, if ((first_ire->ire_flags & RTF_MULTIRT) && (first_ire->ire_addr == ire->ire_addr) && !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) break; - } } if ((first_ire != NULL) && (first_ire != ire)) { @@ -22489,36 +22024,15 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, * conn_outgoing_ill variable is used only in the broadcast loop. * for performance we don't grab the mutexs in the fastpath */ - if ((connp != NULL) && - (ire->ire_type == IRE_BROADCAST) && - ((connp->conn_nofailover_ill != NULL) || - (connp->conn_outgoing_ill != NULL))) { - /* - * Bind to IPIF_NOFAILOVER address overrides IP_BOUND_IF - * option. So, see if this endpoint is bound to a - * IPIF_NOFAILOVER address. If so, honor it. This implies - * that if the interface is failed, we will still send - * the packet on the same ill which is what we want. - */ + if (ire->ire_type == IRE_BROADCAST && connp != NULL && + connp->conn_outgoing_ill != NULL) { conn_outgoing_ill = conn_get_held_ill(connp, - &connp->conn_nofailover_ill, &err); + &connp->conn_outgoing_ill, &err); if (err == ILL_LOOKUP_FAILED) { ire_refrele(ire); freemsg(mp); return; } - if (conn_outgoing_ill == NULL) { - /* - * Choose a good ill in the group to send the - * packets on. - */ - ire = conn_set_outgoing_ill(connp, ire, - &conn_outgoing_ill); - if (ire == NULL) { - freemsg(mp); - return; - } - } } if (mp->b_datap->db_type != M_CTL) { @@ -22578,7 +22092,7 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, if (src_ire != NULL && !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) && (!ipst->ips_ip_restrict_interzone_loopback || - ire_local_same_ill_group(ire, src_ire))) { + ire_local_same_lan(ire, src_ire))) { if (ipha->ipha_src == INADDR_ANY && !unspec_src) ipha->ipha_src = src_ire->ire_src_addr; ire_refrele(src_ire); @@ -22741,39 +22255,7 @@ another:; */ ASSERT(ire->ire_ipversion == IPV4_VERSION); - /* - * With IP multipathing, broadcast packets are sent on the ire - * that has been cleared of IRE_MARK_NORECV and that belongs to - * the group. However, this ire might not be in the same zone so - * we can't always use its source address. We look for a - * broadcast ire in the same group and in the right zone. - */ - if (ire->ire_type == IRE_BROADCAST && - ire->ire_zoneid != zoneid) { - ire_t *src_ire = ire_ctable_lookup(dst, 0, - IRE_BROADCAST, ire->ire_ipif, zoneid, NULL, - (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst); - if (src_ire != NULL) { - src = src_ire->ire_src_addr; - ire_refrele(src_ire); - } else { - ire_refrele(ire); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - freemsg(first_mp); - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutDiscards); - } - return; - } - } else { - src = ire->ire_src_addr; - } - + src = ire->ire_src_addr; if (connp == NULL) { ip1dbg(("ip_wput_ire: no connp and no src " "address for dst 0x%x, using src 0x%x\n", @@ -22917,10 +22399,9 @@ another:; ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t)); io = (ipsec_out_t *)first_mp->b_rptr; - io->ipsec_out_ill_index = ((ill_t *)stq->q_ptr)-> - ill_phyint->phyint_ifindex; - - ipsec_out_process(q, first_mp, ire, ill_index); + io->ipsec_out_ill_index = + ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex; + ipsec_out_process(q, first_mp, ire, 0); ire_refrele(ire); if (conn_outgoing_ill != NULL) ill_refrele(conn_outgoing_ill); @@ -22960,7 +22441,7 @@ another:; if (ire1->ire_addr != ire->ire_addr) continue; if (ire1->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) continue; /* Got one */ @@ -23147,71 +22628,16 @@ broadcast: * back outbound packets in different zones but on the * same ill, as the application would see duplicates. * - * If the interfaces are part of the same group, - * we would want to send only one copy out for - * whole group. - * * This logic assumes that ire_add_v4() groups the * IRE_BROADCAST entries so that those with the same - * ire_addr and ill_group are kept together. + * ire_addr are kept together. */ ire_ill = ire->ire_ipif->ipif_ill; - if (ire->ire_stq == NULL && ire1->ire_stq != NULL) { - if (ire_ill->ill_group != NULL && - (ire->ire_marks & IRE_MARK_NORECV)) { - /* - * If the current zone only has an ire - * broadcast for this address marked - * NORECV, the ire we want is ahead in - * the bucket, so we look it up - * deliberately ignoring the zoneid. - */ - for (ire1 = ire->ire_bucket->irb_ire; - ire1 != NULL; - ire1 = ire1->ire_next) { - ire1_ill = - ire1->ire_ipif->ipif_ill; - if (ire1->ire_addr != dst) - continue; - /* skip over the current ire */ - if (ire1 == ire) - continue; - /* skip over deleted ires */ - if (ire1->ire_marks & - IRE_MARK_CONDEMNED) - continue; - /* - * non-loopback ire in our - * group: use it for the next - * pass in the loop - */ - if (ire1->ire_stq != NULL && - ire1_ill->ill_group == - ire_ill->ill_group) - break; - } - } - } else { + if (ire->ire_stq != NULL || ire1->ire_stq == NULL) { while (ire1 != NULL && ire1->ire_addr == dst) { ire1_ill = ire1->ire_ipif->ipif_ill; - /* - * We can have two broadcast ires on the - * same ill in different zones; here - * we'll send a copy of the packet on - * each ill and the fanout code will - * call conn_wantpacket() to check that - * the zone has the broadcast address - * configured on the ill. If the two - * ires are in the same group we only - * send one copy up. - */ - if (ire1_ill != ire_ill && - (ire1_ill->ill_group == NULL || - ire_ill->ill_group == NULL || - ire1_ill->ill_group != - ire_ill->ill_group)) { + if (ire1_ill != ire_ill) break; - } ire1 = ire1->ire_next; } } @@ -23403,13 +22829,8 @@ multi_loopback: * logic. */ if (ill != NULL) { - ilm_t *ilm; - - ILM_WALKER_HOLD(ill); - ilm = ilm_lookup_ill(ill, ipha->ipha_dst, - ALL_ZONES); - ILM_WALKER_RELE(ill); - if (ilm != NULL) { + if (ilm_lookup_ill(ill, ipha->ipha_dst, + ALL_ZONES) != NULL) { /* * Pass along the virtual output q. * ip_wput_local() will distribute the @@ -23565,18 +22986,17 @@ checksumoptions: ire1 != NULL; ire1 = ire1->ire_next) { if (!(ire1->ire_flags & - RTF_MULTIRT)) { + RTF_MULTIRT)) continue; - } + if (ire1->ire_addr != - ire->ire_addr) { + ire->ire_addr) continue; - } + if (ire1->ire_marks & - (IRE_MARK_CONDEMNED| - IRE_MARK_HIDDEN)) { + (IRE_MARK_CONDEMNED | + IRE_MARK_TESTHIDDEN)) continue; - } /* Got one */ IRE_REFHOLD(ire1); @@ -24743,9 +24163,8 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, if ((first_ire->ire_flags & RTF_MULTIRT) && (first_ire->ire_addr == ire->ire_addr) && !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) break; - } } if (first_ire != NULL) { @@ -24808,7 +24227,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, if (ire1->ire_addr != ire->ire_addr) continue; if (ire1->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) continue; /* * Ensure we do not exceed the MTU @@ -25130,10 +24549,9 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, if (ire1->ire_addr != ire->ire_addr) continue; if (ire1->ire_marks & - (IRE_MARK_CONDEMNED| - IRE_MARK_HIDDEN)) { + (IRE_MARK_CONDEMNED | + IRE_MARK_TESTHIDDEN)) continue; - } /* * Ensure we do not exceed the MTU * of the next route. @@ -25500,6 +24918,7 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, ilm_t *ilm; mblk_t *mp1; zoneid_t last_zoneid; + ilm_walker_t ilw; if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) { ASSERT(ire_type == IRE_BROADCAST); @@ -25524,11 +24943,9 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, * have been exhausted. */ last_zoneid = -1; - ILM_WALKER_HOLD(ill); - for (ilm = ill->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if ((ilm->ilm_flags & ILM_DELETED) || - ipha->ipha_dst != ilm->ilm_addr || + ilm = ilm_walker_start(&ilw, ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + if (ipha->ipha_dst != ilm->ilm_addr || ilm->ilm_zoneid == last_zoneid || ilm->ilm_zoneid == zoneid || !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) @@ -25536,12 +24953,12 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, mp1 = ip_copymsg(first_mp); if (mp1 == NULL) continue; - icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, - mctl_present, B_FALSE, ill, + icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill, + 0, 0, mctl_present, B_FALSE, ill, ilm->ilm_zoneid); last_zoneid = ilm->ilm_zoneid; } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); /* * Loopback case: the sending endpoint has * IP_MULTICAST_LOOP disabled, therefore we don't @@ -25859,14 +25276,9 @@ ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid) * caller and hence matching on ILL (MATCH_IRE_ILL) would * be sufficient rather than MATCH_IRE_IPIF. * - * This function is used for sending IGMP packets. We need - * to make sure that we send the packet out of the interface - * (ipif->ipif_ill) where we joined the group. This is to - * prevent from switches doing IGMP snooping to send us multicast - * packets for a given group on the interface we have joined. - * If we can't find an ire, igmp_sendpkt has already initialized - * ipsec_out_attach_if so that this will not be load spread in - * ip_newroute_ipif. + * This function is used for sending IGMP packets. For IPMP, + * we sidestep IGMP snooping issues by sending all multicast + * traffic on a single interface in the IPMP group. */ ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL, MATCH_IRE_ILL, ipst); @@ -26035,7 +25447,7 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, ip6_t *ip6h1; uint_t ill_index; ipsec_out_t *io; - boolean_t attach_if, hwaccel; + boolean_t hwaccel; uint32_t flags = IP6_NO_IPPOLICY; int match_flags; zoneid_t zoneid; @@ -26052,42 +25464,22 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, if (io->ipsec_out_reachable) { flags |= IPV6_REACHABILITY_CONFIRMATION; } - attach_if = io->ipsec_out_attach_if; hwaccel = io->ipsec_out_accelerated; zoneid = io->ipsec_out_zoneid; ASSERT(zoneid != ALL_ZONES); - match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; + match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; /* Multicast addresses should have non-zero ill_index. */ v6dstp = &ip6h->ip6_dst; ASSERT(ip6h->ip6_nxt != IPPROTO_RAW); ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0); - ASSERT(!attach_if || ill_index != 0); - if (ill_index != 0) { - if (ill == NULL) { - ill = ip_grab_attach_ill(NULL, ipsec_mp, ill_index, - B_TRUE, ipst); - /* Failure case frees things for us. */ - if (ill == NULL) - return; - - ill_need_rele = B_TRUE; - } - /* - * If this packet needs to go out on a particular interface - * honor it. - */ - if (attach_if) { - match_flags = MATCH_IRE_ILL; + if (ill == NULL && ill_index != 0) { + ill = ip_grab_ill(ipsec_mp, ill_index, B_TRUE, ipst); + /* Failure case frees things for us. */ + if (ill == NULL) + return; - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) { - match_flags |= MATCH_IRE_MARK_HIDDEN; - } - } + ill_need_rele = B_TRUE; } ASSERT(mp != NULL); @@ -26138,32 +25530,15 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, return; } - ip_newroute_ipif_v6(q, ipsec_mp, ipif, *v6dstp, + ip_newroute_ipif_v6(q, ipsec_mp, ipif, v6dstp, &ip6h->ip6_src, unspec_src, zoneid); ipif_refrele(ipif); } else { - if (attach_if) { - ipif_t *ipif; - - ipif = ipif_get_next_ipif(NULL, ill); - if (ipif == NULL) { - if (ill_need_rele) - ill_refrele(ill); - freemsg(ipsec_mp); - return; - } - ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, - zoneid, MBLK_GETLABEL(mp), match_flags, ipst); - ire_need_rele = B_TRUE; - ipif_refrele(ipif); + if (ire_arg != NULL) { + ire = ire_arg; } else { - if (ire_arg != NULL) { - ire = ire_arg; - } else { - ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, - ipst); - ire_need_rele = B_TRUE; - } + ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, ipst); + ire_need_rele = B_TRUE; } if (ire != NULL) goto send; @@ -26350,7 +25725,6 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, ipha_t *ipha1; uint_t ill_index; ipsec_out_t *io; - boolean_t attach_if; int match_flags; irb_t *irb = NULL; boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE; @@ -26372,39 +25746,19 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, io = (ipsec_out_t *)ipsec_mp->b_rptr; ill_index = io->ipsec_out_ill_index; - attach_if = io->ipsec_out_attach_if; zoneid = io->ipsec_out_zoneid; ASSERT(zoneid != ALL_ZONES); ipst = io->ipsec_out_ns->netstack_ip; ASSERT(io->ipsec_out_ns != NULL); - match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; - if (ill_index != 0) { - if (ill == NULL) { - ill = ip_grab_attach_ill(NULL, ipsec_mp, - ill_index, B_FALSE, ipst); - - /* Failure case frees things for us. */ - if (ill == NULL) - return; - - ill_need_rele = B_TRUE; - } - /* - * If this packet needs to go out on a particular interface - * honor it. - */ - if (attach_if) { - match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; + match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; + if (ill == NULL && ill_index != 0) { + ill = ip_grab_ill(ipsec_mp, ill_index, B_FALSE, ipst); + /* Failure case frees things for us. */ + if (ill == NULL) + return; - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) { - match_flags |= MATCH_IRE_MARK_HIDDEN; - } - } + ill_need_rele = B_TRUE; } if (CLASSD(dst)) { @@ -26474,17 +25828,12 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT, zoneid, &zero_info); } else { - if (attach_if) { - ire = ire_ctable_lookup(dst, 0, 0, ill->ill_ipif, - zoneid, MBLK_GETLABEL(mp), match_flags, ipst); + if (ire_arg != NULL) { + ire = ire_arg; + ire_need_rele = B_FALSE; } else { - if (ire_arg != NULL) { - ire = ire_arg; - ire_need_rele = B_FALSE; - } else { - ire = ire_cache_lookup(dst, zoneid, - MBLK_GETLABEL(mp), ipst); - } + ire = ire_cache_lookup(dst, zoneid, + MBLK_GETLABEL(mp), ipst); } if (ire != NULL) { goto send; @@ -26613,11 +25962,9 @@ send: (void *)ire->ire_ipif, (void *)ipif)); /* - * Multiroute the secured packet, unless IPsec really - * requires the packet to go out only through a particular - * interface. + * Multiroute the secured packet. */ - if ((ire->ire_flags & RTF_MULTIRT) && !attach_if) { + if (ire->ire_flags & RTF_MULTIRT) { ire_t *first_ire; irb = ire->ire_bucket; ASSERT(irb != NULL); @@ -26634,9 +25981,8 @@ send: if ((first_ire->ire_flags & RTF_MULTIRT) && (first_ire->ire_addr == ire->ire_addr) && !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) break; - } } if ((first_ire != NULL) && (first_ire != ire)) { @@ -26657,11 +26003,6 @@ send: multirt_send = B_TRUE; max_frag = ire->ire_max_frag; - } else { - if ((ire->ire_flags & RTF_MULTIRT) && attach_if) { - ip1dbg(("ip_wput_ipsec_out: ignoring multirouting " - "flag, attach_if %d\n", attach_if)); - } } /* @@ -26689,7 +26030,7 @@ send: if (ire1->ire_addr != ire->ire_addr) continue; if (ire1->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) continue; /* No loopback here */ if (ire1->ire_stq == NULL) @@ -27155,10 +26496,8 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) * before sending it the accelerated packet. */ if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) { - int ifindex; ill = ire_to_ill(ire); - ifindex = ill->ill_phyint->phyint_ifindex; - io->ipsec_out_capab_ill_index = ifindex; + io->ipsec_out_capab_ill_index = ill->ill_phyint->phyint_ifindex; } /* @@ -27284,17 +26623,18 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) } } /* - * We are done with IPsec processing. Send it over - * the wire. + * We are done with IPsec processing. Send it over the wire. */ done: mp = ipsec_mp->b_cont; ipha = (ipha_t *)mp->b_rptr; if (IPH_HDR_VERSION(ipha) == IP_VERSION) { - ip_wput_ipsec_out(q, ipsec_mp, ipha, ill, ire); + ip_wput_ipsec_out(q, ipsec_mp, ipha, ire->ire_ipif->ipif_ill, + ire); } else { ip6h = (ip6_t *)ipha; - ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ill, ire); + ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ire->ire_ipif->ipif_ill, + ire); } if (ill != NULL && ill_need_rele) ill_refrele(ill); @@ -27356,18 +26696,16 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) ipip = ip_sioctl_lookup(iocp->ioc_cmd); if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) { /* - * Special case where ipsq_current_ipif is not set: + * Special case where ipx_current_ipif is not set: * ill_phyint_reinit merged the v4 and v6 into a single ipsq. - * ill could also have become part of a ipmp group in the - * process, we are here as were not able to complete the - * operation in ipif_set_values because we could not become - * exclusive on the new ipsq, In such a case ipsq_current_ipif - * will not be set so we need to set it. + * We are here as were not able to complete the operation in + * ipif_set_values because we could not become exclusive on + * the new ipsq. */ ill_t *ill = q->q_ptr; ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd); } - ASSERT(ipsq->ipsq_current_ipif != NULL); + ASSERT(ipsq->ipsq_xop->ipx_current_ipif != NULL); if (ipip->ipi_cmd_type == IF_CMD) { /* This a old style SIOC[GS]IF* command */ @@ -27381,8 +26719,8 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) sin = NULL; } - err = (*ipip->ipi_func_restart)(ipsq->ipsq_current_ipif, sin, q, mp, - ipip, mp1->b_rptr); + err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin, + q, mp, ipip, mp1->b_rptr); ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); } @@ -27424,6 +26762,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ip_extract_func_t *extract_funcp; cmd_info_t ci; int err; + boolean_t entered_ipsq = B_FALSE; ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd)); @@ -27505,18 +26844,21 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) return; } + ASSERT(ci.ci_ipif != NULL); + /* - * If ipsq is non-null, we are already being called exclusively on an - * ill but in the case of a failover in progress it is the "from" ill, - * rather than the "to" ill (which is the ill ptr passed in). - * In order to ensure we are exclusive on both ILLs we rerun - * ipsq_try_enter() here, ipsq's support recursive entry. + * If ipsq is non-NULL, we are already being called exclusively. */ ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); - ASSERT(ci.ci_ipif != NULL); - - ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl, - NEW_OP, B_TRUE); + if (ipsq == NULL) { + ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl, + NEW_OP, B_TRUE); + if (ipsq == NULL) { + ipif_refrele(ci.ci_ipif); + return; + } + entered_ipsq = B_TRUE; + } /* * Release the ipif so that ipif_down and friends that wait for @@ -27525,8 +26867,6 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) * the ipif. */ ipif_refrele(ci.ci_ipif); - if (ipsq == NULL) - return; ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); @@ -27535,19 +26875,12 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) * where we set the IPIF_CHANGING flag. This ensures that there won't * be any new references to the ipif. This helps functions that go * through this path and end up trying to wait for the refcnts - * associated with the ipif to go down to zero. Some exceptions are - * Failover, Failback, and Groupname commands that operate on more than - * just the ci.ci_ipif. These commands internally determine the - * set of ipif's they operate on and set and clear the IPIF_CHANGING - * flags on that set. Another exception is the Removeif command that - * sets the IPIF_CONDEMNED flag internally after identifying the right - * ipif to operate on. + * associated with the ipif to go down to zero. The exception is + * SIOCSLIFREMOVEIF, which sets IPIF_CONDEMNED internally after + * identifying the right ipif to operate on. */ mutex_enter(&(ci.ci_ipif)->ipif_ill->ill_lock); - if (ipip->ipi_cmd != SIOCLIFREMOVEIF && - ipip->ipi_cmd != SIOCLIFFAILOVER && - ipip->ipi_cmd != SIOCLIFFAILBACK && - ipip->ipi_cmd != SIOCSLIFGROUPNAME) + if (ipip->ipi_cmd != SIOCLIFREMOVEIF) (ci.ci_ipif)->ipif_state_flags |= IPIF_CHANGING; mutex_exit(&(ci.ci_ipif)->ipif_ill->ill_lock); @@ -27560,7 +26893,8 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); - ipsq_exit(ipsq); + if (entered_ipsq) + ipsq_exit(ipsq); } /* @@ -27708,7 +27042,7 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * Refhold the conn, till the ioctl completes. This is * needed in case the ioctl ends up in the pending mp * list. Every mp in the ill_pending_mp list and - * the ipsq_pending_mp must have a refhold on the conn + * the ipx_pending_mp must have a refhold on the conn * to resume processing. The refhold is released when * the ioctl completes. (normally or abnormally) * In all cases ip_ioctl_finish is called to finish @@ -27753,8 +27087,25 @@ nak: if (CONN_Q(q)) goto nak; - /* Finish socket ioctls passed through to ARP. */ - ip_sioctl_iocack(q, mp); + /* + * Finish socket ioctls passed through to ARP. We use the + * ioc_cmd values we set in ip_sioctl_arp() to decide whether + * we need to become writer before calling ip_sioctl_iocack(). + * Note that qwriter_ip() will release the refhold, and that a + * refhold is OK without ILL_CAN_LOOKUP() since we're on the + * ill stream. + */ + iocp = (struct iocblk *)mp->b_rptr; + if (iocp->ioc_cmd == AR_ENTRY_SQUERY) { + ip_sioctl_iocack(NULL, q, mp, NULL); + return; + } + + ASSERT(iocp->ioc_cmd == AR_ENTRY_DELETE || + iocp->ioc_cmd == AR_ENTRY_ADD); + ill = q->q_ptr; + ill_refhold(ill); + qwriter_ip(ill, q, mp, ip_sioctl_iocack, CUR_OP, B_FALSE); return; case M_FLUSH: if (*mp->b_rptr & FLUSHW) @@ -28021,11 +27372,11 @@ nak: gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { - nce = ndp_lookup_v6(ill, + nce = ndp_lookup_v6(ill, B_FALSE, &ire->ire_addr_v6, B_FALSE); } else { - nce = ndp_lookup_v6(ill, &gw_addr_v6, - B_FALSE); + nce = ndp_lookup_v6(ill, B_FALSE, + &gw_addr_v6, B_FALSE); } if (nce != NULL) { nce_resolv_failed(nce); @@ -28061,10 +27412,11 @@ nak: gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { - nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, - B_FALSE); + nce = ndp_lookup_v6(ill, B_FALSE, + &ire->ire_addr_v6, B_FALSE); } else { - nce = ndp_lookup_v6(ill, &gw_addr_v6, B_FALSE); + nce = ndp_lookup_v6(ill, B_FALSE, + &gw_addr_v6, B_FALSE); } if (nce != NULL) { /* @@ -28238,13 +27590,14 @@ nak: fake_ire = (ire_t *)mp->b_rptr; /* - * By the time we come back here from ARP the incomplete ire - * created in ire_forward() could have been removed. We use - * the parameters stored in the fake_ire to specify the real - * ire as explicitly as possible. This avoids problems when - * IPMP groups are configured as an ipif can 'float' - * across several ill queues. We can be confident that the - * the inability to find an ire is because it no longer exists. + * By the time we come back here from ARP the logical outgoing + * interface of the incomplete ire we added in ire_forward() + * could have disappeared, causing the incomplete ire to also + * disappear. So we need to retreive the proper ipif for the + * ire before looking in ctable. In the case of IPMP, the + * ipif may be on the IPMP ill, so look it up based on the + * ire_ipif_ifindex we stashed back in ire_init_common(). + * Then, we can verify that ire_ipif_seqid still exists. */ ill = ill_lookup_on_ifindex(fake_ire->ire_ipif_ifindex, B_FALSE, NULL, NULL, NULL, NULL, ipst); @@ -28299,6 +27652,7 @@ nak: freemsg(mp); /* fake ire */ return; } + nce = ire->ire_nce; DTRACE_PROBE2(ire__arpresolve__type, ire_t *, ire, nce_t *, nce); @@ -29030,7 +28384,7 @@ boolean_t conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, zoneid_t zoneid) { - ill_t *in_ill; + ill_t *bound_ill; boolean_t found; ipif_t *ipif; ire_t *ire; @@ -29045,32 +28399,15 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, * unicast, broadcast and multicast reception to * conn_incoming_ill. conn_wantpacket itself is called * only for BROADCAST and multicast. - * - * 1) ip_rput supresses duplicate broadcasts if the ill - * is part of a group. Hence, we should be receiving - * just one copy of broadcast for the whole group. - * Thus, if it is part of the group the packet could - * come on any ill of the group and hence we need a - * match on the group. Otherwise, match on ill should - * be sufficient. - * - * 2) ip_rput does not suppress duplicate multicast packets. - * If there are two interfaces in a ill group and we have - * 2 applications (conns) joined a multicast group G on - * both the interfaces, ilm_lookup_ill filter in ip_rput - * will give us two packets because we join G on both the - * interfaces rather than nominating just one interface - * for receiving multicast like broadcast above. So, - * we have to call ilg_lookup_ill to filter out duplicate - * copies, if ill is part of a group. - */ - in_ill = connp->conn_incoming_ill; - if (in_ill != NULL) { - if (in_ill->ill_group == NULL) { - if (in_ill != ill) + */ + bound_ill = connp->conn_incoming_ill; + if (bound_ill != NULL) { + if (IS_IPMP(bound_ill)) { + if (bound_ill->ill_grp != ill->ill_grp) + return (B_FALSE); + } else { + if (bound_ill != ill) return (B_FALSE); - } else if (in_ill->ill_group != ill->ill_group) { - return (B_FALSE); } } @@ -29079,15 +28416,14 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, return (B_TRUE); /* * The conn is in a different zone; we need to check that this - * broadcast address is configured in the application's zone and - * on one ill in the group. + * broadcast address is configured in the application's zone. */ ipif = ipif_get_next_ipif(NULL, ill); if (ipif == NULL) return (B_FALSE); ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif, connp->conn_zoneid, NULL, - (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst); + (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); ipif_refrele(ipif); if (ire != NULL) { ire_refrele(ire); @@ -29171,7 +28507,7 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) } ipsq = ill->ill_phyint->phyint_ipsq; - ipif = ipsq->ipsq_pending_ipif; + ipif = ipsq->ipsq_xop->ipx_pending_ipif; mp1 = ipsq_pending_mp_get(ipsq, &connp); ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); if (mp1 == NULL) { @@ -29181,12 +28517,12 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) } /* - * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we + * If an IOCTL is waiting on this (ipx_current_ioctl != 0), then we * must have an associated conn_t. Otherwise, we're bringing this * interface back up as part of handling an asynchronous event (e.g., * physical address change). */ - if (ipsq->ipsq_current_ioctl != 0) { + if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { ASSERT(connp != NULL); q = CONNP_TO_WQ(connp); } else { @@ -29219,16 +28555,28 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) return; } - if (ill->ill_up_ipifs) - ill_group_cleanup(ill); + /* + * If we have a moved ipif to bring up, and everything has succeeded + * to this point, bring it up on the IPMP ill. Otherwise, leave it + * down -- the admin can try to bring it up by hand if need be. + */ + if (ill->ill_move_ipif != NULL) { + ipif = ill->ill_move_ipif; + ill->ill_move_ipif = NULL; + if (err == 0) { + err = ipif_up(ipif, q, mp1); + if (err == EINPROGRESS) + return; + } + } /* * The operation must complete without EINPROGRESS since - * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. - * Otherwise, the operation will be stuck forever in the ipsq. + * ipsq_pending_mp_get() has removed the mblk. Otherwise, the + * operation will be stuck forever in the ipsq. */ ASSERT(err != EINPROGRESS); - if (ipsq->ipsq_current_ioctl != 0) + if (ipsq->ipsq_xop->ipx_current_ioctl != 0) ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); else ipsq_current_finish(ipsq); @@ -29649,124 +28997,6 @@ ip_int_set(queue_t *q, mblk_t *mp, char *value, return (0); } -/* - * Handle changes to ipmp_hook_emulation ndd variable. - * Need to update phyint_hook_ifindex. - * Also generate a nic plumb event should a new ifidex be assigned to a group. - */ -static void -ipmp_hook_emulation_changed(ip_stack_t *ipst) -{ - phyint_t *phyi; - phyint_t *phyi_tmp; - char *groupname; - int namelen; - ill_t *ill; - boolean_t new_group; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - /* - * Group indicies are stored in the phyint - a common structure - * to both IPv4 and IPv6. - */ - phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); - for (; phyi != NULL; - phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, - phyi, AVL_AFTER)) { - /* Ignore the ones that do not have a group */ - if (phyi->phyint_groupname_len == 0) - continue; - - /* - * Look for other phyint in group. - * Clear name/namelen so the lookup doesn't find ourselves. - */ - namelen = phyi->phyint_groupname_len; - groupname = phyi->phyint_groupname; - phyi->phyint_groupname_len = 0; - phyi->phyint_groupname = NULL; - - phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst); - /* Restore */ - phyi->phyint_groupname_len = namelen; - phyi->phyint_groupname = groupname; - - new_group = B_FALSE; - if (ipst->ips_ipmp_hook_emulation) { - /* - * If the group already exists and has already - * been assigned a group ifindex, we use the existing - * group_ifindex, otherwise we pick a new group_ifindex - * here. - */ - if (phyi_tmp != NULL && - phyi_tmp->phyint_group_ifindex != 0) { - phyi->phyint_group_ifindex = - phyi_tmp->phyint_group_ifindex; - } else { - /* XXX We need a recovery strategy here. */ - if (!ip_assign_ifindex( - &phyi->phyint_group_ifindex, ipst)) - cmn_err(CE_PANIC, - "ip_assign_ifindex() failed"); - new_group = B_TRUE; - } - } else { - phyi->phyint_group_ifindex = 0; - } - if (ipst->ips_ipmp_hook_emulation) - phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex; - else - phyi->phyint_hook_ifindex = phyi->phyint_ifindex; - - /* - * For IP Filter to find out the relationship between - * names and interface indicies, we need to generate - * a NE_PLUMB event when a new group can appear. - * We always generate events when a new interface appears - * (even when ipmp_hook_emulation is set) so there - * is no need to generate NE_PLUMB events when - * ipmp_hook_emulation is turned off. - * And since it isn't critical for IP Filter to get - * the NE_UNPLUMB events we skip those here. - */ - if (new_group) { - /* - * First phyint in group - generate group PLUMB event. - * Since we are not running inside the ipsq we do - * the dispatch immediately. - */ - if (phyi->phyint_illv4 != NULL) - ill = phyi->phyint_illv4; - else - ill = phyi->phyint_illv6; - - if (ill != NULL) - ill_nic_event_plumb(ill, B_TRUE); - } - } - rw_exit(&ipst->ips_ill_g_lock); -} - -/* ARGSUSED */ -static int -ipmp_hook_emulation_set(queue_t *q, mblk_t *mp, char *value, - caddr_t addr, cred_t *cr) -{ - int *v = (int *)addr; - long new_value; - ip_stack_t *ipst = CONNQ_TO_IPST(q); - - if (ddi_strtol(value, NULL, 10, &new_value) != 0) - return (EINVAL); - - if (*v != new_value) { - *v = new_value; - ipmp_hook_emulation_changed(ipst); - } - return (0); -} - static void * ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp) { @@ -30448,12 +29678,12 @@ next_mp: arpce->nce_state = ND_INCOMPLETE; mutex_exit(&arpce->nce_lock); + /* * Note that ire_add() (called from ire_forward()) * holds a ref on the ire until ARP is completed. */ - - ire_arpresolve(ire, ire_to_ill(ire)); + ire_arpresolve(ire); return (LOOKUP_IN_PROGRESS); default: ASSERT(0); @@ -30596,7 +29826,7 @@ ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill, return (ALL_ZONES); if (IN6_IS_ADDR_LINKLOCAL(addr)) { - ire_flags |= MATCH_IRE_ILL_GROUP; + ire_flags |= MATCH_IRE_ILL; ipif_arg = ill->ill_ipif; } if (lookup_zoneid != ALL_ZONES) @@ -30648,20 +29878,24 @@ void ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst, const ill_t *ill, int ipver, uint32_t hlen, ip_stack_t *ipst) { + mblk_t *mp2; ipobs_cb_t *ipobs_cb; + ipobs_hook_data_t *ihd; + uint64_t grifindex = 0; ASSERT(DB_TYPE(mp) == M_DATA); + if (IS_UNDER_IPMP(ill)) + grifindex = ipmp_ill_get_ipmp_ifindex(ill); + mutex_enter(&ipst->ips_ipobs_cb_lock); ipst->ips_ipobs_cb_nwalkers++; mutex_exit(&ipst->ips_ipobs_cb_lock); for (ipobs_cb = list_head(&ipst->ips_ipobs_cb_list); ipobs_cb != NULL; ipobs_cb = list_next(&ipst->ips_ipobs_cb_list, ipobs_cb)) { - mblk_t *mp2 = allocb(sizeof (ipobs_hook_data_t), - BPRI_HI); + mp2 = allocb(sizeof (ipobs_hook_data_t), BPRI_HI); if (mp2 != NULL) { - ipobs_hook_data_t *ihd = - (ipobs_hook_data_t *)mp2->b_rptr; + ihd = (ipobs_hook_data_t *)mp2->b_rptr; if (((ihd->ihd_mp = dupmsg(mp)) == NULL) && ((ihd->ihd_mp = copymsg(mp)) == NULL)) { freemsg(mp2); @@ -30673,6 +29907,7 @@ ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst, ihd->ihd_zsrc = zsrc; ihd->ihd_zdst = zdst; ihd->ihd_ifindex = ill->ill_phyint->phyint_ifindex; + ihd->ihd_grifindex = grifindex; ihd->ihd_stack = ipst->ips_netstack; mp2->b_wptr += sizeof (*ihd); ipobs_cb->ipobs_cbfunc(mp2); diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index fe326778c2..6e63af32b3 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -95,7 +95,6 @@ #include <sys/pattr.h> #include <inet/ipclassifier.h> #include <inet/ipsecah.h> -#include <inet/udp_impl.h> #include <inet/rawip_impl.h> #include <inet/rts_impl.h> #include <sys/squeue_impl.h> @@ -186,7 +185,7 @@ const in6_addr_t ipv6_solicited_node_mcast = #define IP6_MBLK_HDR_ERR 1 #define IP6_MBLK_LEN_ERR 2 -static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *ill, +static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *, ill_t *, boolean_t, zoneid_t); static void icmp_pkt_v6(queue_t *, mblk_t *, void *, size_t, const in6_addr_t *, boolean_t, zoneid_t, ip_stack_t *); @@ -208,11 +207,13 @@ static void ip_fanout_udp_v6(queue_t *, mblk_t *, ip6_t *, uint32_t, ill_t *, ill_t *, uint_t, boolean_t, zoneid_t); static int ip_process_options_v6(queue_t *, mblk_t *, ip6_t *, uint8_t *, uint_t, uint8_t, ip_stack_t *); -static mblk_t *ip_rput_frag_v6(queue_t *, mblk_t *, ip6_t *, +static mblk_t *ip_rput_frag_v6(ill_t *, ill_t *, mblk_t *, ip6_t *, ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *); static boolean_t ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *); static void ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int, - conn_t *, int, int, int, zoneid_t); + conn_t *, int, int, zoneid_t); +static boolean_t ipif_lookup_testaddr_v6(ill_t *, const in6_addr_t *, + ipif_t **); /* * A template for an IPv6 AR_ENTRY_QUERY @@ -248,15 +249,14 @@ static areq_t ipv6_areq_template = { * call icmp_inbound_v6() for each relevant zone. */ static void -icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, - boolean_t mctl_present, uint_t flags, zoneid_t zoneid, mblk_t *dl_mp) +icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill, + uint_t hdr_length, boolean_t mctl_present, uint_t flags, zoneid_t zoneid, + mblk_t *dl_mp) { icmp6_t *icmp6; ip6_t *ip6h; boolean_t interested; - ip6i_t *ip6i; in6_addr_t origsrc; - ire_t *ire; mblk_t *first_mp; ipsec_in_t *ii; ip_stack_t *ipst = ill->ill_ipst; @@ -344,7 +344,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, break; case ICMP6_PACKET_TOO_BIG: - icmp_inbound_too_big_v6(q, first_mp, ill, mctl_present, + icmp_inbound_too_big_v6(q, first_mp, ill, inill, mctl_present, zoneid); return; case ICMP6_ECHO_REQUEST: @@ -422,66 +422,6 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, * checksum field. The checksum is calculated in ip_wput_v6. */ icmp6->icmp6_cksum = ip6h->ip6_plen; - /* - * ICMP echo replies should go out on the same interface - * the request came on as probes used by in.mpathd for - * detecting NIC failures are ECHO packets. We turn-off load - * spreading by allocating a ip6i and setting ip6i_attach_if - * to B_TRUE which is handled both by ip_wput_v6 and - * ip_newroute_v6. If we don't turnoff load spreading, - * the packets might get dropped if there are no - * non-FAILED/INACTIVE interfaces for it to go out on and - * in.mpathd would wrongly detect a failure or mis-detect - * a NIC failure as a link failure. As load spreading can - * happen only if ill_group is not NULL, we do only for - * that case and this does not affect the normal case. - * - * We force this only on echo packets that came from on-link - * hosts. We restrict this to link-local addresses which - * is used by in.mpathd for probing. In the IPv6 case, - * default routes typically have an ire_ipif pointer and - * hence a MATCH_IRE_ILL later in ip_newroute_v6/ip_wput_v6 - * might work. As a default route out of this interface - * may not be present, enforcing this packet to go out in - * this case may not work. - */ - if (ill->ill_group != NULL && - IN6_IS_ADDR_LINKLOCAL(&origsrc)) { - /* - * If we are sending replies to ourselves, don't - * set ATTACH_IF as we may not be able to find - * the IRE_LOCAL on this ill i.e setting ATTACH_IF - * causes ip_wput_v6 to look for an IRE_LOCAL on - * "ill" which it may not find and will try to - * create an IRE_CACHE for our local address. Once - * we do this, we will try to forward all packets - * meant to our LOCAL address. - */ - ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ALL_ZONES, - NULL, ipst); - if (ire == NULL || ire->ire_type != IRE_LOCAL) { - mp = ip_add_info_v6(mp, NULL, &ip6h->ip6_dst); - if (mp == NULL) { - BUMP_MIB(ill->ill_icmp6_mib, - ipv6IfIcmpInErrors); - if (ire != NULL) - ire_refrele(ire); - if (mctl_present) - freeb(first_mp); - return; - } else if (mctl_present) { - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - ip6i = (ip6i_t *)mp->b_rptr; - ip6i->ip6i_flags = IP6I_ATTACH_IF; - ip6i->ip6i_ifindex = - ill->ill_phyint->phyint_ifindex; - } - if (ire != NULL) - ire_refrele(ire); - } if (!mctl_present) { /* @@ -529,7 +469,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, if (mctl_present) freeb(first_mp); /* XXX may wish to pass first_mp up to ndp_input someday. */ - ndp_input(ill, mp, dl_mp); + ndp_input(inill, mp, dl_mp); return; case ND_NEIGHBOR_ADVERT: @@ -538,7 +478,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, if (mctl_present) freeb(first_mp); /* XXX may wish to pass first_mp up to ndp_input someday. */ - ndp_input(ill, mp, dl_mp); + ndp_input(inill, mp, dl_mp); return; case ND_REDIRECT: { @@ -579,7 +519,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, } if (interested) { icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, - mctl_present, zoneid); + inill, mctl_present, zoneid); } else { freemsg(first_mp); } @@ -592,7 +532,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, */ /* ARGSUSED */ static void -icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, +icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill, boolean_t mctl_present, zoneid_t zoneid) { ip6_t *ip6h; @@ -658,11 +598,10 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, * sufficient. Same link local addresses for different ILL's is * possible. */ - if (IN6_IS_ADDR_LINKLOCAL(&inner_ip6h->ip6_dst)) { first_ire = ire_ctable_lookup_v6(&inner_ip6h->ip6_dst, NULL, IRE_CACHE, ill->ill_ipif, ALL_ZONES, NULL, - MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst); + MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); if (first_ire == NULL) { if (ip_debug > 2) { @@ -773,7 +712,7 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, } rw_exit(&irb->irb_lock); } - icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, + icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, inill, mctl_present, zoneid); } @@ -783,7 +722,8 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, */ void icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, - icmp6_t *icmp6, ill_t *ill, boolean_t mctl_present, zoneid_t zoneid) + icmp6_t *icmp6, ill_t *ill, ill_t *inill, boolean_t mctl_present, + zoneid_t zoneid) { uint16_t *up; /* Pointer to ports in ULP header */ uint32_t ports; /* reversed ports for fanout */ @@ -861,7 +801,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ((uint16_t *)&ports)[0] = up[1]; ((uint16_t *)&ports)[1] = up[0]; - ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, ill, + ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, inill, IP6_NO_IPPOLICY, mctl_present, zoneid); return; } @@ -908,7 +848,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, up = (uint16_t *)((uchar_t *)ip6h + hdr_length); ((uint16_t *)&ports)[0] = up[1]; ((uint16_t *)&ports)[1] = up[0]; - ip_fanout_sctp(first_mp, ill, (ipha_t *)ip6h, ports, 0, + ip_fanout_sctp(first_mp, inill, (ipha_t *)ip6h, ports, 0, mctl_present, IP6_NO_IPPOLICY, zoneid); return; case IPPROTO_ESP: @@ -940,7 +880,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ASSERT(ill != NULL); ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = ii->ipsec_in_ill_index; + ii->ipsec_in_rill_index = + inill->ill_phyint->phyint_ifindex; first_mp->b_cont->b_datap->db_type = M_CTL; } else { /* @@ -970,7 +911,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, mp->b_datap->db_type = M_CTL; ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = ii->ipsec_in_ill_index; + ii->ipsec_in_rill_index = + inill->ill_phyint->phyint_ifindex; } if (!ipsec_loaded(ipss)) { @@ -985,7 +927,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, if (ipsec_rc == IPSEC_STATUS_FAILED) return; - ip_fanout_proto_again(first_mp, ill, ill, NULL); + ip_fanout_proto_again(first_mp, ill, inill, NULL); return; } case IPPROTO_ENCAP: @@ -1083,8 +1025,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, * doing here. */ icmp_inbound_error_fanout_v6(q, first_mp, - (ip6_t *)mp->b_rptr, icmp6, ill, mctl_present, - zoneid); + (ip6_t *)mp->b_rptr, icmp6, ill, inill, + mctl_present, zoneid); return; } /* FALLTHRU */ @@ -1096,7 +1038,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, rip6h.ip6_src = ip6h->ip6_dst; rip6h.ip6_dst = ip6h->ip6_src; rip6h.ip6_nxt = nexthdr; - ip_fanout_proto_v6(q, first_mp, &rip6h, ill, ill, nexthdr, 0, + ip_fanout_proto_v6(q, first_mp, &rip6h, ill, inill, nexthdr, 0, IP6_NO_IPPOLICY, mctl_present, zoneid); return; } @@ -1194,9 +1136,8 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) * redirect packet.) */ - prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL, - ALL_ZONES, NULL, MATCH_IRE_GW | MATCH_IRE_ILL_GROUP | - MATCH_IRE_DEFAULT, ipst); + prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL, ALL_ZONES, + NULL, MATCH_IRE_GW | MATCH_IRE_ILL | MATCH_IRE_DEFAULT, ipst); /* * Check that @@ -1260,6 +1201,7 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR); if (opt != NULL) { err = ndp_lookup_then_add_v6(ill, + B_FALSE, /* don't match across illgrp */ (uchar_t *)&opt[1], /* Link layer address */ gateway, &ipv6_all_ones, /* prefix mask */ @@ -1367,8 +1309,7 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) */ redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST, ire->ire_ipif, NULL, ALL_ZONES, 0, NULL, - (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), - ipst); + (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); ire_refrele(ire); /* Held in ire_add_v6 */ @@ -1457,15 +1398,11 @@ icmp_pick_source_v6(queue_t *wq, in6_addr_t *origsrc, in6_addr_t *origdst, BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes); return (NULL); } - /* - * Does not matter whether we use ire_stq or ire_ipif here. - * Just pick an ill for ICMP replies. - */ ASSERT(ire->ire_ipif != NULL); ill = ire->ire_ipif->ipif_ill; ire_refrele(ire); } - ipif = ipif_select_source_v6(ill, origsrc, RESTRICT_TO_NONE, + ipif = ipif_select_source_v6(ill, origsrc, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); if (ipif != NULL) { *src = ipif->ipif_v6src_addr; @@ -1858,7 +1795,7 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp, mp = icmp_pkt_err_ok_v6(q, mp, llbcast, B_FALSE, ipst); if (mp == NULL) return; - nce = ndp_lookup_v6(ill, targetp, B_FALSE); + nce = ndp_lookup_v6(ill, B_TRUE, targetp, B_FALSE); if (nce != NULL && nce->nce_state != ND_INCOMPLETE) { ll_opt_len = (sizeof (nd_opt_hdr_t) + ill->ill_phys_addr_length + 7)/8 * 8; @@ -1908,31 +1845,8 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp, rdh->nd_opt_rh_reserved1 = 0; rdh->nd_opt_rh_reserved2 = 0; /* ipif_v6src_addr contains the link-local source address */ - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - if (ill->ill_group != NULL) { - /* - * The receiver of the redirect will verify whether it - * had a route through us (srcp that we will use in - * the redirect) or not. As we load spread even link-locals, - * we don't know which source address the receiver of - * redirect has in its route for communicating with us. - * Thus we randomly choose a source here and finally we - * should get to the right one and it will eventually - * accept the redirect from us. We can't call - * ip_lookup_scope_v6 because we don't have the right - * link-local address here. Thus we randomly choose one. - */ - int cnt = ill->ill_group->illgrp_ill_count; + srcp = &ill->ill_ipif->ipif_v6src_addr; - ill = ill->ill_group->illgrp_ill; - cnt = ++ipst->ips_icmp_redirect_v6_src_index % cnt; - while (cnt--) - ill = ill->ill_group_next; - srcp = &ill->ill_ipif->ipif_v6src_addr; - } else { - srcp = &ill->ill_ipif->ipif_v6src_addr; - } - rw_exit(&ipst->ips_ill_g_lock); /* Redirects sent by router, and router is global zone */ icmp_pkt_v6(q, mp, buf, len, srcp, B_FALSE, GLOBAL_ZONEID, ipst); kmem_free(buf, len); @@ -2231,6 +2145,7 @@ ip_bind_post_handling_v6(conn_t *connp, mblk_t *mp, if (version_changed) { ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst); } + /* * Pass the IPSEC headers size in ire_ipsec_overhead. * We can't do this in ip_bind_insert_ire because the policy @@ -2771,8 +2686,8 @@ ip_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol, } if (ip6_asp_can_lookup(ipst)) { src_ipif = ipif_select_source_v6(dst_ill, - v6dst, RESTRICT_TO_NONE, - connp->conn_src_preferences, zoneid); + v6dst, B_FALSE, connp->conn_src_preferences, + zoneid); ip6_asp_table_refrele(ipst); if (src_ipif == NULL) { pr_addr_dbg("ip_bind_connected_v6: " @@ -3111,7 +3026,15 @@ ip_add_info_v6(mblk_t *mp, ill_t *ill, const in6_addr_t *dst) ip6i->ip6i_nxt = IPPROTO_RAW; if (ill != NULL) { ip6i->ip6i_flags = IP6I_IFINDEX; - ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; + /* + * If `ill' is in an IPMP group, make sure we use the IPMP + * interface index so that e.g. IPV6_RECVPKTINFO will get the + * IPMP interface index and not an underlying interface index. + */ + if (IS_UNDER_IPMP(ill)) + ip6i->ip6i_ifindex = ipmp_ill_get_ipmp_ifindex(ill); + else + ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; } else { ip6i->ip6i_flags = 0; } @@ -4257,33 +4180,6 @@ ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h) } /* - * Select an ill for the packet by considering load spreading across - * a different ill in the group if dst_ill is part of some group. - */ -static ill_t * -ip_newroute_get_dst_ill_v6(ill_t *dst_ill) -{ - ill_t *ill; - - /* - * We schedule irrespective of whether the source address is - * INADDR_UNSPECIED or not. - */ - ill = illgrp_scheduler(dst_ill); - if (ill == NULL) - return (NULL); - - /* - * For groups with names ip_sioctl_groupname ensures that all - * ills are of same type. For groups without names, ifgrp_insert - * ensures this. - */ - ASSERT(dst_ill->ill_type == ill->ill_type); - - return (ill); -} - -/* * IPv6 - * ip_newroute_v6 is called by ip_rput_data_v6 or ip_wput_v6 whenever we need * to send out a packet to a destination address for which we do not have @@ -4303,14 +4199,6 @@ ip_newroute_get_dst_ill_v6(ill_t *dst_ill) * node sits at a site boundary). * We create the cache entries in the regular ctable since * it can not "confuse" things for other destinations. - * table. - * - * When ill is part of a ill group, we subject the packets - * to load spreading even if the ill is specified by the - * means described above. We disable only for IPV6_BOUND_PIF - * and for the cases where IP6I_ATTACH_IF is set i.e NS/NA/ - * Echo replies to link-local destinations have IP6I_ATTACH_IF - * set. * * NOTE : These are the scopes of some of the variables that point at IRE, * which needs to be followed while making any future modifications @@ -4327,8 +4215,6 @@ ip_newroute_get_dst_ill_v6(ill_t *dst_ill) * * Thus on failures, we have to REFRELE only ire and sire, if they * are not NULL. - * - * v6srcp may be used in the future. Currently unused. */ /* ARGSUSED */ void @@ -4346,10 +4232,8 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, int err = 0; mblk_t *first_mp; ipsec_out_t *io; - ill_t *attach_ill = NULL; ushort_t ire_marks = 0; int match_flags; - boolean_t ip6i_present; ire_t *first_sire = NULL; mblk_t *copy_mp = NULL; mblk_t *xmit_mp = NULL; @@ -4359,7 +4243,6 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, boolean_t multirt_is_resolvable; boolean_t multirt_resolve_next; boolean_t need_rele = B_FALSE; - boolean_t do_attach_ill = B_FALSE; boolean_t ip6_asp_table_held = B_FALSE; tsol_ire_gw_secattr_t *attrp = NULL; tsol_gcgrp_t *gcgrp = NULL; @@ -4376,39 +4259,12 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, io = NULL; } - /* - * If this end point is bound to IPIF_NOFAILOVER, set bnf_ill and - * bind_to_nofailover B_TRUE. We can't use conn to determine as it - * could be NULL. - * - * This information can appear either in an ip6i_t or an IPSEC_OUT - * message. - */ ip6h = (ip6_t *)mp->b_rptr; - ip6i_present = (ip6h->ip6_nxt == IPPROTO_RAW); - if (ip6i_present || (io != NULL && io->ipsec_out_attach_if)) { - if (!ip6i_present || - ((ip6i_t *)ip6h)->ip6i_flags & IP6I_ATTACH_IF) { - attach_ill = ip_grab_attach_ill(ill, first_mp, - (ip6i_present ? ((ip6i_t *)ip6h)->ip6i_ifindex : - io->ipsec_out_ill_index), B_TRUE, ipst); - /* Failure case frees things for us. */ - if (attach_ill == NULL) - return; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) - ire_marks = IRE_MARK_HIDDEN; - } - } if (IN6_IS_ADDR_LOOPBACK(v6dstp)) { ip1dbg(("ip_newroute_v6: dst with loopback addr\n")); goto icmp_err_ret; - } else if ((v6srcp != NULL) && IN6_IS_ADDR_LOOPBACK(v6srcp)) { + } else if (IN6_IS_ADDR_LOOPBACK(v6srcp)) { ip1dbg(("ip_newroute_v6: src with loopback addr\n")); goto icmp_err_ret; } @@ -4436,30 +4292,24 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp), match_flags, ipst); + } else { + match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | + MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL; + match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR; + /* - * ire_add_then_send -> ip_newroute_v6 in the CGTP case passes - * in a NULL ill, but the packet could be a neighbor - * solicitation/advertisment and could have a valid attach_ill. + * Because nce_xmit() calls ip_output_v6() and NCEs are always + * tied to an underlying interface, IS_UNDER_IPMP() may be + * true even when building IREs that will be used for data + * traffic. As such, use the packet's source address to + * determine whether the traffic is test traffic, and set + * MATCH_IRE_MARK_TESTHIDDEN if so. */ - if (attach_ill != NULL) - ill_refrele(attach_ill); - } else { - if (attach_ill != NULL) { - /* - * attach_ill is set only for communicating with - * on-link hosts. So, don't look for DEFAULT. - * ip_wput_v6 passes the right ill in this case and - * hence we can assert. - */ - ASSERT(ill == attach_ill); - ill_refrele(attach_ill); - do_attach_ill = B_TRUE; - match_flags = MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL; - } else { - match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL_GROUP; + if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) { + if (ipif_lookup_testaddr_v6(ill, v6srcp, NULL)) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; } - match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR; + ire = ire_ftable_lookup_v6(v6dstp, NULL, NULL, 0, ill->ill_ipif, &sire, zoneid, 0, MBLK_GETLABEL(mp), match_flags, ipst); } @@ -4601,106 +4451,56 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, } /* - * We have a route to reach the destination. - * - * 1) If the interface is part of ill group, try to get a new - * ill taking load spreading into account. + * We have a route to reach the destination. Find the + * appropriate ill, then get a source address that matches the + * right scope via ipif_select_source_v6(). * - * 2) After selecting the ill, get a source address that might - * create good inbound load spreading and that matches the - * right scope. ipif_select_source_v6 does this for us. + * If we are here trying to create an IRE_CACHE for an offlink + * destination and have an IRE_CACHE entry for VNI, then use + * ire_stq instead since VNI's queue is a black hole. * - * If the application specified the ill (ifindex), we still - * load spread. Only if the packets needs to go out specifically - * on a given ill e.g. bind to IPIF_NOFAILOVER address, - * IPV6_BOUND_PIF we don't try to use a different ill for load - * spreading. + * Note: While we pick a dst_ill we are really only interested + * in the ill for load spreading. The source ipif is + * determined by source address selection below. */ - if (!do_attach_ill) { - /* - * If the interface belongs to an interface group, - * make sure the next possible interface in the group - * is used. This encourages load spreading among - * peers in an interface group. However, in the case - * of multirouting, load spreading is not used, as we - * actually want to replicate outgoing packets through - * particular interfaces. - * - * Note: While we pick a dst_ill we are really only - * interested in the ill for load spreading. - * The source ipif is determined by source address - * selection below. - */ - if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { - dst_ill = ire->ire_ipif->ipif_ill; - /* For uniformity do a refhold */ - ill_refhold(dst_ill); + if ((ire->ire_type == IRE_CACHE) && + IS_VNI(ire->ire_ipif->ipif_ill)) { + dst_ill = ire->ire_stq->q_ptr; + ill_refhold(dst_ill); + } else { + ill_t *ill = ire->ire_ipif->ipif_ill; + + if (IS_IPMP(ill)) { + dst_ill = + ipmp_illgrp_hold_next_ill(ill->ill_grp); } else { - /* - * If we are here trying to create an IRE_CACHE - * for an offlink destination and have the - * IRE_CACHE for the next hop and the latter is - * using virtual IP source address selection i.e - * it's ire->ire_ipif is pointing to a virtual - * network interface (vni) then - * ip_newroute_get_dst_ll() will return the vni - * interface as the dst_ill. Since the vni is - * virtual i.e not associated with any physical - * interface, it cannot be the dst_ill, hence - * in such a case call ip_newroute_get_dst_ll() - * with the stq_ill instead of the ire_ipif ILL. - * The function returns a refheld ill. - */ - if ((ire->ire_type == IRE_CACHE) && - IS_VNI(ire->ire_ipif->ipif_ill)) - dst_ill = ip_newroute_get_dst_ill_v6( - ire->ire_stq->q_ptr); - else - dst_ill = ip_newroute_get_dst_ill_v6( - ire->ire_ipif->ipif_ill); + dst_ill = ill; + ill_refhold(dst_ill); } - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute_v6 : no dst " - "ill for dst %s\n", - AF_INET6, v6dstp); - } - goto icmp_err_ret; - } else if (dst_ill->ill_group == NULL && ill != NULL && - dst_ill != ill) { - /* - * If "ill" is not part of any group, we should - * have found a route matching "ill" as we - * called ire_ftable_lookup_v6 with - * MATCH_IRE_ILL_GROUP. - * Rather than asserting when there is a - * mismatch, we just drop the packet. - */ - ip0dbg(("ip_newroute_v6: BOUND_IF failed : " - "dst_ill %s ill %s\n", - dst_ill->ill_name, - ill->ill_name)); - goto icmp_err_ret; + } + + if (dst_ill == NULL) { + if (ip_debug > 2) { + pr_addr_dbg("ip_newroute_v6 : no dst " + "ill for dst %s\n", AF_INET6, v6dstp); } - } else { - dst_ill = ire->ire_ipif->ipif_ill; - /* For uniformity do refhold */ - ill_refhold(dst_ill); + goto icmp_err_ret; + } + + if (ill != NULL && dst_ill != ill && + !IS_IN_SAME_ILLGRP(dst_ill, ill)) { /* - * We should have found a route matching ill as we - * called ire_ftable_lookup_v6 with MATCH_IRE_ILL. - * Rather than asserting, while there is a mismatch, - * we just drop the packet. + * We should have found a route matching "ill" + * as we called ire_ftable_lookup_v6 with + * MATCH_IRE_ILL. Rather than asserting when + * there is a mismatch, we just drop the packet. */ - if (dst_ill != ill) { - ip0dbg(("ip_newroute_v6: Packet dropped as " - "IP6I_ATTACH_IF ill is %s, " - "ire->ire_ipif->ipif_ill is %s\n", - ill->ill_name, - dst_ill->ill_name)); - goto icmp_err_ret; - } + ip0dbg(("ip_newroute_v6: BOUND_IF failed: " + "dst_ill %s ill %s\n", dst_ill->ill_name, + ill->ill_name)); + goto icmp_err_ret; } + /* * Pick a source address which matches the scope of the * destination address. @@ -4708,7 +4508,20 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, * parent ire (sire). */ ASSERT(src_ipif == NULL); - if (ire->ire_type == IRE_IF_RESOLVER && + + /* + * Because nce_xmit() calls ip_output_v6() and NCEs are always + * tied to the underlying interface, IS_UNDER_IPMP() may be + * true even when building IREs that will be used for data + * traffic. As such, see if the packet's source address is a + * test address, and if so use that test address's ipif for + * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in + * ire_add_v6() can work properly. + */ + if (ill != NULL && IS_UNDER_IPMP(ill)) + (void) ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif); + + if (src_ipif == NULL && ire->ire_type == IRE_IF_RESOLVER && !IN6_IS_ADDR_UNSPECIFIED(&v6gw) && ip6_asp_can_lookup(ipst)) { /* @@ -4718,10 +4531,10 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, */ ip6_asp_table_held = B_TRUE; src_ipif = ipif_select_source_v6(dst_ill, &v6gw, - RESTRICT_TO_GROUP, IPV6_PREFER_SRC_DEFAULT, zoneid); + B_TRUE, IPV6_PREFER_SRC_DEFAULT, zoneid); if (src_ipif != NULL) ire_marks |= IRE_MARK_USESRC_CHECK; - } else { + } else if (src_ipif == NULL) { if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { /* * Check that the ipif matching the requested @@ -4732,14 +4545,9 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, NULL, NULL, NULL, NULL, ipst); } if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) { - uint_t restrict_ill = RESTRICT_TO_NONE; - - if (ip6i_present && ((ip6i_t *)ip6h)->ip6i_flags - & IP6I_ATTACH_IF) - restrict_ill = RESTRICT_TO_ILL; ip6_asp_table_held = B_TRUE; src_ipif = ipif_select_source_v6(dst_ill, - v6dstp, restrict_ill, + v6dstp, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); if (src_ipif != NULL) ire_marks |= IRE_MARK_USESRC_CHECK; @@ -4750,7 +4558,7 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg("ip_newroute_v6: no src for " - "dst %s\n, ", AF_INET6, v6dstp); + "dst %s\n", AF_INET6, v6dstp); printf("ip_newroute_v6: interface name %s\n", dst_ill->ill_name); } @@ -4837,14 +4645,7 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, "ire_ihandle_lookup_offlink_v6 failed\n")); goto icmp_err_ret; } - /* - * Assume DL_UNITDATA_REQ is same for all physical - * interfaces in the ifgrp. If it isn't, this code will - * have to be seriously rewhacked to allow the - * fastpath probing (such that I cache the link - * header in the IRE_CACHE) to work over ifgrps. - * We have what we need to build an IRE_CACHE. - */ + /* * Note: the new ire inherits RTF_SETSRC * and RTF_MULTIRT to propagate these flags from prefix @@ -5659,24 +5460,22 @@ icmp_err_ret: */ void ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, - in6_addr_t v6dst, int unspec_src, zoneid_t zoneid) + const in6_addr_t *v6dstp, const in6_addr_t *v6srcp, int unspec_src, + zoneid_t zoneid) { ire_t *ire = NULL; ipif_t *src_ipif = NULL; int err = 0; ill_t *dst_ill = NULL; ire_t *save_ire; - ushort_t ire_marks = 0; ipsec_out_t *io; - ill_t *attach_ill = NULL; ill_t *ill; - ip6_t *ip6h; mblk_t *first_mp; - boolean_t ip6i_present; ire_t *fire = NULL; mblk_t *copy_mp = NULL; + const in6_addr_t *ire_v6srcp; + boolean_t probe = B_FALSE; boolean_t multirt_resolve_next; - in6_addr_t *v6dstp = &v6dst; boolean_t ipif_held = B_FALSE; boolean_t ill_held = B_FALSE; boolean_t ip6_asp_table_held = B_FALSE; @@ -5728,35 +5527,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, if (!(ill->ill_flags & ILLF_MULTICAST)) { goto err_ret; } - /* - * If this end point is bound to IPIF_NOFAILOVER, set bnf_ill - * and bind_to_nofailover B_TRUE. We can't use conn to determine - * as it could be NULL. - * - * This information can appear either in an ip6i_t or an - * IPSEC_OUT message. - */ - ip6h = (ip6_t *)mp->b_rptr; - ip6i_present = (ip6h->ip6_nxt == IPPROTO_RAW); - if (ip6i_present || (io != NULL && io->ipsec_out_attach_if)) { - if (!ip6i_present || - ((ip6i_t *)ip6h)->ip6i_flags & IP6I_ATTACH_IF) { - attach_ill = ip_grab_attach_ill(ill, first_mp, - (ip6i_present ? - ((ip6i_t *)ip6h)->ip6i_ifindex : - io->ipsec_out_ill_index), B_TRUE, ipst); - /* Failure case frees things for us. */ - if (attach_ill == NULL) - return; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(attach_ill)) - ire_marks = IRE_MARK_HIDDEN; - } - } /* * We check if an IRE_OFFSUBNET for the addr that goes through @@ -5770,76 +5540,93 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, (void *)ipif, ntohl(V4_PART_OF_V6((*v6dstp))), (void *)fire)); + ASSERT(src_ipif == NULL); + /* - * If the application specified the ill (ifindex), we still - * load spread. Only if the packets needs to go out specifically - * on a given ill e.g. binding to IPIF_NOFAILOVER address or - * IPV6_BOUND_PIF, or there is a parent ire entry that specified - * multirouting, then we don't try to use a different ill for - * load spreading. + * Because nce_xmit() calls ip_output_v6() and NCEs are always + * tied to the underlying interface, IS_UNDER_IPMP() may be + * true even when building IREs that will be used for data + * traffic. As such, see if the packet's source address is a + * test address, and if so use that test address's ipif for + * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in + * ire_add_v6() can work properly. + */ + if (IS_UNDER_IPMP(ill)) + probe = ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif); + + /* + * Determine the outbound (destination) ill for this route. + * If IPMP is not in use, that's the same as our ill. If IPMP + * is in-use and we're on the IPMP interface, or we're on an + * underlying ill but sending data traffic, use a suitable + * destination ill from the group. The latter case covers a + * subtle edge condition with multicast: when we bring up an + * IPv6 data address, we will create an NCE on an underlying + * interface, and send solitications to ff02::1, which would + * take us through here, and cause us to create an IRE for + * ff02::1. To meet our defined semantics for multicast (and + * ensure there aren't unexpected echoes), that IRE needs to + * use the IPMP group's nominated multicast interface. + * + * Note: the source ipif is determined by source address + * selection later. */ - if (attach_ill == NULL) { - /* - * If the interface belongs to an interface group, - * make sure the next possible interface in the group - * is used. This encourages load spreading among peers - * in an interface group. - * - * Note: While we pick a dst_ill we are really only - * interested in the ill for load spreading. The source - * ipif is determined by source address selection below. - */ - if ((fire != NULL) && (fire->ire_flags & RTF_MULTIRT)) { - dst_ill = ipif->ipif_ill; - /* For uniformity do a refhold */ - ill_refhold(dst_ill); + if (IS_IPMP(ill) || (IS_UNDER_IPMP(ill) && !probe)) { + ill_t *ipmp_ill; + ipmp_illgrp_t *illg; + + if (IS_UNDER_IPMP(ill)) { + ipmp_ill = ipmp_ill_hold_ipmp_ill(ill); } else { - /* refheld by ip_newroute_get_dst_ill_v6 */ - dst_ill = - ip_newroute_get_dst_ill_v6(ipif->ipif_ill); + ipmp_ill = ill; + ill_refhold(ipmp_ill); /* for symmetry */ } - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute_ipif_v6: " - "no dst ill for dst %s\n", - AF_INET6, v6dstp); - } + + if (ipmp_ill == NULL) goto err_ret; - } + + illg = ipmp_ill->ill_grp; + if (IN6_IS_ADDR_MULTICAST(v6dstp)) + dst_ill = ipmp_illgrp_hold_cast_ill(illg); + else + dst_ill = ipmp_illgrp_hold_next_ill(illg); + + ill_refrele(ipmp_ill); } else { - dst_ill = ipif->ipif_ill; - /* - * ip_wput_v6 passes the right ipif for IPIF_NOFAILOVER - * and IPV6_BOUND_PIF case. - */ - ASSERT(dst_ill == attach_ill); - /* attach_ill is already refheld */ + dst_ill = ill; + ill_refhold(dst_ill); /* for symmetry */ + } + + if (dst_ill == NULL) { + if (ip_debug > 2) { + pr_addr_dbg("ip_newroute_ipif_v6: " + "no dst ill for dst %s\n", + AF_INET6, v6dstp); + } + goto err_ret; } + /* * Pick a source address which matches the scope of the * destination address. * For RTF_SETSRC routes, the source address is imposed by the * parent ire (fire). */ - ASSERT(src_ipif == NULL); - if ((fire != NULL) && (fire->ire_flags & RTF_SETSRC)) { + + if (src_ipif == NULL && fire != NULL && + (fire->ire_flags & RTF_SETSRC)) { /* * Check that the ipif matching the requested source * address still exists. */ - src_ipif = - ipif_lookup_addr_v6(&fire->ire_src_addr_v6, + src_ipif = ipif_lookup_addr_v6(&fire->ire_src_addr_v6, NULL, zoneid, NULL, NULL, NULL, NULL, ipst); } - if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) { - uint_t restrict_ill = RESTRICT_TO_NONE; - if (ip6i_present && ((ip6i_t *)ip6h)->ip6i_flags - & IP6I_ATTACH_IF) - restrict_ill = RESTRICT_TO_ILL; + if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) { ip6_asp_table_held = B_TRUE; src_ipif = ipif_select_source_v6(dst_ill, v6dstp, - restrict_ill, IPV6_PREFER_SRC_DEFAULT, zoneid); + B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); } if (src_ipif == NULL) { @@ -5847,16 +5634,20 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg("ip_newroute_ipif_v6: " - "no src for dst %s\n,", + "no src for dst %s\n", AF_INET6, v6dstp); printf(" through interface %s\n", dst_ill->ill_name); } goto err_ret; } + ire_v6srcp = &ipv6_all_zeros; src_ipif = ipif; ipif_refhold(src_ipif); + } else { + ire_v6srcp = &src_ipif->ipif_v6src_addr; } + ire = ipif_to_ire_v6(ipif); if (ire == NULL) { if (ip_debug > 2) { @@ -5903,7 +5694,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, } } - ASSERT((attach_ill == NULL) || (dst_ill == attach_ill)); switch (ire->ire_type) { case IRE_IF_NORESOLVER: { /* @@ -5921,7 +5711,7 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, ire = ire_create_v6( v6dstp, /* dest address */ &ipv6_all_ones, /* mask */ - &src_ipif->ipif_v6src_addr, /* source address */ + ire_v6srcp, /* source address */ NULL, /* gateway address */ &save_ire->ire_max_frag, NULL, /* no src nce */ @@ -5946,8 +5736,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, break; } - ire->ire_marks |= ire_marks; - err = ndp_noresolver(dst_ill, v6dstp); if (err != 0) { ire_refrele(save_ire); @@ -6051,7 +5839,7 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, ire = ire_create_v6( v6dstp, /* dest address */ &ipv6_all_ones, /* mask */ - &src_ipif->ipif_v6src_addr, /* source address */ + ire_v6srcp, /* source address */ NULL, /* gateway address */ &save_ire->ire_max_frag, NULL, /* src nce */ @@ -6076,8 +5864,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, break; } - ire->ire_marks |= ire_marks; - /* Resolve and add ire to the ctable */ err = ndp_resolver(dst_ill, v6dstp, first_mp, zoneid); switch (err) { @@ -6273,8 +6059,8 @@ err_ret: ipif_refrele(ipif); if (src_ipif != NULL) ipif_refrele(src_ipif); + /* Multicast - no point in trying to generate ICMP error */ - ASSERT((attach_ill == NULL) || (dst_ill == attach_ill)); if (dst_ill != NULL) { ill = dst_ill; ill_held = B_TRUE; @@ -6499,7 +6285,7 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, &ip6h->ip6_dst)) { ipif = ipif_select_source_v6( ill, &ip6h->ip6_src, - RESTRICT_TO_GROUP, + B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES); if (ipif != NULL) { @@ -7050,7 +6836,7 @@ ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr) */ static boolean_t ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present, - ill_t *ill, mblk_t *hada_mp, zoneid_t zoneid) + ill_t *ill, ill_t *inill, mblk_t *hada_mp, zoneid_t zoneid) { mblk_t *mp; uint8_t nexthdr; @@ -7093,7 +6879,7 @@ ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present, */ ii = (ipsec_in_t *)first_mp->b_rptr; ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = ii->ipsec_in_ill_index; + ii->ipsec_in_rill_index = inill->ill_phyint->phyint_ifindex; first_mp->b_cont = mp; } /* @@ -7122,7 +6908,7 @@ ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present, switch (ipsec_rc) { case IPSEC_STATUS_SUCCESS: /* we're done with IPsec processing, send it up */ - ip_fanout_proto_again(first_mp, ill, ill, NULL); + ip_fanout_proto_again(first_mp, ill, inill, NULL); break; case IPSEC_STATUS_FAILED: BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); @@ -7225,7 +7011,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, ip6_hbh_t *hbhhdr; boolean_t ll_multicast = (flags & IP6_IN_LLMCAST); conn_t *connp; - ilm_t *ilm; uint32_t ports; zoneid_t zoneid = GLOBAL_ZONEID; uint16_t hck_flags, reass_hck_flags; @@ -7347,10 +7132,8 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, /* * XXX TODO Give to mrouted to for multicast forwarding. */ - ILM_WALKER_HOLD(ill); - ilm = ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, ALL_ZONES); - ILM_WALKER_RELE(ill); - if (ilm == NULL) { + if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE, + ALL_ZONES) == NULL) { if (ip_debug > 3) { /* ip2dbg */ pr_addr_dbg("ip_rput_data_v6: got mcast packet" @@ -7405,7 +7188,7 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) { ire = ire_ctable_lookup_v6(&ip6h->ip6_dst, NULL, IRE_CACHE|IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, - MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst); + MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); } else { ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ALL_ZONES, MBLK_GETLABEL(mp), ipst); @@ -7466,9 +7249,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, } /* we have a matching IRE */ if (ire->ire_stq != NULL) { - ill_group_t *ill_group; - ill_group_t *ire_group; - /* * To be quicker, we may wish not to chase pointers * (ire->ire_ipif->ipif_ill...) and instead store the @@ -7483,7 +7263,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, no_forward = ((ill->ill_flags & ire->ire_ipif->ipif_ill->ill_flags & ILLF_ROUTER) == 0); - ASSERT(first_mp == mp); /* * This ire has a send-to queue - forward the packet. @@ -7568,10 +7347,8 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, * we're forwarding onto the same link), conditionally send * a redirect message. */ - ill_group = ill->ill_group; - ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; - if (ire->ire_rfq != q && (ill_group == NULL || - ill_group != ire_group)) { + if (ire->ire_rfq != q && + !IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) { if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) || IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) { BUMP_MIB(ill->ill_ip_mib, @@ -8006,7 +7783,10 @@ tcp_fanout: * where there is no conn. */ if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { - ASSERT(!IS_LOOPBACK((ill))); + ilm_t *ilm; + ilm_walker_t ilw; + + ASSERT(!IS_LOOPBACK(ill)); /* * In the multicast case, applications may have * joined the group from different zones, so we @@ -8015,32 +7795,32 @@ tcp_fanout: * structures (ilm) on the receive ill and send * a copy of the packet up each matching one. */ - ILM_WALKER_HOLD(ill); - for (ilm = ill->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if (ilm->ilm_flags & ILM_DELETED) - continue; + ilm = ilm_walker_start(&ilw, inill); + for (; ilm != NULL; + ilm = ilm_walker_step(&ilw, ilm)) { if (!IN6_ARE_ADDR_EQUAL( &ilm->ilm_v6addr, &ip6h->ip6_dst)) continue; - if (!ipif_lookup_zoneid(ill, - ilm->ilm_zoneid, IPIF_UP, NULL)) + if (!ipif_lookup_zoneid( + ilw.ilw_walk_ill, ilm->ilm_zoneid, + IPIF_UP, NULL)) continue; first_mp1 = ip_copymsg(first_mp); if (first_mp1 == NULL) continue; - icmp_inbound_v6(q, first_mp1, ill, + icmp_inbound_v6(q, first_mp1, + ilw.ilw_walk_ill, inill, hdr_len, mctl_present, 0, ilm->ilm_zoneid, dl_mp); } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } else { first_mp1 = ip_copymsg(first_mp); if (first_mp1 != NULL) icmp_inbound_v6(q, first_mp1, ill, - hdr_len, mctl_present, 0, zoneid, - dl_mp); + inill, hdr_len, mctl_present, 0, + zoneid, dl_mp); } } /* FALLTHRU */ @@ -8082,7 +7862,7 @@ tcp_fanout: /* Check if AH is present. */ if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill, - hada_mp, zoneid)) { + inill, hada_mp, zoneid)) { ip0dbg(("dst early hada drop\n")); return; } @@ -8206,7 +7986,7 @@ tcp_fanout: /* Restore the flags */ DB_CKSUMFLAGS(mp) = hck_flags; - mp = ip_rput_frag_v6(q, mp, ip6h, fraghdr, + mp = ip_rput_frag_v6(ill, inill, mp, ip6h, fraghdr, remlen - used, &prev_nexthdr_offset, &reass_sum, &reass_hck_flags); if (mp == NULL) { @@ -8249,7 +8029,7 @@ tcp_fanout: /* Check if AH is present. */ if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill, - hada_mp, zoneid)) { + inill, hada_mp, zoneid)) { ip0dbg(("routing hada drop\n")); return; } @@ -8322,7 +8102,7 @@ tcp_fanout: ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; ii->ipsec_in_rill_index = - ii->ipsec_in_ill_index; + inill->ill_phyint->phyint_ifindex; first_mp->b_cont = mp; /* * Cache hardware acceleration info. @@ -8480,11 +8260,10 @@ hada_drop: * nexthdr field when reassembly completes. */ static mblk_t * -ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, +ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset, uint32_t *cksum_val, uint16_t *cksum_flags) { - ill_t *ill = (ill_t *)q->q_ptr; uint32_t ident = ntohl(fraghdr->ip6f_ident); uint16_t offset; boolean_t more_frags; @@ -8518,8 +8297,8 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, * addition, checksum offload support for IP fragments carrying * UDP payload is commonly implemented across network adapters. */ - ASSERT(ill != NULL); - if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && + ASSERT(inill != NULL); + if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(inill) && (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { mblk_t *mp1 = mp->b_cont; int32_t len; @@ -8581,7 +8360,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, freemsg(mp); return (NULL); } - icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER, + icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER, (uint32_t)((char *)&ip6h->ip6_plen - (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst); return (NULL); @@ -8607,7 +8386,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, freemsg(mp); return (NULL); } - icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER, + icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER, (uint32_t)((char *)&fraghdr->ip6f_offlg - (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst); return (NULL); @@ -9204,16 +8983,14 @@ ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst) * The routine can handle an ICMPv6 header that is not in the first mblk. * * The order to determine the outgoing interface is as follows: - * 1. IPV6_BOUND_PIF is set, use that ill (conn_outgoing_pill) - * 2. If conn_nofailover_ill is set then use that ill. - * 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. - * 4. If q is an ill queue and (link local or multicast destination) then + * 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. + * 2. If q is an ill queue and (link local or multicast destination) then * use that ill. - * 5. If IPV6_BOUND_IF has been set use that ill. - * 6. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise + * 3. If IPV6_BOUND_IF has been set use that ill. + * 4. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise * look for the best IRE match for the unspecified group to determine * the ill. - * 7. For unicast: Just do an IRE lookup for the best match. + * 5. For unicast: Just do an IRE lookup for the best match. * * arg2 is always a queue_t *. * When that queue is an ill_t (i.e. q_next != NULL), then arg must be @@ -9238,12 +9015,10 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) int unspec_src; boolean_t do_outrequests; /* Increment OutRequests? */ mib2_ipIfStatsEntry_t *mibptr; - int match_flags = MATCH_IRE_ILL_GROUP; - boolean_t attach_if = B_FALSE; + int match_flags = MATCH_IRE_ILL; mblk_t *first_mp; boolean_t mctl_present; ipsec_out_t *io; - boolean_t drop_if_delayed = B_FALSE; boolean_t multirt_need_resolve = B_FALSE; mblk_t *copy_mp = NULL; int err = 0; @@ -9574,16 +9349,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) */ mp->b_rptr = (uchar_t *)ip6h; - /* - * IP6I_ATTACH_IF is set in this function when we had a - * conn and it was either bound to the IPFF_NOFAILOVER address - * or IPV6_BOUND_PIF was set. These options override other - * options that set the ifindex. We come here with - * IP6I_ATTACH_IF set when we can't find the ire and - * ip_newroute_v6 is feeding the packet for second time. - */ - if ((ip6i->ip6i_flags & IP6I_IFINDEX) || - (ip6i->ip6i_flags & IP6I_ATTACH_IF)) { + if (ip6i->ip6i_flags & IP6I_IFINDEX) { ASSERT(ip6i->ip6i_ifindex != 0); if (ill != NULL) ill_refrele(ill); @@ -9603,33 +9369,13 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) return; } mibptr = ill->ill_ip_mib; - if (ip6i->ip6i_flags & IP6I_IFINDEX) { - /* - * Preserve the index so that when we return - * from IPSEC processing, we know where to - * send the packet. - */ - if (mctl_present) { - ASSERT(io != NULL); - io->ipsec_out_ill_index = - ip6i->ip6i_ifindex; - } - } - if (ip6i->ip6i_flags & IP6I_ATTACH_IF) { - /* - * This is a multipathing probe packet that has - * been delayed in ND resolution. Drop the - * packet for the reasons mentioned in - * nce_queue_mp() - */ - if ((ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) && - (ip6i->ip6i_flags & IP6I_ND_DELAYED)) { - freemsg(first_mp); - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } + /* + * Preserve the index so that when we return from + * IPSEC processing, we know where to send the packet. + */ + if (mctl_present) { + ASSERT(io != NULL); + io->ipsec_out_ill_index = ip6i->ip6i_ifindex; } } if (ip6i->ip6i_flags & IP6I_VERIFY_SRC) { @@ -9698,114 +9444,20 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) if (IN6_IS_ADDR_MULTICAST(v6dstp)) goto ipv6multicast; - /* 1. IPV6_BOUND_PIF takes precedence over all the ifindex settings. */ - if (connp != NULL && connp->conn_outgoing_pill != NULL) { - ill_t *conn_outgoing_pill; - - conn_outgoing_pill = conn_get_held_ill(connp, - &connp->conn_outgoing_pill, &err); - if (err == ILL_LOOKUP_FAILED) { - if (ill != NULL) - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - if (conn_outgoing_pill != NULL) { - if (ill != NULL) - ill_refrele(ill); - ill = conn_outgoing_pill; - attach_if = B_TRUE; - match_flags = MATCH_IRE_ILL; - mibptr = ill->ill_ip_mib; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - goto send_from_ill; - } - } - - /* 2. If ipc_nofailover_ill is set then use that ill. */ - if (connp != NULL && connp->conn_nofailover_ill != NULL) { - ill_t *conn_nofailover_ill; - - conn_nofailover_ill = conn_get_held_ill(connp, - &connp->conn_nofailover_ill, &err); - if (err == ILL_LOOKUP_FAILED) { - if (ill != NULL) - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - if (conn_nofailover_ill != NULL) { - if (ill != NULL) - ill_refrele(ill); - ill = conn_nofailover_ill; - attach_if = B_TRUE; - /* - * Assumes that ipc_nofailover_ill is used only for - * multipathing probe packets. These packets are better - * dropped, if they are delayed in ND resolution, for - * the reasons described in nce_queue_mp(). - * IP6I_DROP_IFDELAYED will be set later on in this - * function for this packet. - */ - drop_if_delayed = B_TRUE; - match_flags = MATCH_IRE_ILL; - mibptr = ill->ill_ip_mib; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - goto send_from_ill; - } - } - - /* - * Redo 1. If we did not find an IRE_CACHE the first time, we should - * have an ip6i_t with IP6I_ATTACH_IF if IPV6_BOUND_PIF or - * bind to the IPIF_NOFAILOVER address was used on this endpoint. - */ - if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_ATTACH_IF)) { - ASSERT(ip6i->ip6i_ifindex != 0); - attach_if = B_TRUE; - ASSERT(ill != NULL); - match_flags = MATCH_IRE_ILL; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - goto send_from_ill; - } - - /* 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. */ + /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */ if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) { ASSERT(ill != NULL); goto send_from_ill; } /* - * 4. If q is an ill queue and (link local or multicast destination) + * 2. If q is an ill queue and there's a link-local destination * then use that ill. */ - if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp)) { + if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp)) goto send_from_ill; - } - /* 5. If IPV6_BOUND_IF has been set use that ill. */ + /* 3. If IPV6_BOUND_IF has been set use that ill. */ if (connp != NULL && connp->conn_outgoing_ill != NULL) { ill_t *conn_outgoing_ill; @@ -9827,7 +9479,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) } /* - * 6. For unicast: Just do an IRE lookup for the best match. + * 4. For unicast: Just do an IRE lookup for the best match. * If we get here for a link-local address it is rather random * what interface we pick on a multihomed host. * *If* there is an IRE_CACHE (and the link-local address @@ -9913,7 +9565,6 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) } BUMP_MIB(mibptr, ipIfStatsHCOutRequests); } - ASSERT(!attach_if); /* * Check if the ire has the RTF_MULTIRT flag, inherited @@ -9966,7 +9617,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) } } ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request, - connp, caller, 0, ip6i_flags, zoneid); + connp, caller, ip6i_flags, zoneid); if (need_decref) { CONN_DEC_REF(connp); connp = NULL; @@ -10086,9 +9737,6 @@ ipv6multicast: ip2dbg(("ip_wput_v6: multicast\n")); /* - * 1. IPV6_BOUND_PIF takes precedence over all the ifindex settings - * 2. If conn_nofailover_ill is set then use that ill. - * * Hold the conn_lock till we refhold the ill of interest that is * pointed to from the conn. Since we cannot do an ill/ipif_refrele * while holding any locks, postpone the refrele until after the @@ -10100,79 +9748,12 @@ ipv6multicast: } else { conn_lock_held = B_FALSE; } - if (connp != NULL && connp->conn_outgoing_pill != NULL) { - err = ill_check_and_refhold(connp->conn_outgoing_pill); - if (err == ILL_LOOKUP_FAILED) { - ip1dbg(("ip_output_v6: multicast" - " conn_outgoing_pill no ipif\n")); -multicast_discard: - ASSERT(saved_ill == NULL); - if (conn_lock_held) - mutex_exit(&connp->conn_lock); - if (ill != NULL) - ill_refrele(ill); - freemsg(first_mp); - if (do_outrequests) - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - saved_ill = ill; - ill = connp->conn_outgoing_pill; - attach_if = B_TRUE; - match_flags = MATCH_IRE_ILL; - mibptr = ill->ill_ip_mib; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - } else if (connp != NULL && connp->conn_nofailover_ill != NULL) { - err = ill_check_and_refhold(connp->conn_nofailover_ill); - if (err == ILL_LOOKUP_FAILED) { - ip1dbg(("ip_output_v6: multicast" - " conn_nofailover_ill no ipif\n")); - goto multicast_discard; - } - saved_ill = ill; - ill = connp->conn_nofailover_ill; - attach_if = B_TRUE; - match_flags = MATCH_IRE_ILL; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - } else if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_ATTACH_IF)) { - /* - * Redo 1. If we did not find an IRE_CACHE the first time, - * we should have an ip6i_t with IP6I_ATTACH_IF if - * IPV6_BOUND_PIF or bind to the IPIF_NOFAILOVER address was - * used on this endpoint. - */ - ASSERT(ip6i->ip6i_ifindex != 0); - attach_if = B_TRUE; - ASSERT(ill != NULL); - match_flags = MATCH_IRE_ILL; - - /* - * Check if we need an ire that will not be - * looked up by anybody else i.e. HIDDEN. - */ - if (ill_is_probeonly(ill)) - match_flags |= MATCH_IRE_MARK_HIDDEN; - } else if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) { - /* 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. */ - + if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) { + /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */ ASSERT(ill != NULL); } else if (ill != NULL) { /* - * 4. If q is an ill queue and (link local or multicast + * 2. If q is an ill queue and (link local or multicast * destination) then use that ill. * We don't need the ipif initialization here. * This useless assert below is just to prevent lint from @@ -10181,9 +9762,9 @@ multicast_discard: ASSERT(ill != NULL); } else if (connp != NULL) { /* - * 5. If IPV6_BOUND_IF has been set use that ill. + * 3. If IPV6_BOUND_IF has been set use that ill. * - * 6. For multicast: if IPV6_MULTICAST_IF has been set use it. + * 4. For multicast: if IPV6_MULTICAST_IF has been set use it. * Otherwise look for the best IRE match for the unspecified * group to determine the ill. * @@ -10198,7 +9779,18 @@ multicast_discard: if (err == ILL_LOOKUP_FAILED) { ip1dbg(("ip_output_v6: multicast" " conn_outgoing_ill no ipif\n")); - goto multicast_discard; +multicast_discard: + ASSERT(saved_ill == NULL); + if (conn_lock_held) + mutex_exit(&connp->conn_lock); + if (ill != NULL) + ill_refrele(ill); + freemsg(first_mp); + if (do_outrequests) + BUMP_MIB(mibptr, ipIfStatsOutDiscards); + if (need_decref) + CONN_DEC_REF(connp); + return; } ill = connp->conn_outgoing_ill; } else if (connp->conn_multicast_ill != NULL) { @@ -10239,8 +9831,6 @@ multicast_discard: */ mutex_enter(&connp->conn_lock); connp->conn_multicast_ill = ill; - connp->conn_orig_multicast_ifindex = - ill->ill_phyint->phyint_ifindex; mutex_exit(&connp->conn_lock); } } @@ -10307,11 +9897,55 @@ multicast_discard: send_from_ill: ASSERT(ill != NULL); ASSERT(mibptr == ill->ill_ip_mib); + if (do_outrequests) { BUMP_MIB(mibptr, ipIfStatsHCOutRequests); do_outrequests = B_FALSE; } + /* + * Because nce_xmit() calls ip_output_v6() and NCEs are always tied to + * an underlying interface, IS_UNDER_IPMP() may be true even when + * building IREs that will be used for data traffic. As such, use the + * packet's source address to determine whether the traffic is test + * traffic, and set MATCH_IRE_MARK_TESTHIDDEN if so. + * + * Separately, we also need to mark probe packets so that ND can + * process them specially; see the comments in nce_queue_mp_common(). + */ + if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && + ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) { + if (ip6i == NULL) { + if ((mp = ip_add_info_v6(mp, NULL, v6dstp)) == NULL) { + if (mctl_present) + freeb(first_mp); + goto discard; + } + + if (mctl_present) + first_mp->b_cont = mp; + else + first_mp = mp; + + /* ndp_resolver() expects a pulled-up message */ + if (MBLKL(mp) == sizeof (ip6i_t) && + pullupmsg(mp, -1) == 0) { + ip1dbg(("ip_output_v6: pullupmsg failed\n")); +discard: BUMP_MIB(mibptr, ipIfStatsOutDiscards); + ill_refrele(ill); + if (need_decref) + CONN_DEC_REF(connp); + return; + } + ip6i = (ip6i_t *)mp->b_rptr; + ip6h = (ip6_t *)&ip6i[1]; + v6dstp = &ip6h->ip6_dst; + mp->b_rptr = (uchar_t *)ip6h; /* rewound below */ + } + ip6i->ip6i_flags |= IP6I_IPMP_PROBE; + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; + } + if (io != NULL) io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; @@ -10390,9 +10024,7 @@ send_from_ill: ill->ill_name, (void *)ire, ill->ill_phyint->phyint_ifindex)); ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request, - connp, caller, - (attach_if ? ill->ill_phyint->phyint_ifindex : 0), - ip6i_flags, zoneid); + connp, caller, ip6i_flags, zoneid); ire_refrele(ire); if (need_decref) { CONN_DEC_REF(connp); @@ -10422,7 +10054,8 @@ send_from_ill: return; } ip_newroute_ipif_v6(q, copy_mp, ipif, - ip6h->ip6_dst, unspec_src, zoneid); + &ip6h->ip6_dst, &ip6h->ip6_src, unspec_src, + zoneid); ipif_refrele(ipif); } else { ip_newroute_v6(q, copy_mp, &ip6h->ip6_dst, @@ -10440,12 +10073,11 @@ send_from_ill: /* Update rptr if there was an ip6i_t header. */ if (ip6i != NULL) mp->b_rptr -= sizeof (ip6i_t); - if (unspec_src || attach_if) { + if (unspec_src) { if (ip6i == NULL) { /* * Add ip6i_t header to carry unspec_src - * or attach_if until the packet comes back in - * ip_wput_v6. + * until the packet comes back in ip_wput_v6. */ if (mctl_present) { first_mp->b_cont = @@ -10481,28 +10113,15 @@ send_from_ill: ip6h = (ip6_t *)&ip6i[1]; v6dstp = &ip6h->ip6_dst; } - if (unspec_src) - ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; - if (attach_if) { - /* - * Bind to nofailover/BOUND_PIF overrides ifindex. - */ - ip6i->ip6i_flags |= IP6I_ATTACH_IF; - ip6i->ip6i_flags &= ~IP6I_IFINDEX; - ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; - if (drop_if_delayed) { - /* This is a multipathing probe packet */ - ip6i->ip6i_flags |= IP6I_DROP_IFDELAYED; - } - } + ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; if (mctl_present) { ASSERT(io != NULL); io->ipsec_out_unspec_src = unspec_src; } } if (IN6_IS_ADDR_MULTICAST(v6dstp)) { - ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, *v6dstp, - unspec_src, zoneid); + ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, v6dstp, + &ip6h->ip6_src, unspec_src, zoneid); } else { ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, ill, zoneid, ipst); @@ -10544,14 +10163,6 @@ ip_wput_v6(queue_t *q, mblk_t *mp) ip_output_v6(GLOBAL_ZONEID, mp, q, IP_WPUT); } -static void -ipsec_out_attach_if(ipsec_out_t *io, int attach_index) -{ - ASSERT(io->ipsec_out_type == IPSEC_OUT); - io->ipsec_out_attach_if = B_TRUE; - io->ipsec_out_ill_index = attach_index; -} - /* * NULL send-to queue - packet is to be delivered locally. */ @@ -10731,6 +10342,8 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp, */ if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && !IS_LOOPBACK(ill)) { + ilm_walker_t ilw; + /* * In the multicast case, applications may have * joined the group from different zones, so we @@ -10742,11 +10355,9 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp, * on the loopback interface (PHYI_LOOPBACK flag * set) as they must stay in the sender's zone. */ - ILM_WALKER_HOLD(ill); - for (ilm = ill->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if (ilm->ilm_flags & ILM_DELETED) - continue; + ilm = ilm_walker_start(&ilw, ill); + for (; ilm != NULL; + ilm = ilm_walker_step(&ilw, ilm)) { if (!IN6_ARE_ADDR_EQUAL( &ilm->ilm_v6addr, &ip6h->ip6_dst)) continue; @@ -10754,23 +10365,24 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp, IP_FF_NO_MCAST_LOOP) && ilm->ilm_zoneid == ire->ire_zoneid) continue; - if (!ipif_lookup_zoneid(ill, - ilm->ilm_zoneid, IPIF_UP, NULL)) + if (!ipif_lookup_zoneid( + ilw.ilw_walk_ill, ilm->ilm_zoneid, + IPIF_UP, NULL)) continue; first_mp1 = ip_copymsg(first_mp); if (first_mp1 == NULL) continue; - icmp_inbound_v6(q, first_mp1, ill, - hdr_length, mctl_present, - IP6_NO_IPPOLICY, ilm->ilm_zoneid, - NULL); + icmp_inbound_v6(q, first_mp1, + ilw.ilw_walk_ill, ill, hdr_length, + mctl_present, IP6_NO_IPPOLICY, + ilm->ilm_zoneid, NULL); } - ILM_WALKER_RELE(ill); + ilm_walker_finish(&ilw); } else { first_mp1 = ip_copymsg(first_mp); if (first_mp1 != NULL) - icmp_inbound_v6(q, first_mp1, ill, + icmp_inbound_v6(q, first_mp1, ill, ill, hdr_length, mctl_present, IP6_NO_IPPOLICY, ire->ire_zoneid, NULL); @@ -10823,8 +10435,7 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp, */ static void ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, - int cksum_request, conn_t *connp, int caller, int attach_index, int flags, - zoneid_t zoneid) + int cksum_request, conn_t *connp, int caller, int flags, zoneid_t zoneid) { ip6_t *ip6h; uint8_t nexthdr; @@ -10917,7 +10528,7 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, if (src_ire != NULL && !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) && (!ipst->ips_ip_restrict_interzone_loopback || - ire_local_same_ill_group(ire, src_ire))) { + ire_local_same_lan(ire, src_ire))) { if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && !unspec_src) { ip6h->ip6_src = src_ire->ire_src_addr_v6; @@ -10974,20 +10585,14 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, /* * Select the source address using ipif_select_source_v6. */ - if (attach_index != 0) { - ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, - RESTRICT_TO_ILL, IPV6_PREFER_SRC_DEFAULT, zoneid); - } else { - ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, - RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid); - } + ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, B_FALSE, + IPV6_PREFER_SRC_DEFAULT, zoneid); if (ipif == NULL) { if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg("ip_wput_ire_v6: no src for " - "dst %s\n, ", AF_INET6, &ip6h->ip6_dst); - printf("ip_wput_ire_v6: interface name %s\n", - ill->ill_name); + "dst %s\n", AF_INET6, &ip6h->ip6_dst); + printf("through interface %s\n", ill->ill_name); } freemsg(first_mp); return; @@ -10998,12 +10603,8 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { if ((connp != NULL && connp->conn_multicast_loop) || !IS_LOOPBACK(ill)) { - ilm_t *ilm; - - ILM_WALKER_HOLD(ill); - ilm = ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, ALL_ZONES); - ILM_WALKER_RELE(ill); - if (ilm != NULL) { + if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE, + ALL_ZONES) != NULL) { mblk_t *nmp; int fanout_flags = 0; @@ -11417,8 +11018,6 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, } /* Do IPSEC processing first */ if (mctl_present) { - if (attach_index != 0) - ipsec_out_attach_if(io, attach_index); ipsec_out_process(q, first_mp, ire, ill_index); return; } @@ -11456,8 +11055,6 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, max_frag, B_FALSE, B_TRUE, zoneid, ipst); return; } - if (attach_index != 0) - ipsec_out_attach_if(io, attach_index); ipsec_out_process(q, first_mp, ire, ill_index); return; } @@ -11948,8 +11545,8 @@ boolean_t conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags, zoneid_t zoneid) { - ill_t *in_ill; - boolean_t wantpacket = B_TRUE; + ill_t *bound_ill; + boolean_t wantpacket; in6_addr_t *v6dst_ptr = &ip6h->ip6_dst; in6_addr_t *v6src_ptr = &ip6h->ip6_src; @@ -11958,42 +11555,16 @@ conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags, * unicast and multicast reception to conn_incoming_ill. * conn_wantpacket_v6 is called both for unicast and * multicast. - * - * 1) The unicast copy of the packet can come anywhere in - * the ill group if it is part of the group. Thus, we - * need to check to see whether the ill group matches - * if in_ill is part of a group. - * - * 2) ip_rput does not suppress duplicate multicast packets. - * If there are two interfaces in a ill group and we have - * 2 applications (conns) joined a multicast group G on - * both the interfaces, ilm_lookup_ill filter in ip_rput - * will give us two packets because we join G on both the - * interfaces rather than nominating just one interface - * for receiving multicast like broadcast above. So, - * we have to call ilg_lookup_ill to filter out duplicate - * copies, if ill is part of a group, to supress duplicates. */ - in_ill = connp->conn_incoming_ill; - if (in_ill != NULL) { - mutex_enter(&connp->conn_lock); - in_ill = connp->conn_incoming_ill; - mutex_enter(&ill->ill_lock); - /* - * No IPMP, and the packet did not arrive on conn_incoming_ill - * OR, IPMP in use and the packet arrived on an IPMP group - * different from the conn_incoming_ill's IPMP group. - * Reject the packet. - */ - if ((in_ill->ill_group == NULL && in_ill != ill) || - (in_ill->ill_group != NULL && - in_ill->ill_group != ill->ill_group)) { - wantpacket = B_FALSE; + bound_ill = connp->conn_incoming_ill; + if (bound_ill != NULL) { + if (IS_IPMP(bound_ill)) { + if (bound_ill->ill_grp != ill->ill_grp) + return (B_FALSE); + } else { + if (bound_ill != ill) + return (B_FALSE); } - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - if (!wantpacket) - return (B_FALSE); } if (connp->conn_multi_router) @@ -12140,7 +11711,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp, (IN6_ARE_ADDR_EQUAL(&first_ire->ire_addr_v6, &ire->ire_addr_v6)) && !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) + (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) break; } @@ -12204,8 +11775,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp, &ire->ire_addr_v6)) continue; if (ire1->ire_marks & - (IRE_MARK_CONDEMNED| - IRE_MARK_HIDDEN)) + IRE_MARK_CONDEMNED) continue; /* Got one */ @@ -13279,3 +12849,31 @@ ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah) size += ehdrlen; } } + +/* + * Utility routine that checks if `v6srcp' is a valid address on underlying + * interface `ill'. If `ipifp' is non-NULL, it's set to a held ipif + * associated with `v6srcp' on success. NOTE: if this is not called from + * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the + * group during or after this lookup. + */ +static boolean_t +ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp) +{ + ipif_t *ipif; + + ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst); + if (ipif != NULL) { + if (ipifp != NULL) + *ipifp = ipif; + else + ipif_refrele(ipif); + return (B_TRUE); + } + + if (ip_debug > 2) { + pr_addr_dbg("ipif_lookup_testaddr_v6: cannot find ipif for " + "src %s\n", AF_INET6, v6srcp); + } + return (B_FALSE); +} diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c index 81447c2e30..c729118fec 100644 --- a/usr/src/uts/common/inet/ip/ip6_if.c +++ b/usr/src/uts/common/inet/ip/ip6_if.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -53,7 +53,6 @@ #include <netinet/igmp_var.h> #include <netinet/ip6.h> #include <netinet/icmp6.h> -#include <netinet/in.h> #include <inet/common.h> #include <inet/nd.h> @@ -178,10 +177,12 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst, } else if (IPIF_CAN_WAIT(ipif, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); rw_exit(&ipst->ips_ill_g_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (error != NULL) @@ -202,16 +203,12 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst, } /* - * Look for an ipif with the specified address. For point-point links - * we look for matches on either the destination address and the local - * address, but we ignore the check on the local address if IPIF_UNNUMBERED - * is set. - * Matches on a specific ill if match_ill is set. + * Common function for ipif_lookup_addr_v6() and ipif_lookup_addr_exact_v6(). */ -/* ARGSUSED */ -ipif_t * -ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid, - queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) +static ipif_t * +ipif_lookup_addr_common_v6(const in6_addr_t *addr, ill_t *match_ill, + boolean_t match_illgrp, zoneid_t zoneid, queue_t *q, mblk_t *mp, + ipsq_func_t func, int *error, ip_stack_t *ipst) { ipif_t *ipif; ill_t *ill; @@ -230,7 +227,8 @@ ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid, repeat: ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (match_ill != NULL && ill != match_ill) { + if (match_ill != NULL && ill != match_ill && + (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { continue; } GRAB_CONN_LOCK(q); @@ -257,10 +255,12 @@ repeat: } else if (IPIF_CAN_WAIT(ipif, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); rw_exit(&ipst->ips_ill_g_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (error != NULL) @@ -323,11 +323,41 @@ ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid, } /* + * Lookup an ipif with the specified address. For point-to-point links we + * look for matches on either the destination address or the local address, + * but we skip the local address check if IPIF_UNNUMBERED is set. If the + * `match_ill' argument is non-NULL, the lookup is restricted to that ill + * (or illgrp if `match_ill' is in an IPMP group). + */ +ipif_t * +ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid, + queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) +{ + return (ipif_lookup_addr_common_v6(addr, match_ill, B_TRUE, zoneid, q, + mp, func, error, ipst)); +} + +/* + * Special abbreviated version of ipif_lookup_addr_v6() that doesn't match + * `match_ill' across the IPMP group. This function is only needed in some + * corner-cases; almost everything should use ipif_lookup_addr_v6(). + */ +ipif_t * +ipif_lookup_addr_exact_v6(const in6_addr_t *addr, ill_t *match_ill, + ip_stack_t *ipst) +{ + ASSERT(match_ill != NULL); + return (ipif_lookup_addr_common_v6(addr, match_ill, B_FALSE, ALL_ZONES, + NULL, NULL, NULL, NULL, ipst)); +} + +/* * Look for an ipif with the specified address. For point-point links * we look for matches on either the destination address and the local * address, but we ignore the check on the local address if IPIF_UNNUMBERED * is set. - * Matches on a specific ill if match_ill is set. + * If the `match_ill' argument is non-NULL, the lookup is restricted to that + * ill (or illgrp if `match_ill' is in an IPMP group). * Return the zoneid for the ipif. ALL_ZONES if none found. */ zoneid_t @@ -348,7 +378,8 @@ ipif_lookup_addr_zoneid_v6(const in6_addr_t *addr, ill_t *match_ill, repeat: ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (match_ill != NULL && ill != match_ill) { + if (match_ill != NULL && ill != match_ill && + !IS_IN_SAME_ILLGRP(ill, match_ill)) { continue; } mutex_enter(&ill->ill_lock); @@ -1120,11 +1151,10 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, boolean_t ill_setdefaulttoken(ill_t *ill) { - int i; + int i; in6_addr_t v6addr, v6mask; - if (!MEDIA_V6INTFID(ill->ill_media, ill->ill_phys_addr_length, - ill->ill_phys_addr, &v6addr)) + if (!MEDIA_V6INTFID(ill->ill_media, ill, &v6addr)) return (B_FALSE); (void) ip_plen_to_mask_v6(IPV6_TOKEN_LEN, &v6mask); @@ -1161,7 +1191,7 @@ ipif_set_tun_auto_addr(ipif_t *ipif, struct iftun_req *ta) { sin6_t sin6; sin_t *sin; - ill_t *ill = ipif->ipif_ill; + ill_t *ill = ipif->ipif_ill; tun_t *tp = (tun_t *)ill->ill_wq->q_next->q_ptr; if (ta->ifta_saddr.ss_family != AF_INET || @@ -1227,7 +1257,7 @@ ipif_set_tun_llink(ill_t *ill, struct iftun_req *ta) if ((ta->ifta_flags & IFTUN_DST) && IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)) { - sin6_t sin6; + sin6_t sin6; ASSERT(!(ipif->ipif_flags & IPIF_UP)); bzero(&sin6, sizeof (sin6_t)); @@ -1344,13 +1374,22 @@ ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce) if (ret_nce != NULL) *ret_nce = NULL; + + /* + * IPMP meta-interfaces don't have any inherent multicast mappings, + * and instead use the ones on the underlying interfaces. + */ + if (IS_IPMP(ill)) + return (0); + /* * Delete the mapping nce. Normally these should not exist * as a previous ipif_down -> ipif_ndp_down should have deleted * all the nces. But they can exist if ip_rput_dlpi_writer - * calls this when PHYI_MULTI_BCAST is set. + * calls this when PHYI_MULTI_BCAST is set. Mappings are always + * tied to the underlying ill, so don't match across the illgrp. */ - mnce = ndp_lookup_v6(ill, &v6_mcast_addr, B_FALSE); + mnce = ndp_lookup_v6(ill, B_FALSE, &v6_mcast_addr, B_FALSE); if (mnce != NULL) { ndp_delete(mnce); NCE_REFRELE(mnce); @@ -1424,13 +1463,15 @@ ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce) * Get the resolver set up for a new ipif. (Always called as writer.) */ int -ipif_ndp_up(ipif_t *ipif) +ipif_ndp_up(ipif_t *ipif, boolean_t initial) { ill_t *ill = ipif->ipif_ill; int err = 0; nce_t *nce = NULL; nce_t *mnce = NULL; + boolean_t added_ipif = B_FALSE; + ASSERT(IAM_WRITER_ILL(ill)); ip1dbg(("ipif_ndp_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); /* @@ -1464,7 +1505,10 @@ ipif_ndp_up(ipif_t *ipif) if ((ipif->ipif_flags & (IPIF_UNNUMBERED|IPIF_NOLOCAL)) == 0) { uint16_t flags; - uchar_t *hw_addr = NULL; + uint16_t state; + uchar_t *hw_addr = NULL; + ill_t *bound_ill; + ipmp_illgrp_t *illg = ill->ill_grp; /* Permanent entries don't need NUD */ flags = NCE_F_PERMANENT | NCE_F_NONUD; @@ -1474,26 +1518,65 @@ ipif_ndp_up(ipif_t *ipif) if (ipif->ipif_flags & IPIF_ANYCAST) flags |= NCE_F_ANYCAST; - if (ill->ill_net_type == IRE_IF_RESOLVER) { - hw_addr = ill->ill_nd_lla; - - if (ill->ill_move_in_progress) { - /* - * Addresses are failing over to this ill. - * Don't wait for NUD to see this change. - * Publish our new link-layer address. - */ - flags |= NCE_F_UNSOL_ADV; + if (IS_IPMP(ill)) { + ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); + /* + * If we're here via ipif_up(), then the ipif won't be + * bound yet -- add it to the group, which will bind + * it if possible. (We would add it in ipif_up(), but + * deleting on failure there is gruesome.) If we're + * here via ipmp_ill_bind_ipif(), then the ipif has + * already been added to the group and we just need to + * use the binding. + */ + if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) { + bound_ill = ipmp_illgrp_add_ipif(illg, ipif); + if (bound_ill == NULL) { + /* + * We couldn't bind the ipif to an ill + * yet, so we have nothing to publish. + * Set ipif_addr_ready so that this + * address can be used locally for now. + * The routing socket message will be + * sent from ipif_up_done_v6(). + */ + ipif->ipif_addr_ready = 1; + return (0); + } + added_ipif = B_TRUE; } + hw_addr = bound_ill->ill_nd_lla; + } else { + bound_ill = ill; + if (ill->ill_net_type == IRE_IF_RESOLVER) + hw_addr = ill->ill_nd_lla; + } + + /* + * If this is an initial bring-up (or the ipif was never + * completely brought up), do DAD. Otherwise, we're here + * because IPMP has rebound an address to this ill: send + * unsolicited advertisements to inform others. + */ + if (initial || !ipif->ipif_addr_ready) { + state = ND_PROBE; + } else { + state = ND_REACHABLE; + flags |= NCE_F_UNSOL_ADV; } - err = ndp_lookup_then_add_v6(ill, + /* + * NOTE: for IPMP, local addresses are always associated with + * the ill they're bound to, so don't match across the illgrp. + */ + err = ndp_lookup_then_add_v6(bound_ill, + B_FALSE, hw_addr, &ipif->ipif_v6lcl_addr, &ipv6_all_ones, &ipv6_all_zeros, 0, flags, - ND_PROBE, /* Causes Duplicate Address Detection to run */ + state, &nce); switch (err) { case 0: @@ -1509,19 +1592,11 @@ ipif_ndp_up(ipif_t *ipif) NCE_REFRELE(nce); ip1dbg(("ipif_ndp_up: NCE already exists for %s\n", ill->ill_name)); - if (mnce != NULL) { - ndp_delete(mnce); - NCE_REFRELE(mnce); - } - return (err); + goto fail; default: - ip1dbg(("ipif_ndp_up: NCE creation failed %s\n", + ip1dbg(("ipif_ndp_up: NCE creation failed for %s\n", ill->ill_name)); - if (mnce != NULL) { - ndp_delete(mnce); - NCE_REFRELE(mnce); - } - return (err); + goto fail; } } else { /* No local NCE for this entry */ @@ -1532,6 +1607,15 @@ ipif_ndp_up(ipif_t *ipif) if (mnce != NULL) NCE_REFRELE(mnce); return (0); +fail: + if (mnce != NULL) { + ndp_delete(mnce); + NCE_REFRELE(mnce); + } + if (added_ipif) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); + + return (err); } /* Remove all cache entries for this logical interface */ @@ -1539,23 +1623,42 @@ void ipif_ndp_down(ipif_t *ipif) { nce_t *nce; + ill_t *ill = ipif->ipif_ill; + + ASSERT(IAM_WRITER_ILL(ill)); if (ipif->ipif_isv6) { - nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, - B_FALSE); - if (nce != NULL) { - ndp_delete(nce); - NCE_REFRELE(nce); + ill_t *bound_ill; + + if (IS_IPMP(ill)) + bound_ill = ipmp_ipif_bound_ill(ipif); + else + bound_ill = ill; + + if (bound_ill != NULL) { + nce = ndp_lookup_v6(bound_ill, + B_FALSE, /* see comment in ipif_ndp_up() */ + &ipif->ipif_v6lcl_addr, + B_FALSE); + if (nce != NULL) { + ndp_delete(nce); + NCE_REFRELE(nce); + } } + + /* + * Make IPMP aware of the deleted data address. + */ + if (IS_IPMP(ill)) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); } + /* * Remove mapping and all other nces dependent on this ill * when the last ipif is going away. */ - if (ipif->ipif_ill->ill_ipif_up_count == 0) { - ndp_walk(ipif->ipif_ill, (pfi_t)ndp_delete_per_ill, - (uchar_t *)ipif->ipif_ill, ipif->ipif_ill->ill_ipst); - } + if (ill->ill_ipif_up_count == 0) + ndp_walk(ill, (pfi_t)ndp_delete_per_ill, ill, ill->ill_ipst); } /* @@ -1936,9 +2039,7 @@ rule_preferred(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, } /* - * Prefer source addresses that are assigned to the outgoing interface, or - * to an interface that is in the same IPMP group as the outgoing - * interface. + * Prefer source addresses that are assigned to the outgoing interface. */ /* ARGSUSED3 */ static rule_res_t @@ -1955,15 +2056,11 @@ rule_interface(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, return (CAND_TIE); if (!bc->cand_matchedinterface_set) { - bc->cand_matchedinterface = (bc->cand_ill == dstill || - (dstill->ill_group != NULL && - dstill->ill_group == bc->cand_ill->ill_group)); + bc->cand_matchedinterface = bc->cand_ill == dstill; bc->cand_matchedinterface_set = B_TRUE; } - cc->cand_matchedinterface = (cc->cand_ill == dstill || - (dstill->ill_group != NULL && - dstill->ill_group == cc->cand_ill->ill_group)); + cc->cand_matchedinterface = cc->cand_ill == dstill; cc->cand_matchedinterface_set = B_TRUE; if (bc->cand_matchedinterface == cc->cand_matchedinterface) @@ -2134,6 +2231,13 @@ rule_addr_type(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, static rule_res_t rule_prefix(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst) { + /* + * For IPMP, we always want to choose a random source address from + * among any equally usable addresses, so always report a tie. + */ + if (IS_IPMP(dstinfo->dst_ill)) + return (CAND_TIE); + if (!bc->cand_common_pref_set) { bc->cand_common_pref = ip_common_prefix_v6(&bc->cand_srcaddr, dstinfo->dst_addr); @@ -2177,10 +2281,9 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, * specification's algorithm could traverse the list of addresses once for * every rule). * - * The restrict_ill argument restricts the algorithm to chose a source - * address that is assigned to the destination ill or an ill in the same - * IPMP group as the destination ill. This is used when the destination - * address is a link-local or multicast address, and when + * The restrict_ill argument restricts the algorithm to choose a source + * address that is assigned to the destination ill. This is used when + * the destination address is a link-local or multicast address, and when * ipv6_strict_dst_multihoming is turned on. * * src_prefs is the caller's set of source address preferences. If source @@ -2192,13 +2295,13 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, */ ipif_t * ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, - uint_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid) + boolean_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid) { dstinfo_t dstinfo; char dstr[INET6_ADDRSTRLEN]; char sstr[INET6_ADDRSTRLEN]; - ipif_t *ipif; - ill_t *ill, *usesrc_ill = NULL; + ipif_t *ipif, *start_ipif, *next_ipif; + ill_t *ill, *usesrc_ill = NULL, *ipmp_ill = NULL; ill_walk_context_t ctx; cand_t best_c; /* The best candidate */ cand_t curr_c; /* The current candidate */ @@ -2247,6 +2350,16 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, } else { return (NULL); } + } else if (IS_UNDER_IPMP(dstill)) { + /* + * Test addresses should never be used for source address + * selection, so if we were passed an underlying ill, switch + * to the IPMP meta-interface. + */ + if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(dstill)) != NULL) + dstinfo.dst_ill = ipmp_ill; + else + return (NULL); } else { dstinfo.dst_ill = dstill; } @@ -2286,10 +2399,7 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, */ if (IN6_IS_ADDR_LINKLOCAL(dst) || IN6_IS_ADDR_MULTICAST(dst) || ipst->ips_ipv6_strict_dst_multihoming || usesrc_ill != NULL) { - if (restrict_ill == RESTRICT_TO_NONE) - dstinfo.dst_restrict_ill = RESTRICT_TO_GROUP; - else - dstinfo.dst_restrict_ill = restrict_ill; + dstinfo.dst_restrict_ill = B_TRUE; } else { dstinfo.dst_restrict_ill = restrict_ill; } @@ -2297,39 +2407,41 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, bzero(&best_c, sizeof (cand_t)); /* - * Take a pass through the list of IPv6 interfaces to chose the - * best possible source address. If restrict_ill is true, we only - * iterate through the ill's that are in the same IPMP group as the - * destination's outgoing ill. If restrict_ill is false, we walk - * the entire list of IPv6 ill's. + * Take a pass through the list of IPv6 interfaces to choose the best + * possible source address. If restrict_ill is set, just use dst_ill. */ - if (dstinfo.dst_restrict_ill != RESTRICT_TO_NONE) { - if (dstinfo.dst_ill->ill_group != NULL && - dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP) { - ill = dstinfo.dst_ill->ill_group->illgrp_ill; - } else { - ill = dstinfo.dst_ill; - } - } else { + if (dstinfo.dst_restrict_ill) + ill = dstinfo.dst_ill; + else ill = ILL_START_WALK_V6(&ctx, ipst); - } - while (ill != NULL) { + for (; ill != NULL; ill = ill_next(&ctx, ill)) { ASSERT(ill->ill_isv6); /* - * Avoid FAILED/OFFLINE ills. - * Global and site local addresses will failover and - * will be available on the new ill. - * But link local addresses don't move. + * Test addresses should never be used for source address + * selection, so ignore underlying ills. */ - if (dstinfo.dst_restrict_ill != RESTRICT_TO_ILL && - ill->ill_phyint->phyint_flags & - (PHYI_OFFLINE | PHYI_FAILED)) - goto next_ill; + if (IS_UNDER_IPMP(ill)) + continue; - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { + /* + * For source address selection, we treat the ipif list as + * circular and continue until we get back to where we + * started. This allows IPMP to vary source address selection + * (which improves inbound load spreading) by caching its last + * ending point and starting from there. NOTE: we don't have + * to worry about ill_src_ipif changing ills since that can't + * happen on the IPMP ill. + */ + start_ipif = ill->ill_ipif; + if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) + start_ipif = ill->ill_src_ipif; + + ipif = start_ipif; + do { + if ((next_ipif = ipif->ipif_next) == NULL) + next_ipif = ill->ill_ipif; if (!IPIF_VALID_IPV6_SOURCE(ipif)) continue; @@ -2387,9 +2499,8 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, */ for (index = 0; rules[index] != NULL; index++) { /* Apply a comparison rule. */ - rule_result = - (rules[index])(&best_c, &curr_c, &dstinfo, - ipst); + rule_result = (rules[index])(&best_c, &curr_c, + &dstinfo, ipst); if (rule_result == CAND_AVOID) { /* * The best candidate is still the @@ -2417,21 +2528,29 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, * have been prefered as the best candidate so far. */ ASSERT(rule_result != CAND_TIE); + } while ((ipif = next_ipif) != start_ipif); + + /* + * For IPMP, update the source ipif rotor to the next ipif, + * provided we can look it up. (We must not use it if it's + * IPIF_CONDEMNED since we may have grabbed ill_g_lock after + * ipif_free() checked ill_src_ipif.) + */ + if (IS_IPMP(ill) && ipif != NULL) { + mutex_enter(&ipif->ipif_ill->ill_lock); + next_ipif = ipif->ipif_next; + if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif)) + ill->ill_src_ipif = next_ipif; + else + ill->ill_src_ipif = NULL; + mutex_exit(&ipif->ipif_ill->ill_lock); } /* - * We may be walking the linked-list of ill's in an - * IPMP group or traversing the IPv6 ill avl tree. If it is a - * usesrc ILL then it can't be part of IPMP group and we - * will exit the while loop. + * Only one ill to consider if dst_restrict_ill is set. */ -next_ill: - if (dstinfo.dst_restrict_ill == RESTRICT_TO_ILL) - ill = NULL; - else if (dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP) - ill = ill->ill_group_next; - else - ill = ill_next(&ctx, ill); + if (dstinfo.dst_restrict_ill) + break; } ipif = best_c.cand_ipif; @@ -2444,6 +2563,9 @@ next_ill: if (usesrc_ill != NULL) ill_refrele(usesrc_ill); + if (ipmp_ill != NULL) + ill_refrele(ipmp_ill); + if (dst_rhtp != NULL) TPC_RELE(dst_rhtp); @@ -2474,8 +2596,7 @@ next_ill: * ipif_update_other_ipifs calls us. * * If old_ipif is NULL, just redo the source address selection - * if needed. This happens when illgrp_insert or ipif_up_done_v6 - * calls us. + * if needed. This happens when ipif_up_done_v6 calls us. */ void ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif) @@ -2561,8 +2682,7 @@ ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif) if (ip6_asp_can_lookup(ipst)) { ip6_asp_table_held = B_TRUE; nipif = ipif_select_source_v6(ill, &ipif->ipif_v6subnet, - RESTRICT_TO_GROUP, IPV6_PREFER_SRC_DEFAULT, - ipif->ipif_zoneid); + B_TRUE, IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid); } if (nipif == NULL) { /* Last resort - all ipif's have IPIF_NOLOCAL */ @@ -2630,13 +2750,9 @@ ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif) * Find the IRE_INTERFACE for such ipif's and recreate them * to use an different source address following the rules in * ipif_up_done_v6. - * - * This function takes an illgrp as an argument so that illgrp_delete - * can call this to update source address even after deleting the - * old_ipif->ipif_ill from the ill group. */ void -ipif_update_other_ipifs_v6(ipif_t *old_ipif, ill_group_t *illgrp) +ipif_update_other_ipifs_v6(ipif_t *old_ipif) { ipif_t *ipif; ill_t *ill; @@ -2651,23 +2767,9 @@ ipif_update_other_ipifs_v6(ipif_t *old_ipif, ill_group_t *illgrp) inet_ntop(AF_INET6, &old_ipif->ipif_v6lcl_addr, buf, sizeof (buf)))); - /* - * If this part of a group, look at all ills as ipif_select_source - * borrows a source address across all the ills in the group. - */ - if (illgrp != NULL) - ill = illgrp->illgrp_ill; - - /* Don't need a lock since this is a writer */ - for (; ill != NULL; ill = ill->ill_group_next) { - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - - if (ipif == old_ipif) - continue; - + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + if (ipif != old_ipif) ipif_recreate_interface_routes_v6(old_ipif, ipif); - } } } @@ -2828,12 +2930,10 @@ ipif_up_done_v6(ipif_t *ipif) boolean_t flush_ire_cache = B_TRUE; int err; char buf[INET6_ADDRSTRLEN]; - phyint_t *phyi; ire_t **ipif_saved_irep = NULL; int ipif_saved_ire_cnt; int cnt; boolean_t src_ipif_held = B_FALSE; - boolean_t ire_added = B_FALSE; boolean_t loopback = B_FALSE; boolean_t ip6_asp_table_held = B_FALSE; ip_stack_t *ipst = ill->ill_ipst; @@ -2868,8 +2968,8 @@ ipif_up_done_v6(ipif_t *ipif) break; } if (flush_ire_cache) - ire_walk_ill_v6(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, - IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); + ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, + IRE_CACHE, ill_ipif_cache_delete, ill, ill); /* * Figure out which way the send-to queue should go. Only @@ -2900,7 +3000,9 @@ ipif_up_done_v6(ipif_t *ipif) ipif->ipif_ire_type = IRE_LOCAL; } - if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { + if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) || + ((ipif->ipif_flags & IPIF_DEPRECATED) && + !(ipif->ipif_flags & IPIF_NOFAILOVER))) { /* * Can't use our source address. Select a different * source address for the IRE_INTERFACE and IRE_LOCAL @@ -2908,7 +3010,7 @@ ipif_up_done_v6(ipif_t *ipif) if (ip6_asp_can_lookup(ipst)) { ip6_asp_table_held = B_TRUE; src_ipif = ipif_select_source_v6(ipif->ipif_ill, - &ipif->ipif_v6subnet, RESTRICT_TO_NONE, + &ipif->ipif_v6subnet, B_FALSE, IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid); } if (src_ipif == NULL) @@ -3090,9 +3192,9 @@ ipif_up_done_v6(ipif_t *ipif) ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); /* - * Need to atomically check for ip_addr_availablity_check - * now under ill_g_lock, and if it fails got bad, and remove - * from group also + * Need to atomically check for IP address availability under + * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new + * ills or new ipifs can be added while we are checking availability. */ rw_enter(&ipst->ips_ill_g_lock, RW_READER); mutex_enter(&ipst->ips_ip_addr_avail_lock); @@ -3125,9 +3227,7 @@ ipif_up_done_v6(ipif_t *ipif) } /* - * Add in all newly created IREs. We want to add before - * we call ifgrp_insert which wants to know whether - * IRE_IF_RESOLVER exists or not. + * Add in all newly created IREs. * * NOTE : We refrele the ire though we may branch to "bad" * later on where we do ire_delete. This is okay @@ -3148,36 +3248,6 @@ ipif_up_done_v6(ipif_t *ipif) ip6_asp_table_refrele(ipst); ip6_asp_table_held = B_FALSE; } - ire_added = B_TRUE; - - /* - * Form groups if possible. - * - * If we are supposed to be in a ill_group with a name, insert it - * now as we know that at least one ipif is UP. Otherwise form - * nameless groups. - * - * If ip_enable_group_ifs is set and ipif address is not ::0, insert - * this ipif into the appropriate interface group, or create a - * new one. If this is already in a nameless group, we try to form - * a bigger group looking at other ills potentially sharing this - * ipif's prefix. - */ - phyi = ill->ill_phyint; - if (phyi->phyint_groupname_len != 0) { - ASSERT(phyi->phyint_groupname != NULL); - if (ill->ill_ipif_up_count == 1) { - ASSERT(ill->ill_group == NULL); - err = illgrp_insert(&ipst->ips_illgrp_head_v6, ill, - phyi->phyint_groupname, NULL, B_TRUE); - if (err != 0) { - ip1dbg(("ipif_up_done_v6: illgrp allocation " - "failed, error %d\n", err)); - goto bad; - } - } - ASSERT(ill->ill_group != NULL); - } /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; @@ -3190,19 +3260,23 @@ ipif_up_done_v6(ipif_t *ipif) */ ill_recover_multicast(ill); } - /* Join the allhosts multicast address and the solicited node MC */ - ipif_multicast_up(ipif); - if (!loopback) { + if (ill->ill_ipif_up_count == 1) { /* - * See whether anybody else would benefit from the - * new ipif that we added. We call this always rather - * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST - * ipif for the benefit of illgrp_insert (done above) - * which does not do source address selection as it does - * not want to re-create interface routes that we are - * having reference to it here. + * Since the interface is now up, it may now be active. */ + if (IS_UNDER_IPMP(ill)) + ipmp_ill_refresh_active(ill); + } + + /* Join the allhosts multicast address and the solicited node MC */ + ipif_multicast_up(ipif); + + /* + * See if anybody else would benefit from our new ipif. + */ + if (!loopback && + !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { ill_update_source_selection(ill); } @@ -3238,29 +3312,11 @@ ipif_up_done_v6(ipif_t *ipif) bad: if (ip6_asp_table_held) ip6_asp_table_refrele(ipst); - /* - * We don't have to bother removing from ill groups because - * - * 1) For groups with names, we insert only when the first ipif - * comes up. In that case if it fails, it will not be in any - * group. So, we need not try to remove for that case. - * - * 2) For groups without names, either we tried to insert ipif_ill - * in a group as singleton or found some other group to become - * a bigger group. For the former, if it fails we don't have - * anything to do as ipif_ill is not in the group and for the - * latter, there are no failures in illgrp_insert/illgrp_delete - * (ENOMEM can't occur for this. Check ifgrp_insert). - */ while (irep > ire_array) { irep--; - if (*irep != NULL) { + if (*irep != NULL) ire_delete(*irep); - if (ire_added) - ire_refrele(*irep); - } - } (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); @@ -3272,8 +3328,7 @@ bad: ipif_refrele(src_ipif); ipif_ndp_down(ipif); - if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV) - ipif_arp_down(ipif); + ipif_resolver_down(ipif); return (err); } @@ -3286,15 +3341,14 @@ int ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) { - in6_addr_t addr; sin6_t *sin6; nce_t *nce; struct lifreq *lifr; lif_nd_req_t *lnr; - mblk_t *mp1; + ill_t *ill = ipif->ipif_ill; + ire_t *ire; - mp1 = mp->b_cont->b_cont; - lifr = (struct lifreq *)mp1->b_rptr; + lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr; lnr = &lifr->lifr_nd; /* Only allow for logical unit zero i.e. not on "le0:17" */ if (ipif->ipif_id != 0) @@ -3307,8 +3361,28 @@ ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, return (EAFNOSUPPORT); sin6 = (sin6_t *)&lnr->lnr_addr; - addr = sin6->sin6_addr; - nce = ndp_lookup_v6(ipif->ipif_ill, &addr, B_FALSE); + + /* + * Since ND mappings must be consistent across an IPMP group, prohibit + * deleting ND mappings on underlying interfaces. Also, since ND + * mappings for IPMP data addresses are owned by IP itself, prohibit + * deleting them. + */ + if (IS_UNDER_IPMP(ill)) + return (EPERM); + + if (IS_IPMP(ill)) { + ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL, + ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, + ill->ill_ipst); + if (ire != NULL) { + ire_refrele(ire); + return (EPERM); + } + } + + /* See comment in ndp_query() regarding IS_IPMP(ill) usage */ + nce = ndp_lookup_v6(ill, IS_IPMP(ill), &sin6->sin6_addr, B_FALSE); if (nce == NULL) return (ESRCH); ndp_delete(nce); @@ -3354,11 +3428,11 @@ int ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) { + sin6_t *sin6; ill_t *ill = ipif->ipif_ill; struct lifreq *lifr; lif_nd_req_t *lnr; - - ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); + ire_t *ire; lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr; lnr = &lifr->lifr_nd; @@ -3372,5 +3446,26 @@ ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, if (lnr->lnr_addr.ss_family != AF_INET6) return (EAFNOSUPPORT); + sin6 = (sin6_t *)&lnr->lnr_addr; + + /* + * Since ND mappings must be consistent across an IPMP group, prohibit + * updating ND mappings on underlying interfaces. Also, since ND + * mappings for IPMP data addresses are owned by IP itself, prohibit + * updating them. + */ + if (IS_UNDER_IPMP(ill)) + return (EPERM); + + if (IS_IPMP(ill)) { + ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL, + ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, + ill->ill_ipst); + if (ire != NULL) { + ire_refrele(ire); + return (EPERM); + } + } + return (ndp_sioc_update(ill, lnr)); } diff --git a/usr/src/uts/common/inet/ip/ip6_ire.c b/usr/src/uts/common/inet/ip/ip6_ire.c index 41461ca96f..0d0f3621f5 100644 --- a/usr/src/uts/common/inet/ip/ip6_ire.c +++ b/usr/src/uts/common/inet/ip/ip6_ire.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -73,7 +73,6 @@ static ire_t *ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *, const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *); static ire_t *ip6_ctable_lookup_impl(ire_ctable_args_t *); - /* * Initialize the ire that is specific to IPv6 part and call * ire_init_common to finish it. @@ -261,13 +260,11 @@ ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst) * Make sure we follow ire_ipif. * * We need to determine the interface route through - * which the gateway will be reached. We don't really - * care which interface is picked if the interface is - * part of a group. + * which the gateway will be reached. */ if (ire->ire_ipif != NULL) { ipif = ire->ire_ipif; - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; } switch (ire->ire_type) { @@ -409,35 +406,54 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) ire_t *ire = *ire_p; int error; ip_stack_t *ipst = ire->ire_ipst; + uint_t marks = 0; ASSERT(ire->ire_ipversion == IPV6_VERSION); ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */ ASSERT(ire->ire_nce == NULL); + /* + * IREs with source addresses hosted on interfaces that are under IPMP + * should be hidden so that applications don't accidentally end up + * sending packets with test addresses as their source addresses, or + * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here. + * (We let IREs with unspecified source addresses slip through since + * ire_send_v6() will delete them automatically.) + */ + if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && + !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) { + DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); + marks |= IRE_MARK_TESTHIDDEN; + } + /* Find the appropriate list head. */ switch (ire->ire_type) { case IRE_HOST: ire->ire_mask_v6 = ipv6_all_ones; ire->ire_masklen = IPV6_ABITS; + ire->ire_marks |= marks; if ((ire->ire_flags & RTF_SETSRC) == 0) ire->ire_src_addr_v6 = ipv6_all_zeros; break; case IRE_CACHE: + ire->ire_mask_v6 = ipv6_all_ones; + ire->ire_masklen = IPV6_ABITS; + ire->ire_marks |= marks; + break; case IRE_LOCAL: case IRE_LOOPBACK: ire->ire_mask_v6 = ipv6_all_ones; ire->ire_masklen = IPV6_ABITS; break; case IRE_PREFIX: - if ((ire->ire_flags & RTF_SETSRC) == 0) - ire->ire_src_addr_v6 = ipv6_all_zeros; - break; case IRE_DEFAULT: + ire->ire_marks |= marks; if ((ire->ire_flags & RTF_SETSRC) == 0) ire->ire_src_addr_v6 = ipv6_all_zeros; break; case IRE_IF_RESOLVER: case IRE_IF_NORESOLVER: + ire->ire_marks |= marks; break; default: printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n", @@ -543,9 +559,8 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) * 2) We could have multiple packets trying to create * an IRE_CACHE for the same ill. * - * Moreover, IPIF_NOFAILOVER and IPV6_BOUND_PIF endpoints wants - * to go out on a particular ill. Rather than looking at the - * packet, we depend on the above for MATCH_IRE_ILL here. + * Rather than looking at the packet, we depend on the above for + * MATCH_IRE_ILL here. * * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have * multiple IRE_CACHES for an ill for the same destination @@ -555,20 +570,15 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) */ if (ire->ire_ipif != NULL) flags |= MATCH_IRE_IPIF; + /* - * If we are creating hidden ires, make sure we search on - * this ill (MATCH_IRE_ILL) and a hidden ire, while we are - * searching for duplicates below. Otherwise we could - * potentially find an IRE on some other interface - * and it may not be a IRE marked with IRE_MARK_HIDDEN. We - * shouldn't do this as this will lead to an infinite loop as - * eventually we need an hidden ire for this packet to go - * out. MATCH_IRE_ILL is already marked above. + * If we are creating a hidden IRE, make sure we search for + * hidden IREs when searching for duplicates below. + * Otherwise, we might find an IRE on some other interface + * that's not marked hidden. */ - if (ire->ire_marks & IRE_MARK_HIDDEN) { - ASSERT(ire->ire_type == IRE_CACHE); - flags |= MATCH_IRE_MARK_HIDDEN; - } + if (ire->ire_marks & IRE_MARK_TESTHIDDEN) + flags |= MATCH_IRE_MARK_TESTHIDDEN; /* * Start the atomic add of the ire. Grab the ill locks, @@ -692,7 +702,7 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) } } if (ire->ire_type == IRE_CACHE) { - in6_addr_t gw_addr_v6; + const in6_addr_t *addr_v6; ill_t *ill = ire_to_ill(ire); char buf[INET6_ADDRSTRLEN]; nce_t *nce; @@ -712,12 +722,12 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) * time on the list and rts_setgwr_v6 could not * be changing this. */ - gw_addr_v6 = ire->ire_gateway_addr_v6; - if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { - nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, B_TRUE); - } else { - nce = ndp_lookup_v6(ill, &gw_addr_v6, B_TRUE); - } + addr_v6 = &ire->ire_gateway_addr_v6; + if (IN6_IS_ADDR_UNSPECIFIED(addr_v6)) + addr_v6 = &ire->ire_addr_v6; + + /* nce fastpath is per-ill; don't match across illgrp */ + nce = ndp_lookup_v6(ill, B_FALSE, addr_v6, B_TRUE); if (nce == NULL) goto failed; @@ -1217,28 +1227,29 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, in6_addr_t gw_addr_v6; ill_t *ire_ill = NULL, *dst_ill; ill_t *ipif_ill = NULL; - ill_group_t *ire_ill_group = NULL; - ill_group_t *ipif_ill_group = NULL; ipif_t *src_ipif; ASSERT(ire->ire_ipversion == IPV6_VERSION); ASSERT(addr != NULL); ASSERT(mask != NULL); ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); - ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) || + ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ipif != NULL && ipif->ipif_isv6)); /* - * HIDDEN cache entries have to be looked up specifically with - * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set - * when the interface is FAILED or INACTIVE. In that case, - * any IRE_CACHES that exists should be marked with - * IRE_MARK_HIDDEN. So, we don't really need to match below - * for IRE_MARK_HIDDEN. But we do so for consistency. + * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it + * is in fact hidden, to ensure the caller gets the right one. One + * exception: if the caller passed MATCH_IRE_IHANDLE, then they + * already know the identity of the given IRE_INTERFACE entry and + * there's no point trying to hide it from them. */ - if (!(match_flags & MATCH_IRE_MARK_HIDDEN) && - (ire->ire_marks & IRE_MARK_HIDDEN)) - return (B_FALSE); + if (ire->ire_marks & IRE_MARK_TESTHIDDEN) { + if (match_flags & MATCH_IRE_IHANDLE) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; + + if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) + return (B_FALSE); + } if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { @@ -1288,7 +1299,7 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, */ if ((dst_ill->ill_usesrc_ifindex != 0) && (src_ipif = ipif_select_source_v6(dst_ill, addr, - RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid)) + B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid)) != NULL) { ip3dbg(("ire_match_args: src_ipif %p" " dst_ill %p", (void *)src_ipif, @@ -1326,20 +1337,20 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); } + /* - * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that - * somebody wants to send out on a particular interface which - * is given by ire_stq and hence use ire_stq to derive the ill - * value. ire_ipif for IRE_CACHES is just the - * means of getting a source address i.e ire_src_addr_v6 = - * ire->ire_ipif->ipif_src_addr_v6. + * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to + * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means + * of getting a source address -- i.e., ire_src_addr_v6 == + * ire->ire_ipif->ipif_v6src_addr). ire_to_ill() handles this. + * + * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group. + * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for + * IPMP test traffic), then the ill must match exactly. */ - if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) { + if (match_flags & MATCH_IRE_ILL) { ire_ill = ire_to_ill(ire); - if (ire_ill != NULL) - ire_ill_group = ire_ill->ill_group; ipif_ill = ipif->ipif_ill; - ipif_ill_group = ipif_ill->ill_group; } /* No ire_addr_v6 bits set past the mask */ @@ -1357,17 +1368,14 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, &ipif->ipif_v6src_addr)) && ((!(match_flags & MATCH_IRE_IPIF)) || (ire->ire_ipif == ipif)) && - ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) || - (ire->ire_type != IRE_CACHE || - ire->ire_marks & IRE_MARK_HIDDEN)) && + ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) || + (ire->ire_marks & IRE_MARK_TESTHIDDEN)) && ((!(match_flags & MATCH_IRE_ILL)) || - (ire_ill == ipif_ill)) && + (ire_ill == ipif_ill || + (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) && + ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) && ((!(match_flags & MATCH_IRE_IHANDLE)) || (ire->ire_ihandle == ihandle)) && - ((!(match_flags & MATCH_IRE_ILL_GROUP)) || - (ire_ill == ipif_ill) || - (ire_ill_group != NULL && - ire_ill_group == ipif_ill_group)) && ((!(match_flags & MATCH_IRE_SECATTR)) || (!is_system_labeled()) || (tsol_ire_match_gwattr(ire, tsl) == 0))) { @@ -1391,8 +1399,7 @@ ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or * MATCH_IRE_ILL is set. */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && - (ipif == NULL)) + if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) return (NULL); /* @@ -1477,8 +1484,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or * MATCH_IRE_ILL is set. */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && - (ipif == NULL)) + if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) return (NULL); /* @@ -1661,8 +1667,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); - match_flags = MATCH_IRE_ILL_GROUP | - MATCH_IRE_SECATTR; + match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL, 0, ire->ire_ipif, zoneid, tsl, match_flags, ipst); @@ -1703,7 +1708,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, if (ire->ire_ipif != NULL) { ire_match_flags |= - MATCH_IRE_ILL_GROUP; + MATCH_IRE_ILL; } rire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, IRE_INTERFACE, @@ -1791,21 +1796,8 @@ found_ire_held: */ saved_ire = ire; - /* - * Currently MATCH_IRE_ILL is never used with - * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while - * sending out packets as MATCH_IRE_ILL is used only - * for communicating with on-link hosts. We can't assert - * that here as RTM_GET calls this function with - * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE. - * We have already used the MATCH_IRE_ILL in determining - * the right prefix route at this point. To match the - * behavior of how we locate routes while sending out - * packets, we don't want to use MATCH_IRE_ILL below - * while locating the interface route. - */ if (ire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; @@ -1958,9 +1950,7 @@ ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway, } /* - * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers - * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get - * to the hidden ones. + * Lookup cache. * * In general the zoneid has to match (where ALL_ZONES match all of them). * But for IRE_LOCAL we also need to handle the case where L2 should @@ -1968,8 +1958,7 @@ ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway, * Ethernet drivers nor Ethernet hardware loops back packets sent to their * own MAC address. This loopback is needed when the normal * routes (ignoring IREs with different zoneids) would send out the packet on - * the same ill (or ill group) as the ill with which this IRE_LOCAL is - * associated. + * the same ill as the ill with which this IRE_LOCAL is associated. * * Earlier versions of this code always matched an IRE_LOCAL independently of * the zoneid. We preserve that earlier behavior when @@ -1986,7 +1975,7 @@ ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid, ipst->ips_ip6_cache_table_size)]; rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) { - if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN)) + if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) continue; if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) { /* @@ -2125,13 +2114,8 @@ ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire) ASSERT(cire != NULL && pire != NULL); match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; - /* - * ip_newroute_v6 calls ire_ftable_lookup with MATCH_IRE_ILL only - * for on-link hosts. We should never be here for onlink. - * Thus, use MATCH_IRE_ILL_GROUP. - */ if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; /* * We know that the mask of the interface ire equals cire->ire_cmask. * (When ip_newroute_v6() created 'cire' for an on-link destn. it set @@ -2168,7 +2152,7 @@ ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire) */ match_flags = MATCH_IRE_TYPE; if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; mutex_enter(&pire->ire_lock); gw_addr = pire->ire_gateway_addr_v6; @@ -2210,24 +2194,30 @@ ire_t * ipif_to_ire_v6(const ipif_t *ipif) { ire_t *ire; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF; + + /* + * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN + * so that they aren't accidentally returned. However, if the + * caller's ipif is on an ill under IPMP, there's no need to hide 'em. + */ + if (IS_UNDER_IPMP(ipif->ipif_ill)) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; ASSERT(ipif->ipif_isv6); if (ipif->ipif_ire_type == IRE_LOOPBACK) { ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL, - IRE_LOOPBACK, ipif, ALL_ZONES, NULL, - (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ipst); + IRE_LOOPBACK, ipif, ALL_ZONES, NULL, match_flags, ipst); } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { /* In this case we need to lookup destination address. */ ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr, &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES, - 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF | - MATCH_IRE_MASK), ipst); + 0, NULL, (match_flags | MATCH_IRE_MASK), ipst); } else { ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet, &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL, - ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF | - MATCH_IRE_MASK), ipst); + ALL_ZONES, 0, NULL, (match_flags | MATCH_IRE_MASK), ipst); } return (ire); } @@ -2296,7 +2286,7 @@ ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl, continue; if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp)) continue; - if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN)) + if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) continue; unres_cnt--; } @@ -2434,7 +2424,7 @@ ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, continue; if (cire->ire_marks & (IRE_MARK_CONDEMNED| - IRE_MARK_HIDDEN)) + IRE_MARK_TESTHIDDEN)) continue; if (cire->ire_gw_secattr != NULL && @@ -2635,8 +2625,7 @@ ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, &cire->ire_addr_v6, &v6dst)) continue; if (cire->ire_marks & - (IRE_MARK_CONDEMNED| - IRE_MARK_HIDDEN)) + IRE_MARK_CONDEMNED) continue; if (cire->ire_gw_secattr != NULL && @@ -2845,8 +2834,7 @@ ip6_ctable_lookup_impl(ire_ctable_args_t *margs) ire_t *ire; ip_stack_t *ipst = margs->ict_ipst; - if ((margs->ict_flags & - (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && + if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (margs->ict_ipif == NULL)) { return (NULL); } diff --git a/usr/src/uts/common/inet/ip/ip6_rts.c b/usr/src/uts/common/inet/ip/ip6_rts.c index 7d2ddd5c04..dcf429c8ba 100644 --- a/usr/src/uts/common/inet/ip/ip6_rts.c +++ b/usr/src/uts/common/inet/ip/ip6_rts.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,8 +38,6 @@ * @(#)rtsock.c 8.6 (Berkeley) 2/11/95 */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file contains routines that processes routing socket requests. */ @@ -216,5 +214,5 @@ ip_rts_change_v6(int type, const in6_addr_t *dst_addr, rtm->rtm_errno = error; rtm->rtm_flags |= RTF_DONE; rtm->rtm_addrs = rtm_addrs; - rts_queue_input(mp, NULL, AF_INET6, ipst); + rts_queue_input(mp, NULL, AF_INET6, RTSQ_ALL, ipst); } diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c index 4fa3c7a74d..31f83c842d 100644 --- a/usr/src/uts/common/inet/ip/ip_ftable.c +++ b/usr/src/uts/common/inet/ip/ip_ftable.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,7 +67,6 @@ #include <net/pfkeyv2.h> #include <inet/ipsec_info.h> #include <inet/sadb.h> -#include <sys/kmem.h> #include <inet/tcp.h> #include <inet/ipclassifier.h> #include <sys/zone.h> @@ -159,8 +158,7 @@ ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, * ire_match_args() will dereference ipif MATCH_IRE_SRC or * MATCH_IRE_ILL is set. */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && - (ipif == NULL)) + if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) return (NULL); (void) memset(&rdst, 0, sizeof (rdst)); @@ -290,28 +288,16 @@ found_ire_held: */ save_ire = ire; + if (ire->ire_ipif != NULL) + match_flags |= MATCH_IRE_ILL; + /* - * Currently MATCH_IRE_ILL is never used with - * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while - * sending out packets as MATCH_IRE_ILL is used only - * for communicating with on-link hosts. We can't assert - * that here as RTM_GET calls this function with - * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE. - * We have already used the MATCH_IRE_ILL in determining - * the right prefix route at this point. To match the - * behavior of how we locate routes while sending out - * packets, we don't want to use MATCH_IRE_ILL below - * while locating the interface route. - * * ire_ftable_lookup may end up with an incomplete IRE_CACHE * entry for the gateway (i.e., one for which the * ire_nce->nce_state is not yet ND_REACHABLE). If the caller * has specified MATCH_IRE_COMPLETE, such entries will not * be returned; instead, we return the IF_RESOLVER ire. */ - if (ire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; - ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0, ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst); DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire, @@ -532,7 +518,7 @@ ire_ftable_lookup_simple(ipaddr_t addr, } } if (ire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst); @@ -678,13 +664,11 @@ ire_lookup_multi(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst) * Make sure we follow ire_ipif. * * We need to determine the interface route through - * which the gateway will be reached. We don't really - * care which interface is picked if the interface is - * part of a group. + * which the gateway will be reached. */ if (ire->ire_ipif != NULL) { ipif = ire->ire_ipif; - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; } switch (ire->ire_type) { @@ -854,40 +838,26 @@ ire_get_next_default_ire(ire_t *ire, ire_t *ire_origin) } static ipif_t * -ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill, +ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, int zoneid, ushort_t *marks) { ipif_t *src_ipif; - ip_stack_t *ipst = dst_ill->ill_ipst; + ill_t *ill = ire->ire_ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; /* - * Pick the best source address from dst_ill. + * Pick the best source address from ill. * - * 1) If it is part of a multipathing group, we would - * like to spread the inbound packets across different - * interfaces. ipif_select_source picks a random source - * across the different ills in the group. - * - * 2) If it is not part of a multipathing group, we try - * to pick the source address from the destination + * 1) Try to pick the source address from the destination * route. Clustering assumes that when we have multiple * prefixes hosted on an interface, the prefix of the * source address matches the prefix of the destination * route. We do this only if the address is not * DEPRECATED. * - * 3) If the conn is in a different zone than the ire, we + * 2) If the conn is in a different zone than the ire, we * need to pick a source address from the right zone. - * - * NOTE : If we hit case (1) above, the prefix of the source - * address picked may not match the prefix of the - * destination routes prefix as ipif_select_source - * does not look at "dst" while picking a source - * address. - * If we want the same behavior as (2), we will need - * to change the behavior of ipif_select_source. */ - if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { /* * The RTF_SETSRC flag is set in the parent ire (sire). @@ -899,13 +869,10 @@ ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill, return (src_ipif); } *marks |= IRE_MARK_USESRC_CHECK; - if ((dst_ill->ill_group != NULL) || + if (IS_IPMP(ill) || (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || - (dst_ill->ill_usesrc_ifindex != 0)) { - src_ipif = ipif_select_source(dst_ill, dst, zoneid); - if (src_ipif == NULL) - return (NULL); - + (ill->ill_usesrc_ifindex != 0)) { + src_ipif = ipif_select_source(ill, dst, zoneid); } else { src_ipif = ire->ire_ipif; ASSERT(src_ipif != NULL); @@ -1071,18 +1038,20 @@ create_irecache: sire->ire_last_used_time = lbolt; } - /* Obtain dst_ill */ - dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill); + dst_ill = ire->ire_ipif->ipif_ill; + if (IS_IPMP(dst_ill)) + dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp); + else + ill_refhold(dst_ill); + if (dst_ill == NULL) { - ip2dbg(("ire_forward no dst ill; ire 0x%p\n", - (void *)ire)); + ip2dbg(("ire_forward no dst ill; ire 0x%p\n", (void *)ire)); goto icmp_err_ret; } ASSERT(src_ipif == NULL); /* Now obtain the src_ipif */ - src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill, - zoneid, &ire_marks); + src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks); if (src_ipif == NULL) goto icmp_err_ret; @@ -1254,18 +1223,13 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action, ire_t *sire = NULL, *save_ire; ill_t *dst_ill = NULL; int error; - zoneid_t zoneid; + zoneid_t zoneid = GLOBAL_ZONEID; ipif_t *src_ipif = NULL; mblk_t *res_mp; ushort_t ire_marks = 0; - zoneid = GLOBAL_ZONEID; - - ire = ire_ftable_lookup_simple(dst, &sire, zoneid, - MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE, ipst); - + MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE, ipst); if (ire == NULL) { ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst); goto icmp_err_ret; @@ -1288,9 +1252,7 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action, * nexthop router, just hand over the cache entry * and we are done. */ - if (ire->ire_type & IRE_CACHE) { - /* * If we are using this ire cache entry as a * gateway to forward packets, chances are we @@ -1334,18 +1296,21 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action, UPDATE_OB_PKT_COUNT(sire); } - /* Obtain dst_ill */ - dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill); + dst_ill = ire->ire_ipif->ipif_ill; + if (IS_IPMP(dst_ill)) + dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp); + else + ill_refhold(dst_ill); /* for symmetry */ + if (dst_ill == NULL) { - ip2dbg(("ire_forward no dst ill; ire 0x%p\n", + ip2dbg(("ire_forward_simple: no dst ill; ire 0x%p\n", (void *)ire)); goto icmp_err_ret; } ASSERT(src_ipif == NULL); /* Now obtain the src_ipif */ - src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill, - zoneid, &ire_marks); + src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks); if (src_ipif == NULL) goto icmp_err_ret; @@ -1720,33 +1685,24 @@ ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE| MATCH_IRE_RJ_BHOLE| - MATCH_IRE_SECATTR); + MATCH_IRE_SECATTR | MATCH_IRE_ILL); /* * If supplied ifindex is non-null, the only valid - * nexthop is one off of the interface or group corresponding + * nexthop is one off of the interface corresponding * to the specified ifindex. */ ill = ill_lookup_on_ifindex(ifindex, B_FALSE, NULL, NULL, NULL, NULL, ipst); if (ill != NULL) { - match_flags |= MATCH_IRE_ILL; + supplied_ipif = ipif_get_next_ipif(NULL, ill); } else { - /* Fallback to group names if hook_emulation set */ - if (ipst->ips_ipmp_hook_emulation) { - ill = ill_group_lookup_on_ifindex(ifindex, - B_FALSE, ipst); - } - if (ill == NULL) { - ip1dbg(("ipfil_sendpkt: Could not find" - " route to dst\n")); - value = ECOMM; - freemsg(mp); - goto discard; - } - match_flags |= MATCH_IRE_ILL_GROUP; + ip1dbg(("ipfil_sendpkt: Could not find" + " route to dst\n")); + value = ECOMM; + freemsg(mp); + goto discard; } - supplied_ipif = ipif_get_next_ipif(NULL, ill); ire = ire_route_lookup(dst, 0, 0, 0, supplied_ipif, &sire, zoneid, MBLK_GETLABEL(mp), match_flags, ipst); @@ -2325,9 +2281,9 @@ ire_round_robin(irb_t *irb_ptr, zoneid_t zoneid, ire_ftable_args_t *margs, * interested in routers that are * reachable through ipifs within our zone. */ - if (ire->ire_ipif != NULL) { - match_flags |= MATCH_IRE_ILL_GROUP; - } + if (ire->ire_ipif != NULL) + match_flags |= MATCH_IRE_ILL; + rire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, IRE_INTERFACE, ire->ire_ipif, NULL, zoneid, margs->ift_tsl, match_flags, ipst); diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index 0597245499..9771c87721 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -46,6 +46,7 @@ #include <sys/bitmap.h> #include <sys/cpuvar.h> #include <sys/time.h> +#include <sys/ctype.h> #include <sys/kmem.h> #include <sys/systm.h> #include <sys/param.h> @@ -61,10 +62,10 @@ #include <netinet/ip6.h> #include <netinet/icmp6.h> #include <netinet/igmp_var.h> -#include <sys/strsun.h> #include <sys/policy.h> #include <sys/ethernet.h> #include <sys/callb.h> +#include <sys/md5.h> #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ #include <inet/mi.h> @@ -85,7 +86,6 @@ #include <inet/tun.h> #include <inet/sctp_ip.h> #include <inet/ip_netinfo.h> -#include <inet/mib2.h> #include <net/pfkeyv2.h> #include <inet/ipsec_info.h> @@ -93,7 +93,6 @@ #include <inet/ipsec_impl.h> #include <sys/iphada.h> - #include <netinet/igmp.h> #include <inet/ip_listutils.h> #include <inet/ipclassifier.h> @@ -158,7 +157,7 @@ static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, static void ipsq_delete(ipsq_t *); static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, - boolean_t initialize); + boolean_t initialize, boolean_t insert); static void ipif_check_bcast_ires(ipif_t *test_ipif); static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, @@ -169,7 +168,6 @@ static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); static void ipif_free(ipif_t *ipif); static void ipif_free_tail(ipif_t *ipif); static void ipif_mtu_change(ire_t *ire, char *ipif_arg); -static void ipif_multicast_down(ipif_t *ipif); static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); static void ipif_set_default(ipif_t *ipif); static int ipif_set_values(queue_t *q, mblk_t *mp, @@ -179,8 +177,7 @@ static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *); -static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); -static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); +static void ipif_update_other_ipifs(ipif_t *old_ipif); static int ill_alloc_ppa(ill_if_t *, ill_t *); static int ill_arp_off(ill_t *ill); @@ -192,33 +189,18 @@ static void ill_down(ill_t *ill); static void ill_downi(ire_t *ire, char *ill_arg); static void ill_free_mib(ill_t *ill); static void ill_glist_delete(ill_t *); -static boolean_t ill_has_usable_ipif(ill_t *); -static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); -static void ill_nominate_bcast_rcv(ill_group_t *illgrp); -static void ill_phyint_free(ill_t *ill); static void ill_phyint_reinit(ill_t *ill); static void ill_set_nce_router_flags(ill_t *, boolean_t); static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); -static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); -static boolean_t ill_split_ipsq(ipsq_t *cur_sq); -static void ill_stq_cache_delete(ire_t *, char *); - -static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); -static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); -static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, - in6_addr_t *); -static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, - ipaddr_t *); -static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); -static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, - in6_addr_t *); -static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, - ipaddr_t *); - +static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; +static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; +static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo; +static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo; static void ipif_save_ire(ipif_t *, ire_t *); static void ipif_remove_ire(ipif_t *, ire_t *); static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *); static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); +static void phyint_free(phyint_t *); /* * Per-ill IPsec capabilities management. @@ -250,18 +232,14 @@ static void ill_capability_ack_thr(void *); static void ill_capability_lso_enable(ill_t *); static void ill_capability_send(ill_t *, mblk_t *); -static void illgrp_cache_delete(ire_t *, char *); -static void illgrp_delete(ill_t *ill); -static void illgrp_reset_schednext(ill_t *ill); - static ill_t *ill_prev_usesrc(ill_t *); static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); static void ill_disband_usesrc_group(ill_t *); static void conn_cleanup_stale_ire(conn_t *, caddr_t); #ifdef DEBUG -static void ill_trace_cleanup(const ill_t *); -static void ipif_trace_cleanup(const ipif_t *); +static void ill_trace_cleanup(const ill_t *); +static void ipif_trace_cleanup(const ipif_t *); #endif /* @@ -491,6 +469,7 @@ static nv_t ipif_nv_tbl[] = { { PHYI_STANDBY, "STANDBY" }, { PHYI_INACTIVE, "INACTIVE" }, { PHYI_OFFLINE, "OFFLINE" }, + { PHYI_IPMP, "IPMP" } }; static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; @@ -508,7 +487,8 @@ static ip_m_t ip_m_tbl[] = { ip_ether_v6intfid }, { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, ip_ib_v6intfid }, - { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, + { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL }, + { SUNW_DL_IPMP, IFT_OTHER, NULL, NULL, ip_ipmp_v6intfid }, { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid } }; @@ -529,14 +509,6 @@ static ipif_t ipif_zero; */ uint_t ill_no_arena = 12; /* Setable in /etc/system */ -static uint_t -ipif_rand(ip_stack_t *ipst) -{ - ipst->ips_ipif_src_random = ipst->ips_ipif_src_random * 1103515245 + - 12345; - return ((ipst->ips_ipif_src_random >> 16) & 0x7fff); -} - /* * Allocate per-interface mibs. * Returns true if ok. False otherwise. @@ -623,7 +595,7 @@ ill_allocate_mibs(ill_t *ill) * (Always called as writer.) */ mblk_t * -ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) +ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr) { arc_t *arc = (arc_t *)template; char *cp; @@ -669,17 +641,69 @@ ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) } mblk_t * -ipif_area_alloc(ipif_t *ipif) +ipif_area_alloc(ipif_t *ipif, uint_t optflags) { - return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template, - (char *)&ipif->ipif_lcl_addr)); + caddr_t addr; + mblk_t *mp; + area_t *area; + uchar_t *areap; + ill_t *ill = ipif->ipif_ill; + + if (ill->ill_isv6) { + ASSERT(ill->ill_flags & ILLF_XRESOLV); + addr = (caddr_t)&ipif->ipif_v6lcl_addr; + areap = (uchar_t *)&ip6_area_template; + } else { + addr = (caddr_t)&ipif->ipif_lcl_addr; + areap = (uchar_t *)&ip_area_template; + } + + if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL) + return (NULL); + + /* + * IPMP requires that the hardware address be included in all + * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on. + * If there are no active underlying ills in the group (and thus no + * hardware address, DAD will be deferred until an underlying ill + * becomes active. + */ + if (IS_IPMP(ill)) { + if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { + freemsg(mp); + return (NULL); + } + } else { + ill_refhold(ill); + } + + area = (area_t *)mp->b_rptr; + area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR; + area->area_flags |= optflags; + area->area_hw_addr_length = ill->ill_phys_addr_length; + bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset, + area->area_hw_addr_length); + + ill_refrele(ill); + return (mp); } mblk_t * ipif_ared_alloc(ipif_t *ipif) { - return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template, - (char *)&ipif->ipif_lcl_addr)); + caddr_t addr; + uchar_t *aredp; + + if (ipif->ipif_ill->ill_isv6) { + ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV); + addr = (caddr_t)&ipif->ipif_v6lcl_addr; + aredp = (uchar_t *)&ip6_ared_template; + } else { + addr = (caddr_t)&ipif->ipif_lcl_addr; + aredp = (uchar_t *)&ip_ared_template; + } + + return (ill_arp_alloc(ipif->ipif_ill, aredp, addr)); } mblk_t * @@ -689,6 +713,19 @@ ill_ared_alloc(ill_t *ill, ipaddr_t addr) (char *)&addr)); } +mblk_t * +ill_arie_alloc(ill_t *ill, const char *grifname, const void *template) +{ + mblk_t *mp = ill_arp_alloc(ill, template, 0); + arie_t *arie; + + if (mp != NULL) { + arie = (arie_t *)mp->b_rptr; + (void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ); + } + return (mp); +} + /* * Completely vaporize a lower level tap and all associated interfaces. * ill_delete is called only out of ip_close when the device control @@ -751,6 +788,12 @@ ill_delete(ill_t *ill) ip_purge_allmulti(ill); /* + * If the ill being deleted is under IPMP, boot it out of the illgrp. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_ill_leave_illgrp(ill); + + /* * ill_down will arrange to blow off any IRE's dependent on this * ILL, and shut down fragmentation reassembly. */ @@ -890,8 +933,19 @@ ill_delete_tail(ill_t *ill) * ill references. */ ASSERT(ilm_walk_ill(ill) == 0); + /* - * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free + * If this ill is an IPMP meta-interface, blow away the illgrp. This + * is safe to do because the illgrp has already been unlinked from the + * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. + */ + if (IS_IPMP(ill)) { + ipmp_illgrp_destroy(ill->ill_grp); + ill->ill_grp = NULL; + } + + /* + * Take us out of the list of ILLs. ill_glist_delete -> phyint_free * could free the phyint. No more reference to the phyint after this * point. */ @@ -1139,7 +1193,7 @@ ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) * Add the pending mp to the list. There can be only 1 pending mp * in the list. Any exclusive ioctl that needs to wait for a response * from another module or driver needs to use this function to set - * the ipsq_pending_mp to the ioctl mblk and wait for the response from + * the ipx_pending_mp to the ioctl mblk and wait for the response from * the other module/driver. This is also used while waiting for the * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. */ @@ -1147,19 +1201,19 @@ boolean_t ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, int waitfor) { - ipsq_t *ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; + ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; ASSERT(IAM_WRITER_IPIF(ipif)); ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); - ASSERT(ipsq->ipsq_pending_mp == NULL); + ASSERT(ipx->ipx_pending_mp == NULL); /* * The caller may be using a different ipif than the one passed into * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT - * that `ipsq_current_ipif == ipif'. + * that `ipx_current_ipif == ipif'. */ - ASSERT(ipsq->ipsq_current_ipif != NULL); + ASSERT(ipx->ipx_current_ipif != NULL); /* * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, @@ -1180,8 +1234,8 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, if (connp->conn_state_flags & CONN_CLOSING) return (B_FALSE); } - mutex_enter(&ipsq->ipsq_lock); - ipsq->ipsq_pending_ipif = ipif; + mutex_enter(&ipx->ipx_lock); + ipx->ipx_pending_ipif = ipif; /* * Note down the queue in b_queue. This will be returned by * ipsq_pending_mp_get. Caller will then use these values to restart @@ -1189,38 +1243,40 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, */ add_mp->b_next = NULL; add_mp->b_queue = q; - ipsq->ipsq_pending_mp = add_mp; - ipsq->ipsq_waitfor = waitfor; + ipx->ipx_pending_mp = add_mp; + ipx->ipx_waitfor = waitfor; + mutex_exit(&ipx->ipx_lock); if (connp != NULL) connp->conn_oper_pending_ill = ipif->ipif_ill; - mutex_exit(&ipsq->ipsq_lock); + return (B_TRUE); } /* - * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp + * Retrieve the ipx_pending_mp and return it. There can be only 1 mp * queued in the list. */ mblk_t * ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) { mblk_t *curr = NULL; + ipxop_t *ipx = ipsq->ipsq_xop; - mutex_enter(&ipsq->ipsq_lock); *connpp = NULL; - if (ipsq->ipsq_pending_mp == NULL) { - mutex_exit(&ipsq->ipsq_lock); + mutex_enter(&ipx->ipx_lock); + if (ipx->ipx_pending_mp == NULL) { + mutex_exit(&ipx->ipx_lock); return (NULL); } /* There can be only 1 such excl message */ - curr = ipsq->ipsq_pending_mp; - ASSERT(curr != NULL && curr->b_next == NULL); - ipsq->ipsq_pending_ipif = NULL; - ipsq->ipsq_pending_mp = NULL; - ipsq->ipsq_waitfor = 0; - mutex_exit(&ipsq->ipsq_lock); + curr = ipx->ipx_pending_mp; + ASSERT(curr->b_next == NULL); + ipx->ipx_pending_ipif = NULL; + ipx->ipx_pending_mp = NULL; + ipx->ipx_waitfor = 0; + mutex_exit(&ipx->ipx_lock); if (CONN_Q(curr->b_queue)) { /* @@ -1237,7 +1293,7 @@ ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) } /* - * Cleanup the ioctl mp queued in ipsq_pending_mp + * Cleanup the ioctl mp queued in ipx_pending_mp * - Called in the ill_delete path * - Called in the M_ERROR or M_HANGUP path on the ill. * - Called in the conn close path. @@ -1246,48 +1302,41 @@ boolean_t ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) { mblk_t *mp; - ipsq_t *ipsq; + ipxop_t *ipx; queue_t *q; ipif_t *ipif; ASSERT(IAM_WRITER_ILL(ill)); - ipsq = ill->ill_phyint->phyint_ipsq; - mutex_enter(&ipsq->ipsq_lock); + ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; + /* - * If connp is null, unconditionally clean up the ipsq_pending_mp. + * If connp is null, unconditionally clean up the ipx_pending_mp. * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl * even if it is meant for another ill, since we have to enqueue - * a new mp now in ipsq_pending_mp to complete the ipif_down. + * a new mp now in ipx_pending_mp to complete the ipif_down. * If connp is non-null we are called from the conn close path. */ - mp = ipsq->ipsq_pending_mp; + mutex_enter(&ipx->ipx_lock); + mp = ipx->ipx_pending_mp; if (mp == NULL || (connp != NULL && mp->b_queue != CONNP_TO_WQ(connp))) { - mutex_exit(&ipsq->ipsq_lock); + mutex_exit(&ipx->ipx_lock); return (B_FALSE); } - /* Now remove from the ipsq_pending_mp */ - ipsq->ipsq_pending_mp = NULL; + /* Now remove from the ipx_pending_mp */ + ipx->ipx_pending_mp = NULL; q = mp->b_queue; mp->b_next = NULL; mp->b_prev = NULL; mp->b_queue = NULL; - /* If MOVE was in progress, clear the move_in_progress fields also. */ - ill = ipsq->ipsq_pending_ipif->ipif_ill; - if (ill->ill_move_in_progress) { - ILL_CLEAR_MOVE(ill); - } else if (ill->ill_up_ipifs) { - ill_group_cleanup(ill); - } - - ipif = ipsq->ipsq_pending_ipif; - ipsq->ipsq_pending_ipif = NULL; - ipsq->ipsq_waitfor = 0; - ipsq->ipsq_current_ipif = NULL; - ipsq->ipsq_current_ioctl = 0; - ipsq->ipsq_current_done = B_TRUE; - mutex_exit(&ipsq->ipsq_lock); + ipif = ipx->ipx_pending_ipif; + ipx->ipx_pending_ipif = NULL; + ipx->ipx_waitfor = 0; + ipx->ipx_current_ipif = NULL; + ipx->ipx_current_ioctl = 0; + ipx->ipx_current_done = B_TRUE; + mutex_exit(&ipx->ipx_lock); if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { if (connp == NULL) { @@ -1437,7 +1486,7 @@ conn_ioctl_cleanup(conn_t *connp) * Is any exclusive ioctl pending ? If so clean it up. If the * ioctl has not yet started, the mp is pending in the list headed by * ipsq_xopq_head. If the ioctl has started the mp could be present in - * ipsq_pending_mp. If the ioctl timed out in the streamhead but + * ipx_pending_mp. If the ioctl timed out in the streamhead but * is currently executing now the mp is not queued anywhere but * conn_oper_pending_ill is null. The conn close will wait * till the conn_ref drops to zero. @@ -1468,9 +1517,9 @@ conn_ioctl_cleanup(conn_t *connp) ill_waiter_dcr(ill); /* * Check whether this ioctl has started and is - * pending now in ipsq_pending_mp. If it is not - * found there then check whether this ioctl has - * not even started and is in the ipsq_xopq list. + * pending. If it is not found there then check + * whether this ioctl has not even started and is in + * the ipsq_xopq list. */ if (!ipsq_pending_mp_cleanup(ill, connp)) ipsq_xopq_mp_cleanup(ill, connp); @@ -1506,16 +1555,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg) if (connp->conn_multicast_ill == ill) { /* Revert to late binding */ connp->conn_multicast_ill = NULL; - connp->conn_orig_multicast_ifindex = 0; } if (connp->conn_incoming_ill == ill) connp->conn_incoming_ill = NULL; if (connp->conn_outgoing_ill == ill) connp->conn_outgoing_ill = NULL; - if (connp->conn_outgoing_pill == ill) - connp->conn_outgoing_pill = NULL; - if (connp->conn_nofailover_ill == ill) - connp->conn_nofailover_ill = NULL; if (connp->conn_dhcpinit_ill == ill) { connp->conn_dhcpinit_ill = NULL; ASSERT(ill->ill_dhcpinit != 0); @@ -1524,11 +1568,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg) if (connp->conn_ire_cache != NULL) { ire = connp->conn_ire_cache; /* - * ip_newroute creates IRE_CACHE with ire_stq coming from - * interface X and ipif coming from interface Y, if interface - * X and Y are part of the same IPMPgroup. Thus whenever - * interface X goes down, remove all references to it by - * checking both on ire_ipif and ire_stq. + * Source address selection makes it possible for IRE_CACHE + * entries to be created with ire_stq coming from interface X + * and ipif coming from interface Y. Thus whenever interface + * X goes down, remove all references to it by checking both + * on ire_ipif and ire_stq. */ if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || (ire->ire_type == IRE_CACHE && @@ -1601,14 +1645,10 @@ ill_down(ill_t *ill) ip_stack_t *ipst = ill->ill_ipst; /* Blow off any IREs dependent on this ILL. */ - ire_walk(ill_downi, (char *)ill, ipst); + ire_walk(ill_downi, ill, ipst); /* Remove any conn_*_ill depending on this ill */ ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); - - if (ill->ill_group != NULL) { - illgrp_delete(ill); - } } /* @@ -1621,9 +1661,9 @@ ill_downi(ire_t *ire, char *ill_arg) ill_t *ill = (ill_t *)ill_arg; /* - * ip_newroute creates IRE_CACHE with ire_stq coming from - * interface X and ipif coming from interface Y, if interface - * X and Y are part of the same IPMP group. Thus whenever interface + * Source address selection makes it possible for IRE_CACHE + * entries to be created with ire_stq coming from interface X + * and ipif coming from interface Y. Thus whenever interface * X goes down, remove all references to it by checking both * on ire_ipif and ire_stq. */ @@ -3696,16 +3736,39 @@ nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, } /* - * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an - * IPMP group, make sure all ill's in the group adopt the new policy. Send - * up RTS_IFINFO routing socket messages for each interface whose flags we - * change. + * Helper function for ill_forward_set(). + */ +static void +ill_forward_set_on_ill(ill_t *ill, boolean_t enable) +{ + ip_stack_t *ipst = ill->ill_ipst; + + ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); + + ip1dbg(("ill_forward_set: %s %s forwarding on %s", + (enable ? "Enabling" : "Disabling"), + (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); + mutex_enter(&ill->ill_lock); + if (enable) + ill->ill_flags |= ILLF_ROUTER; + else + ill->ill_flags &= ~ILLF_ROUTER; + mutex_exit(&ill->ill_lock); + if (ill->ill_isv6) + ill_set_nce_router_flags(ill, enable); + /* Notify routing socket listeners of this change. */ + ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); +} + +/* + * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing + * socket messages for each interface whose flags we change. */ int ill_forward_set(ill_t *ill, boolean_t enable) { - ill_group_t *illgrp; - ip_stack_t *ipst = ill->ill_ipst; + ipmp_illgrp_t *illg; + ip_stack_t *ipst = ill->ill_ipst; ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); @@ -3716,47 +3779,23 @@ ill_forward_set(ill_t *ill, boolean_t enable) if (IS_LOOPBACK(ill)) return (EINVAL); - /* - * If the ill is in an IPMP group, set the forwarding policy on all - * members of the group to the same value. - */ - illgrp = ill->ill_group; - if (illgrp != NULL) { - ill_t *tmp_ill; + if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { + /* + * Update all of the interfaces in the group. + */ + illg = ill->ill_grp; + ill = list_head(&illg->ig_if); + for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) + ill_forward_set_on_ill(ill, enable); - for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL; - tmp_ill = tmp_ill->ill_group_next) { - ip1dbg(("ill_forward_set: %s %s forwarding on %s", - (enable ? "Enabling" : "Disabling"), - (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"), - tmp_ill->ill_name)); - mutex_enter(&tmp_ill->ill_lock); - if (enable) - tmp_ill->ill_flags |= ILLF_ROUTER; - else - tmp_ill->ill_flags &= ~ILLF_ROUTER; - mutex_exit(&tmp_ill->ill_lock); - if (tmp_ill->ill_isv6) - ill_set_nce_router_flags(tmp_ill, enable); - /* Notify routing socket listeners of this change. */ - ip_rts_ifmsg(tmp_ill->ill_ipif); - } - } else { - ip1dbg(("ill_forward_set: %s %s forwarding on %s", - (enable ? "Enabling" : "Disabling"), - (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); - mutex_enter(&ill->ill_lock); - if (enable) - ill->ill_flags |= ILLF_ROUTER; - else - ill->ill_flags &= ~ILLF_ROUTER; - mutex_exit(&ill->ill_lock); - if (ill->ill_isv6) - ill_set_nce_router_flags(ill, enable); - /* Notify routing socket listeners of this change. */ - ip_rts_ifmsg(ill->ill_ipif); + /* + * Update the IPMP meta-interface. + */ + ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); + return (0); } + ill_forward_set_on_ill(ill, enable); return (0); } @@ -3772,7 +3811,12 @@ ill_set_nce_router_flags(ill_t *ill, boolean_t enable) nce_t *nce; for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE); + /* + * NOTE: we're called separately for each ill in an illgrp, + * so don't match across the illgrp. + */ + nce = ndp_lookup_v6(ill, B_FALSE, &ipif->ipif_v6lcl_addr, + B_FALSE); if (nce != NULL) { mutex_enter(&nce->nce_lock); if (enable) @@ -3928,36 +3972,45 @@ ill_next(ill_walk_context_t *ctx, ill_t *lastill) } /* - * Check interface name for correct format which is name+ppa. - * name can contain characters and digits, the right most digits - * make up the ppa number. use of octal is not allowed, name must contain - * a ppa, return pointer to the start of ppa. - * In case of error return NULL. + * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ + * The final number (PPA) must not have any leading zeros. Upon success, a + * pointer to the start of the PPA is returned; otherwise NULL is returned. */ static char * ill_get_ppa_ptr(char *name) { - int namelen = mi_strlen(name); + int namelen = strlen(name); + int end_ndx = namelen - 1; + int ppa_ndx, i; - int len = namelen; + /* + * Check that the first character is [a-zA-Z], and that the last + * character is [0-9]. + */ + if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) + return (NULL); - name += len; - while (len > 0) { - name--; - if (*name < '0' || *name > '9') + /* + * Set `ppa_ndx' to the PPA start, and check for leading zeroes. + */ + for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) + if (!isdigit(name[ppa_ndx - 1])) break; - len--; - } - /* empty string, all digits, or no trailing digits */ - if (len == 0 || len == (int)namelen) + if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) return (NULL); - name++; - /* check for attempted use of octal */ - if (*name == '0' && len != (int)namelen - 1) - return (NULL); - return (name); + /* + * Check that the intermediate characters are [a-z0-9.] + */ + for (i = 1; i < ppa_ndx; i++) { + if (!isalpha(name[i]) && !isdigit(name[i]) && + name[i] != '.' && name[i] != '_') { + return (NULL); + } + } + + return (name + ppa_ndx); } /* @@ -4037,8 +4090,10 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, } else if (ILL_CAN_WAIT(ill, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (error != NULL) @@ -4102,6 +4157,7 @@ static void ill_glist_delete(ill_t *ill) { ip_stack_t *ipst; + phyint_t *phyi; if (ill == NULL) return; @@ -4139,8 +4195,41 @@ ill_glist_delete(ill_t *ill) ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, ill->ill_name_length); - ill_phyint_free(ill); + ASSERT(ill->ill_phyint != NULL); + phyi = ill->ill_phyint; + ill->ill_phyint = NULL; + + /* + * ill_init allocates a phyint always to store the copy + * of flags relevant to phyint. At that point in time, we could + * not assign the name and hence phyint_illv4/v6 could not be + * initialized. Later in ipif_set_values, we assign the name to + * the ill, at which point in time we assign phyint_illv4/v6. + * Thus we don't rely on phyint_illv6 to be initialized always. + */ + if (ill->ill_flags & ILLF_IPV6) + phyi->phyint_illv6 = NULL; + else + phyi->phyint_illv4 = NULL; + + if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { + rw_exit(&ipst->ips_ill_g_lock); + return; + } + + /* + * There are no ills left on this phyint; pull it out of the phyint + * avl trees, and free it. + */ + if (phyi->phyint_ifindex > 0) { + avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, + phyi); + avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, + phyi); + } rw_exit(&ipst->ips_ill_g_lock); + + phyint_free(phyi); } /* @@ -4367,30 +4456,32 @@ ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) return (0); } -/* Initialize the per phyint (per IPMP group) ipsq used for serialization */ +/* Initialize the per phyint ipsq used for serialization */ static boolean_t -ipsq_init(ill_t *ill) +ipsq_init(ill_t *ill, boolean_t enter) { ipsq_t *ipsq; + ipxop_t *ipx; - /* Init the ipsq and impicitly enter as writer */ - ill->ill_phyint->phyint_ipsq = - kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); - if (ill->ill_phyint->phyint_ipsq == NULL) + if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) return (B_FALSE); - ipsq = ill->ill_phyint->phyint_ipsq; - ipsq->ipsq_phyint_list = ill->ill_phyint; - ill->ill_phyint->phyint_ipsq_next = NULL; + + ill->ill_phyint->phyint_ipsq = ipsq; + ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; + ipx->ipx_ipsq = ipsq; + ipsq->ipsq_next = ipsq; + ipsq->ipsq_phyint = ill->ill_phyint; mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); - ipsq->ipsq_refs = 1; - ipsq->ipsq_writer = curthread; - ipsq->ipsq_reentry_cnt = 1; + mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ + if (enter) { + ipx->ipx_writer = curthread; + ipx->ipx_forced = B_FALSE; + ipx->ipx_reentry_cnt = 1; #ifdef DEBUG - ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, - IPSQ_STACK_DEPTH); + ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); #endif - (void) strcpy(ipsq->ipsq_name, ill->ill_name); + } return (B_TRUE); } @@ -4468,7 +4559,7 @@ ill_init(queue_t *q, ill_t *ill) ill->ill_ppa = UINT_MAX; ill->ill_fastpath_list = &ill->ill_fastpath_list; - if (!ipsq_init(ill)) { + if (!ipsq_init(ill, B_TRUE)) { freemsg(info_mp); mi_free(frag_ptr); mi_free(ill->ill_phyint); @@ -4589,29 +4680,16 @@ loopback_kstat_update(kstat_t *ksp, int rw) } /* - * Has ifindex been plumbed already. - * Compares both phyint_ifindex and phyint_group_ifindex. + * Has ifindex been plumbed already? */ static boolean_t phyint_exists(uint_t index, ip_stack_t *ipst) { - phyint_t *phyi; - ASSERT(index != 0); ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); - /* - * Indexes are stored in the phyint - a common structure - * to both IPv4 and IPv6. - */ - phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); - for (; phyi != NULL; - phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, - phyi, AVL_AFTER)) { - if (phyi->phyint_ifindex == index || - phyi->phyint_group_ifindex == index) - return (B_TRUE); - } - return (B_FALSE); + + return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, + &index, NULL) != NULL); } /* Pick a unique ifindex */ @@ -4675,9 +4753,9 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, { ill_t *ill; ipif_t *ipif; + ipsq_t *ipsq; kstat_named_t *kn; boolean_t isloopback; - ipsq_t *old_ipsq; in6_addr_t ov6addr; isloopback = mi_strcmp(name, ipif_loopback_name) == 0; @@ -4761,16 +4839,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, ill->ill_net_type = IRE_LOOPBACK; /* Initialize the ipsq */ - if (!ipsq_init(ill)) + if (!ipsq_init(ill, B_FALSE)) goto done; - ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL; - ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--; - ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0); -#ifdef DEBUG - ill->ill_phyint->phyint_ipsq->ipsq_depth = 0; -#endif - ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE); + ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE); if (ipif == NULL) goto done; @@ -4807,7 +4879,7 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, ill->ill_frag_free_num_pkts = 0; ill->ill_last_frag_clean_time = 0; - old_ipsq = ill->ill_phyint->phyint_ipsq; + ipsq = ill->ill_phyint->phyint_ipsq; if (ill_glist_insert(ill, "lo", isv6) != 0) cmn_err(CE_PANIC, "cannot insert loopback interface"); @@ -4824,13 +4896,11 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, sctp_update_ipif_addr(ipif, ov6addr); /* - * If the ipsq was changed in ill_phyint_reinit free the old ipsq. + * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. + * If so, free our original one. */ - if (old_ipsq != ill->ill_phyint->phyint_ipsq) { - /* Loopback ills aren't in any IPMP group */ - ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP)); - ipsq_delete(old_ipsq); - } + if (ipsq != ill->ill_phyint->phyint_ipsq) + ipsq_delete(ipsq); /* * Delay this till the ipif is allocated as ipif_allocate @@ -4871,12 +4941,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, done: if (ill != NULL) { if (ill->ill_phyint != NULL) { - ipsq_t *ipsq; - ipsq = ill->ill_phyint->phyint_ipsq; if (ipsq != NULL) { - ipsq->ipsq_ipst = NULL; - kmem_free(ipsq, sizeof (ipsq_t)); + ipsq->ipsq_phyint = NULL; + ipsq_delete(ipsq); } mi_free(ill->ill_phyint); } @@ -4954,9 +5022,11 @@ ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, } else if (ILL_CAN_WAIT(ill, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); rw_exit(&ipst->ips_ill_g_lock); mutex_exit(&ill->ill_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (err != NULL) @@ -5294,6 +5364,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) dl_info_ack_t *dlia; ip_m_t *ipm; dl_qos_cl_sel1_t *sel1; + int min_mtu; ASSERT(IAM_WRITER_ILL(ill)); @@ -5336,7 +5407,14 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) ill->ill_bcast_addr_length = brdcst_addr_length; ill->ill_phys_addr_length = phys_addr_length; ill->ill_sap_length = sap_length; - ill->ill_max_frag = dlia->dl_max_sdu; + + /* + * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, + * but we must ensure a minimum IP MTU is used since other bits of + * IP will fly apart otherwise. + */ + min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; + ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); ill->ill_max_mtu = ill->ill_max_frag; ill->ill_type = ipm->ip_m_type; @@ -5358,7 +5436,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) * the wakeup. */ (void) ipif_allocate(ill, 0, IRE_LOCAL, - dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE); + dlia->dl_provider_style != DL_STYLE2, B_TRUE); mutex_enter(&ill->ill_lock); ASSERT(ill->ill_dlpi_style_set == 0); ill->ill_dlpi_style_set = 1; @@ -5397,8 +5475,13 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) /* * Free ill_resolver_mp and ill_bcast_mp as things could have * changed now. + * + * NOTE: The IPMP meta-interface is special-cased because it starts + * with no underlying interfaces (and thus an unknown broadcast + * address length), but we enforce that an interface is broadcast- + * capable as part of allowing it to join a group. */ - if (ill->ill_bcast_addr_length == 0) { + if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { if (ill->ill_resolver_mp != NULL) freemsg(ill->ill_resolver_mp); if (ill->ill_bcast_mp != NULL) @@ -5451,6 +5534,11 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) if (!ill->ill_isv6) ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; } + + /* For IPMP, PHYI_IPMP should already be set by ipif_allocate() */ + if (ill->ill_mactype == SUNW_DL_IPMP) + ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); + /* By default an interface does not support any CoS marking */ ill->ill_flags &= ~ILLF_COS_ENABLED; @@ -5552,16 +5640,18 @@ ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) } /* - * Find any non-virtual, not condemned, and up multicast capable interface - * given an IP instance and zoneid. Order of preference is: + * Find a mulitcast-capable ipif given an IP instance and zoneid. + * The ipif must be up, and its ill must multicast-capable, not + * condemned, not an underlying interface in an IPMP group, and + * not a VNI interface. Order of preference: * - * 1. normal - * 1.1 normal, but deprecated - * 2. point to point - * 2.1 point to point, but deprecated - * 3. link local - * 3.1 link local, but deprecated - * 4. loopback. + * 1a. normal + * 1b. normal, but deprecated + * 2a. point to point + * 2b. point to point, but deprecated + * 3a. link local + * 3b. link local, but deprecated + * 4. loopback. */ ipif_t * ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) @@ -5580,7 +5670,7 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) for (; ill != NULL; ill = ill_next(&ctx, ill)) { mutex_enter(&ill->ill_lock); - if (IS_VNI(ill) || !ILL_CAN_LOOKUP(ill) || + if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) || !(ill->ill_flags & ILLF_MULTICAST)) { mutex_exit(&ill->ill_lock); continue; @@ -5736,10 +5826,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, } else if (IPIF_CAN_WAIT(ipif, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); rw_exit(&ipst->ips_ill_g_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (error != NULL) @@ -5761,15 +5853,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, } /* - * Look for an ipif with the specified address. For point-point links - * we look for matches on either the destination address and the local - * address, but we ignore the check on the local address if IPIF_UNNUMBERED - * is set. - * Matches on a specific ill if match_ill is set. + * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). */ -ipif_t * -ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, - mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) +static ipif_t * +ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp, + zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, + ip_stack_t *ipst) { ipif_t *ipif; ill_t *ill; @@ -5788,7 +5877,8 @@ ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, repeat: ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (match_ill != NULL && ill != match_ill) { + if (match_ill != NULL && ill != match_ill && + (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { continue; } GRAB_CONN_LOCK(q); @@ -5817,10 +5907,12 @@ repeat: } else if (IPIF_CAN_WAIT(ipif, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); rw_exit(&ipst->ips_ill_g_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); if (error != NULL) @@ -5894,11 +5986,40 @@ ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) } /* + * Lookup an ipif with the specified address. For point-to-point links we + * look for matches on either the destination address or the local address, + * but we skip the local address check if IPIF_UNNUMBERED is set. If the + * `match_ill' argument is non-NULL, the lookup is restricted to that ill + * (or illgrp if `match_ill' is in an IPMP group). + */ +ipif_t * +ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, + mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) +{ + return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp, + func, error, ipst)); +} + +/* + * Special abbreviated version of ipif_lookup_addr() that doesn't match + * `match_ill' across the IPMP group. This function is only needed in some + * corner-cases; almost everything should use ipif_lookup_addr(). + */ +static ipif_t * +ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) +{ + ASSERT(match_ill != NULL); + return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES, + NULL, NULL, NULL, NULL, ipst)); +} + +/* * Look for an ipif with the specified address. For point-point links * we look for matches on either the destination address and the local * address, but we ignore the check on the local address if IPIF_UNNUMBERED * is set. - * Matches on a specific ill if match_ill is set. + * If the `match_ill' argument is non-NULL, the lookup is restricted to that + * ill (or illgrp if `match_ill' is in an IPMP group). * Return the zoneid for the ipif which matches. ALL_ZONES if no match. */ zoneid_t @@ -5918,7 +6039,8 @@ ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) repeat: ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (match_ill != NULL && ill != match_ill) { + if (match_ill != NULL && ill != match_ill && + !IS_IN_SAME_ILLGRP(ill, match_ill)) { continue; } mutex_enter(&ill->ill_lock); @@ -6008,7 +6130,7 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) /* * The callers of this function wants to know the * interface on which they have to send the replies - * back. For IRE_CACHES that have ire_stq and ire_ipif + * back. For IREs that have ire_stq and ire_ipif * derived from different ills, we really don't care * what we return here. */ @@ -6109,30 +6231,6 @@ ipif_is_freeable(ipif_t *ipif) } /* - * This func does not prevent refcnt from increasing. But if - * the caller has taken steps to that effect, then this func - * can be used to determine whether the ipifs marked with IPIF_MOVING - * have become quiescent and can be moved in a failover/failback. - */ -static ipif_t * -ill_quiescent_to_move(ill_t *ill) -{ - ipif_t *ipif; - - ASSERT(MUTEX_HELD(&ill->ill_lock)); - - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (ipif->ipif_state_flags & IPIF_MOVING) { - if (ipif->ipif_refcnt != 0 || - !IPIF_DOWN_OK(ipif)) { - return (ipif); - } - } - } - return (NULL); -} - -/* * The ipif/ill/ire has been refreled. Do the tail processing. * Determine if the ipif or ill in question has become quiescent and if so * wakeup close and/or restart any queued pending ioctl that is waiting @@ -6144,87 +6242,61 @@ ipif_ill_refrele_tail(ill_t *ill) mblk_t *mp; conn_t *connp; ipsq_t *ipsq; + ipxop_t *ipx; ipif_t *ipif; dl_notify_ind_t *dlindp; ASSERT(MUTEX_HELD(&ill->ill_lock)); - if ((ill->ill_state_flags & ILL_CONDEMNED) && - ill_is_freeable(ill)) { - /* ill_close may be waiting */ + if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { + /* ip_modclose() may be waiting */ cv_broadcast(&ill->ill_cv); } - /* ipsq can't change because ill_lock is held */ ipsq = ill->ill_phyint->phyint_ipsq; - if (ipsq->ipsq_waitfor == 0) { - /* Not waiting for anything, just return. */ - mutex_exit(&ill->ill_lock); - return; - } - ASSERT(ipsq->ipsq_pending_mp != NULL && - ipsq->ipsq_pending_ipif != NULL); - /* - * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF. - * Last ipif going down needs to down the ill, so ill_ire_cnt must - * be zero for restarting an ioctl that ends up downing the ill. - */ - ipif = ipsq->ipsq_pending_ipif; - if (ipif->ipif_ill != ill) { - /* The ioctl is pending on some other ill. */ - mutex_exit(&ill->ill_lock); - return; - } + mutex_enter(&ipsq->ipsq_lock); + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); + if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ + goto unlock; + + ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); + + ipif = ipx->ipx_pending_ipif; + if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ + goto unlock; - switch (ipsq->ipsq_waitfor) { + switch (ipx->ipx_waitfor) { case IPIF_DOWN: - if (!ipif_is_quiescent(ipif)) { - mutex_exit(&ill->ill_lock); - return; - } + if (!ipif_is_quiescent(ipif)) + goto unlock; break; case IPIF_FREE: - if (!ipif_is_freeable(ipif)) { - mutex_exit(&ill->ill_lock); - return; - } + if (!ipif_is_freeable(ipif)) + goto unlock; break; - case ILL_DOWN: - if (!ill_is_quiescent(ill)) { - mutex_exit(&ill->ill_lock); - return; - } + if (!ill_is_quiescent(ill)) + goto unlock; break; case ILL_FREE: /* - * case ILL_FREE arises only for loopback. otherwise ill_delete - * waits synchronously in ip_close, and no message is queued in - * ipsq_pending_mp at all in this case + * ILL_FREE is only for loopback; normal ill teardown waits + * synchronously in ip_modclose() without using ipx_waitfor, + * handled by the cv_broadcast() at the top of this function. */ - if (!ill_is_freeable(ill)) { - mutex_exit(&ill->ill_lock); - return; - } - break; - - case ILL_MOVE_OK: - if (ill_quiescent_to_move(ill) != NULL) { - mutex_exit(&ill->ill_lock); - return; - } + if (!ill_is_freeable(ill)) + goto unlock; break; default: - cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n", - (void *)ipsq, ipsq->ipsq_waitfor); + cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", + (void *)ipsq, ipx->ipx_waitfor); } - /* - * Incr refcnt for the qwriter_ip call below which - * does a refrele - */ - ill_refhold_locked(ill); + ill_refhold_locked(ill); /* for qwriter_ip() call below */ + mutex_exit(&ipx->ipx_lock); mp = ipsq_pending_mp_get(ipsq, &connp); + mutex_exit(&ipsq->ipsq_lock); mutex_exit(&ill->ill_lock); ASSERT(mp != NULL); @@ -6249,6 +6321,7 @@ ipif_ill_refrele_tail(ill_t *ill) return; default: ASSERT(0); + ill_refrele(ill); } break; @@ -6268,6 +6341,11 @@ ipif_ill_refrele_tail(ill_t *ill) cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " "db_type %d\n", (void *)mp, mp->b_datap->db_type); } + return; +unlock: + mutex_exit(&ipsq->ipsq_lock); + mutex_exit(&ipx->ipx_lock); + mutex_exit(&ill->ill_lock); } #ifdef DEBUG @@ -6902,10 +6980,23 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, ipif = ipif_arg; if (ipif_arg != NULL) match_flags |= MATCH_IRE_ILL; +again: gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); - if (gw_ire == NULL) + if (gw_ire == NULL) { + /* + * With IPMP, we allow host routes to influence in.mpathd's + * target selection. However, if the test addresses are on + * their own network, the above lookup will fail since the + * underlying IRE_INTERFACEs are marked hidden. So allow + * hidden test IREs to be found and try again. + */ + if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) { + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; + goto again; + } return (ENETUNREACH); + } /* * We create one of three types of IREs as a result of this request @@ -7355,9 +7446,11 @@ void ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, ill_t *pending_ill) { - conn_t *connp = NULL; + conn_t *connp; + ipxop_t *ipx = ipsq->ipsq_xop; ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); + ASSERT(MUTEX_HELD(&ipx->ipx_lock)); ASSERT(func != NULL); mp->b_queue = q; @@ -7366,14 +7459,14 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, switch (type) { case CUR_OP: - if (ipsq->ipsq_mptail != NULL) { - ASSERT(ipsq->ipsq_mphead != NULL); - ipsq->ipsq_mptail->b_next = mp; + if (ipx->ipx_mptail != NULL) { + ASSERT(ipx->ipx_mphead != NULL); + ipx->ipx_mptail->b_next = mp; } else { - ASSERT(ipsq->ipsq_mphead == NULL); - ipsq->ipsq_mphead = mp; + ASSERT(ipx->ipx_mphead == NULL); + ipx->ipx_mphead = mp; } - ipsq->ipsq_mptail = mp; + ipx->ipx_mptail = mp; break; case NEW_OP: @@ -7385,6 +7478,15 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, ipsq->ipsq_xopq_mphead = mp; } ipsq->ipsq_xopq_mptail = mp; + ipx->ipx_ipsq_queued = B_TRUE; + break; + + case SWITCH_OP: + ASSERT(ipsq->ipsq_swxop != NULL); + /* only one switch operation is currently allowed */ + ASSERT(ipsq->ipsq_switch_mp == NULL); + ipsq->ipsq_switch_mp = mp; + ipx->ipx_ipsq_queued = B_TRUE; break; default: cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); @@ -7392,55 +7494,273 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, if (CONN_Q(q) && pending_ill != NULL) { connp = Q_TO_CONN(q); - ASSERT(MUTEX_HELD(&connp->conn_lock)); connp->conn_oper_pending_ill = pending_ill; } } /* - * Return the mp at the head of the ipsq. After emptying the ipsq - * look at the next ioctl, if this ioctl is complete. Otherwise - * return, we will resume when we complete the current ioctl. - * The current ioctl will wait till it gets a response from the - * driver below. + * Dequeue the next message that requested exclusive access to this IPSQ's + * xop. Specifically: + * + * 1. If we're still processing the current operation on `ipsq', then + * dequeue the next message for the operation (from ipx_mphead), or + * return NULL if there are no queued messages for the operation. + * These messages are queued via CUR_OP to qwriter_ip() and friends. + * + * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is + * not set) see if the ipsq has requested an xop switch. If so, switch + * `ipsq' to a different xop. Xop switches only happen when joining or + * leaving IPMP groups and require a careful dance -- see the comments + * in-line below for details. If we're leaving a group xop or if we're + * joining a group xop and become writer on it, then we proceed to (3). + * Otherwise, we return NULL and exit the xop. + * + * 3. For each IPSQ in the xop, return any switch operation stored on + * ipsq_switch_mp (set via SWITCH_OP); these must be processed before + * any other messages queued on the IPSQ. Otherwise, dequeue the next + * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. + * Note that if the phyint tied to `ipsq' is not using IPMP there will + * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for + * each phyint in the group, including the IPMP meta-interface phyint. */ static mblk_t * ipsq_dq(ipsq_t *ipsq) { + ill_t *illv4, *illv6; mblk_t *mp; + ipsq_t *xopipsq; + ipsq_t *leftipsq = NULL; + ipxop_t *ipx; + phyint_t *phyi = ipsq->ipsq_phyint; + ip_stack_t *ipst = ipsq->ipsq_ipst; + boolean_t emptied = B_FALSE; - ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); + /* + * Grab all the locks we need in the defined order (ill_g_lock -> + * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. + */ + rw_enter(&ipst->ips_ill_g_lock, + ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); + mutex_enter(&ipsq->ipsq_lock); + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); - mp = ipsq->ipsq_mphead; - if (mp != NULL) { - ipsq->ipsq_mphead = mp->b_next; - if (ipsq->ipsq_mphead == NULL) - ipsq->ipsq_mptail = NULL; - mp->b_next = NULL; - return (mp); + /* + * Dequeue the next message associated with the current exclusive + * operation, if any. + */ + if ((mp = ipx->ipx_mphead) != NULL) { + ipx->ipx_mphead = mp->b_next; + if (ipx->ipx_mphead == NULL) + ipx->ipx_mptail = NULL; + mp->b_next = (void *)ipsq; + goto out; } - if (ipsq->ipsq_current_ipif != NULL) - return (NULL); - mp = ipsq->ipsq_xopq_mphead; - if (mp != NULL) { - ipsq->ipsq_xopq_mphead = mp->b_next; - if (ipsq->ipsq_xopq_mphead == NULL) - ipsq->ipsq_xopq_mptail = NULL; - mp->b_next = NULL; - return (mp); + + if (ipx->ipx_current_ipif != NULL) + goto empty; + + if (ipsq->ipsq_swxop != NULL) { + /* + * The exclusive operation that is now being completed has + * requested a switch to a different xop. This happens + * when an interface joins or leaves an IPMP group. Joins + * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). + * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb + * (phyint_free()), or interface plumb for an ill type + * not in the IPMP group (ip_rput_dlpi_writer()). + * + * Xop switches are not allowed on the IPMP meta-interface. + */ + ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); + ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); + DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); + + if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { + /* + * We're switching back to our own xop, so we have two + * xop's to drain/exit: our own, and the group xop + * that we are leaving. + * + * First, pull ourselves out of the group ipsq list. + * This is safe since we're writer on ill_g_lock. + */ + ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); + + xopipsq = ipx->ipx_ipsq; + while (xopipsq->ipsq_next != ipsq) + xopipsq = xopipsq->ipsq_next; + + xopipsq->ipsq_next = ipsq->ipsq_next; + ipsq->ipsq_next = ipsq; + ipsq->ipsq_xop = ipsq->ipsq_swxop; + ipsq->ipsq_swxop = NULL; + + /* + * Second, prepare to exit the group xop. The actual + * ipsq_exit() is done at the end of this function + * since we cannot hold any locks across ipsq_exit(). + * Note that although we drop the group's ipx_lock, no + * threads can proceed since we're still ipx_writer. + */ + leftipsq = xopipsq; + mutex_exit(&ipx->ipx_lock); + + /* + * Third, set ipx to point to our own xop (which was + * inactive and therefore can be entered). + */ + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); + ASSERT(ipx->ipx_writer == NULL); + ASSERT(ipx->ipx_current_ipif == NULL); + } else { + /* + * We're switching from our own xop to a group xop. + * The requestor of the switch must ensure that the + * group xop cannot go away (e.g. by ensuring the + * phyint associated with the xop cannot go away). + * + * If we can become writer on our new xop, then we'll + * do the drain. Otherwise, the current writer of our + * new xop will do the drain when it exits. + * + * First, splice ourselves into the group IPSQ list. + * This is safe since we're writer on ill_g_lock. + */ + ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); + + xopipsq = ipsq->ipsq_swxop->ipx_ipsq; + while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) + xopipsq = xopipsq->ipsq_next; + + xopipsq->ipsq_next = ipsq; + ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; + ipsq->ipsq_xop = ipsq->ipsq_swxop; + ipsq->ipsq_swxop = NULL; + + /* + * Second, exit our own xop, since it's now unused. + * This is safe since we've got the only reference. + */ + ASSERT(ipx->ipx_writer == curthread); + ipx->ipx_writer = NULL; + VERIFY(--ipx->ipx_reentry_cnt == 0); + ipx->ipx_ipsq_queued = B_FALSE; + mutex_exit(&ipx->ipx_lock); + + /* + * Third, set ipx to point to our new xop, and check + * if we can become writer on it. If we cannot, then + * the current writer will drain the IPSQ group when + * it exits. Our ipsq_xop is guaranteed to be stable + * because we're still holding ipsq_lock. + */ + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); + if (ipx->ipx_writer != NULL || + ipx->ipx_current_ipif != NULL) { + goto out; + } + } + + /* + * Fourth, become writer on our new ipx before we continue + * with the drain. Note that we never dropped ipsq_lock + * above, so no other thread could've raced with us to + * become writer first. Also, we're holding ipx_lock, so + * no other thread can examine the ipx right now. + */ + ASSERT(ipx->ipx_current_ipif == NULL); + ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); + VERIFY(ipx->ipx_reentry_cnt++ == 0); + ipx->ipx_writer = curthread; + ipx->ipx_forced = B_FALSE; +#ifdef DEBUG + ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); +#endif } - return (NULL); + + xopipsq = ipsq; + do { + /* + * So that other operations operate on a consistent and + * complete phyint, a switch message on an IPSQ must be + * handled prior to any other operations on that IPSQ. + */ + if ((mp = xopipsq->ipsq_switch_mp) != NULL) { + xopipsq->ipsq_switch_mp = NULL; + ASSERT(mp->b_next == NULL); + mp->b_next = (void *)xopipsq; + goto out; + } + + if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { + xopipsq->ipsq_xopq_mphead = mp->b_next; + if (xopipsq->ipsq_xopq_mphead == NULL) + xopipsq->ipsq_xopq_mptail = NULL; + mp->b_next = (void *)xopipsq; + goto out; + } + } while ((xopipsq = xopipsq->ipsq_next) != ipsq); +empty: + /* + * There are no messages. Further, we are holding ipx_lock, hence no + * new messages can end up on any IPSQ in the xop. + */ + ipx->ipx_writer = NULL; + ipx->ipx_forced = B_FALSE; + VERIFY(--ipx->ipx_reentry_cnt == 0); + ipx->ipx_ipsq_queued = B_FALSE; + emptied = B_TRUE; +#ifdef DEBUG + ipx->ipx_depth = 0; +#endif +out: + mutex_exit(&ipx->ipx_lock); + mutex_exit(&ipsq->ipsq_lock); + + /* + * If we completely emptied the xop, then wake up any threads waiting + * to enter any of the IPSQ's associated with it. + */ + if (emptied) { + xopipsq = ipsq; + do { + if ((phyi = xopipsq->ipsq_phyint) == NULL) + continue; + + illv4 = phyi->phyint_illv4; + illv6 = phyi->phyint_illv6; + + GRAB_ILL_LOCKS(illv4, illv6); + if (illv4 != NULL) + cv_broadcast(&illv4->ill_cv); + if (illv6 != NULL) + cv_broadcast(&illv6->ill_cv); + RELEASE_ILL_LOCKS(illv4, illv6); + } while ((xopipsq = xopipsq->ipsq_next) != ipsq); + } + rw_exit(&ipst->ips_ill_g_lock); + + /* + * Now that all locks are dropped, exit the IPSQ we left. + */ + if (leftipsq != NULL) + ipsq_exit(leftipsq); + + return (mp); } /* * Enter the ipsq corresponding to ill, by waiting synchronously till * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq * will have to drain completely before ipsq_enter returns success. - * ipsq_current_ipif will be set if some exclusive ioctl is in progress, - * and the ipsq_exit logic will start the next enqueued ioctl after - * completion of the current ioctl. If 'force' is used, we don't wait - * for the enqueued ioctls. This is needed when a conn_close wants to + * ipx_current_ipif will be set if some exclusive op is in progress, + * and the ipsq_exit logic will start the next enqueued op after + * completion of the current op. If 'force' is used, we don't wait + * for the enqueued ops. This is needed when a conn_close wants to * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb * of an ill can also use this option. But we dont' use it currently. */ @@ -7449,13 +7769,16 @@ boolean_t ipsq_enter(ill_t *ill, boolean_t force, int type) { ipsq_t *ipsq; + ipxop_t *ipx; boolean_t waited_enough = B_FALSE; /* - * Holding the ill_lock prevents <ill-ipsq> assocs from changing. - * Since the <ill-ipsq> assocs could change while we wait for the - * writer, it is easier to wait on a fixed global rather than try to - * cv_wait on a changing ipsq. + * Note that the relationship between ill and ipsq is fixed as long as + * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the + * relationship between the IPSQ and xop cannot change. However, + * since we cannot hold ipsq_lock across the cv_wait(), it may change + * while we're waiting. We wait on ill_cv and rely on ipsq_exit() + * waking up all ills in the xop when it becomes available. */ mutex_enter(&ill->ill_lock); for (;;) { @@ -7466,34 +7789,35 @@ ipsq_enter(ill_t *ill, boolean_t force, int type) ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); - if (ipsq->ipsq_writer == NULL && - (type == CUR_OP || ipsq->ipsq_current_ipif == NULL || - waited_enough)) { + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); + + if (ipx->ipx_writer == NULL && (type == CUR_OP || + ipx->ipx_current_ipif == NULL || waited_enough)) break; - } else if (ipsq->ipsq_writer != NULL) { + + if (!force || ipx->ipx_writer != NULL) { + mutex_exit(&ipx->ipx_lock); mutex_exit(&ipsq->ipsq_lock); cv_wait(&ill->ill_cv, &ill->ill_lock); } else { + mutex_exit(&ipx->ipx_lock); mutex_exit(&ipsq->ipsq_lock); - if (force) { - (void) cv_timedwait(&ill->ill_cv, - &ill->ill_lock, - lbolt + ENTER_SQ_WAIT_TICKS); - waited_enough = B_TRUE; - continue; - } else { - cv_wait(&ill->ill_cv, &ill->ill_lock); - } + (void) cv_timedwait(&ill->ill_cv, + &ill->ill_lock, lbolt + ENTER_SQ_WAIT_TICKS); + waited_enough = B_TRUE; } } - ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL); - ASSERT(ipsq->ipsq_reentry_cnt == 0); - ipsq->ipsq_writer = curthread; - ipsq->ipsq_reentry_cnt++; + ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); + ASSERT(ipx->ipx_reentry_cnt == 0); + ipx->ipx_writer = curthread; + ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); + ipx->ipx_reentry_cnt++; #ifdef DEBUG - ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IPSQ_STACK_DEPTH); + ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); #endif + mutex_exit(&ipx->ipx_lock); mutex_exit(&ipsq->ipsq_lock); mutex_exit(&ill->ill_lock); return (B_TRUE); @@ -7513,14 +7837,13 @@ ill_perim_exit(ill_t *ill) /* * The ipsq_t (ipsq) is the synchronization data structure used to serialize - * certain critical operations like plumbing (i.e. most set ioctls), - * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP - * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per - * IPMP group. The ipsq serializes exclusive ioctls issued by applications - * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple - * threads executing in the ipsq. Responses from the driver pertain to the - * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated - * as part of bringing up the interface) and are enqueued in ipsq_mphead. + * certain critical operations like plumbing (i.e. most set ioctls), multicast + * joins, igmp/mld timers, etc. There is one ipsq per phyint. The ipsq + * serializes exclusive ioctls issued by applications on a per ipsq basis in + * ipsq_xopq_mphead. It also protects against multiple threads executing in + * the ipsq. Responses from the driver pertain to the current ioctl (say a + * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing + * up the interface) and are enqueued in ipx_mphead. * * If a thread does not want to reenter the ipsq when it is already writer, * it must make sure that the specified reentry point to be called later @@ -7528,29 +7851,33 @@ ill_perim_exit(ill_t *ill) * point must never ever try to enter the ipsq again. Otherwise it can lead * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. * When the thread that is currently exclusive finishes, it (ipsq_exit) - * dequeues the requests waiting to become exclusive in ipsq_mphead and calls - * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit + * dequeues the requests waiting to become exclusive in ipx_mphead and calls + * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next * ioctl if the current ioctl has completed. If the current ioctl is still * in progress it simply returns. The current ioctl could be waiting for - * a response from another module (arp_ or the driver or could be waiting for - * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp - * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the + * a response from another module (arp or the driver or could be waiting for + * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp + * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the * execution of the ioctl and ipsq_exit does not start the next ioctl unless - * ipsq_current_ipif is clear which happens only on ioctl completion. + * ipx_current_ipif is NULL which happens only once the ioctl is complete and + * all associated DLPI operations have completed. */ /* - * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of - * ipif or ill can be specified). The caller ensures ipif or ill is valid by - * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued - * completion. + * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' + * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ + * on success, or NULL on failure. The caller ensures ipif/ill is valid by + * refholding it as necessary. If the IPSQ cannot be entered and `func' is + * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ + * can be entered. If `func' is NULL, then `q' and `mp' are ignored. */ ipsq_t * ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, boolean_t reentry_ok) { ipsq_t *ipsq; + ipxop_t *ipx; /* Only 1 of ipif or ill can be specified */ ASSERT((ipif != NULL) ^ (ill != NULL)); @@ -7558,13 +7885,15 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, ill = ipif->ipif_ill; /* - * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock - * ipsq of an ill can't change when ill_lock is held. + * lock ordering: conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. + * ipx of an ipsq can't change when ipsq_lock is held. */ GRAB_CONN_LOCK(q); mutex_enter(&ill->ill_lock); ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + ipx = ipsq->ipsq_xop; + mutex_enter(&ipx->ipx_lock); /* * 1. Enter the ipsq if we are already writer and reentry is ok. @@ -7572,30 +7901,32 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, * 'func' nor any of its callees must ever attempt to enter the ipsq * again. Otherwise it can lead to an infinite loop * 2. Enter the ipsq if there is no current writer and this attempted - * entry is part of the current ioctl or operation + * entry is part of the current operation * 3. Enter the ipsq if there is no current writer and this is a new - * ioctl (or operation) and the ioctl (or operation) queue is - * empty and there is no ioctl (or operation) currently in progress + * operation and the operation queue is empty and there is no + * operation currently in progress */ - if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) || - (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL && - ipsq->ipsq_current_ipif == NULL))) || - (ipsq->ipsq_writer == curthread && reentry_ok)) { + if ((ipx->ipx_writer == curthread && reentry_ok) || + (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && + !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL)))) { /* Success. */ - ipsq->ipsq_reentry_cnt++; - ipsq->ipsq_writer = curthread; + ipx->ipx_reentry_cnt++; + ipx->ipx_writer = curthread; + ipx->ipx_forced = B_FALSE; + mutex_exit(&ipx->ipx_lock); mutex_exit(&ipsq->ipsq_lock); mutex_exit(&ill->ill_lock); RELEASE_CONN_LOCK(q); #ifdef DEBUG - ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, - IPSQ_STACK_DEPTH); + ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); #endif return (ipsq); } - ipsq_enq(ipsq, q, mp, func, type, ill); + if (func != NULL) + ipsq_enq(ipsq, q, mp, func, type, ill); + mutex_exit(&ipx->ipx_lock); mutex_exit(&ipsq->ipsq_lock); mutex_exit(&ill->ill_lock); RELEASE_CONN_LOCK(q); @@ -7630,188 +7961,58 @@ qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, } /* - * If there are more than ILL_GRP_CNT ills in a group, - * we use kmem alloc'd buffers, else use the stack - */ -#define ILL_GRP_CNT 14 -/* - * Drain the ipsq, if there are messages on it, and then leave the ipsq. - * Called by a thread that is currently exclusive on this ipsq. + * Exit the specified IPSQ. If this is the final exit on it then drain it + * prior to exiting. Caller must be writer on the specified IPSQ. */ void ipsq_exit(ipsq_t *ipsq) { + mblk_t *mp; + ipsq_t *mp_ipsq; queue_t *q; - mblk_t *mp; - ipsq_func_t func; - int next; - ill_t **ill_list = NULL; - size_t ill_list_size = 0; - int cnt = 0; - boolean_t need_ipsq_free = B_FALSE; - ip_stack_t *ipst = ipsq->ipsq_ipst; + phyint_t *phyi; + ipsq_func_t func; ASSERT(IAM_WRITER_IPSQ(ipsq)); - mutex_enter(&ipsq->ipsq_lock); - ASSERT(ipsq->ipsq_reentry_cnt >= 1); - if (ipsq->ipsq_reentry_cnt != 1) { - ipsq->ipsq_reentry_cnt--; - mutex_exit(&ipsq->ipsq_lock); + + ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); + if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { + ipsq->ipsq_xop->ipx_reentry_cnt--; return; } - mp = ipsq_dq(ipsq); - while (mp != NULL) { -again: - mutex_exit(&ipsq->ipsq_lock); - func = (ipsq_func_t)mp->b_prev; - q = (queue_t *)mp->b_queue; - mp->b_prev = NULL; - mp->b_queue = NULL; - - /* - * If 'q' is an conn queue, it is valid, since we did a - * a refhold on the connp, at the start of the ioctl. - * If 'q' is an ill queue, it is valid, since close of an - * ill will clean up the 'ipsq'. - */ - (*func)(ipsq, q, mp, NULL); - - mutex_enter(&ipsq->ipsq_lock); + for (;;) { + phyi = ipsq->ipsq_phyint; mp = ipsq_dq(ipsq); - } - - mutex_exit(&ipsq->ipsq_lock); - - /* - * Need to grab the locks in the right order. Need to - * atomically check (under ipsq_lock) that there are no - * messages before relinquishing the ipsq. Also need to - * atomically wakeup waiters on ill_cv while holding ill_lock. - * Holding ill_g_lock ensures that ipsq list of ills is stable. - * If we need to call ill_split_ipsq and change <ill-ipsq> we need - * to grab ill_g_lock as writer. - */ - rw_enter(&ipst->ips_ill_g_lock, - ipsq->ipsq_split ? RW_WRITER : RW_READER); + mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; - /* ipsq_refs can't change while ill_g_lock is held as reader */ - if (ipsq->ipsq_refs != 0) { - /* At most 2 ills v4/v6 per phyint */ - cnt = ipsq->ipsq_refs << 1; - ill_list_size = cnt * sizeof (ill_t *); /* - * If memory allocation fails, we will do the split - * the next time ipsq_exit is called for whatever reason. - * As long as the ipsq_split flag is set the need to - * split is remembered. + * If we've changed to a new IPSQ, and the phyint associated + * with the old one has gone away, free the old IPSQ. Note + * that this cannot happen while the IPSQ is in a group. */ - ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); - if (ill_list != NULL) - cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt); - } - mutex_enter(&ipsq->ipsq_lock); - mp = ipsq_dq(ipsq); - if (mp != NULL) { - /* oops, some message has landed up, we can't get out */ - if (ill_list != NULL) - ill_unlock_ills(ill_list, cnt); - rw_exit(&ipst->ips_ill_g_lock); - if (ill_list != NULL) - kmem_free(ill_list, ill_list_size); - ill_list = NULL; - ill_list_size = 0; - cnt = 0; - goto again; - } + if (mp_ipsq != ipsq && phyi == NULL) { + ASSERT(ipsq->ipsq_next == ipsq); + ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); + ipsq_delete(ipsq); + } - /* - * Split only if no ioctl is pending and if memory alloc succeeded - * above. - */ - if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL && - ill_list != NULL) { - /* - * No new ill can join this ipsq since we are holding the - * ill_g_lock. Hence ill_split_ipsq can safely traverse the - * ipsq. ill_split_ipsq may fail due to memory shortage. - * If so we will retry on the next ipsq_exit. - */ - ipsq->ipsq_split = ill_split_ipsq(ipsq); - } + if (mp == NULL) + break; - /* - * We are holding the ipsq lock, hence no new messages can - * land up on the ipsq, and there are no messages currently. - * Now safe to get out. Wake up waiters and relinquish ipsq - * atomically while holding ill locks. - */ - ipsq->ipsq_writer = NULL; - ipsq->ipsq_reentry_cnt--; - ASSERT(ipsq->ipsq_reentry_cnt == 0); -#ifdef DEBUG - ipsq->ipsq_depth = 0; -#endif - mutex_exit(&ipsq->ipsq_lock); - /* - * For IPMP this should wake up all ills in this ipsq. - * We need to hold the ill_lock while waking up waiters to - * avoid missed wakeups. But there is no need to acquire all - * the ill locks and then wakeup. If we have not acquired all - * the locks (due to memory failure above) ill_signal_ipsq_ills - * wakes up ills one at a time after getting the right ill_lock - */ - ill_signal_ipsq_ills(ipsq, ill_list != NULL); - if (ill_list != NULL) - ill_unlock_ills(ill_list, cnt); - if (ipsq->ipsq_refs == 0) - need_ipsq_free = B_TRUE; - rw_exit(&ipst->ips_ill_g_lock); - if (ill_list != 0) - kmem_free(ill_list, ill_list_size); + q = mp->b_queue; + func = (ipsq_func_t)mp->b_prev; + ipsq = mp_ipsq; + mp->b_next = mp->b_prev = NULL; + mp->b_queue = NULL; - if (need_ipsq_free) { /* - * Free the ipsq. ipsq_refs can't increase because ipsq can't be - * looked up. ipsq can be looked up only thru ill or phyint - * and there are no ills/phyint on this ipsq. + * If 'q' is an conn queue, it is valid, since we did a + * a refhold on the conn at the start of the ioctl. + * If 'q' is an ill queue, it is valid, since close of an + * ill will clean up its IPSQ. */ - ipsq_delete(ipsq); - } - - /* - * Now that we're outside the IPSQ, start any IGMP/MLD timers. We - * can't start these inside the IPSQ since e.g. igmp_start_timers() -> - * untimeout() (inside the IPSQ, waiting for an executing timeout to - * finish) could deadlock with igmp_timeout_handler() -> ipsq_enter() - * (executing the timeout, waiting to get inside the IPSQ). - * - * However, there is one exception to the above: if this thread *is* - * the IGMP/MLD timeout handler thread, then we must not start its - * timer until the current handler is done. - */ - mutex_enter(&ipst->ips_igmp_timer_lock); - if (curthread != ipst->ips_igmp_timer_thread) { - next = ipst->ips_igmp_deferred_next; - ipst->ips_igmp_deferred_next = INFINITY; - mutex_exit(&ipst->ips_igmp_timer_lock); - - if (next != INFINITY) - igmp_start_timers(next, ipst); - } else { - mutex_exit(&ipst->ips_igmp_timer_lock); - } - - mutex_enter(&ipst->ips_mld_timer_lock); - if (curthread != ipst->ips_mld_timer_thread) { - next = ipst->ips_mld_deferred_next; - ipst->ips_mld_deferred_next = INFINITY; - mutex_exit(&ipst->ips_mld_timer_lock); - - if (next != INFINITY) - mld_start_timers(next, ipst); - } else { - mutex_exit(&ipst->ips_mld_timer_lock); + (*func)(ipsq, q, mp, NULL); } } @@ -7822,15 +8023,17 @@ again: void ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) { + ipxop_t *ipx = ipsq->ipsq_xop; + ASSERT(IAM_WRITER_IPSQ(ipsq)); + ASSERT(ipx->ipx_current_ipif == NULL); + ASSERT(ipx->ipx_current_ioctl == 0); - mutex_enter(&ipsq->ipsq_lock); - ASSERT(ipsq->ipsq_current_ipif == NULL); - ASSERT(ipsq->ipsq_current_ioctl == 0); - ipsq->ipsq_current_done = B_FALSE; - ipsq->ipsq_current_ipif = ipif; - ipsq->ipsq_current_ioctl = ioccmd; - mutex_exit(&ipsq->ipsq_lock); + ipx->ipx_current_done = B_FALSE; + ipx->ipx_current_ioctl = ioccmd; + mutex_enter(&ipx->ipx_lock); + ipx->ipx_current_ipif = ipif; + mutex_exit(&ipx->ipx_lock); } /* @@ -7844,17 +8047,18 @@ ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) void ipsq_current_finish(ipsq_t *ipsq) { - ipif_t *ipif = ipsq->ipsq_current_ipif; + ipxop_t *ipx = ipsq->ipsq_xop; t_uscalar_t dlpi_pending = DL_PRIM_INVAL; + ipif_t *ipif = ipx->ipx_current_ipif; ASSERT(IAM_WRITER_IPSQ(ipsq)); /* - * For SIOCSLIFREMOVEIF, the ipif has been already been blown away + * For SIOCLIFREMOVEIF, the ipif has been already been blown away * (but in that case, IPIF_CHANGING will already be clear and no * pending DLPI messages can remain). */ - if (ipsq->ipsq_current_ioctl != SIOCLIFREMOVEIF) { + if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { ill_t *ill = ipif->ipif_ill; mutex_enter(&ill->ill_lock); @@ -7863,12 +8067,14 @@ ipsq_current_finish(ipsq_t *ipsq) mutex_exit(&ill->ill_lock); } - mutex_enter(&ipsq->ipsq_lock); - ipsq->ipsq_current_ioctl = 0; - ipsq->ipsq_current_done = B_TRUE; - if (dlpi_pending == DL_PRIM_INVAL) - ipsq->ipsq_current_ipif = NULL; - mutex_exit(&ipsq->ipsq_lock); + ASSERT(!ipx->ipx_current_done); + ipx->ipx_current_done = B_TRUE; + ipx->ipx_current_ioctl = 0; + if (dlpi_pending == DL_PRIM_INVAL) { + mutex_enter(&ipx->ipx_lock); + ipx->ipx_current_ipif = NULL; + mutex_exit(&ipx->ipx_lock); + } } /* @@ -7884,123 +8090,38 @@ ipsq_flush(ill_t *ill) mblk_t *prev; mblk_t *mp; mblk_t *mp_next; - ipsq_t *ipsq; + ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; ASSERT(IAM_WRITER_ILL(ill)); - ipsq = ill->ill_phyint->phyint_ipsq; + /* * Flush any messages sent up by the driver. */ - mutex_enter(&ipsq->ipsq_lock); - for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) { + mutex_enter(&ipx->ipx_lock); + for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { mp_next = mp->b_next; q = mp->b_queue; if (q == ill->ill_rq || q == ill->ill_wq) { - /* Remove the mp from the ipsq */ + /* dequeue mp */ if (prev == NULL) - ipsq->ipsq_mphead = mp->b_next; + ipx->ipx_mphead = mp->b_next; else prev->b_next = mp->b_next; - if (ipsq->ipsq_mptail == mp) { + if (ipx->ipx_mptail == mp) { ASSERT(mp_next == NULL); - ipsq->ipsq_mptail = prev; + ipx->ipx_mptail = prev; } inet_freemsg(mp); } else { prev = mp; } } - mutex_exit(&ipsq->ipsq_lock); + mutex_exit(&ipx->ipx_lock); (void) ipsq_pending_mp_cleanup(ill, NULL); ipsq_xopq_mp_cleanup(ill, NULL); ill_pending_mp_cleanup(ill); } -/* ARGSUSED */ -int -ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, - ip_ioctl_cmd_t *ipip, void *ifreq) -{ - ill_t *ill; - struct lifreq *lifr = (struct lifreq *)ifreq; - boolean_t isv6; - conn_t *connp; - ip_stack_t *ipst; - - connp = Q_TO_CONN(q); - ipst = connp->conn_netstack->netstack_ip; - isv6 = connp->conn_af_isv6; - /* - * Set original index. - * Failover and failback move logical interfaces - * from one physical interface to another. The - * original index indicates the parent of a logical - * interface, in other words, the physical interface - * the logical interface will be moved back to on - * failback. - */ - - /* - * Don't allow the original index to be changed - * for non-failover addresses, autoconfigured - * addresses, or IPv6 link local addresses. - */ - if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) || - (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) { - return (EINVAL); - } - /* - * The new original index must be in use by some - * physical interface. - */ - ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL, - NULL, NULL, ipst); - if (ill == NULL) - return (ENXIO); - ill_refrele(ill); - - ipif->ipif_orig_ifindex = lifr->lifr_index; - /* - * When this ipif gets failed back, don't - * preserve the original id, as it is no - * longer applicable. - */ - ipif->ipif_orig_ipifid = 0; - /* - * For IPv4, change the original index of any - * multicast addresses associated with the - * ipif to the new value. - */ - if (!isv6) { - ilm_t *ilm; - - mutex_enter(&ipif->ipif_ill->ill_lock); - for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if (ilm->ilm_ipif == ipif) { - ilm->ilm_orig_ifindex = lifr->lifr_index; - } - } - mutex_exit(&ipif->ipif_ill->ill_lock); - } - return (0); -} - -/* ARGSUSED */ -int -ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, - ip_ioctl_cmd_t *ipip, void *ifreq) -{ - struct lifreq *lifr = (struct lifreq *)ifreq; - - /* - * Get the original interface index i.e the one - * before FAILOVER if it ever happened. - */ - lifr->lifr_index = ipif->ipif_orig_ifindex; - return (0); -} - /* * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, * refhold and return the associated ipif @@ -8087,8 +8208,6 @@ int ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, cmd_info_t *ci, ipsq_func_t func) { - sin_t *sin; - sin6_t *sin6; char *name; struct ifreq *ifr; struct lifreq *lifr; @@ -8132,9 +8251,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, * be trusted. */ ifr->ifr_name[IFNAMSIZ - 1] = '\0'; - sin = (sin_t *)&ifr->ifr_addr; name = ifr->ifr_name; - ci->ci_sin = sin; + ci->ci_sin = (sin_t *)&ifr->ifr_addr; ci->ci_sin6 = NULL; ci->ci_lifr = (struct lifreq *)ifr; } else { @@ -8148,14 +8266,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, */ lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; name = lifr->lifr_name; - sin = (sin_t *)&lifr->lifr_addr; - sin6 = (sin6_t *)&lifr->lifr_addr; - if (ipip->ipi_cmd == SIOCSLIFGROUPNAME) { - (void) strncpy(ci->ci_groupname, lifr->lifr_groupname, - LIFNAMSIZ); - } - ci->ci_sin = sin; - ci->ci_sin6 = sin6; + ci->ci_sin = (sin_t *)&lifr->lifr_addr; + ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; ci->ci_lifr = lifr; } @@ -8181,21 +8293,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, if (ipif == NULL) { if (err == EINPROGRESS) return (err); - if (ipip->ipi_cmd == SIOCLIFFAILOVER || - ipip->ipi_cmd == SIOCLIFFAILBACK) { - /* - * Need to try both v4 and v6 since this - * ioctl can come down either v4 or v6 - * socket. The lifreq.lifr_family passed - * down by this ioctl is AF_UNSPEC. - */ - ipif = ipif_lookup_on_name(name, - mi_strlen(name), B_FALSE, &exists, !isv6, - zoneid, (connp == NULL) ? q : - CONNP_TO_WQ(connp), mp, func, &err, ipst); - if (err == EINPROGRESS) - return (err); - } err = 0; /* Ensure we don't use it below */ } } @@ -8221,15 +8318,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, if (ipif == NULL) return (ENXIO); - /* - * Allow only GET operations if this ipif has been created - * temporarily due to a MOVE operation. - */ - if (ipif->ipif_replace_zero && !(ipip->ipi_flags & IPI_REPL)) { - ipif_refrele(ipif); - return (EINVAL); - } - ci->ci_ipif = ipif; return (0); } @@ -8247,15 +8335,15 @@ ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); - - while (ill != NULL) { + for (; ill != NULL; ill = ill_next(&ctx, ill)) { + if (IS_UNDER_IPMP(ill)) + continue; for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (ipif->ipif_zoneid == zoneid || ipif->ipif_zoneid == ALL_ZONES) numifs++; } - ill = ill_next(&ctx, ill); } rw_exit(&ipst->ips_ill_g_lock); return (numifs); @@ -8283,6 +8371,9 @@ ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) ill = ILL_START_WALK_ALL(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { + if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) + continue; + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if ((ipif->ipif_flags & IPIF_NOXMIT) && @@ -8491,6 +8582,8 @@ ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { + if (IS_UNDER_IPMP(ill)) + continue; for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (zoneid != ipif->ipif_zoneid && @@ -8760,6 +8853,9 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ill_first(list, list, &ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { + if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) + continue; + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if ((ipif->ipif_flags & IPIF_NOXMIT) && @@ -8795,6 +8891,7 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); + lifr->lifr_type = ill->ill_type; if (ipif->ipif_isv6) { sin6 = (sin6_t *)&lifr->lifr_addr; *sin6 = sin6_null; @@ -8828,23 +8925,6 @@ lif_copydone: return (0); } -/* ARGSUSED */ -int -ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin, - queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) -{ - ip_stack_t *ipst; - - if (q->q_next == NULL) - ipst = CONNQ_TO_IPST(q); - else - ipst = ILLQ_TO_IPST(q); - - /* Existence of b_cont->b_cont checked in ip_wput_nondata */ - ipst->ips_ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr; - return (0); -} - static void ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) { @@ -9038,8 +9118,7 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); } else { src_ipif = ipif_select_source_v6(dst_ill, - daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, - zoneid); + daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); } if (src_ipif == NULL) goto next_dst; @@ -9325,10 +9404,14 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, struct arpreq *ar; struct xarpreq *xar; int flags, alength; - char *lladdr; - ip_stack_t *ipst; + uchar_t *lladdr; + ire_t *ire; + ip_stack_t *ipst; ill_t *ill = ipif->ipif_ill; + ill_t *proxy_ill = NULL; + ipmp_arpent_t *entp = NULL; boolean_t if_arp_ioctl = B_FALSE; + boolean_t proxyarp = B_FALSE; ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); connp = Q_TO_CONN(q); @@ -9340,7 +9423,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ar = NULL; flags = xar->xarp_flags; - lladdr = LLADDR(&xar->xarp_ha); + lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); /* * Validate against user's link layer address length @@ -9359,7 +9442,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, xar = NULL; flags = ar->arp_flags; - lladdr = ar->arp_ha.sa_data; + lladdr = (uchar_t *)ar->arp_ha.sa_data; /* * Theoretically, the sa_family could tell us what link * layer type this operation is trying to deal with. By @@ -9379,6 +9462,51 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } } + ipaddr = sin->sin_addr.s_addr; + + /* + * IPMP ARP special handling: + * + * 1. Since ARP mappings must appear consistent across the group, + * prohibit changing ARP mappings on the underlying interfaces. + * + * 2. Since ARP mappings for IPMP data addresses are maintained by + * IP itself, prohibit changing them. + * + * 3. For proxy ARP, use a functioning hardware address in the group, + * provided one exists. If one doesn't, just add the entry as-is; + * ipmp_illgrp_refresh_arpent() will refresh it if things change. + */ + if (IS_UNDER_IPMP(ill)) { + if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) + return (EPERM); + } + if (IS_IPMP(ill)) { + ipmp_illgrp_t *illg = ill->ill_grp; + + switch (ipip->ipi_cmd) { + case SIOCSARP: + case SIOCSXARP: + proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); + if (proxy_ill != NULL) { + proxyarp = B_TRUE; + if (!ipmp_ill_is_active(proxy_ill)) + proxy_ill = ipmp_illgrp_next_ill(illg); + if (proxy_ill != NULL) + lladdr = proxy_ill->ill_phys_addr; + } + /* FALLTHRU */ + case SIOCDARP: + case SIOCDXARP: + ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL, + ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); + if (ire != NULL) { + ire_refrele(ire); + return (EPERM); + } + } + } + /* * We are going to pass up to ARP a packet chain that looks * like: @@ -9400,8 +9528,6 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, return (ENOMEM); } - ipaddr = sin->sin_addr.s_addr; - mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, (caddr_t)&ipaddr); if (mp2 == NULL) { @@ -9481,6 +9607,30 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, area->area_flags |= ACE_F_AUTHORITY; /* + * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it + * so that IP can update ARP as the active ills in the group change. + */ + if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD && + (area->area_flags & ACE_F_PERMANENT)) { + entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp); + + /* + * The second part of the conditional below handles a corner + * case: if this is proxy ARP and the IPMP group has no active + * interfaces, we can't send the request to ARP now since it + * won't be able to build an ACE. So we return success and + * notify ARP about the proxy ARP entry once an interface + * becomes active. + */ + if (entp == NULL || (proxyarp && proxy_ill == NULL)) { + mp2->b_cont = NULL; + inet_freemsg(mp1); + inet_freemsg(pending_mp); + return (entp == NULL ? ENOMEM : 0); + } + } + + /* * Before sending 'mp' to ARP, we have to clear the b_next * and b_prev. Otherwise if STREAMS encounters such a message * in freemsg(), (because ARP can close any time) it can cause @@ -9497,7 +9647,12 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, mutex_enter(&connp->conn_lock); mutex_enter(&ill->ill_lock); /* conn has not yet started closing, hence this can't fail */ - VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0); + if (ipip->ipi_flags & IPI_WR) { + VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), + pending_mp, 0) != 0); + } else { + VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0); + } mutex_exit(&ill->ill_lock); mutex_exit(&connp->conn_lock); @@ -9506,6 +9661,13 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion. */ putnext(ill->ill_rq, mp1); + + /* + * If we created an IPMP ARP entry, mark that we've notified ARP. + */ + if (entp != NULL) + ipmp_illgrp_mark_arpent(ill->ill_grp, entp); + return (EINPROGRESS); } @@ -9564,55 +9726,114 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, mp, func, &err, ipst); if (ipif == NULL) return (err); - if (ipif->ipif_id != 0 || - ipif->ipif_net_type != IRE_IF_RESOLVER) { + if (ipif->ipif_id != 0) { ipif_refrele(ipif); return (ENXIO); } } else { /* - * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with sdl_nlen == - * 0: use the IP address to figure out the ill. In the IPMP - * case, a simple forwarding table lookup will return the - * IRE_IF_RESOLVER for the first interface in the group, which - * might not be the interface on which the requested IP - * address was resolved due to the ill selection algorithm - * (see ip_newroute_get_dst_ill()). So we do a cache table - * lookup first: if the IRE cache entry for the IP address is - * still there, it will contain the ill pointer for the right - * interface, so we use that. If the cache entry has been - * flushed, we fall back to the forwarding table lookup. This - * should be rare enough since IRE cache entries have a longer - * life expectancy than ARP cache entries. + * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen + * of 0: use the IP address to find the ipif. If the IP + * address is an IPMP test address, ire_ftable_lookup() will + * find the wrong ill, so we first do an ipif_lookup_addr(). */ - ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL, - ipst); - if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || - ((ill = ire_to_ill(ire)) == NULL) || - (ill->ill_net_type != IRE_IF_RESOLVER)) { - if (ire != NULL) - ire_refrele(ire); - ire = ire_ftable_lookup(sin->sin_addr.s_addr, - 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, - NULL, MATCH_IRE_TYPE, ipst); + ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, + CONNP_TO_WQ(connp), mp, func, &err, ipst); + if (ipif == NULL) { + ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0, + IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL, + MATCH_IRE_TYPE, ipst); if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) { - if (ire != NULL) ire_refrele(ire); return (ENXIO); } + ipif = ill->ill_ipif; + ipif_refhold(ipif); + ire_refrele(ire); } - ASSERT(ire != NULL && ill != NULL); - ipif = ill->ill_ipif; - ipif_refhold(ipif); - ire_refrele(ire); } + + if (ipif->ipif_net_type != IRE_IF_RESOLVER) { + ipif_refrele(ipif); + return (ENXIO); + } + ci->ci_sin = sin; ci->ci_ipif = ipif; return (0); } /* + * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the + * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is + * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it + * up and thus an ill can join that illgrp. + * + * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than + * open()/close() primarily because close() is not allowed to fail or block + * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason + * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure + * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the + * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts + * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent + * state if I_UNLINK didn't occur. + * + * Note that for each plumb/unplumb operation, we may end up here more than + * once because of the way ifconfig works. However, it's OK to link the same + * illgrp more than once, or unlink an illgrp that's already unlinked. + */ +static int +ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) +{ + int err; + ip_stack_t *ipst = ill->ill_ipst; + + ASSERT(IS_IPMP(ill)); + ASSERT(IAM_WRITER_ILL(ill)); + + switch (ioccmd) { + case I_LINK: + return (ENOTSUP); + + case I_PLINK: + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); + rw_exit(&ipst->ips_ipmp_lock); + break; + + case I_PUNLINK: + /* + * Require all UP ipifs be brought down prior to unlinking the + * illgrp so any associated IREs (and other state) is torched. + */ + if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) + return (EBUSY); + + /* + * NOTE: We hold ipmp_lock across the unlink to prevent a race + * with an SIOCSLIFGROUPNAME request from an ill trying to + * join this group. Specifically: ills trying to join grab + * ipmp_lock and bump a "pending join" counter checked by + * ipmp_illgrp_unlink_grp(). During the unlink no new pending + * joins can occur (since we have ipmp_lock). Once we drop + * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not + * find the illgrp (since we unlinked it) and will return + * EAFNOSUPPORT. This will then take them back through the + * IPMP meta-interface plumbing logic in ifconfig, and thus + * back through I_PLINK above. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + err = ipmp_illgrp_unlink_grp(ill->ill_grp); + rw_exit(&ipst->ips_ipmp_lock); + return (err); + default: + break; + } + return (0); +} + +/* * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also * atomically set/clear the muxids. Also complete the ioctl by acking or * naking it. Note that the code is structured such that the link type, @@ -9697,7 +9918,7 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) if (ipsq == NULL) { ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, - NEW_OP, B_TRUE); + NEW_OP, B_FALSE); if (ipsq == NULL) { ill_refrele(ill); return; @@ -9728,6 +9949,11 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) err = EINVAL; goto done; } + + if (IS_IPMP(ill) && + (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) + goto done; + ill->ill_arp_muxid = islink ? li->l_index : 0; } else { /* @@ -9763,6 +9989,7 @@ static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, struct linkblk *li, boolean_t doconsist) { + int err = 0; ill_t *ill; queue_t *ipwq, *dwq; const char *name; @@ -9796,7 +10023,7 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, if (ipsq == NULL) { ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, - NEW_OP, B_TRUE); + NEW_OP, B_FALSE); if (ipsq == NULL) return (EINPROGRESS); entered_ipsq = B_TRUE; @@ -9811,12 +10038,14 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, */ if ((islink && ill->ill_ip_muxid != 0) || (!islink && ill->ill_arp_muxid != 0)) { - if (entered_ipsq) - ipsq_exit(ipsq); - return (EINVAL); + err = EINVAL; + goto done; } } + if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) + goto done; + /* * As part of I_{P}LINKing, stash the number of downstream modules and * the read queue of the module immediately below IP in the ill. @@ -9853,11 +10082,11 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, ill_capability_reset(ill, B_FALSE); } ipsq_current_finish(ipsq); - +done: if (entered_ipsq) ipsq_exit(ipsq); - return (0); + return (err); } /* @@ -10124,8 +10353,9 @@ nak: } /* ip_wput hands off ARP IOCTL responses to us */ +/* ARGSUSED3 */ void -ip_sioctl_iocack(queue_t *q, mblk_t *mp) +ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) { struct arpreq *ar; struct xarpreq *xar; @@ -10136,7 +10366,6 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) struct iocblk *orig_iocp; ill_t *ill; conn_t *connp = NULL; - uint_t ioc_id; mblk_t *pending_mp; int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; int *flagsp; @@ -10146,6 +10375,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) int err; ip_stack_t *ipst; + ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); ill = q->q_ptr; ASSERT(ill != NULL); ipst = ill->ill_ipst; @@ -10185,10 +10415,14 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) iocp = (struct iocblk *)mp->b_rptr; /* - * Pick out the originating queue based on the ioc_id. + * Find the pending message; if we're exclusive, it'll be on our IPSQ. + * Otherwise, we can find it from our ioc_id. */ - ioc_id = iocp->ioc_id; - pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); + if (ipsq != NULL) + pending_mp = ipsq_pending_mp_get(ipsq, &connp); + else + pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id); + if (pending_mp == NULL) { ASSERT(connp == NULL); inet_freemsg(mp); @@ -10271,7 +10505,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) ire_refrele(ire); freemsg(mp); ip_ioctl_finish(q, orig_ioc_mp, - EINVAL, NO_COPYOUT, NULL); + EINVAL, NO_COPYOUT, ipsq); return; } *flagsp |= ATF_COM; @@ -10297,12 +10531,27 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) /* Ditch the internal IOCTL. */ freemsg(mp); ire_refrele(ire); - ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); + ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); return; } } /* + * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE + * on the IPMP meta-interface, ensure any ARP entries added in + * ip_sioctl_arp() are deleted. + */ + if (IS_IPMP(ill) && + ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) || + ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) { + ipmp_illgrp_t *illg = ill->ill_grp; + ipmp_arpent_t *entp; + + if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL) + ipmp_illgrp_destroy_arpent(illg, entp); + } + + /* * Delete the coresponding IRE_CACHE if any. * Reset the error if there was one (in case there was no entry * in arp.) @@ -10341,7 +10590,7 @@ errack: if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { err = iocp->ioc_error; freemsg(mp); - ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL); + ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq); return; } @@ -10355,7 +10604,7 @@ errack: sizeof (xar->xarp_ha.sdl_data)) { freemsg(mp); ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT, - NULL); + ipsq); return; } } @@ -10382,7 +10631,7 @@ errack: /* Ditch the internal IOCTL. */ freemsg(mp); /* Complete the original. */ - ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); + ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); } /* @@ -10397,7 +10646,7 @@ errack: * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. * - * Executed as a writer on the ill or ill group. + * Executed as a writer on the ill. * So no lock is needed to traverse the ipif chain, or examine the * phyint flags. */ @@ -10423,7 +10672,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, boolean_t found_sep = B_FALSE; conn_t *connp; zoneid_t zoneid; - int orig_ifindex = 0; ip_stack_t *ipst = CONNQ_TO_IPST(q); ASSERT(q->q_next == NULL); @@ -10513,61 +10761,10 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, if (ipsq == NULL) return (EINPROGRESS); - /* - * If the interface is failed, inactive or offlined, look for a working - * interface in the ill group and create the ipif there. If we can't - * find a good interface, create the ipif anyway so that in.mpathd can - * move it to the first repaired interface. - */ - if ((ill->ill_phyint->phyint_flags & - (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && - ill->ill_phyint->phyint_groupname_len != 0) { - phyint_t *phyi; - char *groupname = ill->ill_phyint->phyint_groupname; - - /* - * We're looking for a working interface, but it doesn't matter - * if it's up or down; so instead of following the group lists, - * we look at each physical interface and compare the groupname. - * We're only interested in interfaces with IPv4 (resp. IPv6) - * plumbed when we're adding an IPv4 (resp. IPv6) ipif. - * Otherwise we create the ipif on the failed interface. - */ - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - phyi = avl_first(&ipst->ips_phyint_g_list-> - phyint_list_avl_by_index); - for (; phyi != NULL; - phyi = avl_walk(&ipst->ips_phyint_g_list-> - phyint_list_avl_by_index, - phyi, AVL_AFTER)) { - if (phyi->phyint_groupname_len == 0) - continue; - ASSERT(phyi->phyint_groupname != NULL); - if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 && - !(phyi->phyint_flags & - (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && - (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) : - (phyi->phyint_illv4 != NULL))) { - break; - } - } - rw_exit(&ipst->ips_ill_g_lock); - - if (phyi != NULL) { - orig_ifindex = ill->ill_phyint->phyint_ifindex; - ill = (ill->ill_isv6 ? phyi->phyint_illv6 : - phyi->phyint_illv4); - } - } - - /* - * We are now exclusive on the ipsq, so an ill move will be serialized - * before or after us. - */ + /* We are now exclusive on the IPSQ */ ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(ill->ill_move_in_progress == B_FALSE); - if (found_sep && orig_ifindex == 0) { + if (found_sep) { /* Now see if there is an IPIF with this unit number. */ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { @@ -10580,14 +10777,11 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, /* * We use IRE_LOCAL for lo0:1 etc. for "receive only" use - * of lo0. We never come here when we plumb lo0:0. It - * happens in ipif_lookup_on_name. - * The specified unit number is ignored when we create the ipif on a - * different interface. However, we save it in ipif_orig_ipifid below so - * that the ipif fails back to the right position. - */ - if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ? - id : -1, IRE_LOCAL, B_TRUE)) == NULL) { + * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() + * instead. + */ + if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, + B_TRUE, B_TRUE)) == NULL) { err = ENOBUFS; goto done; } @@ -10604,14 +10798,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); } - /* Set ifindex and unit number for failback */ - if (err == 0 && orig_ifindex != 0) { - ipif->ipif_orig_ifindex = orig_ifindex; - if (found_sep) { - ipif->ipif_orig_ipifid = id; - } - } - done: ipsq_exit(ipsq); return (err); @@ -10672,7 +10858,6 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ill_delete(ill); mutex_enter(&connp->conn_lock); mutex_enter(&ill->ill_lock); - ASSERT(ill->ill_group == NULL); /* Are any references to this ill active */ if (ill_is_freeable(ill)) { @@ -10693,14 +10878,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } } - /* - * We are exclusive on the ipsq, so an ill move will be serialized - * before or after us. - */ - ASSERT(ill->ill_move_in_progress == B_FALSE); - if (ipif->ipif_id == 0) { - ipsq_t *ipsq; /* Find based on address */ @@ -10712,35 +10890,15 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, sin6 = (sin6_t *)sin; /* We are a writer, so we should be able to lookup */ - ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, - ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst); - if (ipif == NULL) { - /* - * Maybe the address in on another interface in - * the same IPMP group? We check this below. - */ - ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, - NULL, ALL_ZONES, NULL, NULL, NULL, NULL, - ipst); - } + ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, + ipst); } else { - ipaddr_t addr; - if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); - addr = sin->sin_addr.s_addr; /* We are a writer, so we should be able to lookup */ - ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL, - NULL, NULL, NULL, ipst); - if (ipif == NULL) { - /* - * Maybe the address in on another interface in - * the same IPMP group? We check this below. - */ - ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES, - NULL, NULL, NULL, NULL, ipst); - } + ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, + ipst); } if (ipif == NULL) { return (EADDRNOTAVAIL); @@ -10750,32 +10908,11 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * It is possible for a user to send an SIOCLIFREMOVEIF with * lifr_name of the physical interface but with an ip address * lifr_addr of a logical interface plumbed over it. - * So update ipsq_current_ipif once ipif points to the - * correct interface after doing ipif_lookup_addr(). + * So update ipx_current_ipif now that ipif points to the + * correct one. */ ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; - ASSERT(ipsq != NULL); - - mutex_enter(&ipsq->ipsq_lock); - ipsq->ipsq_current_ipif = ipif; - mutex_exit(&ipsq->ipsq_lock); - - /* - * When the address to be removed is hosted on a different - * interface, we check if the interface is in the same IPMP - * group as the specified one; if so we proceed with the - * removal. - * ill->ill_group is NULL when the ill is down, so we have to - * compare the group names instead. - */ - if (ipif->ipif_ill != ill && - (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 || - ill->ill_phyint->phyint_groupname_len == 0 || - mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname, - ill->ill_phyint->phyint_groupname) != 0)) { - ipif_refrele(ipif); - return (EADDRNOTAVAIL); - } + ipsq->ipsq_xop->ipx_current_ipif = ipif; /* This is a writer */ ipif_refrele(ipif); @@ -11072,7 +11209,7 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if (need_dl_down) ill_dl_down(ill); if (need_arp_down) - ipif_arp_down(ipif); + ipif_resolver_down(ipif); return (err); } @@ -11272,9 +11409,9 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if (need_dl_down) ill_dl_down(ill); - if (need_arp_down) - ipif_arp_down(ipif); + ipif_resolver_down(ipif); + return (err); } @@ -11323,144 +11460,8 @@ ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } /* - * part of ipmp, make this func return the active/inactive state and - * caller can set once atomically instead of multiple mutex_enter/mutex_exit - */ -/* - * This function either sets or clears the IFF_INACTIVE flag. - * - * As long as there are some addresses or multicast memberships on the - * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we - * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface - * will be used for outbound packets. - * - * Caller needs to verify the validity of setting IFF_INACTIVE. - */ -static void -phyint_inactive(phyint_t *phyi) -{ - ill_t *ill_v4; - ill_t *ill_v6; - ipif_t *ipif; - ilm_t *ilm; - - ill_v4 = phyi->phyint_illv4; - ill_v6 = phyi->phyint_illv6; - - /* - * No need for a lock while traversing the list since iam - * a writer - */ - if (ill_v4 != NULL) { - ASSERT(IAM_WRITER_ILL(ill_v4)); - for (ipif = ill_v4->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { - mutex_enter(&phyi->phyint_lock); - phyi->phyint_flags &= ~PHYI_INACTIVE; - mutex_exit(&phyi->phyint_lock); - return; - } - } - for (ilm = ill_v4->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { - mutex_enter(&phyi->phyint_lock); - phyi->phyint_flags &= ~PHYI_INACTIVE; - mutex_exit(&phyi->phyint_lock); - return; - } - } - } - if (ill_v6 != NULL) { - ill_v6 = phyi->phyint_illv6; - for (ipif = ill_v6->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { - mutex_enter(&phyi->phyint_lock); - phyi->phyint_flags &= ~PHYI_INACTIVE; - mutex_exit(&phyi->phyint_lock); - return; - } - } - for (ilm = ill_v6->ill_ilm; ilm != NULL; - ilm = ilm->ilm_next) { - if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { - mutex_enter(&phyi->phyint_lock); - phyi->phyint_flags &= ~PHYI_INACTIVE; - mutex_exit(&phyi->phyint_lock); - return; - } - } - } - mutex_enter(&phyi->phyint_lock); - phyi->phyint_flags |= PHYI_INACTIVE; - mutex_exit(&phyi->phyint_lock); -} - -/* - * This function is called only when the phyint flags change. Currently - * called from ip_sioctl_flags. We re-do the broadcast nomination so - * that we can select a good ill. - */ -static void -ip_redo_nomination(phyint_t *phyi) -{ - ill_t *ill_v4; - - ill_v4 = phyi->phyint_illv4; - - if (ill_v4 != NULL && ill_v4->ill_group != NULL) { - ASSERT(IAM_WRITER_ILL(ill_v4)); - if (ill_v4->ill_group->illgrp_ill_count > 1) - ill_nominate_bcast_rcv(ill_v4->ill_group); - } -} - -/* - * Heuristic to check if ill is INACTIVE. - * Checks if ill has an ipif with an usable ip address. - * - * Return values: - * B_TRUE - ill is INACTIVE; has no usable ipif - * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif - */ -static boolean_t -ill_is_inactive(ill_t *ill) -{ - ipif_t *ipif; - - /* Check whether it is in an IPMP group */ - if (ill->ill_phyint->phyint_groupname == NULL) - return (B_FALSE); - - if (ill->ill_ipif_up_count == 0) - return (B_TRUE); - - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - uint64_t flags = ipif->ipif_flags; - - /* - * This ipif is usable if it is IPIF_UP and not a - * dedicated test address. A dedicated test address - * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED - * (note in particular that V6 test addresses are - * link-local data addresses and thus are marked - * IPIF_NOFAILOVER but not IPIF_DEPRECATED). - */ - if ((flags & IPIF_UP) && - ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) != - (IPIF_DEPRECATED|IPIF_NOFAILOVER))) - return (B_FALSE); - } - return (B_TRUE); -} - -/* - * Set interface flags. - * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, - * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST, - * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE. + * Set interface flags. Many flags require special handling (e.g., + * bringing the interface down); see below for details. * * NOTE : We really don't enforce that ipif_id zero should be used * for setting any flags other than IFF_LOGINT_FLAGS. This @@ -11478,17 +11479,16 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, { uint64_t turn_on; uint64_t turn_off; - int err; + int err = 0; phyint_t *phyi; ill_t *ill; - uint64_t intf_flags; + uint64_t intf_flags, cantchange_flags; boolean_t phyint_flags_modified = B_FALSE; uint64_t flags; struct ifreq *ifr; struct lifreq *lifr; boolean_t set_linklocal = B_FALSE; boolean_t zero_source = B_FALSE; - ip_stack_t *ipst; ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); @@ -11497,11 +11497,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ill = ipif->ipif_ill; phyi = ill->ill_phyint; - ipst = ill->ill_ipst; if (ipip->ipi_cmd_type == IF_CMD) { ifr = (struct ifreq *)if_req; - flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); + flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); } else { lifr = (struct lifreq *)if_req; flags = lifr->lifr_flags; @@ -11524,25 +11523,60 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, flags |= intf_flags & ~0xFFFF; /* - * First check which bits will change and then which will - * go on and off + * Explicitly fail attempts to change flags that are always invalid on + * an IPMP meta-interface. */ - turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE; - if (!turn_on) + if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) + return (EINVAL); + + /* + * Check which flags will change; silently ignore flags which userland + * is not allowed to control. (Because these flags may change between + * SIOCGLIFFLAGS and SIOCSLIFFLAGS, and that's outside of userland's + * control, we need to silently ignore them rather than fail.) + */ + cantchange_flags = IFF_CANTCHANGE; + if (IS_IPMP(ill)) + cantchange_flags |= IFF_IPMP_CANTCHANGE; + + turn_on = (flags ^ intf_flags) & ~cantchange_flags; + if (turn_on == 0) return (0); /* No change */ turn_off = intf_flags & turn_on; turn_on ^= turn_off; - err = 0; /* - * Don't allow any bits belonging to the logical interface - * to be set or cleared on the replacement ipif that was - * created temporarily during a MOVE. + * All test addresses must be IFF_DEPRECATED (to ensure source address + * selection avoids them) -- so force IFF_DEPRECATED on, and do not + * allow it to be turned off. */ - if (ipif->ipif_replace_zero && - ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) { + if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && + (turn_on|intf_flags) & IFF_NOFAILOVER) return (EINVAL); + + if (turn_on & IFF_NOFAILOVER) { + turn_on |= IFF_DEPRECATED; + flags |= IFF_DEPRECATED; + } + + /* + * On underlying interfaces, only allow applications to manage test + * addresses -- otherwise, they may get confused when the address + * moves as part of being brought up. Likewise, prevent an + * application-managed test address from being converted to a data + * address. To prevent migration of administratively up addresses in + * the kernel, we don't allow them to be converted either. + */ + if (IS_UNDER_IPMP(ill)) { + const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; + + if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) + return (EINVAL); + + if ((turn_off & IFF_NOFAILOVER) && + (flags & (appflags | IFF_UP | IFF_DUPLICATE))) + return (EINVAL); } /* @@ -11583,16 +11617,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } /* - * ILL cannot be part of a usesrc group and and IPMP group at the - * same time. No need to grab ill_g_usesrc_lock here, see - * synchronization notes in ip.c - */ - if (turn_on & PHYI_STANDBY && - ipif->ipif_ill->ill_usesrc_grp_next != NULL) { - return (EINVAL); - } - - /* * If we modify physical interface flags, we'll potentially need to * send up two routing socket messages for the changes (one for the * IPv4 ill, and another for the IPv6 ill). Note that here. @@ -11601,98 +11625,44 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, phyint_flags_modified = B_TRUE; /* - * If we are setting or clearing FAILED or STANDBY or OFFLINE, - * we need to flush the IRE_CACHES belonging to this ill. - * We handle this case here without doing the DOWN/UP dance - * like it is done for other flags. If some other flags are - * being turned on/off with FAILED/STANDBY/OFFLINE, the code - * below will handle it by bringing it down and then - * bringing it UP. + * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE + * (otherwise, we'd immediately use them, defeating standby). Also, + * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not + * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already + * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We + * also don't allow PHYI_STANDBY if VNI is enabled since its semantics + * will not be honored. */ - if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) { - ill_t *ill_v4, *ill_v6; - - ill_v4 = phyi->phyint_illv4; - ill_v6 = phyi->phyint_illv6; - + if (turn_on & PHYI_STANDBY) { /* - * First set the INACTIVE flag if needed. Then delete the ires. - * ire_add will atomically prevent creating new IRE_CACHEs - * unless hidden flag is set. - * PHYI_FAILED and PHYI_INACTIVE are exclusive + * No need to grab ill_g_usesrc_lock here; see the + * synchronization notes in ip.c. */ - if ((turn_on & PHYI_FAILED) && - ((intf_flags & PHYI_STANDBY) || - !ipst->ips_ipmp_enable_failback)) { - /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */ - phyi->phyint_flags &= ~PHYI_INACTIVE; - } - if ((turn_off & PHYI_FAILED) && - ((intf_flags & PHYI_STANDBY) || - (!ipst->ips_ipmp_enable_failback && - ill_is_inactive(ill)))) { - phyint_inactive(phyi); - } - - if (turn_on & PHYI_STANDBY) { - /* - * We implicitly set INACTIVE only when STANDBY is set. - * INACTIVE is also set on non-STANDBY phyint when user - * disables FAILBACK using configuration file. - * Do not allow STANDBY to be set on such INACTIVE - * phyint - */ - if (phyi->phyint_flags & PHYI_INACTIVE) - return (EINVAL); - if (!(phyi->phyint_flags & PHYI_FAILED)) - phyint_inactive(phyi); - } - if (turn_off & PHYI_STANDBY) { - if (ipst->ips_ipmp_enable_failback) { - /* - * Reset PHYI_INACTIVE. - */ - phyi->phyint_flags &= ~PHYI_INACTIVE; - } else if (ill_is_inactive(ill) && - !(phyi->phyint_flags & PHYI_FAILED)) { - /* - * Need to set INACTIVE, when user sets - * STANDBY on a non-STANDBY phyint and - * later resets STANDBY - */ - phyint_inactive(phyi); - } + if (ill->ill_usesrc_grp_next != NULL || + intf_flags & PHYI_INACTIVE) + return (EINVAL); + if (!(flags & PHYI_FAILED)) { + flags |= PHYI_INACTIVE; + turn_on |= PHYI_INACTIVE; } - /* - * We should always send up a message so that the - * daemons come to know of it. Note that the zeroth - * interface can be down and the check below for IPIF_UP - * will not make sense as we are actually setting - * a phyint flag here. We assume that the ipif used - * is always the zeroth ipif. (ip_rts_ifmsg does not - * send up any message for non-zero ipifs). - */ - phyint_flags_modified = B_TRUE; + } - if (ill_v4 != NULL) { - ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, ill_stq_cache_delete, - (char *)ill_v4, ill_v4); - illgrp_reset_schednext(ill_v4); - } - if (ill_v6 != NULL) { - ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, ill_stq_cache_delete, - (char *)ill_v6, ill_v6); - illgrp_reset_schednext(ill_v6); - } + if (turn_off & PHYI_STANDBY) { + flags &= ~PHYI_INACTIVE; + turn_off |= PHYI_INACTIVE; } /* + * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both + * would end up on. + */ + if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == + (PHYI_FAILED | PHYI_INACTIVE)) + return (EINVAL); + + /* * If ILLF_ROUTER changes, we need to change the ip forwarding - * status of the interface and, if the interface is part of an IPMP - * group, all other interfaces that are part of the same IPMP - * group. + * status of the interface. */ if ((turn_on | turn_off) & ILLF_ROUTER) (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); @@ -11718,33 +11688,31 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, mutex_exit(&ill->ill_phyint->phyint_lock); /* - * We do the broadcast and nomination here rather - * than waiting for a FAILOVER/FAILBACK to happen. In - * the case of FAILBACK from INACTIVE standby to the - * interface that has been repaired, PHYI_FAILED has not - * been cleared yet. If there are only two interfaces in - * that group, all we have is a FAILED and INACTIVE - * interface. If we do the nomination soon after a failback, - * the broadcast nomination code would select the - * INACTIVE interface for receiving broadcasts as FAILED is - * not yet cleared. As we don't want STANDBY/INACTIVE to - * receive broadcast packets, we need to redo nomination - * when the FAILED is cleared here. Thus, in general we - * always do the nomination here for FAILED, STANDBY - * and OFFLINE. + * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the + * same to the kernel: if any of them has been set by + * userland, the interface cannot be used for data traffic. */ - if (((turn_on | turn_off) & - (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) { - ip_redo_nomination(phyi); + if ((turn_on|turn_off) & + (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { + ASSERT(!IS_IPMP(ill)); + /* + * It's possible the ill is part of an "anonymous" + * IPMP group rather than a real group. In that case, + * there are no other interfaces in the group and thus + * no need to call ipmp_phyint_refresh_active(). + */ + if (IS_UNDER_IPMP(ill)) + ipmp_phyint_refresh_active(phyi); } + if (phyint_flags_modified) { if (phyi->phyint_illv4 != NULL) { ip_rts_ifmsg(phyi->phyint_illv4-> - ill_ipif); + ill_ipif, RTSQ_DEFAULT); } if (phyi->phyint_illv6 != NULL) { ip_rts_ifmsg(phyi->phyint_illv6-> - ill_ipif); + ill_ipif, RTSQ_DEFAULT); } } return (0); @@ -11785,15 +11753,17 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } /* - * The only flag changes that we currently take specific action on - * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, - * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and - * IPIF_PREFERRED. This is done by bring the ipif down, changing - * the flags and bringing it back up again. + * The only flag changes that we currently take specific action on are + * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, + * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and + * IPIF_NOFAILOVER. This is done by bring the ipif down, changing the + * flags and bringing it back up again. For IPIF_NOFAILOVER, the act + * of bringing it back up will trigger the address to be moved. */ if ((turn_on|turn_off) & (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| - ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) { + ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| + IPIF_NOFAILOVER)) { /* * Taking this ipif down, make sure we have * valid net and subnet bcast ire's for other @@ -11822,9 +11792,8 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) { ill_t *ill; phyint_t *phyi; - uint64_t turn_on; - uint64_t turn_off; - uint64_t intf_flags; + uint64_t turn_on, turn_off; + uint64_t intf_flags, cantchange_flags; boolean_t phyint_flags_modified = B_FALSE; int err = 0; boolean_t set_linklocal = B_FALSE; @@ -11839,12 +11808,15 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) phyi = ill->ill_phyint; intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; - turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP); + cantchange_flags = IFF_CANTCHANGE | IFF_UP; + if (IS_IPMP(ill)) + cantchange_flags |= IFF_IPMP_CANTCHANGE; + turn_on = (flags ^ intf_flags) & ~cantchange_flags; turn_off = intf_flags & turn_on; turn_on ^= turn_off; - if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) + if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) phyint_flags_modified = B_TRUE; /* @@ -11870,9 +11842,6 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) mutex_exit(&ill->ill_lock); mutex_exit(&phyi->phyint_lock); - if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) - ip_redo_nomination(phyi); - if (set_linklocal) (void) ipif_setlinklocal(ipif); @@ -11881,12 +11850,29 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) else ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; + /* + * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to + * the kernel: if any of them has been set by userland, the interface + * cannot be used for data traffic. + */ + if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { + ASSERT(!IS_IPMP(ill)); + /* + * It's possible the ill is part of an "anonymous" IPMP group + * rather than a real group. In that case, there are no other + * interfaces in the group and thus no need for us to call + * ipmp_phyint_refresh_active(). + */ + if (IS_UNDER_IPMP(ill)) + ipmp_phyint_refresh_active(phyi); + } + if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { /* * XXX ipif_up really does not know whether a phyint flags * was modified or not. So, it sends up information on * only one routing sockets message. As we don't bring up - * the interface and also set STANDBY/FAILED simultaneously + * the interface and also set PHYI_ flags simultaneously * it should be okay. */ err = ipif_up(ipif, q, mp); @@ -11898,14 +11884,14 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) if (phyint_flags_modified) { if (phyi->phyint_illv4 != NULL) { ip_rts_ifmsg(phyi->phyint_illv4-> - ill_ipif); + ill_ipif, RTSQ_DEFAULT); } if (phyi->phyint_illv6 != NULL) { ip_rts_ifmsg(phyi->phyint_illv6-> - ill_ipif); + ill_ipif, RTSQ_DEFAULT); } } else { - ip_rts_ifmsg(ipif); + ip_rts_ifmsg(ipif, RTSQ_DEFAULT); } /* * Update the flags in SCTP's IPIF list, ipif_up() will do @@ -12101,10 +12087,7 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * broadcast address makes sense. If it does, * there should be an IRE for it already. * Don't match on ipif, only on the ill - * since we are sharing these now. Don't use - * MATCH_IRE_ILL_GROUP as we are looking for - * the broadcast ire on this ill and each ill - * in the group has its own broadcast ire. + * since we are sharing these now. */ ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif, ALL_ZONES, NULL, @@ -12302,9 +12285,16 @@ int ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *if_req) { - ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); + + /* + * Since no applications should ever be setting metrics on underlying + * interfaces, we explicitly fail to smoke 'em out. + */ + if (IS_UNDER_IPMP(ipif->ipif_ill)) + return (EINVAL); + /* * Set interface metric. We don't use this for * anything but we keep track of it in case it is @@ -12332,6 +12322,7 @@ ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, /* Get interface metric. */ ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); + if (ipip->ipi_cmd_type == IF_CMD) { struct ifreq *ifr; @@ -12766,13 +12757,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, nipif->ipif_state_flags |= IPIF_CHANGING; } - mutex_exit(&ill->ill_lock); - if (lir->lir_maxmtu != 0) { ill->ill_max_mtu = lir->lir_maxmtu; - ill->ill_mtu_userspecified = 1; + ill->ill_user_mtu = lir->lir_maxmtu; mtu_walk = B_TRUE; } + mutex_exit(&ill->ill_lock); if (lir->lir_reachtime != 0) ill->ill_reachable_time = lir->lir_reachtime; @@ -12821,6 +12811,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ILL_UNMARK_CHANGING(ill); mutex_exit(&ill->ill_lock); + /* + * Refresh IPMP meta-interface MTU if necessary. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_illgrp_refresh_mtu(ill->ill_grp); + return (0); } @@ -13032,13 +13028,117 @@ ipif_assign_seqid(ipif_t *ipif) } /* + * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are + * administratively down (i.e., no DAD), of the same type, and locked. Note + * that the clone is complete -- including the seqid -- and the expectation is + * that the caller will either free or overwrite `sipif' before it's unlocked. + */ +static void +ipif_clone(const ipif_t *sipif, ipif_t *dipif) +{ + ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); + ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); + ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); + ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); + ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); + ASSERT(sipif->ipif_arp_del_mp == NULL); + ASSERT(dipif->ipif_arp_del_mp == NULL); + ASSERT(sipif->ipif_igmp_rpt == NULL); + ASSERT(dipif->ipif_igmp_rpt == NULL); + ASSERT(sipif->ipif_multicast_up == 0); + ASSERT(dipif->ipif_multicast_up == 0); + ASSERT(sipif->ipif_joined_allhosts == 0); + ASSERT(dipif->ipif_joined_allhosts == 0); + + dipif->ipif_mtu = sipif->ipif_mtu; + dipif->ipif_flags = sipif->ipif_flags; + dipif->ipif_metric = sipif->ipif_metric; + dipif->ipif_zoneid = sipif->ipif_zoneid; + dipif->ipif_v6subnet = sipif->ipif_v6subnet; + dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; + dipif->ipif_v6src_addr = sipif->ipif_v6src_addr; + dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; + dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; + dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; + + /* + * While dipif is down right now, it might've been up before. Since + * it's changing identity, its packet counters need to be reset. + */ + dipif->ipif_ib_pkt_count = 0; + dipif->ipif_ob_pkt_count = 0; + dipif->ipif_fo_pkt_count = 0; + + /* + * As per the comment atop the function, we assume that these sipif + * fields will be changed before sipif is unlocked. + */ + dipif->ipif_seqid = sipif->ipif_seqid; + dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp; + dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt; + dipif->ipif_state_flags = sipif->ipif_state_flags; +} + +/* + * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' + * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin + * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then + * transfer the xop to `dipif'. Requires that all ipifs are administratively + * down (i.e., no DAD), of the same type, and unlocked. + */ +static void +ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) +{ + ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; + int ipx_current_ioctl; + + ASSERT(sipif != dipif); + ASSERT(sipif != virgipif); + + /* + * Grab all of the locks that protect the ipif in a defined order. + */ + GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); + if (sipif > dipif) { + mutex_enter(&sipif->ipif_saved_ire_lock); + mutex_enter(&dipif->ipif_saved_ire_lock); + } else { + mutex_enter(&dipif->ipif_saved_ire_lock); + mutex_enter(&sipif->ipif_saved_ire_lock); + } + + ipif_clone(sipif, dipif); + if (virgipif != NULL) { + ipif_clone(virgipif, sipif); + mi_free(virgipif); + } + + mutex_exit(&sipif->ipif_saved_ire_lock); + mutex_exit(&dipif->ipif_saved_ire_lock); + RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); + + /* + * Transfer ownership of the current xop, if necessary. + */ + if (ipsq->ipsq_xop->ipx_current_ipif == sipif) { + ASSERT(ipsq->ipsq_xop->ipx_pending_ipif == NULL); + ipx_current_ioctl = ipsq->ipsq_xop->ipx_current_ioctl; + ipsq_current_finish(ipsq); + ipsq_current_start(ipsq, dipif, ipx_current_ioctl); + } + + if (virgipif == NULL) + mi_free(sipif); +} + +/* * Insert the ipif, so that the list of ipifs on the ill will be sorted * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will * be inserted into the first space available in the list. The value of * ipif_id will then be set to the appropriate value for its position. */ static int -ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) +ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) { ill_t *ill; ipif_t *tipif; @@ -13056,12 +13156,11 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) /* * In the case of lo0:0 we already hold the ill_g_lock. * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> - * ipif_insert. Another such caller is ipif_move. + * ipif_insert. */ if (acquire_g_lock) rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - if (acquire_ill_lock) - mutex_enter(&ill->ill_lock); + mutex_enter(&ill->ill_lock); id = ipif->ipif_id; tipifp = &(ill->ill_ipif); if (id == -1) { /* need to find a real id */ @@ -13075,8 +13174,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) } /* limit number of logical interfaces */ if (id >= ipst->ips_ip_addrs_per_if) { - if (acquire_ill_lock) - mutex_exit(&ill->ill_lock); + mutex_exit(&ill->ill_lock); if (acquire_g_lock) rw_exit(&ipst->ips_ill_g_lock); return (-1); @@ -13091,8 +13189,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) tipifp = &(tipif->ipif_next); } } else { - if (acquire_ill_lock) - mutex_exit(&ill->ill_lock); + mutex_exit(&ill->ill_lock); if (acquire_g_lock) rw_exit(&ipst->ips_ill_g_lock); return (-1); @@ -13102,25 +13199,22 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) ipif->ipif_next = tipif; *tipifp = ipif; - if (acquire_ill_lock) - mutex_exit(&ill->ill_lock); + mutex_exit(&ill->ill_lock); if (acquire_g_lock) rw_exit(&ipst->ips_ill_g_lock); + return (0); } static void -ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock) +ipif_remove(ipif_t *ipif) { ipif_t **ipifp; ill_t *ill = ipif->ipif_ill; ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); - if (acquire_ill_lock) - mutex_enter(&ill->ill_lock); - else - ASSERT(MUTEX_HELD(&ill->ill_lock)); + mutex_enter(&ill->ill_lock); ipifp = &ill->ill_ipif; for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { if (*ipifp == ipif) { @@ -13128,9 +13222,7 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock) break; } } - - if (acquire_ill_lock) - mutex_exit(&ill->ill_lock); + mutex_exit(&ill->ill_lock); } /* @@ -13149,10 +13241,12 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock) * second DL_INFO_ACK comes in from the driver. */ static ipif_t * -ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) +ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, + boolean_t insert) { ipif_t *ipif; - phyint_t *phyi; + phyint_t *phyi = ill->ill_phyint; + ip_stack_t *ipst = ill->ill_ipst; ip1dbg(("ipif_allocate(%s:%d ill %p)\n", ill->ill_name, id, (void *)ill)); @@ -13175,23 +13269,61 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) ipif->ipif_refcnt = 0; ipif->ipif_saved_ire_cnt = 0; - if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) { - mi_free(ipif); - return (NULL); + if (insert) { + if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) { + mi_free(ipif); + return (NULL); + } + /* -1 id should have been replaced by real id */ + id = ipif->ipif_id; + ASSERT(id >= 0); } - /* -1 id should have been replaced by real id */ - id = ipif->ipif_id; - ASSERT(id >= 0); if (ill->ill_name[0] != '\0') ipif_assign_seqid(ipif); /* - * Keep a copy of original id in ipif_orig_ipifid. Failback - * will attempt to restore the original id. The SIOCSLIFOINDEX - * ioctl sets ipif_orig_ipifid to zero. + * If this is ipif zero, configure ill/phyint-wide information. + * Defer most configuration until we're guaranteed we're attached. */ - ipif->ipif_orig_ipifid = id; + if (id == 0) { + if (ill->ill_mactype == SUNW_DL_IPMP) { + /* + * Set PHYI_IPMP and also set PHYI_FAILED since there + * are no active interfaces. Similarly, PHYI_RUNNING + * isn't set until the group has an active interface. + */ + mutex_enter(&phyi->phyint_lock); + phyi->phyint_flags |= (PHYI_IPMP | PHYI_FAILED); + mutex_exit(&phyi->phyint_lock); + + /* + * Create the illgrp (which must not exist yet because + * the zeroth ipif is created once per ill). However, + * do not not link it to the ipmp_grp_t until I_PLINK + * is called; see ip_sioctl_plink_ipmp() for details. + */ + if (ipmp_illgrp_create(ill) == NULL) { + if (insert) { + rw_enter(&ipst->ips_ill_g_lock, + RW_WRITER); + ipif_remove(ipif); + rw_exit(&ipst->ips_ill_g_lock); + } + mi_free(ipif); + return (NULL); + } + } else { + /* + * By default, PHYI_RUNNING is set when the zeroth + * ipif is created. For other ipifs, we don't touch + * it since DLPI notifications may have changed it. + */ + mutex_enter(&phyi->phyint_lock); + phyi->phyint_flags |= PHYI_RUNNING; + mutex_exit(&phyi->phyint_lock); + } + } /* * We grab the ill_lock and phyint_lock to protect the flag changes. @@ -13199,18 +13331,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) * ioctl completes and the IPIF_CHANGING flag is cleared. */ mutex_enter(&ill->ill_lock); - mutex_enter(&ill->ill_phyint->phyint_lock); - /* - * Set the running flag when logical interface zero is created. - * For subsequent logical interfaces, a DLPI link down - * notification message may have cleared the running flag to - * indicate the link is down, so we shouldn't just blindly set it. - */ - if (id == 0) - ill->ill_phyint->phyint_flags |= PHYI_RUNNING; + mutex_enter(&phyi->phyint_lock); + ipif->ipif_ire_type = ire_type; - phyi = ill->ill_phyint; - ipif->ipif_orig_ifindex = phyi->phyint_ifindex; if (ipif->ipif_isv6) { ill->ill_flags |= ILLF_IPV6; @@ -13238,14 +13361,18 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) * Don't set the interface flags etc. now, will do it in * ip_ll_subnet_defaults. */ - if (!initialize) { - mutex_exit(&ill->ill_lock); - mutex_exit(&ill->ill_phyint->phyint_lock); - return (ipif); - } + if (!initialize) + goto out; + ipif->ipif_mtu = ill->ill_max_mtu; - if (ill->ill_bcast_addr_length != 0) { + /* + * NOTE: The IPMP meta-interface is special-cased because it starts + * with no underlying interfaces (and thus an unknown broadcast + * address length), but all interfaces that can be placed into an IPMP + * group are required to be broadcast-capable. + */ + if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { /* * Later detect lack of DLPI driver multicast * capability by catching DL_ENABMULTI errors in @@ -13269,8 +13396,7 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) ill->ill_flags |= ILLF_NOARP; } if (ill->ill_phys_addr_length == 0) { - if (ill->ill_media && - ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { + if (ill->ill_mactype == SUNW_DL_VNI) { ipif->ipif_flags |= IPIF_NOXMIT; phyi->phyint_flags |= PHYI_VIRTUAL; } else { @@ -13285,8 +13411,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) } } } +out: + mutex_exit(&phyi->phyint_lock); mutex_exit(&ill->ill_lock); - mutex_exit(&ill->ill_phyint->phyint_lock); return (ipif); } @@ -13300,34 +13427,49 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) * for details. */ void -ipif_arp_down(ipif_t *ipif) +ipif_resolver_down(ipif_t *ipif) { mblk_t *mp; ill_t *ill = ipif->ipif_ill; - ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); + ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); ASSERT(IAM_WRITER_IPIF(ipif)); + if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) + return; + /* Delete the mapping for the local address */ mp = ipif->ipif_arp_del_mp; if (mp != NULL) { - ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", + ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, mp); ipif->ipif_arp_del_mp = NULL; } /* + * Make IPMP aware of the deleted data address. + */ + if (IS_IPMP(ill)) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); + + /* * If this is the last ipif that is going down and there are no * duplicate addresses we may yet attempt to re-probe, then we need to * clean up ARP completely. */ if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { + /* + * If this was the last ipif on an IPMP interface, purge any + * IPMP ARP entries associated with it. + */ + if (IS_IPMP(ill)) + ipmp_illgrp_refresh_arpent(ill->ill_grp); /* Send up AR_INTERFACE_DOWN message */ mp = ill->ill_arp_down_mp; if (mp != NULL) { - ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", + ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, mp); @@ -13337,7 +13479,7 @@ ipif_arp_down(ipif_t *ipif) /* Tell ARP to delete the multicast mappings */ mp = ill->ill_arp_del_mapping_mp; if (mp != NULL) { - ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", + ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, mp); @@ -13377,6 +13519,13 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) return (0); /* + * IPMP meta-interfaces don't have any inherent multicast mappings, + * and instead use the ones on the underlying interfaces. + */ + if (IS_IPMP(ill)) + return (0); + + /* * Delete the existing mapping from ARP. Normally ipif_down * -> ipif_arp_down should send this up to ARP. The only * reason we would find this when we are switching from @@ -13473,26 +13622,23 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) } /* - * Get the resolver set up for a new interface address. - * (Always called as writer.) - * Called both for IPv4 and IPv6 interfaces, - * though it only sets up the resolver for v6 - * if it's an xresolv interface (one using an external resolver). - * Honors ILLF_NOARP. - * The enumerated value res_act is used to tune the behavior. - * If set to Res_act_initial, then we set up all the resolver - * structures for a new interface. If set to Res_act_move, then - * we just send an AR_ENTRY_ADD message up to ARP for IPv4 - * interfaces; this is called by ip_rput_dlpi_writer() to handle - * asynchronous hardware address change notification. If set to - * Res_act_defend, then we tell ARP that it needs to send a single - * gratuitous message in defense of the address. + * Get the resolver set up for a new IP address. (Always called as writer.) + * Called both for IPv4 and IPv6 interfaces, though it only sets up the + * resolver for v6 if it's an ILLF_XRESOLV interface. Honors ILLF_NOARP. + * + * The enumerated value res_act tunes the behavior: + * * Res_act_initial: set up all the resolver structures for a new + * IP address. + * * Res_act_defend: tell ARP that it needs to send a single gratuitous + * ARP message in defense of the address. + * * Res_act_rebind: tell ARP to change the hardware address for an IP + * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). + * * Returns error on failure. */ int ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) { - caddr_t addr; mblk_t *arp_up_mp = NULL; mblk_t *arp_down_mp = NULL; mblk_t *arp_add_mp = NULL; @@ -13500,9 +13646,9 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) mblk_t *arp_add_mapping_mp = NULL; mblk_t *arp_del_mapping_mp = NULL; ill_t *ill = ipif->ipif_ill; - uchar_t *area_p = NULL; - uchar_t *ared_p = NULL; int err = ENOMEM; + boolean_t added_ipif = B_FALSE; + boolean_t publish; boolean_t was_dup; ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", @@ -13540,11 +13686,7 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) * External resolver for IPv6 */ ASSERT(res_act == Res_act_initial); - if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { - addr = (caddr_t)&ipif->ipif_v6lcl_addr; - area_p = (uchar_t *)&ip6_area_template; - ared_p = (uchar_t *)&ip6_ared_template; - } + publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr); } else { /* * IPv4 arp case. If the ARP stream has already started @@ -13562,41 +13704,39 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) ill->ill_arp_bringup_pending = 1; mutex_exit(&ill->ill_lock); } - if (ipif->ipif_lcl_addr != INADDR_ANY) { - addr = (caddr_t)&ipif->ipif_lcl_addr; - area_p = (uchar_t *)&ip_area_template; - ared_p = (uchar_t *)&ip_ared_template; + publish = (ipif->ipif_lcl_addr != INADDR_ANY); + } + + if (IS_IPMP(ill) && publish) { + /* + * If we're here via ipif_up(), then the ipif won't be bound + * yet -- add it to the group, which will bind it if possible. + * (We would add it in ipif_up(), but deleting on failure + * there is gruesome.) If we're here via ipmp_ill_bind_ipif(), + * then the ipif has already been added to the group and we + * just need to use the binding. + */ + if (ipmp_ipif_bound_ill(ipif) == NULL) { + if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) { + /* + * We couldn't bind the ipif to an ill yet, + * so we have nothing to publish. + */ + publish = B_FALSE; + } + added_ipif = B_TRUE; } } /* * Add an entry for the local address in ARP only if it - * is not UNNUMBERED and the address is not INADDR_ANY. + * is not UNNUMBERED and it is suitable for publishing. */ - if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) { - area_t *area; - - /* Now ask ARP to publish our address. */ - arp_add_mp = ill_arp_alloc(ill, area_p, addr); - if (arp_add_mp == NULL) - goto failed; - area = (area_t *)arp_add_mp->b_rptr; - if (res_act != Res_act_initial) { - /* - * Copy the new hardware address and length into - * arp_add_mp to be sent to ARP. - */ - area->area_hw_addr_length = ill->ill_phys_addr_length; - bcopy(ill->ill_phys_addr, - ((char *)area + area->area_hw_addr_offset), - area->area_hw_addr_length); - } - - area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | - ACE_F_MYADDR; - + if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) { if (res_act == Res_act_defend) { - area->area_flags |= ACE_F_DEFEND; + arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND); + if (arp_add_mp == NULL) + goto failed; /* * If we're just defending our address now, then * there's no need to set up ARP multicast mappings. @@ -13605,17 +13745,18 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) goto done; } - if (res_act != Res_act_initial) - goto arp_setup_multicast; - /* - * Allocate an ARP deletion message so we know we can tell ARP - * when the interface goes down. + * Allocate an ARP add message and an ARP delete message (the + * latter is saved for use when the address goes down). */ - arp_del_mp = ill_arp_alloc(ill, ared_p, addr); - if (arp_del_mp == NULL) + if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL) + goto failed; + + if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) goto failed; + if (res_act != Res_act_initial) + goto arp_setup_multicast; } else { if (res_act != Res_act_initial) goto done; @@ -13624,14 +13765,11 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) * Need to bring up ARP or setup multicast mapping only * when the first interface is coming UP. */ - if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || - was_dup) { + if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup) goto done; - } /* - * Allocate an ARP down message (to be saved) and an ARP up - * message. + * Allocate an ARP down message (to be saved) and an ARP up message. */ arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); if (arp_down_mp == NULL) @@ -13648,33 +13786,21 @@ arp_setup_multicast: /* * Setup the multicast mappings. This function initializes * ill_arp_del_mapping_mp also. This does not need to be done for - * IPv6. + * IPv6, or for the IPMP interface (since it has no link-layer). */ - if (!ill->ill_isv6) { + if (!ill->ill_isv6 && !IS_IPMP(ill)) { err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); if (err != 0) goto failed; ASSERT(ill->ill_arp_del_mapping_mp != NULL); ASSERT(arp_add_mapping_mp != NULL); } - done: - if (arp_del_mp != NULL) { - ASSERT(ipif->ipif_arp_del_mp == NULL); - ipif->ipif_arp_del_mp = arp_del_mp; - } - if (arp_down_mp != NULL) { - ASSERT(ill->ill_arp_down_mp == NULL); - ill->ill_arp_down_mp = arp_down_mp; - } - if (arp_del_mapping_mp != NULL) { - ASSERT(ill->ill_arp_del_mapping_mp == NULL); - ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; - } if (arp_up_mp != NULL) { ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, arp_up_mp); + arp_up_mp = NULL; } if (arp_add_mp != NULL) { ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", @@ -13686,6 +13812,7 @@ done: if (!ill->ill_arp_extend) ipif->ipif_addr_ready = 1; putnext(ill->ill_rq, arp_add_mp); + arp_add_mp = NULL; } else { ipif->ipif_addr_ready = 1; } @@ -13693,29 +13820,40 @@ done: ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, arp_add_mapping_mp); + arp_add_mapping_mp = NULL; } - if (res_act != Res_act_initial) - return (0); - if (ill->ill_flags & ILLF_NOARP) - err = ill_arp_off(ill); - else - err = ill_arp_on(ill); - if (err != 0) { - ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); - freemsg(ipif->ipif_arp_del_mp); - freemsg(ill->ill_arp_down_mp); - freemsg(ill->ill_arp_del_mapping_mp); - ipif->ipif_arp_del_mp = NULL; - ill->ill_arp_down_mp = NULL; - ill->ill_arp_del_mapping_mp = NULL; - return (err); + if (res_act == Res_act_initial) { + if (ill->ill_flags & ILLF_NOARP) + err = ill_arp_off(ill); + else + err = ill_arp_on(ill); + if (err != 0) { + ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", + err)); + goto failed; + } } + + if (arp_del_mp != NULL) { + ASSERT(ipif->ipif_arp_del_mp == NULL); + ipif->ipif_arp_del_mp = arp_del_mp; + } + if (arp_down_mp != NULL) { + ASSERT(ill->ill_arp_down_mp == NULL); + ill->ill_arp_down_mp = arp_down_mp; + } + if (arp_del_mapping_mp != NULL) { + ASSERT(ill->ill_arp_del_mapping_mp == NULL); + ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; + } + return ((ill->ill_ipif_up_count != 0 || was_dup || ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); - failed: ip1dbg(("ipif_resolver_up: FAILED\n")); + if (added_ipif) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); freemsg(arp_add_mp); freemsg(arp_del_mp); freemsg(arp_add_mapping_mp); @@ -13734,13 +13872,12 @@ ipif_arp_start_dad(ipif_t *ipif) { ill_t *ill = ipif->ipif_ill; mblk_t *arp_add_mp; - area_t *area; + /* ACE_F_UNVERIFIED restarts DAD */ if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || (ipif->ipif_flags & IPIF_UNNUMBERED) || ipif->ipif_lcl_addr == INADDR_ANY || - (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, - (char *)&ipif->ipif_lcl_addr)) == NULL) { + (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) { /* * If we can't contact ARP for some reason, that's not really a * problem. Just send out the routing socket notification that @@ -13752,10 +13889,6 @@ ipif_arp_start_dad(ipif_t *ipif) return; } - /* Setting the 'unverified' flag restarts DAD */ - area = (area_t *)arp_add_mp->b_rptr; - area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | - ACE_F_UNVERIFIED; putnext(ill->ill_rq, arp_add_mp); } @@ -13764,7 +13897,8 @@ ipif_ndp_start_dad(ipif_t *ipif) { nce_t *nce; - nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE); + nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr, + B_FALSE); if (nce == NULL) return; @@ -13805,7 +13939,7 @@ ill_restart_dad(ill_t *ill, boolean_t went_up) */ if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || (!ill->ill_isv6 && !ill->ill_arp_extend)) { - ip_rts_ifmsg(ill->ill_ipif); + ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); return; } @@ -13838,8 +13972,10 @@ ill_restart_dad(ill_t *ill, boolean_t went_up) * we'll handle eventual routing socket * notification via DAD completion.) */ - if (ipif == ill->ill_ipif) - ip_rts_ifmsg(ill->ill_ipif); + if (ipif == ill->ill_ipif) { + ip_rts_ifmsg(ill->ill_ipif, + RTSQ_DEFAULT); + } } } else { /* @@ -13855,285 +13991,30 @@ ill_restart_dad(ill_t *ill, boolean_t went_up) * If we've torn down links, then notify the user right away. */ if (!went_up) - ip_rts_ifmsg(ill->ill_ipif); -} - -/* - * Wakeup all threads waiting to enter the ipsq, and sleeping - * on any of the ills in this ipsq. The ill_lock of the ill - * must be held so that waiters don't miss wakeups - */ -static void -ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock) -{ - phyint_t *phyint; - - phyint = ipsq->ipsq_phyint_list; - while (phyint != NULL) { - if (phyint->phyint_illv4) { - if (!caller_holds_lock) - mutex_enter(&phyint->phyint_illv4->ill_lock); - ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); - cv_broadcast(&phyint->phyint_illv4->ill_cv); - if (!caller_holds_lock) - mutex_exit(&phyint->phyint_illv4->ill_lock); - } - if (phyint->phyint_illv6) { - if (!caller_holds_lock) - mutex_enter(&phyint->phyint_illv6->ill_lock); - ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); - cv_broadcast(&phyint->phyint_illv6->ill_cv); - if (!caller_holds_lock) - mutex_exit(&phyint->phyint_illv6->ill_lock); - } - phyint = phyint->phyint_ipsq_next; - } -} - -static ipsq_t * -ipsq_create(char *groupname, ip_stack_t *ipst) -{ - ipsq_t *ipsq; - - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); - if (ipsq == NULL) { - return (NULL); - } - - if (groupname != NULL) - (void) strcpy(ipsq->ipsq_name, groupname); - else - ipsq->ipsq_name[0] = '\0'; - - mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL); - ipsq->ipsq_flags |= IPSQ_GROUP; - ipsq->ipsq_next = ipst->ips_ipsq_g_head; - ipst->ips_ipsq_g_head = ipsq; - ipsq->ipsq_ipst = ipst; /* No netstack_hold */ - return (ipsq); -} - -/* - * Return an ipsq correspoding to the groupname. If 'create' is true - * allocate a new ipsq if one does not exist. Usually an ipsq is associated - * uniquely with an IPMP group. However during IPMP groupname operations, - * multiple IPMP groups may be associated with a single ipsq. But no - * IPMP group can be associated with more than 1 ipsq at any time. - * For example - * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs - * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2 - * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2 - * - * Now the command ifconfig hme3 group mpk17-84 results in the temporary - * status shown below during the execution of the above command. - * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4 - * - * After the completion of the above groupname command we return to the stable - * state shown below. - * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3 - * hme4 mpk17-85 ipsq2 mpk17-85 1 - * - * Because of the above, we don't search based on the ipsq_name since that - * would miss the correct ipsq during certain windows as shown above. - * The ipsq_name is only used during split of an ipsq to return the ipsq to its - * natural state. - */ -static ipsq_t * -ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq, - ip_stack_t *ipst) -{ - ipsq_t *ipsq; - int group_len; - phyint_t *phyint; - - ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); - - group_len = strlen(groupname); - ASSERT(group_len != 0); - group_len++; - - for (ipsq = ipst->ips_ipsq_g_head; - ipsq != NULL; - ipsq = ipsq->ipsq_next) { - /* - * When an ipsq is being split, and ill_split_ipsq - * calls this function, we exclude it from being considered. - */ - if (ipsq == exclude_ipsq) - continue; - - /* - * Compare against the ipsq_name. The groupname change happens - * in 2 phases. The 1st phase merges the from group into - * the to group's ipsq, by calling ill_merge_groups and restarts - * the ioctl. The 2nd phase then locates the ipsq again thru - * ipsq_name. At this point the phyint_groupname has not been - * updated. - */ - if ((group_len == strlen(ipsq->ipsq_name) + 1) && - (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) { - /* - * Verify that an ipmp groupname is exactly - * part of 1 ipsq and is not found in any other - * ipsq. - */ - ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, ipst) == - NULL); - return (ipsq); - } - - /* - * Comparison against ipsq_name alone is not sufficient. - * In the case when groups are currently being - * merged, the ipsq could hold other IPMP groups temporarily. - * so we walk the phyint list and compare against the - * phyint_groupname as well. - */ - phyint = ipsq->ipsq_phyint_list; - while (phyint != NULL) { - if ((group_len == phyint->phyint_groupname_len) && - (bcmp(phyint->phyint_groupname, groupname, - group_len) == 0)) { - /* - * Verify that an ipmp groupname is exactly - * part of 1 ipsq and is not found in any other - * ipsq. - */ - ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, - ipst) == NULL); - return (ipsq); - } - phyint = phyint->phyint_ipsq_next; - } - } - if (create) - ipsq = ipsq_create(groupname, ipst); - return (ipsq); + ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); } static void ipsq_delete(ipsq_t *ipsq) { - ipsq_t *nipsq; - ipsq_t *pipsq = NULL; - ip_stack_t *ipst = ipsq->ipsq_ipst; - - /* - * We don't hold the ipsq lock, but we are sure no new - * messages can land up, since the ipsq_refs is zero. - * i.e. this ipsq is unnamed and no phyint or phyint group - * is associated with this ipsq. (Lookups are based on ill_name - * or phyint_groupname) - */ - ASSERT(ipsq->ipsq_refs == 0); - ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL); - ASSERT(ipsq->ipsq_pending_mp == NULL); - if (!(ipsq->ipsq_flags & IPSQ_GROUP)) { - /* - * This is not the ipsq of an IPMP group. - */ - ipsq->ipsq_ipst = NULL; - kmem_free(ipsq, sizeof (ipsq_t)); - return; - } - - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - - /* - * Locate the ipsq before we can remove it from - * the singly linked list of ipsq's. - */ - for (nipsq = ipst->ips_ipsq_g_head; nipsq != NULL; - nipsq = nipsq->ipsq_next) { - if (nipsq == ipsq) { - break; - } - pipsq = nipsq; - } - - ASSERT(nipsq == ipsq); + ipxop_t *ipx = ipsq->ipsq_xop; - /* unlink ipsq from the list */ - if (pipsq != NULL) - pipsq->ipsq_next = ipsq->ipsq_next; - else - ipst->ips_ipsq_g_head = ipsq->ipsq_next; ipsq->ipsq_ipst = NULL; + ASSERT(ipsq->ipsq_phyint == NULL); + ASSERT(ipsq->ipsq_xop != NULL); + ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); + ASSERT(ipx->ipx_pending_mp == NULL); kmem_free(ipsq, sizeof (ipsq_t)); - rw_exit(&ipst->ips_ill_g_lock); -} - -static void -ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp, - queue_t *q) -{ - ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock)); - ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL); - ASSERT(old_ipsq->ipsq_pending_ipif == NULL); - ASSERT(old_ipsq->ipsq_pending_mp == NULL); - ASSERT(current_mp != NULL); - - ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl, - NEW_OP, NULL); - - ASSERT(new_ipsq->ipsq_xopq_mptail != NULL && - new_ipsq->ipsq_xopq_mphead != NULL); - - /* - * move from old ipsq to the new ipsq. - */ - new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead; - if (old_ipsq->ipsq_xopq_mphead != NULL) - new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail; - - old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL; } -void -ill_group_cleanup(ill_t *ill) -{ - ill_t *ill_v4; - ill_t *ill_v6; - ipif_t *ipif; - - ill_v4 = ill->ill_phyint->phyint_illv4; - ill_v6 = ill->ill_phyint->phyint_illv6; - - if (ill_v4 != NULL) { - mutex_enter(&ill_v4->ill_lock); - for (ipif = ill_v4->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - IPIF_UNMARK_MOVING(ipif); - } - ill_v4->ill_up_ipifs = B_FALSE; - mutex_exit(&ill_v4->ill_lock); - } - - if (ill_v6 != NULL) { - mutex_enter(&ill_v6->ill_lock); - for (ipif = ill_v6->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - IPIF_UNMARK_MOVING(ipif); - } - ill_v6->ill_up_ipifs = B_FALSE; - mutex_exit(&ill_v6->ill_lock); - } -} -/* - * This function is called when an ill has had a change in its group status - * to bring up all the ipifs that were up before the change. - */ -int -ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) +static int +ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) { + int err; ipif_t *ipif; - ill_t *ill_v4; - ill_t *ill_v6; - ill_t *from_ill; - int err = 0; - ASSERT(IAM_WRITER_ILL(ill)); + if (ill == NULL) + return (0); /* * Except for ipif_state_flags and ill_state_flags the other @@ -14142,389 +14023,86 @@ ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) * even an ipif that was already down, in ill_down_ipifs. So we * just blindly clear the IPIF_CHANGING flag here on all ipifs. */ - ill_v4 = ill->ill_phyint->phyint_illv4; - ill_v6 = ill->ill_phyint->phyint_illv6; - if (ill_v4 != NULL) { - ill_v4->ill_up_ipifs = B_TRUE; - for (ipif = ill_v4->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - mutex_enter(&ill_v4->ill_lock); - ipif->ipif_state_flags &= ~IPIF_CHANGING; - IPIF_UNMARK_MOVING(ipif); - mutex_exit(&ill_v4->ill_lock); - if (ipif->ipif_was_up) { - if (!(ipif->ipif_flags & IPIF_UP)) - err = ipif_up(ipif, q, mp); - ipif->ipif_was_up = B_FALSE; - if (err != 0) { - /* - * Can there be any other error ? - */ - ASSERT(err == EINPROGRESS); - return (err); - } - } - } - mutex_enter(&ill_v4->ill_lock); - ill_v4->ill_state_flags &= ~ILL_CHANGING; - mutex_exit(&ill_v4->ill_lock); - ill_v4->ill_up_ipifs = B_FALSE; - if (ill_v4->ill_move_in_progress) { - ASSERT(ill_v4->ill_move_peer != NULL); - ill_v4->ill_move_in_progress = B_FALSE; - from_ill = ill_v4->ill_move_peer; - from_ill->ill_move_in_progress = B_FALSE; - from_ill->ill_move_peer = NULL; - mutex_enter(&from_ill->ill_lock); - from_ill->ill_state_flags &= ~ILL_CHANGING; - mutex_exit(&from_ill->ill_lock); - if (ill_v6 == NULL) { - if (from_ill->ill_phyint->phyint_flags & - PHYI_STANDBY) { - phyint_inactive(from_ill->ill_phyint); - } - if (ill_v4->ill_phyint->phyint_flags & - PHYI_STANDBY) { - phyint_inactive(ill_v4->ill_phyint); - } - } - ill_v4->ill_move_peer = NULL; - } - } + ASSERT(IAM_WRITER_ILL(ill)); - if (ill_v6 != NULL) { - ill_v6->ill_up_ipifs = B_TRUE; - for (ipif = ill_v6->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - mutex_enter(&ill_v6->ill_lock); - ipif->ipif_state_flags &= ~IPIF_CHANGING; - IPIF_UNMARK_MOVING(ipif); - mutex_exit(&ill_v6->ill_lock); - if (ipif->ipif_was_up) { - if (!(ipif->ipif_flags & IPIF_UP)) - err = ipif_up(ipif, q, mp); - ipif->ipif_was_up = B_FALSE; - if (err != 0) { - /* - * Can there be any other error ? - */ - ASSERT(err == EINPROGRESS); - return (err); - } - } - } - mutex_enter(&ill_v6->ill_lock); - ill_v6->ill_state_flags &= ~ILL_CHANGING; - mutex_exit(&ill_v6->ill_lock); - ill_v6->ill_up_ipifs = B_FALSE; - if (ill_v6->ill_move_in_progress) { - ASSERT(ill_v6->ill_move_peer != NULL); - ill_v6->ill_move_in_progress = B_FALSE; - from_ill = ill_v6->ill_move_peer; - from_ill->ill_move_in_progress = B_FALSE; - from_ill->ill_move_peer = NULL; - mutex_enter(&from_ill->ill_lock); - from_ill->ill_state_flags &= ~ILL_CHANGING; - mutex_exit(&from_ill->ill_lock); - if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { - phyint_inactive(from_ill->ill_phyint); - } - if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) { - phyint_inactive(ill_v6->ill_phyint); + ill->ill_up_ipifs = B_TRUE; + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + mutex_enter(&ill->ill_lock); + ipif->ipif_state_flags &= ~IPIF_CHANGING; + mutex_exit(&ill->ill_lock); + if (ipif->ipif_was_up) { + if (!(ipif->ipif_flags & IPIF_UP)) + err = ipif_up(ipif, q, mp); + ipif->ipif_was_up = B_FALSE; + if (err != 0) { + ASSERT(err == EINPROGRESS); + return (err); } - ill_v6->ill_move_peer = NULL; } } + mutex_enter(&ill->ill_lock); + ill->ill_state_flags &= ~ILL_CHANGING; + mutex_exit(&ill->ill_lock); + ill->ill_up_ipifs = B_FALSE; return (0); } /* - * bring down all the approriate ipifs. + * This function is called to bring up all the ipifs that were up before + * bringing the ill down via ill_down_ipifs(). */ -/* ARGSUSED */ -static void -ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) +int +ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) { - ipif_t *ipif; + int err; ASSERT(IAM_WRITER_ILL(ill)); - /* - * Except for ipif_state_flags the other fields of the ipif/ill that - * are modified below are protected implicitly since we are a writer - */ - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER)) - continue; - /* - * Don't bring down the LINK LOCAL addresses as they are tied - * to physical interface and they don't move. Treat them as - * IPIF_NOFAILOVER. - */ - if (chk_nofailover && ill->ill_isv6 && - IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) - continue; - if (index == 0 || index == ipif->ipif_orig_ifindex) { - /* - * We go through the ipif_down logic even if the ipif - * is already down, since routes can be added based - * on down ipifs. Going through ipif_down once again - * will delete any IREs created based on these routes. - */ - if (ipif->ipif_flags & IPIF_UP) - ipif->ipif_was_up = B_TRUE; - /* - * If called with chk_nofailover true ipif is moving. - */ - mutex_enter(&ill->ill_lock); - if (chk_nofailover) { - ipif->ipif_state_flags |= - IPIF_MOVING | IPIF_CHANGING; - } else { - ipif->ipif_state_flags |= IPIF_CHANGING; - } - mutex_exit(&ill->ill_lock); - /* - * Need to re-create net/subnet bcast ires if - * they are dependent on ipif. - */ - if (!ipif->ipif_isv6) - ipif_check_bcast_ires(ipif); - (void) ipif_logical_down(ipif, NULL, NULL); - ipif_non_duplicate(ipif); - ipif_down_tail(ipif); - } - } -} - -#define IPSQ_INC_REF(ipsq, ipst) { \ - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \ - (ipsq)->ipsq_refs++; \ -} + err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); + if (err != 0) + return (err); -#define IPSQ_DEC_REF(ipsq, ipst) { \ - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \ - (ipsq)->ipsq_refs--; \ - if ((ipsq)->ipsq_refs == 0) \ - (ipsq)->ipsq_name[0] = '\0'; \ + return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); } /* - * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to - * new_ipsq. + * Bring down any IPIF_UP ipifs on ill. */ static void -ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq, ip_stack_t *ipst) +ill_down_ipifs(ill_t *ill) { - phyint_t *phyint; - phyint_t *next_phyint; - - /* - * To change the ipsq of an ill, we need to hold the ill_g_lock as - * writer and the ill_lock of the ill in question. Also the dest - * ipsq can't vanish while we hold the ill_g_lock as writer. - */ - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - - phyint = cur_ipsq->ipsq_phyint_list; - cur_ipsq->ipsq_phyint_list = NULL; - while (phyint != NULL) { - next_phyint = phyint->phyint_ipsq_next; - IPSQ_DEC_REF(cur_ipsq, ipst); - phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list; - new_ipsq->ipsq_phyint_list = phyint; - IPSQ_INC_REF(new_ipsq, ipst); - phyint->phyint_ipsq = new_ipsq; - phyint = next_phyint; - } -} - -#define SPLIT_SUCCESS 0 -#define SPLIT_NOT_NEEDED 1 -#define SPLIT_FAILED 2 - -int -ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry, - ip_stack_t *ipst) -{ - ipsq_t *newipsq = NULL; - - /* - * Assertions denote pre-requisites for changing the ipsq of - * a phyint - */ - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - /* - * <ill-phyint> assocs can't change while ill_g_lock - * is held as writer. See ill_phyint_reinit() - */ - ASSERT(phyint->phyint_illv4 == NULL || - MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); - ASSERT(phyint->phyint_illv6 == NULL || - MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); - - if ((phyint->phyint_groupname_len != - (strlen(cur_ipsq->ipsq_name) + 1) || - bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name, - phyint->phyint_groupname_len) != 0)) { - /* - * Once we fail in creating a new ipsq due to memory shortage, - * don't attempt to create new ipsq again, based on another - * phyint, since we want all phyints belonging to an IPMP group - * to be in the same ipsq even in the event of mem alloc fails. - */ - newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry, - cur_ipsq, ipst); - if (newipsq == NULL) { - /* Memory allocation failure */ - return (SPLIT_FAILED); - } else { - /* ipsq_refs protected by ill_g_lock (writer) */ - IPSQ_DEC_REF(cur_ipsq, ipst); - phyint->phyint_ipsq = newipsq; - phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list; - newipsq->ipsq_phyint_list = phyint; - IPSQ_INC_REF(newipsq, ipst); - return (SPLIT_SUCCESS); - } - } - return (SPLIT_NOT_NEEDED); -} + ipif_t *ipif; -/* - * The ill locks of the phyint and the ill_g_lock (writer) must be held - * to do this split - */ -static int -ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, ip_stack_t *ipst) -{ - ipsq_t *newipsq; + ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); /* - * <ill-phyint> assocs can't change while ill_g_lock - * is held as writer. See ill_phyint_reinit() + * Except for ipif_state_flags the other fields of the ipif/ill that + * are modified below are protected implicitly since we are a writer */ - - ASSERT(phyint->phyint_illv4 == NULL || - MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); - ASSERT(phyint->phyint_illv6 == NULL || - MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); - - if (!ipsq_init((phyint->phyint_illv4 != NULL) ? - phyint->phyint_illv4: phyint->phyint_illv6)) { + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { /* - * ipsq_init failed due to no memory - * caller will use the same ipsq + * We go through the ipif_down logic even if the ipif + * is already down, since routes can be added based + * on down ipifs. Going through ipif_down once again + * will delete any IREs created based on these routes. */ - return (SPLIT_FAILED); - } - - /* ipsq_ref is protected by ill_g_lock (writer) */ - IPSQ_DEC_REF(cur_ipsq, ipst); - - /* - * This is a new ipsq that is unknown to the world. - * So we don't need to hold ipsq_lock, - */ - newipsq = phyint->phyint_ipsq; - newipsq->ipsq_writer = NULL; - newipsq->ipsq_reentry_cnt--; - ASSERT(newipsq->ipsq_reentry_cnt == 0); -#ifdef DEBUG - newipsq->ipsq_depth = 0; -#endif - - return (SPLIT_SUCCESS); -} + if (ipif->ipif_flags & IPIF_UP) + ipif->ipif_was_up = B_TRUE; -/* - * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to - * ipsq's representing their individual groups or themselves. Return - * whether split needs to be retried again later. - */ -static boolean_t -ill_split_ipsq(ipsq_t *cur_ipsq) -{ - phyint_t *phyint; - phyint_t *next_phyint; - int error; - boolean_t need_retry = B_FALSE; - ip_stack_t *ipst = cur_ipsq->ipsq_ipst; + mutex_enter(&ill->ill_lock); + ipif->ipif_state_flags |= IPIF_CHANGING; + mutex_exit(&ill->ill_lock); - phyint = cur_ipsq->ipsq_phyint_list; - cur_ipsq->ipsq_phyint_list = NULL; - while (phyint != NULL) { - next_phyint = phyint->phyint_ipsq_next; /* - * 'created' will tell us whether the callee actually - * created an ipsq. Lack of memory may force the callee - * to return without creating an ipsq. + * Need to re-create net/subnet bcast ires if + * they are dependent on ipif. */ - if (phyint->phyint_groupname == NULL) { - error = ill_split_to_own_ipsq(phyint, cur_ipsq, ipst); - } else { - error = ill_split_to_grp_ipsq(phyint, cur_ipsq, - need_retry, ipst); - } - - switch (error) { - case SPLIT_FAILED: - need_retry = B_TRUE; - /* FALLTHRU */ - case SPLIT_NOT_NEEDED: - /* - * Keep it on the list. - */ - phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list; - cur_ipsq->ipsq_phyint_list = phyint; - break; - case SPLIT_SUCCESS: - break; - default: - ASSERT(0); - } - - phyint = next_phyint; - } - return (need_retry); -} - -/* - * given an ipsq 'ipsq' lock all ills associated with this ipsq. - * and return the ills in the list. This list will be - * needed to unlock all the ills later on by the caller. - * The <ill-ipsq> associations could change between the - * lock and unlock. Hence the unlock can't traverse the - * ipsq to get the list of ills. - */ -static int -ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max) -{ - int cnt = 0; - phyint_t *phyint; - ip_stack_t *ipst = ipsq->ipsq_ipst; - - /* - * The caller holds ill_g_lock to ensure that the ill memberships - * of the ipsq don't change - */ - ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); - - phyint = ipsq->ipsq_phyint_list; - while (phyint != NULL) { - if (phyint->phyint_illv4 != NULL) { - ASSERT(cnt < list_max); - list[cnt++] = phyint->phyint_illv4; - } - if (phyint->phyint_illv6 != NULL) { - ASSERT(cnt < list_max); - list[cnt++] = phyint->phyint_illv6; - } - phyint = phyint->phyint_ipsq_next; + if (!ipif->ipif_isv6) + ipif_check_bcast_ires(ipif); + (void) ipif_logical_down(ipif, NULL, NULL); + ipif_non_duplicate(ipif); + ipif_down_tail(ipif); } - ill_lock_ills(list, cnt); - return (cnt); } void @@ -14577,3504 +14155,251 @@ ill_unlock_ills(ill_t **list, int cnt) } /* - * Merge all the ills from 1 ipsq group into another ipsq group. - * The source ipsq group is specified by the ipsq associated with - * 'from_ill'. The destination ipsq group is specified by the ipsq - * associated with 'to_ill' or 'groupname' respectively. - * Note that ipsq itself does not have a reference count mechanism - * and functions don't look up an ipsq and pass it around. Instead - * functions pass around an ill or groupname, and the ipsq is looked - * up from the ill or groupname and the required operation performed - * atomically with the lookup on the ipsq. + * Redo source address selection. This is called when a + * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up. */ -static int -ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp, - queue_t *q) -{ - ipsq_t *old_ipsq; - ipsq_t *new_ipsq; - ill_t **ill_list; - int cnt; - size_t ill_list_size; - boolean_t became_writer_on_new_sq = B_FALSE; - ip_stack_t *ipst = from_ill->ill_ipst; - - ASSERT(to_ill == NULL || ipst == to_ill->ill_ipst); - /* Exactly 1 of 'to_ill' and groupname can be specified. */ - ASSERT((to_ill != NULL) ^ (groupname != NULL)); - - /* - * Need to hold ill_g_lock as writer and also the ill_lock to - * change the <ill-ipsq> assoc of an ill. Need to hold the - * ipsq_lock to prevent new messages from landing on an ipsq. - */ - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - - old_ipsq = from_ill->ill_phyint->phyint_ipsq; - if (groupname != NULL) - new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL, ipst); - else { - new_ipsq = to_ill->ill_phyint->phyint_ipsq; - } - - ASSERT(old_ipsq != NULL && new_ipsq != NULL); - - /* - * both groups are on the same ipsq. - */ - if (old_ipsq == new_ipsq) { - rw_exit(&ipst->ips_ill_g_lock); - return (0); - } - - cnt = old_ipsq->ipsq_refs << 1; - ill_list_size = cnt * sizeof (ill_t *); - ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); - if (ill_list == NULL) { - rw_exit(&ipst->ips_ill_g_lock); - return (ENOMEM); - } - cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt); - - /* Need ipsq lock to enque messages on new ipsq or to become writer */ - mutex_enter(&new_ipsq->ipsq_lock); - if ((new_ipsq->ipsq_writer == NULL && - new_ipsq->ipsq_current_ipif == NULL) || - (new_ipsq->ipsq_writer == curthread)) { - new_ipsq->ipsq_writer = curthread; - new_ipsq->ipsq_reentry_cnt++; - became_writer_on_new_sq = B_TRUE; - } - - /* - * We are holding ill_g_lock as writer and all the ill locks of - * the old ipsq. So the old_ipsq can't be looked up, and hence no new - * message can land up on the old ipsq even though we don't hold the - * ipsq_lock of the old_ipsq. Now move all messages to the newipsq. - */ - ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q); - - /* - * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'. - * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq> - * assocs. till we release the ill_g_lock, and hence it can't vanish. - */ - ill_merge_ipsq(old_ipsq, new_ipsq, ipst); - - /* - * Mark the new ipsq as needing a split since it is currently - * being shared by more than 1 IPMP group. The split will - * occur at the end of ipsq_exit - */ - new_ipsq->ipsq_split = B_TRUE; - - /* Now release all the locks */ - mutex_exit(&new_ipsq->ipsq_lock); - ill_unlock_ills(ill_list, cnt); - rw_exit(&ipst->ips_ill_g_lock); - - kmem_free(ill_list, ill_list_size); - - /* - * If we succeeded in becoming writer on the new ipsq, then - * drain the new ipsq and start processing all enqueued messages - * including the current ioctl we are processing which is either - * a set groupname or failover/failback. - */ - if (became_writer_on_new_sq) - ipsq_exit(new_ipsq); - - /* - * syncq has been changed and all the messages have been moved. - */ - mutex_enter(&old_ipsq->ipsq_lock); - old_ipsq->ipsq_current_ipif = NULL; - old_ipsq->ipsq_current_ioctl = 0; - old_ipsq->ipsq_current_done = B_TRUE; - mutex_exit(&old_ipsq->ipsq_lock); - return (EINPROGRESS); -} - -/* - * Delete and add the loopback copy and non-loopback copy of - * the BROADCAST ire corresponding to ill and addr. Used to - * group broadcast ires together when ill becomes part of - * a group. - * - * This function is also called when ill is leaving the group - * so that the ires belonging to the group gets re-grouped. - */ -static void -ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr) -{ - ire_t *ire, *nire, *nire_next, *ire_head = NULL; - ire_t **ire_ptpn = &ire_head; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * The loopback and non-loopback IREs are inserted in the order in which - * they're found, on the basis that they are correctly ordered (loopback - * first). - */ - for (;;) { - ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, - ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); - if (ire == NULL) - break; - - /* - * we are passing in KM_SLEEP because it is not easy to - * go back to a sane state in case of memory failure. - */ - nire = kmem_cache_alloc(ire_cache, KM_SLEEP); - ASSERT(nire != NULL); - bzero(nire, sizeof (ire_t)); - /* - * Don't use ire_max_frag directly since we don't - * hold on to 'ire' until we add the new ire 'nire' and - * we don't want the new ire to have a dangling reference - * to 'ire'. The ire_max_frag of a broadcast ire must - * be in sync with the ipif_mtu of the associate ipif. - * For eg. this happens as a result of SIOCSLIFNAME, - * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by - * the driver. A change in ire_max_frag triggered as - * as a result of path mtu discovery, or due to an - * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a - * route change -mtu command does not apply to broadcast ires. - * - * XXX We need a recovery strategy here if ire_init fails - */ - if (ire_init(nire, - (uchar_t *)&ire->ire_addr, - (uchar_t *)&ire->ire_mask, - (uchar_t *)&ire->ire_src_addr, - (uchar_t *)&ire->ire_gateway_addr, - ire->ire_stq == NULL ? &ip_loopback_mtu : - &ire->ire_ipif->ipif_mtu, - ire->ire_nce, - ire->ire_rfq, - ire->ire_stq, - ire->ire_type, - ire->ire_ipif, - ire->ire_cmask, - ire->ire_phandle, - ire->ire_ihandle, - ire->ire_flags, - &ire->ire_uinfo, - NULL, - NULL, - ipst) == NULL) { - cmn_err(CE_PANIC, "ire_init() failed"); - } - ire_delete(ire); - ire_refrele(ire); - - /* - * The newly created IREs are inserted at the tail of the list - * starting with ire_head. As we've just allocated them no one - * knows about them so it's safe. - */ - *ire_ptpn = nire; - ire_ptpn = &nire->ire_next; - } - - for (nire = ire_head; nire != NULL; nire = nire_next) { - int error; - ire_t *oire; - /* unlink the IRE from our list before calling ire_add() */ - nire_next = nire->ire_next; - nire->ire_next = NULL; - - /* ire_add adds the ire at the right place in the list */ - oire = nire; - error = ire_add(&nire, NULL, NULL, NULL, B_FALSE); - ASSERT(error == 0); - ASSERT(oire == nire); - ire_refrele(nire); /* Held in ire_add */ - } -} - -/* - * This function is usually called when an ill is inserted in - * a group and all the ipifs are already UP. As all the ipifs - * are already UP, the broadcast ires have already been created - * and been inserted. But, ire_add_v4 would not have grouped properly. - * We need to re-group for the benefit of ip_wput_ire which - * expects BROADCAST ires to be grouped properly to avoid sending - * more than one copy of the broadcast packet per group. - * - * NOTE : We don't check for ill_ipif_up_count to be non-zero here - * because when ipif_up_done ends up calling this, ires have - * already been added before illgrp_insert i.e before ill_group - * has been initialized. - */ -static void -ill_group_bcast_for_xmit(ill_t *ill) +void +ill_update_source_selection(ill_t *ill) { - ill_group_t *illgrp; ipif_t *ipif; - ipaddr_t addr; - ipaddr_t net_mask; - ipaddr_t subnet_netmask; - illgrp = ill->ill_group; + ASSERT(IAM_WRITER_ILL(ill)); /* - * This function is called even when an ill is deleted from - * the group. Hence, illgrp could be null. + * Underlying interfaces are only used for test traffic and thus + * should always send with their (deprecated) source addresses. */ - if (illgrp != NULL && illgrp->illgrp_ill_count == 1) + if (IS_UNDER_IPMP(ill)) return; - /* - * Delete all the BROADCAST ires matching this ill and add - * them back. This time, ire_add_v4 should take care of - * grouping them with others because ill is part of the - * group. - */ - ill_bcast_delete_and_add(ill, 0); - ill_bcast_delete_and_add(ill, INADDR_BROADCAST); - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - - if ((ipif->ipif_lcl_addr != INADDR_ANY) && - !(ipif->ipif_flags & IPIF_NOLOCAL)) { - net_mask = ip_net_mask(ipif->ipif_lcl_addr); - } else { - net_mask = htonl(IN_CLASSA_NET); - } - addr = net_mask & ipif->ipif_subnet; - ill_bcast_delete_and_add(ill, addr); - ill_bcast_delete_and_add(ill, ~net_mask | addr); - - subnet_netmask = ipif->ipif_net_mask; - addr = ipif->ipif_subnet; - ill_bcast_delete_and_add(ill, addr); - ill_bcast_delete_and_add(ill, ~subnet_netmask | addr); - } -} - -/* - * This function is called from illgrp_delete when ill is being deleted - * from the group. - * - * As ill is not there in the group anymore, any address belonging - * to this ill should be cleared of IRE_MARK_NORECV. - */ -static void -ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr) -{ - ire_t *ire; - irb_t *irb; - ip_stack_t *ipst = ill->ill_ipst; - - ASSERT(ill->ill_group == NULL); - - ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, - ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); - - if (ire != NULL) { - /* - * IPMP and plumbing operations are serialized on the ipsq, so - * no one will insert or delete a broadcast ire under our feet. - */ - irb = ire->ire_bucket; - rw_enter(&irb->irb_lock, RW_READER); - ire_refrele(ire); - - for (; ire != NULL; ire = ire->ire_next) { - if (ire->ire_addr != addr) - break; - if (ire_to_ill(ire) != ill) - continue; - - ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED)); - ire->ire_marks &= ~IRE_MARK_NORECV; - } - rw_exit(&irb->irb_lock); - } -} - -ire_t * -irep_insert(ill_group_t *illgrp, ipaddr_t addr, ire_t *ire, ire_t ***pirep) -{ - boolean_t first = B_TRUE; - ire_t *clear_ire = NULL; - ire_t *start_ire = NULL; - uint64_t match_flags; - uint64_t phyi_flags; - boolean_t fallback = B_FALSE; - - /* - * irb_lock must be held by the caller. - * Get to the first ire matching the address and the - * group. If the address does not match we are done - * as we could not find the IRE. If the address matches - * we should get to the first one matching the group. - */ - while (ire != NULL) { - if (ire->ire_addr != addr || - ire->ire_ipif->ipif_ill->ill_group == illgrp) { - break; - } - ire = ire->ire_next; - } - match_flags = PHYI_FAILED | PHYI_INACTIVE; - start_ire = ire; -redo: - while (ire != NULL && ire->ire_addr == addr && - ire->ire_ipif->ipif_ill->ill_group == illgrp) { - /* - * The first ire for any address within a group - * should always be the one with IRE_MARK_NORECV cleared - * so that ip_wput_ire can avoid searching for one. - * Note down the insertion point which will be used - * later. - */ - if (first && (*pirep == NULL)) - *pirep = ire->ire_ptpn; - /* - * PHYI_FAILED is set when the interface fails. - * This interface might have become good, but the - * daemon has not yet detected. We should still - * not receive on this. PHYI_OFFLINE should never - * be picked as this has been offlined and soon - * be removed. - */ - phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags; - if (phyi_flags & PHYI_OFFLINE) { - ire->ire_marks |= IRE_MARK_NORECV; - ire = ire->ire_next; - continue; - } - if (phyi_flags & match_flags) { - ire->ire_marks |= IRE_MARK_NORECV; - ire = ire->ire_next; - if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) == - PHYI_INACTIVE) { - fallback = B_TRUE; - } - continue; - } - if (first) { - /* - * We will move this to the front of the list later - * on. - */ - clear_ire = ire; - ire->ire_marks &= ~IRE_MARK_NORECV; - } else { - ire->ire_marks |= IRE_MARK_NORECV; - } - first = B_FALSE; - ire = ire->ire_next; - } - /* - * If we never nominated anybody, try nominating at least - * an INACTIVE, if we found one. Do it only once though. - */ - if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) && - fallback) { - match_flags = PHYI_FAILED; - ire = start_ire; - *pirep = NULL; - goto redo; - } - return (clear_ire); -} - -/* - * This function must be called only after the broadcast ires - * have been grouped together. For a given address addr, nominate - * only one of the ires whose interface is not FAILED or OFFLINE. - * - * This is also called when an ipif goes down, so that we can nominate - * a different ire with the same address for receiving. - */ -static void -ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr, ip_stack_t *ipst) -{ - irb_t *irb; - ire_t *ire; - ire_t *ire1; - ire_t *save_ire; - ire_t **irep = NULL; - ire_t *clear_ire = NULL; - ire_t *new_lb_ire; - ire_t *new_nlb_ire; - boolean_t new_lb_ire_used = B_FALSE; - boolean_t new_nlb_ire_used = B_FALSE; - boolean_t refrele_lb_ire = B_FALSE; - boolean_t refrele_nlb_ire = B_FALSE; - uint_t max_frag; - - ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES, - NULL, MATCH_IRE_TYPE, ipst); - /* - * We may not be able to find some ires if a previous - * ire_create failed. This happens when an ipif goes - * down and we are unable to create BROADCAST ires due - * to memory failure. Thus, we have to check for NULL - * below. This should handle the case for LOOPBACK, - * POINTOPOINT and interfaces with some POINTOPOINT - * logicals for which there are no BROADCAST ires. - */ - if (ire == NULL) - return; - /* - * Currently IRE_BROADCASTS are deleted when an ipif - * goes down which runs exclusively. Thus, setting - * IRE_MARK_RCVD should not race with ire_delete marking - * IRE_MARK_CONDEMNED. We grab the lock below just to - * be consistent with other parts of the code that walks - * a given bucket. - */ - save_ire = ire; - irb = ire->ire_bucket; - new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); - if (new_lb_ire == NULL) { - ire_refrele(ire); - return; - } - new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); - if (new_nlb_ire == NULL) { - ire_refrele(ire); - kmem_cache_free(ire_cache, new_lb_ire); - return; - } - IRB_REFHOLD(irb); - rw_enter(&irb->irb_lock, RW_WRITER); - clear_ire = irep_insert(illgrp, addr, ire, &irep); - - /* - * irep non-NULL indicates that we entered the while loop - * above. If clear_ire is at the insertion point, we don't - * have to do anything. clear_ire will be NULL if all the - * interfaces are failed. - * - * We cannot unlink and reinsert the ire at the right place - * in the list since there can be other walkers of this bucket. - * Instead we delete and recreate the ire - */ - if (clear_ire != NULL && irep != NULL && *irep != clear_ire) { - ire_t *clear_ire_stq = NULL; - ire_t *clr_ire = NULL; - ire_t *ire_next = NULL; - - if (clear_ire->ire_stq == NULL) - ire_next = clear_ire->ire_next; - - rw_exit(&irb->irb_lock); - - bzero(new_lb_ire, sizeof (ire_t)); - /* XXX We need a recovery strategy here. */ - if (ire_init(new_lb_ire, - (uchar_t *)&clear_ire->ire_addr, - (uchar_t *)&clear_ire->ire_mask, - (uchar_t *)&clear_ire->ire_src_addr, - (uchar_t *)&clear_ire->ire_gateway_addr, - &clear_ire->ire_max_frag, - NULL, /* let ire_nce_init derive the resolver info */ - clear_ire->ire_rfq, - clear_ire->ire_stq, - clear_ire->ire_type, - clear_ire->ire_ipif, - clear_ire->ire_cmask, - clear_ire->ire_phandle, - clear_ire->ire_ihandle, - clear_ire->ire_flags, - &clear_ire->ire_uinfo, - NULL, - NULL, - ipst) == NULL) - cmn_err(CE_PANIC, "ire_init() failed"); - - refrele_lb_ire = B_TRUE; - - if (ire_next != NULL && - ire_next->ire_stq != NULL && - ire_next->ire_addr == clear_ire->ire_addr && - ire_next->ire_ipif->ipif_ill == - clear_ire->ire_ipif->ipif_ill) { - clear_ire_stq = ire_next; - - bzero(new_nlb_ire, sizeof (ire_t)); - /* XXX We need a recovery strategy here. */ - if (ire_init(new_nlb_ire, - (uchar_t *)&clear_ire_stq->ire_addr, - (uchar_t *)&clear_ire_stq->ire_mask, - (uchar_t *)&clear_ire_stq->ire_src_addr, - (uchar_t *)&clear_ire_stq->ire_gateway_addr, - &clear_ire_stq->ire_max_frag, - NULL, - clear_ire_stq->ire_rfq, - clear_ire_stq->ire_stq, - clear_ire_stq->ire_type, - clear_ire_stq->ire_ipif, - clear_ire_stq->ire_cmask, - clear_ire_stq->ire_phandle, - clear_ire_stq->ire_ihandle, - clear_ire_stq->ire_flags, - &clear_ire_stq->ire_uinfo, - NULL, - NULL, - ipst) == NULL) - cmn_err(CE_PANIC, "ire_init() failed"); - - refrele_nlb_ire = B_TRUE; - } - - rw_enter(&irb->irb_lock, RW_WRITER); - /* - * irb_lock was dropped across call to ire_init() due to - * lock ordering issue with ipst->ips_ndp{4,6}->ndp_g_lock - * mutex lock. Therefore irep could have changed. call - * irep_insert() to get the new insertion point (irep) and - * recheck all known conditions. - */ - irep = NULL; - clr_ire = irep_insert(illgrp, addr, save_ire, &irep); - if ((irep != NULL) && (*irep != clear_ire) && - (clr_ire == clear_ire)) { - if ((clear_ire_stq != NULL) && - (clr_ire->ire_next != clear_ire_stq)) - clear_ire_stq = NULL; - /* - * Delete the ire. We can't call ire_delete() since - * we are holding the bucket lock. We can't release the - * bucket lock since we can't allow irep to change. - * So just mark it CONDEMNED. - * The IRB_REFRELE will delete the ire from the list - * and do the refrele. - */ - clear_ire->ire_marks |= IRE_MARK_CONDEMNED; - irb->irb_marks |= IRB_MARK_CONDEMNED; - - if (clear_ire_stq != NULL && - clear_ire_stq->ire_nce != NULL) { - nce_fastpath_list_delete( - clear_ire_stq->ire_nce); - clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED; - } - - /* - * Also take care of otherfields like ib/ob pkt count - * etc. Need to dup them. - * ditto in ill_bcast_delete_and_add - */ - - /* Set the max_frag before adding the ire */ - max_frag = *new_lb_ire->ire_max_fragp; - new_lb_ire->ire_max_fragp = NULL; - new_lb_ire->ire_max_frag = max_frag; - - /* Add the new ire's. Insert at *irep */ - new_lb_ire->ire_bucket = clear_ire->ire_bucket; - ire1 = *irep; - if (ire1 != NULL) - ire1->ire_ptpn = &new_lb_ire->ire_next; - new_lb_ire->ire_next = ire1; - /* Link the new one in. */ - new_lb_ire->ire_ptpn = irep; - membar_producer(); - *irep = new_lb_ire; - new_lb_ire_used = B_TRUE; - BUMP_IRE_STATS(ipst->ips_ire_stats_v4, - ire_stats_inserted); - new_lb_ire->ire_bucket->irb_ire_cnt++; - DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), - new_lb_ire->ire_ipif, - (char *), "ire", (void *), new_lb_ire); - new_lb_ire->ire_ipif->ipif_ire_cnt++; - - if (clear_ire_stq != NULL) { - ill_t *ire_ill; - /* Set the max_frag before adding the ire */ - max_frag = *new_nlb_ire->ire_max_fragp; - new_nlb_ire->ire_max_fragp = NULL; - new_nlb_ire->ire_max_frag = max_frag; - - new_nlb_ire->ire_bucket = clear_ire->ire_bucket; - irep = &new_lb_ire->ire_next; - /* Add the new ire. Insert at *irep */ - ire1 = *irep; - if (ire1 != NULL) - ire1->ire_ptpn = &new_nlb_ire->ire_next; - new_nlb_ire->ire_next = ire1; - /* Link the new one in. */ - new_nlb_ire->ire_ptpn = irep; - membar_producer(); - *irep = new_nlb_ire; - new_nlb_ire_used = B_TRUE; - BUMP_IRE_STATS(ipst->ips_ire_stats_v4, - ire_stats_inserted); - new_nlb_ire->ire_bucket->irb_ire_cnt++; - DTRACE_PROBE3(ipif__incr__cnt, - (ipif_t *), new_nlb_ire->ire_ipif, - (char *), "ire", (void *), new_nlb_ire); - new_nlb_ire->ire_ipif->ipif_ire_cnt++; - DTRACE_PROBE3(ill__incr__cnt, - (ill_t *), new_nlb_ire->ire_stq->q_ptr, - (char *), "ire", (void *), new_nlb_ire); - ire_ill = (ill_t *)new_nlb_ire->ire_stq->q_ptr; - ire_ill->ill_ire_cnt++; - } - } - } - ire_refrele(save_ire); - rw_exit(&irb->irb_lock); - /* - * Since we dropped the irb_lock across call to ire_init() - * and rechecking known conditions, it is possible that - * the checks might fail, therefore undo the work done by - * ire_init() by calling ire_refrele() on the newly created ire. - */ - if (!new_lb_ire_used) { - if (refrele_lb_ire) { - ire_refrele(new_lb_ire); - } else { - kmem_cache_free(ire_cache, new_lb_ire); - } - } - if (!new_nlb_ire_used) { - if (refrele_nlb_ire) { - ire_refrele(new_nlb_ire); - } else { - kmem_cache_free(ire_cache, new_nlb_ire); - } - } - IRB_REFRELE(irb); -} - -/* - * Whenever an ipif goes down we have to renominate a different - * broadcast ire to receive. Whenever an ipif comes up, we need - * to make sure that we have only one nominated to receive. - */ -static void -ipif_renominate_bcast(ipif_t *ipif) -{ - ill_t *ill = ipif->ipif_ill; - ipaddr_t subnet_addr; - ipaddr_t net_addr; - ipaddr_t net_mask = 0; - ipaddr_t subnet_netmask; - ipaddr_t addr; - ill_group_t *illgrp; - ip_stack_t *ipst = ill->ill_ipst; - - illgrp = ill->ill_group; - /* - * If this is the last ipif going down, it might take - * the ill out of the group. In that case ipif_down -> - * illgrp_delete takes care of doing the nomination. - * ipif_down does not call for this case. - */ - ASSERT(illgrp != NULL); - - /* There could not have been any ires associated with this */ - if (ipif->ipif_subnet == 0) - return; - - ill_mark_bcast(illgrp, 0, ipst); - ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst); - - if ((ipif->ipif_lcl_addr != INADDR_ANY) && - !(ipif->ipif_flags & IPIF_NOLOCAL)) { - net_mask = ip_net_mask(ipif->ipif_lcl_addr); - } else { - net_mask = htonl(IN_CLASSA_NET); - } - addr = net_mask & ipif->ipif_subnet; - ill_mark_bcast(illgrp, addr, ipst); - - net_addr = ~net_mask | addr; - ill_mark_bcast(illgrp, net_addr, ipst); - - subnet_netmask = ipif->ipif_net_mask; - addr = ipif->ipif_subnet; - ill_mark_bcast(illgrp, addr, ipst); - - subnet_addr = ~subnet_netmask | addr; - ill_mark_bcast(illgrp, subnet_addr, ipst); -} - -/* - * Whenever we form or delete ill groups, we need to nominate one set of - * BROADCAST ires for receiving in the group. - * - * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires - * have been added, but ill_ipif_up_count is 0. Thus, we don't assert - * for ill_ipif_up_count to be non-zero. This is the only case where - * ill_ipif_up_count is zero and we would still find the ires. - * - * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one - * ipif is UP and we just have to do the nomination. - * - * 3) When ill_handoff_responsibility calls us, some ill has been removed - * from the group. So, we have to do the nomination. - * - * Because of (3), there could be just one ill in the group. But we have - * to nominate still as IRE_MARK_NORCV may have been marked on this. - * Thus, this function does not optimize when there is only one ill as - * it is not correct for (3). - */ -static void -ill_nominate_bcast_rcv(ill_group_t *illgrp) -{ - ill_t *ill; - ipif_t *ipif; - ipaddr_t subnet_addr; - ipaddr_t prev_subnet_addr = 0; - ipaddr_t net_addr; - ipaddr_t prev_net_addr = 0; - ipaddr_t net_mask = 0; - ipaddr_t subnet_netmask; - ipaddr_t addr; - ip_stack_t *ipst; - - /* - * When the last memeber is leaving, there is nothing to - * nominate. - */ - if (illgrp->illgrp_ill_count == 0) { - ASSERT(illgrp->illgrp_ill == NULL); - return; - } - - ill = illgrp->illgrp_ill; - ASSERT(!ill->ill_isv6); - ipst = ill->ill_ipst; - /* - * We assume that ires with same address and belonging to the - * same group, has been grouped together. Nominating a *single* - * ill in the group for sending and receiving broadcast is done - * by making sure that the first BROADCAST ire (which will be - * the one returned by ire_ctable_lookup for ip_rput and the - * one that will be used in ip_wput_ire) will be the one that - * will not have IRE_MARK_NORECV set. - * - * 1) ip_rput checks and discards packets received on ires marked - * with IRE_MARK_NORECV. Thus, we don't send up duplicate - * broadcast packets. We need to clear IRE_MARK_NORECV on the - * first ire in the group for every broadcast address in the group. - * ip_rput will accept packets only on the first ire i.e only - * one copy of the ill. - * - * 2) ip_wput_ire needs to send out just one copy of the broadcast - * packet for the whole group. It needs to send out on the ill - * whose ire has not been marked with IRE_MARK_NORECV. If it sends - * on the one marked with IRE_MARK_NORECV, ip_rput will accept - * the copy echoed back on other port where the ire is not marked - * with IRE_MARK_NORECV. - * - * Note that we just need to have the first IRE either loopback or - * non-loopback (either of them may not exist if ire_create failed - * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will - * always hit the first one and hence will always accept one copy. - * - * We have a broadcast ire per ill for all the unique prefixes - * hosted on that ill. As we don't have a way of knowing the - * unique prefixes on a given ill and hence in the whole group, - * we just call ill_mark_bcast on all the prefixes that exist - * in the group. For the common case of one prefix, the code - * below optimizes by remebering the last address used for - * markng. In the case of multiple prefixes, this will still - * optimize depending the order of prefixes. - * - * The only unique address across the whole group is 0.0.0.0 and - * 255.255.255.255 and thus we call only once. ill_mark_bcast enables - * the first ire in the bucket for receiving and disables the - * others. - */ - ill_mark_bcast(illgrp, 0, ipst); - ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst); - for (; ill != NULL; ill = ill->ill_group_next) { - - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - - if (!(ipif->ipif_flags & IPIF_UP) || - ipif->ipif_subnet == 0) { - continue; - } - if ((ipif->ipif_lcl_addr != INADDR_ANY) && - !(ipif->ipif_flags & IPIF_NOLOCAL)) { - net_mask = ip_net_mask(ipif->ipif_lcl_addr); - } else { - net_mask = htonl(IN_CLASSA_NET); - } - addr = net_mask & ipif->ipif_subnet; - if (prev_net_addr == 0 || prev_net_addr != addr) { - ill_mark_bcast(illgrp, addr, ipst); - net_addr = ~net_mask | addr; - ill_mark_bcast(illgrp, net_addr, ipst); - } - prev_net_addr = addr; - - subnet_netmask = ipif->ipif_net_mask; - addr = ipif->ipif_subnet; - if (prev_subnet_addr == 0 || - prev_subnet_addr != addr) { - ill_mark_bcast(illgrp, addr, ipst); - subnet_addr = ~subnet_netmask | addr; - ill_mark_bcast(illgrp, subnet_addr, ipst); - } - prev_subnet_addr = addr; - } - } -} - -/* - * This function is called while forming ill groups. - * - * Currently, we handle only allmulti groups. We want to join - * allmulti on only one of the ills in the groups. In future, - * when we have link aggregation, we may have to join normal - * multicast groups on multiple ills as switch does inbound load - * balancing. Following are the functions that calls this - * function : - * - * 1) ill_recover_multicast : Interface is coming back UP. - * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6 - * will call ill_recover_multicast to recover all the multicast - * groups. We need to make sure that only one member is joined - * in the ill group. - * - * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed. - * Somebody is joining allmulti. We need to make sure that only one - * member is joined in the group. - * - * 3) illgrp_insert : If allmulti has already joined, we need to make - * sure that only one member is joined in the group. - * - * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving - * allmulti who we have nominated. We need to pick someother ill. - * - * 5) illgrp_delete : The ill we nominated is leaving the group, - * we need to pick a new ill to join the group. - * - * For (1), (2), (5) - we just have to check whether there is - * a good ill joined in the group. If we could not find any ills - * joined the group, we should join. - * - * For (4), the one that was nominated to receive, left the group. - * There could be nobody joined in the group when this function is - * called. - * - * For (3) - we need to explicitly check whether there are multiple - * ills joined in the group. - * - * For simplicity, we don't differentiate any of the above cases. We - * just leave the group if it is joined on any of them and join on - * the first good ill. - */ -int -ill_nominate_mcast_rcv(ill_group_t *illgrp) -{ - ilm_t *ilm; - ill_t *ill; - ill_t *fallback_inactive_ill = NULL; - ill_t *fallback_failed_ill = NULL; - int ret = 0; - - /* - * Leave the allmulti on all the ills and start fresh. - */ - for (ill = illgrp->illgrp_ill; ill != NULL; - ill = ill->ill_group_next) { - if (ill->ill_join_allmulti) - ill_leave_allmulti(ill); - } - - /* - * Choose a good ill. Fallback to inactive or failed if - * none available. We need to fallback to FAILED in the - * case where we have 2 interfaces in a group - where - * one of them is failed and another is a good one and - * the good one (not marked inactive) is leaving the group. - */ - for (ill = illgrp->illgrp_ill; ill != NULL; ill = ill->ill_group_next) { - if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE) - continue; - if (ill->ill_phyint->phyint_flags & PHYI_FAILED) { - fallback_failed_ill = ill; - continue; - } - if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) { - fallback_inactive_ill = ill; - continue; - } - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { - ret = ill_join_allmulti(ill); - /* - * ill_join_allmulti() can fail because of - * memory failures so make sure we join at - * least on one ill. - */ - if (ill->ill_join_allmulti) - return (0); - } - } - } - if (ret != 0) { - /* - * If we tried nominating above and failed to do so, - * return error. We might have tried multiple times. - * But, return the latest error. - */ - return (ret); - } - if ((ill = fallback_inactive_ill) != NULL) { - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) - return (ill_join_allmulti(ill)); - } - } else if ((ill = fallback_failed_ill) != NULL) { - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) - return (ill_join_allmulti(ill)); - } - } - return (0); -} - -/* - * This function is called from illgrp_delete after it is - * deleted from the group to reschedule responsibilities - * to a different ill. - */ -static void -ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp) -{ - ilm_t *ilm; - ipif_t *ipif; - ipaddr_t subnet_addr; - ipaddr_t net_addr; - ipaddr_t net_mask = 0; - ipaddr_t subnet_netmask; - ipaddr_t addr; - ip_stack_t *ipst = ill->ill_ipst; - - ASSERT(ill->ill_group == NULL); - /* - * Broadcast Responsibility: - * - * 1. If this ill has been nominated for receiving broadcast - * packets, we need to find a new one. Before we find a new - * one, we need to re-group the ires that are part of this new - * group (assumed by ill_nominate_bcast_rcv). We do this by - * calling ill_group_bcast_for_xmit(ill) which will do the right - * thing for us. - * - * 2. If this ill was not nominated for receiving broadcast - * packets, we need to clear the IRE_MARK_NORECV flag - * so that we continue to send up broadcast packets. - */ - if (!ill->ill_isv6) { - /* - * Case 1 above : No optimization here. Just redo the - * nomination. - */ - ill_group_bcast_for_xmit(ill); - ill_nominate_bcast_rcv(illgrp); - - /* - * Case 2 above : Lookup and clear IRE_MARK_NORECV. - */ - ill_clear_bcast_mark(ill, 0); - ill_clear_bcast_mark(ill, INADDR_BROADCAST); - - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - - if (!(ipif->ipif_flags & IPIF_UP) || - ipif->ipif_subnet == 0) { - continue; - } - if ((ipif->ipif_lcl_addr != INADDR_ANY) && - !(ipif->ipif_flags & IPIF_NOLOCAL)) { - net_mask = ip_net_mask(ipif->ipif_lcl_addr); - } else { - net_mask = htonl(IN_CLASSA_NET); - } - addr = net_mask & ipif->ipif_subnet; - ill_clear_bcast_mark(ill, addr); - - net_addr = ~net_mask | addr; - ill_clear_bcast_mark(ill, net_addr); - - subnet_netmask = ipif->ipif_net_mask; - addr = ipif->ipif_subnet; - ill_clear_bcast_mark(ill, addr); - - subnet_addr = ~subnet_netmask | addr; - ill_clear_bcast_mark(ill, subnet_addr); - } - } - - /* - * Multicast Responsibility. - * - * If we have joined allmulti on this one, find a new member - * in the group to join allmulti. As this ill is already part - * of allmulti, we don't have to join on this one. - * - * If we have not joined allmulti on this one, there is no - * responsibility to handoff. But we need to take new - * responsibility i.e, join allmulti on this one if we need - * to. - */ - if (ill->ill_join_allmulti) { - (void) ill_nominate_mcast_rcv(illgrp); - } else { - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { - (void) ill_join_allmulti(ill); - break; - } - } - } - - /* - * We intentionally do the flushing of IRE_CACHES only matching - * on the ill and not on groups. Note that we are already deleted - * from the group. - * - * This will make sure that all IRE_CACHES whose stq is pointing - * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get - * deleted and IRE_CACHES that are not pointing at this ill will - * be left alone. - */ - ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, - illgrp_cache_delete, ill, ill); - - /* - * Some conn may have cached one of the IREs deleted above. By removing - * the ire reference, we clean up the extra reference to the ill held in - * ire->ire_stq. - */ - ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); - - /* - * Re-do source address selection for all the members in the - * group, if they borrowed source address from one of the ipifs - * in this ill. - */ - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (ill->ill_isv6) { - ipif_update_other_ipifs_v6(ipif, illgrp); - } else { - ipif_update_other_ipifs(ipif, illgrp); - } + if (ill->ill_isv6) + ipif_recreate_interface_routes_v6(NULL, ipif); + else + ipif_recreate_interface_routes(NULL, ipif); } } /* - * Delete the ill from the group. The caller makes sure that it is - * in a group and it okay to delete from the group. So, we always - * delete here. + * Finish the group join started in ip_sioctl_groupname(). */ +/* ARGSUSED */ static void -illgrp_delete(ill_t *ill) -{ - ill_group_t *illgrp; - ill_group_t *tmpg; - ill_t *tmp_ill; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * Reset illgrp_ill_schednext if it was pointing at us. - * We need to do this before we set ill_group to NULL. - */ - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - mutex_enter(&ill->ill_lock); - - illgrp_reset_schednext(ill); - - illgrp = ill->ill_group; - - /* Delete the ill from illgrp. */ - if (illgrp->illgrp_ill == ill) { - illgrp->illgrp_ill = ill->ill_group_next; - } else { - tmp_ill = illgrp->illgrp_ill; - while (tmp_ill->ill_group_next != ill) { - tmp_ill = tmp_ill->ill_group_next; - ASSERT(tmp_ill != NULL); - } - tmp_ill->ill_group_next = ill->ill_group_next; - } - ill->ill_group = NULL; - ill->ill_group_next = NULL; - - illgrp->illgrp_ill_count--; - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - - /* - * As this ill is leaving the group, we need to hand off - * the responsibilities to the other ills in the group, if - * this ill had some responsibilities. - */ - - ill_handoff_responsibility(ill, illgrp); - - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - - if (illgrp->illgrp_ill_count == 0) { - - ASSERT(illgrp->illgrp_ill == NULL); - if (ill->ill_isv6) { - if (illgrp == ipst->ips_illgrp_head_v6) { - ipst->ips_illgrp_head_v6 = illgrp->illgrp_next; - } else { - tmpg = ipst->ips_illgrp_head_v6; - while (tmpg->illgrp_next != illgrp) { - tmpg = tmpg->illgrp_next; - ASSERT(tmpg != NULL); - } - tmpg->illgrp_next = illgrp->illgrp_next; - } - } else { - if (illgrp == ipst->ips_illgrp_head_v4) { - ipst->ips_illgrp_head_v4 = illgrp->illgrp_next; - } else { - tmpg = ipst->ips_illgrp_head_v4; - while (tmpg->illgrp_next != illgrp) { - tmpg = tmpg->illgrp_next; - ASSERT(tmpg != NULL); - } - tmpg->illgrp_next = illgrp->illgrp_next; - } - } - mutex_destroy(&illgrp->illgrp_lock); - mi_free(illgrp); - } - rw_exit(&ipst->ips_ill_g_lock); - - /* - * Even though the ill is out of the group its not necessary - * to set ipsq_split as TRUE as the ipifs could be down temporarily - * We will split the ipsq when phyint_groupname is set to NULL. - */ - - /* - * Send a routing sockets message if we are deleting from - * groups with names. - */ - if (ill->ill_phyint->phyint_groupname_len != 0) - ip_rts_ifmsg(ill->ill_ipif); -} - -/* - * Re-do source address selection. This is normally called when - * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST - * ipif comes up. - */ -void -ill_update_source_selection(ill_t *ill) +ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) { - ipif_t *ipif; - - ASSERT(IAM_WRITER_ILL(ill)); - - if (ill->ill_group != NULL) - ill = ill->ill_group->illgrp_ill; - - for (; ill != NULL; ill = ill->ill_group_next) { - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (ill->ill_isv6) - ipif_recreate_interface_routes_v6(NULL, ipif); - else - ipif_recreate_interface_routes(NULL, ipif); - } - } -} - -/* - * Insert ill in a group headed by illgrp_head. The caller can either - * pass a groupname in which case we search for a group with the - * same name to insert in or pass a group to insert in. This function - * would only search groups with names. - * - * NOTE : The caller should make sure that there is at least one ipif - * UP on this ill so that illgrp_scheduler can pick this ill - * for outbound packets. If ill_ipif_up_count is zero, we have - * already sent a DL_UNBIND to the driver and we don't want to - * send anymore packets. We don't assert for ipif_up_count - * to be greater than zero, because ipif_up_done wants to call - * this function before bumping up the ipif_up_count. See - * ipif_up_done() for details. - */ -int -illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname, - ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up) -{ - ill_group_t *illgrp; - ill_t *prev_ill; - phyint_t *phyi; + ill_t *ill = q->q_ptr; + phyint_t *phyi = ill->ill_phyint; + ipmp_grp_t *grp = phyi->phyint_grp; ip_stack_t *ipst = ill->ill_ipst; - ASSERT(ill->ill_group == NULL); - - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - mutex_enter(&ill->ill_lock); - - if (groupname != NULL) { - /* - * Look for a group with a matching groupname to insert. - */ - for (illgrp = *illgrp_head; illgrp != NULL; - illgrp = illgrp->illgrp_next) { - - ill_t *tmp_ill; - - /* - * If we have an ill_group_t in the list which has - * no ill_t assigned then we must be in the process of - * removing this group. We skip this as illgrp_delete() - * will remove it from the list. - */ - if ((tmp_ill = illgrp->illgrp_ill) == NULL) { - ASSERT(illgrp->illgrp_ill_count == 0); - continue; - } - - ASSERT(tmp_ill->ill_phyint != NULL); - phyi = tmp_ill->ill_phyint; - /* - * Look at groups which has names only. - */ - if (phyi->phyint_groupname_len == 0) - continue; - /* - * Names are stored in the phyint common to both - * IPv4 and IPv6. - */ - if (mi_strcmp(phyi->phyint_groupname, - groupname) == 0) { - break; - } - } - } else { - /* - * If the caller passes in a NULL "grp_to_insert", we - * allocate one below and insert this singleton. - */ - illgrp = grp_to_insert; - } - - ill->ill_group_next = NULL; - - if (illgrp == NULL) { - illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t)); - if (illgrp == NULL) { - return (ENOMEM); - } - illgrp->illgrp_next = *illgrp_head; - *illgrp_head = illgrp; - illgrp->illgrp_ill = ill; - illgrp->illgrp_ill_count = 1; - ill->ill_group = illgrp; - /* - * Used in illgrp_scheduler to protect multiple threads - * from traversing the list. - */ - mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0); - } else { - ASSERT(ill->ill_net_type == - illgrp->illgrp_ill->ill_net_type); - ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type); - - /* Insert ill at tail of this group */ - prev_ill = illgrp->illgrp_ill; - while (prev_ill->ill_group_next != NULL) - prev_ill = prev_ill->ill_group_next; - prev_ill->ill_group_next = ill; - ill->ill_group = illgrp; - illgrp->illgrp_ill_count++; - /* - * Inherit group properties. Currently only forwarding - * is the property we try to keep the same with all the - * ills. When there are more, we will abstract this into - * a function. - */ - ill->ill_flags &= ~ILLF_ROUTER; - ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER); - } - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - - /* - * 1) When ipif_up_done() calls this function, ipif_up_count - * may be zero as it has not yet been bumped. But the ires - * have already been added. So, we do the nomination here - * itself. But, when ip_sioctl_groupname calls this, it checks - * for ill_ipif_up_count != 0. Thus we don't check for - * ill_ipif_up_count here while nominating broadcast ires for - * receive. - * - * 2) Similarly, we need to call ill_group_bcast_for_xmit here - * to group them properly as ire_add() has already happened - * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert - * case, we need to do it here anyway. - */ - if (!ill->ill_isv6) { - ill_group_bcast_for_xmit(ill); - ill_nominate_bcast_rcv(illgrp); - } - - if (!ipif_is_coming_up) { - /* - * When ipif_up_done() calls this function, the multicast - * groups have not been joined yet. So, there is no point in - * nomination. ill_join_allmulti() will handle groups when - * ill_recover_multicast() is called from ipif_up_done() later. - */ - (void) ill_nominate_mcast_rcv(illgrp); - /* - * ipif_up_done calls ill_update_source_selection - * anyway. Moreover, we don't want to re-create - * interface routes while ipif_up_done() still has reference - * to them. Refer to ipif_up_done() for more details. - */ - ill_update_source_selection(ill); - } - - /* - * Send a routing sockets message if we are inserting into - * groups with names. - */ - if (groupname != NULL) - ip_rts_ifmsg(ill->ill_ipif); - return (0); -} - -/* - * Return the first phyint matching the groupname. There could - * be more than one when there are ill groups. - * - * If 'usable' is set, then we exclude ones that are marked with any of - * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE). - * Needs work: called only from ip_sioctl_groupname and from the ipmp/netinfo - * emulation of ipmp. - */ -phyint_t * -phyint_lookup_group(char *groupname, boolean_t usable, ip_stack_t *ipst) -{ - phyint_t *phyi; - - ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); - /* - * Group names are stored in the phyint - a common structure - * to both IPv4 and IPv6. - */ - phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); - for (; phyi != NULL; - phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, - phyi, AVL_AFTER)) { - if (phyi->phyint_groupname_len == 0) - continue; - /* - * Skip the ones that should not be used since the callers - * sometime use this for sending packets. - */ - if (usable && (phyi->phyint_flags & - (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE))) - continue; + /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ + ASSERT(!IS_IPMP(ill) && grp != NULL); + ASSERT(IAM_WRITER_IPSQ(ipsq)); - ASSERT(phyi->phyint_groupname != NULL); - if (mi_strcmp(groupname, phyi->phyint_groupname) == 0) - return (phyi); + if (phyi->phyint_illv4 != NULL) { + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + VERIFY(grp->gr_pendv4-- > 0); + rw_exit(&ipst->ips_ipmp_lock); + ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); } - return (NULL); -} - - -/* - * Return the first usable phyint matching the group index. By 'usable' - * we exclude ones that are marked ununsable with any of - * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE). - * - * Used only for the ipmp/netinfo emulation of ipmp. - */ -phyint_t * -phyint_lookup_group_ifindex(uint_t group_ifindex, ip_stack_t *ipst) -{ - phyint_t *phyi; - - ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); - - if (!ipst->ips_ipmp_hook_emulation) - return (NULL); - - /* - * Group indicies are stored in the phyint - a common structure - * to both IPv4 and IPv6. - */ - phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); - for (; phyi != NULL; - phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, - phyi, AVL_AFTER)) { - /* Ignore the ones that do not have a group */ - if (phyi->phyint_groupname_len == 0) - continue; - - ASSERT(phyi->phyint_group_ifindex != 0); - /* - * Skip the ones that should not be used since the callers - * sometime use this for sending packets. - */ - if (phyi->phyint_flags & - (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)) - continue; - if (phyi->phyint_group_ifindex == group_ifindex) - return (phyi); + if (phyi->phyint_illv6 != NULL) { + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + VERIFY(grp->gr_pendv6-- > 0); + rw_exit(&ipst->ips_ipmp_lock); + ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); } - return (NULL); + freemsg(mp); } /* - * MT notes on creation and deletion of IPMP groups - * - * Creation and deletion of IPMP groups introduce the need to merge or - * split the associated serialization objects i.e the ipsq's. Normally all - * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled - * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during - * the execution of the SIOCSLIFGROUPNAME command the picture changes. There - * is a need to change the <ill-ipsq> association and we have to operate on both - * the source and destination IPMP groups. For eg. attempting to set the - * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to - * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the - * source or destination IPMP group are mapped to a single ipsq for executing - * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's. - * The <ill-ipsq> mapping is restored back to normal at a later point. This is - * termed as a split of the ipsq. The converse of the merge i.e. a split of the - * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname - * occurred on the ipsq, then the ipsq_split flag is set. This indicates the - * ipsq has to be examined for redoing the <ill-ipsq> associations. - * - * In the above example the ioctl handling code locates the current ipsq of hme0 - * which is ipsq(mpk17-84). It then enters the above ipsq immediately or - * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates - * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into - * the destination ipsq. If the destination ipsq is not busy, it also enters - * the destination ipsq exclusively. Now the actual groupname setting operation - * can proceed. If the destination ipsq is busy, the operation is enqueued - * on the destination (merged) ipsq and will be handled in the unwind from - * ipsq_exit. - * - * To prevent other threads accessing the ill while the group name change is - * in progres, we bring down the ipifs which also removes the ill from the - * group. The group is changed in phyint and when the first ipif on the ill - * is brought up, the ill is inserted into the right IPMP group by - * illgrp_insert. + * Process an SIOCSLIFGROUPNAME request. */ /* ARGSUSED */ int ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) { - int i; - char *tmp; - int namelen; - ill_t *ill = ipif->ipif_ill; - ill_t *ill_v4, *ill_v6; - int err = 0; - phyint_t *phyi; - phyint_t *phyi_tmp; - struct lifreq *lifr; - mblk_t *mp1; - char *groupname; - ipsq_t *ipsq; + struct lifreq *lifr = ifreq; + ill_t *ill = ipif->ipif_ill; ip_stack_t *ipst = ill->ill_ipst; - - ASSERT(IAM_WRITER_IPIF(ipif)); - - /* Existance verified in ip_wput_nondata */ - mp1 = mp->b_cont->b_cont; - lifr = (struct lifreq *)mp1->b_rptr; - groupname = lifr->lifr_groupname; - - if (ipif->ipif_id != 0) - return (EINVAL); - - phyi = ill->ill_phyint; - ASSERT(phyi != NULL); - - if (phyi->phyint_flags & PHYI_VIRTUAL) - return (EINVAL); - - tmp = groupname; - for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++) - ; - - if (i == LIFNAMSIZ) { - /* no null termination */ - return (EINVAL); - } + phyint_t *phyi = ill->ill_phyint; + ipmp_grp_t *grp = phyi->phyint_grp; + mblk_t *ipsq_mp; + int err = 0; /* - * Calculate the namelen exclusive of the null - * termination character. + * Note that phyint_grp can only change here, where we're exclusive. */ - namelen = tmp - groupname; - - ill_v4 = phyi->phyint_illv4; - ill_v6 = phyi->phyint_illv6; + ASSERT(IAM_WRITER_ILL(ill)); - /* - * ILL cannot be part of a usesrc group and and IPMP group at the - * same time. No need to grab the ill_g_usesrc_lock here, see - * synchronization notes in ip.c - */ - if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) { + if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || + (phyi->phyint_flags & PHYI_VIRTUAL)) return (EINVAL); - } - - /* - * mark the ill as changing. - * this should queue all new requests on the syncq. - */ - GRAB_ILL_LOCKS(ill_v4, ill_v6); - - if (ill_v4 != NULL) - ill_v4->ill_state_flags |= ILL_CHANGING; - if (ill_v6 != NULL) - ill_v6->ill_state_flags |= ILL_CHANGING; - RELEASE_ILL_LOCKS(ill_v4, ill_v6); - - if (namelen == 0) { - /* - * Null string means remove this interface from the - * existing group. - */ - if (phyi->phyint_groupname_len == 0) { - /* - * Never was in a group. - */ - err = 0; - goto done; - } - - /* - * IPv4 or IPv6 may be temporarily out of the group when all - * the ipifs are down. Thus, we need to check for ill_group to - * be non-NULL. - */ - if (ill_v4 != NULL && ill_v4->ill_group != NULL) { - ill_down_ipifs(ill_v4, mp, 0, B_FALSE); - mutex_enter(&ill_v4->ill_lock); - if (!ill_is_quiescent(ill_v4)) { - /* - * ipsq_pending_mp_add will not fail since - * connp is NULL - */ - (void) ipsq_pending_mp_add(NULL, - ill_v4->ill_ipif, q, mp, ILL_DOWN); - mutex_exit(&ill_v4->ill_lock); - err = EINPROGRESS; - goto done; - } - mutex_exit(&ill_v4->ill_lock); - } - - if (ill_v6 != NULL && ill_v6->ill_group != NULL) { - ill_down_ipifs(ill_v6, mp, 0, B_FALSE); - mutex_enter(&ill_v6->ill_lock); - if (!ill_is_quiescent(ill_v6)) { - (void) ipsq_pending_mp_add(NULL, - ill_v6->ill_ipif, q, mp, ILL_DOWN); - mutex_exit(&ill_v6->ill_lock); - err = EINPROGRESS; - goto done; - } - mutex_exit(&ill_v6->ill_lock); - } - - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - GRAB_ILL_LOCKS(ill_v4, ill_v6); - mutex_enter(&phyi->phyint_lock); - ASSERT(phyi->phyint_groupname != NULL); - mi_free(phyi->phyint_groupname); - phyi->phyint_groupname = NULL; - phyi->phyint_groupname_len = 0; - - /* Restore the ifindex used to be the per interface one */ - phyi->phyint_group_ifindex = 0; - phyi->phyint_hook_ifindex = phyi->phyint_ifindex; - mutex_exit(&phyi->phyint_lock); - RELEASE_ILL_LOCKS(ill_v4, ill_v6); - rw_exit(&ipst->ips_ill_g_lock); - err = ill_up_ipifs(ill, q, mp); - /* - * set the split flag so that the ipsq can be split - */ - mutex_enter(&phyi->phyint_ipsq->ipsq_lock); - phyi->phyint_ipsq->ipsq_split = B_TRUE; - mutex_exit(&phyi->phyint_ipsq->ipsq_lock); + lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; - } else { - if (phyi->phyint_groupname_len != 0) { - ASSERT(phyi->phyint_groupname != NULL); - /* Are we inserting in the same group ? */ - if (mi_strcmp(groupname, - phyi->phyint_groupname) == 0) { - err = 0; - goto done; - } - } + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - /* - * Merge ipsq for the group's. - * This check is here as multiple groups/ills might be - * sharing the same ipsq. - * If we have to merege than the operation is restarted - * on the new ipsq. - */ - ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL, ipst); - if (phyi->phyint_ipsq != ipsq) { - rw_exit(&ipst->ips_ill_g_lock); - err = ill_merge_groups(ill, NULL, groupname, mp, q); - goto done; - } - /* - * Running exclusive on new ipsq. - */ - - ASSERT(ipsq != NULL); - ASSERT(ipsq->ipsq_writer == curthread); - - /* - * Check whether the ill_type and ill_net_type matches before - * we allocate any memory so that the cleanup is easier. - * - * We can't group dissimilar ones as we can't load spread - * packets across the group because of potential link-level - * header differences. - */ - phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst); - if (phyi_tmp != NULL) { - if ((ill_v4 != NULL && - phyi_tmp->phyint_illv4 != NULL) && - ((ill_v4->ill_net_type != - phyi_tmp->phyint_illv4->ill_net_type) || - (ill_v4->ill_type != - phyi_tmp->phyint_illv4->ill_type))) { - mutex_enter(&phyi->phyint_ipsq->ipsq_lock); - phyi->phyint_ipsq->ipsq_split = B_TRUE; - mutex_exit(&phyi->phyint_ipsq->ipsq_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (EINVAL); - } - if ((ill_v6 != NULL && - phyi_tmp->phyint_illv6 != NULL) && - ((ill_v6->ill_net_type != - phyi_tmp->phyint_illv6->ill_net_type) || - (ill_v6->ill_type != - phyi_tmp->phyint_illv6->ill_type))) { - mutex_enter(&phyi->phyint_ipsq->ipsq_lock); - phyi->phyint_ipsq->ipsq_split = B_TRUE; - mutex_exit(&phyi->phyint_ipsq->ipsq_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (EINVAL); - } - } - - rw_exit(&ipst->ips_ill_g_lock); - - /* - * bring down all v4 ipifs. - */ - if (ill_v4 != NULL) { - ill_down_ipifs(ill_v4, mp, 0, B_FALSE); - } - - /* - * bring down all v6 ipifs. - */ - if (ill_v6 != NULL) { - ill_down_ipifs(ill_v6, mp, 0, B_FALSE); - } - - /* - * make sure all ipifs are down and there are no active - * references. Call to ipsq_pending_mp_add will not fail - * since connp is NULL. - */ - if (ill_v4 != NULL) { - mutex_enter(&ill_v4->ill_lock); - if (!ill_is_quiescent(ill_v4)) { - (void) ipsq_pending_mp_add(NULL, - ill_v4->ill_ipif, q, mp, ILL_DOWN); - mutex_exit(&ill_v4->ill_lock); - err = EINPROGRESS; - goto done; - } - mutex_exit(&ill_v4->ill_lock); - } - - if (ill_v6 != NULL) { - mutex_enter(&ill_v6->ill_lock); - if (!ill_is_quiescent(ill_v6)) { - (void) ipsq_pending_mp_add(NULL, - ill_v6->ill_ipif, q, mp, ILL_DOWN); - mutex_exit(&ill_v6->ill_lock); - err = EINPROGRESS; - goto done; - } - mutex_exit(&ill_v6->ill_lock); - } - - /* - * allocate including space for null terminator - * before we insert. - */ - tmp = (char *)mi_alloc(namelen + 1, BPRI_MED); - if (tmp == NULL) - return (ENOMEM); - - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - GRAB_ILL_LOCKS(ill_v4, ill_v6); - mutex_enter(&phyi->phyint_lock); - if (phyi->phyint_groupname_len != 0) { - ASSERT(phyi->phyint_groupname != NULL); - mi_free(phyi->phyint_groupname); - } - - /* - * setup the new group name. - */ - phyi->phyint_groupname = tmp; - bcopy(groupname, phyi->phyint_groupname, namelen + 1); - phyi->phyint_groupname_len = namelen + 1; - - if (ipst->ips_ipmp_hook_emulation) { - /* - * If the group already exists we use the existing - * group_ifindex, otherwise we pick a new index here. - */ - if (phyi_tmp != NULL) { - phyi->phyint_group_ifindex = - phyi_tmp->phyint_group_ifindex; - } else { - /* XXX We need a recovery strategy here. */ - if (!ip_assign_ifindex( - &phyi->phyint_group_ifindex, ipst)) - cmn_err(CE_PANIC, - "ip_assign_ifindex() failed"); - } - } - /* - * Select whether the netinfo and hook use the per-interface - * or per-group ifindex. - */ - if (ipst->ips_ipmp_hook_emulation) - phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex; - else - phyi->phyint_hook_ifindex = phyi->phyint_ifindex; - - if (ipst->ips_ipmp_hook_emulation && - phyi_tmp != NULL) { - /* First phyint in group - group PLUMB event */ - ill_nic_event_plumb(ill, B_TRUE); - } - mutex_exit(&phyi->phyint_lock); - RELEASE_ILL_LOCKS(ill_v4, ill_v6); - rw_exit(&ipst->ips_ill_g_lock); - - err = ill_up_ipifs(ill, q, mp); - } - -done: /* - * normally ILL_CHANGING is cleared in ill_up_ipifs. + * If the name hasn't changed, there's nothing to do. */ - if (err != EINPROGRESS) { - GRAB_ILL_LOCKS(ill_v4, ill_v6); - if (ill_v4 != NULL) - ill_v4->ill_state_flags &= ~ILL_CHANGING; - if (ill_v6 != NULL) - ill_v6->ill_state_flags &= ~ILL_CHANGING; - RELEASE_ILL_LOCKS(ill_v4, ill_v6); - } - return (err); -} + if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) + goto unlock; -/* ARGSUSED */ -int -ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, - mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) -{ - ill_t *ill; - phyint_t *phyi; - struct lifreq *lifr; - mblk_t *mp1; - - /* Existence verified in ip_wput_nondata */ - mp1 = mp->b_cont->b_cont; - lifr = (struct lifreq *)mp1->b_rptr; - ill = ipif->ipif_ill; - phyi = ill->ill_phyint; - - lifr->lifr_groupname[0] = '\0'; /* - * ill_group may be null if all the interfaces - * are down. But still, the phyint should always - * hold the name. - */ - if (phyi->phyint_groupname_len != 0) { - bcopy(phyi->phyint_groupname, lifr->lifr_groupname, - phyi->phyint_groupname_len); - } - - return (0); -} - - -typedef struct conn_move_s { - ill_t *cm_from_ill; - ill_t *cm_to_ill; - int cm_ifindex; -} conn_move_t; - -/* - * ipcl_walk function for moving conn_multicast_ill for a given ill. - */ -static void -conn_move(conn_t *connp, caddr_t arg) -{ - conn_move_t *connm; - int ifindex; - int i; - ill_t *from_ill; - ill_t *to_ill; - ilg_t *ilg; - ilm_t *ret_ilm; - - connm = (conn_move_t *)arg; - ifindex = connm->cm_ifindex; - from_ill = connm->cm_from_ill; - to_ill = connm->cm_to_ill; - - /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */ - - /* All multicast fields protected by conn_lock */ - mutex_enter(&connp->conn_lock); - ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); - if ((connp->conn_outgoing_ill == from_ill) && - (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) { - connp->conn_outgoing_ill = to_ill; - connp->conn_incoming_ill = to_ill; - } - - /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */ - - if ((connp->conn_multicast_ill == from_ill) && - (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) { - connp->conn_multicast_ill = connm->cm_to_ill; - } - - /* - * Change the ilg_ill to point to the new one. This assumes - * ilm_move_v6 has moved the ilms to new_ill and the driver - * has been told to receive packets on this interface. - * ilm_move_v6 FAILBACKS all the ilms successfully always. - * But when doing a FAILOVER, it might fail with ENOMEM and so - * some ilms may not have moved. We check to see whether - * the ilms have moved to to_ill. We can't check on from_ill - * as in the process of moving, we could have split an ilm - * in to two - which has the same orig_ifindex and v6group. + * Handle requests to rename an IPMP meta-interface. * - * For IPv4, ilg_ipif moves implicitly. The code below really - * does not do anything for IPv4 as ilg_ill is NULL for IPv4. - */ - for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { - ilg = &connp->conn_ilg[i]; - if ((ilg->ilg_ill == from_ill) && - (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) { - /* ifindex != 0 indicates failback */ - if (ifindex != 0) { - connp->conn_ilg[i].ilg_ill = to_ill; - continue; - } - - mutex_enter(&to_ill->ill_lock); - ret_ilm = ilm_lookup_ill_index_v6(to_ill, - &ilg->ilg_v6group, ilg->ilg_orig_ifindex, - connp->conn_zoneid); - mutex_exit(&to_ill->ill_lock); - - if (ret_ilm != NULL) - connp->conn_ilg[i].ilg_ill = to_ill; - } + * Note that creation of the IPMP meta-interface is handled in + * userland through the standard plumbing sequence. As part of the + * plumbing the IPMP meta-interface, its initial groupname is set to + * the name of the interface (see ipif_set_values_tail()). + */ + if (IS_IPMP(ill)) { + err = ipmp_grp_rename(grp, lifr->lifr_groupname); + goto unlock; } - mutex_exit(&connp->conn_lock); -} - -static void -conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex) -{ - conn_move_t connm; - ip_stack_t *ipst = from_ill->ill_ipst; - - connm.cm_from_ill = from_ill; - connm.cm_to_ill = to_ill; - connm.cm_ifindex = ifindex; - - ipcl_walk(conn_move, (caddr_t)&connm, ipst); -} - -/* - * ilm has been moved from from_ill to to_ill. - * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill. - * appropriately. - * - * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because - * the code there de-references ipif_ill to get the ill to - * send multicast requests. It does not work as ipif is on its - * move and already moved when this function is called. - * Thus, we need to use from_ill and to_ill send down multicast - * requests. - */ -static void -ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill) -{ - ipif_t *ipif; - ilm_t *ilm; /* - * See whether we need to send down DL_ENABMULTI_REQ on - * to_ill as ilm has just been added. + * Handle requests to add or remove an IP interface from a group. */ - ASSERT(IAM_WRITER_ILL(to_ill)); - ASSERT(IAM_WRITER_ILL(from_ill)); - - ILM_WALKER_HOLD(to_ill); - for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - - if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED)) - continue; - /* - * no locks held, ill/ipif cannot dissappear as long - * as we are writer. - */ - ipif = to_ill->ill_ipif; + if (lifr->lifr_groupname[0] != '\0') { /* add */ /* - * No need to hold any lock as we are the writer and this - * can only be changed by a writer. + * Moves are handled by first removing the interface from + * its existing group, and then adding it to another group. + * So, fail if it's already in a group. */ - ilm->ilm_is_new = B_FALSE; - - if (to_ill->ill_net_type != IRE_IF_RESOLVER || - ipif->ipif_flags & IPIF_POINTOPOINT) { - ip1dbg(("ilm_send_multicast_reqs: to_ill not " - "resolver\n")); - continue; /* Must be IRE_IF_NORESOLVER */ - } - - if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { - ip1dbg(("ilm_send_multicast_reqs: " - "to_ill MULTI_BCAST\n")); - goto from; + if (IS_UNDER_IPMP(ill)) { + err = EALREADY; + goto unlock; } - if (to_ill->ill_isv6) - mld_joingroup(ilm); - else - igmp_joingroup(ilm); - - if (to_ill->ill_ipif_up_count == 0) { - /* - * Nobody there. All multicast addresses will be - * re-joined when we get the DL_BIND_ACK bringing the - * interface up. - */ - ilm->ilm_notify_driver = B_FALSE; - ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n")); - goto from; + grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); + if (grp == NULL) { + err = ENOENT; + goto unlock; } /* - * For allmulti address, we want to join on only one interface. - * Checking for ilm_numentries_v6 is not correct as you may - * find an ilm with zero address on to_ill, but we may not - * have nominated to_ill for receiving. Thus, if we have - * nominated from_ill (ill_join_allmulti is set), nominate - * only if to_ill is not already nominated (to_ill normally - * should not have been nominated if "from_ill" has already - * been nominated. As we don't prevent failovers from happening - * across groups, we don't assert). + * Check if the phyint and its ills are suitable for + * inclusion into the group. */ - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { - /* - * There is no need to hold ill locks as we are - * writer on both ills and when ill_join_allmulti() - * is called the thread is always a writer. - */ - if (from_ill->ill_join_allmulti && - !to_ill->ill_join_allmulti) { - (void) ill_join_allmulti(to_ill); - } - } else if (ilm->ilm_notify_driver) { - - /* - * This is a newly moved ilm so we need to tell the - * driver about the new group. There can be more than - * one ilm's for the same group in the list each with a - * different orig_ifindex. We have to inform the driver - * once. In ilm_move_v[4,6] we only set the flag - * ilm_notify_driver for the first ilm. - */ - - (void) ip_ll_send_enabmulti_req(to_ill, - &ilm->ilm_v6addr); - } - - ilm->ilm_notify_driver = B_FALSE; + if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) + goto unlock; /* - * See whether we need to send down DL_DISABMULTI_REQ on - * from_ill as ilm has just been removed. + * Checks pass; join the group, and enqueue the remaining + * illgrp joins for when we've become part of the group xop + * and are exclusive across its IPSQs. Since qwriter_ip() + * requires an mblk_t to scribble on, and since `mp' will be + * freed as part of completing the ioctl, allocate another. */ -from: - ipif = from_ill->ill_ipif; - if (from_ill->ill_net_type != IRE_IF_RESOLVER || - ipif->ipif_flags & IPIF_POINTOPOINT) { - ip1dbg(("ilm_send_multicast_reqs: " - "from_ill not resolver\n")); - continue; /* Must be IRE_IF_NORESOLVER */ - } - - if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { - ip1dbg(("ilm_send_multicast_reqs: " - "from_ill MULTI_BCAST\n")); - continue; - } - - if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { - if (from_ill->ill_join_allmulti) - ill_leave_allmulti(from_ill); - } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) { - (void) ip_ll_send_disabmulti_req(from_ill, - &ilm->ilm_v6addr); + if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { + err = ENOMEM; + goto unlock; } - } - ILM_WALKER_RELE(to_ill); -} - -/* - * This function is called when all multicast memberships needs - * to be moved from "from_ill" to "to_ill" for IPv6. This function is - * called only once unlike the IPv4 counterpart where it is called after - * every logical interface is moved. The reason is due to multicast - * memberships are joined using an interface address in IPv4 while in - * IPv6, interface index is used. - */ -static void -ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex) -{ - ilm_t *ilm; - ilm_t *ilm_next; - ilm_t *new_ilm; - ilm_t **ilmp; - int count; - char buf[INET6_ADDRSTRLEN]; - in6_addr_t ipv6_snm = ipv6_solicited_node_mcast; - ip_stack_t *ipst = from_ill->ill_ipst; - - ASSERT(MUTEX_HELD(&to_ill->ill_lock)); - ASSERT(MUTEX_HELD(&from_ill->ill_lock)); - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - if (ifindex == 0) { /* - * Form the solicited node mcast address which is used later. + * Before we drop ipmp_lock, bump gr_pend* to ensure that the + * IPMP meta-interface ills needed by `phyi' cannot go away + * before ip_join_illgrps() is called back. See the comments + * in ip_sioctl_plink_ipmp() for more. */ - ipif_t *ipif; - - ipif = from_ill->ill_ipif; - ASSERT(ipif->ipif_id == 0); - - ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; - } - - ilmp = &from_ill->ill_ilm; - for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { - ilm_next = ilm->ilm_next; - - if (ilm->ilm_flags & ILM_DELETED) { - ilmp = &ilm->ilm_next; - continue; - } + if (phyi->phyint_illv4 != NULL) + grp->gr_pendv4++; + if (phyi->phyint_illv6 != NULL) + grp->gr_pendv6++; - new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr, - ilm->ilm_orig_ifindex, ilm->ilm_zoneid); - ASSERT(ilm->ilm_orig_ifindex != 0); - if (ilm->ilm_orig_ifindex == ifindex) { - /* - * We are failing back multicast memberships. - * If the same ilm exists in to_ill, it means somebody - * has joined the same group there e.g. ff02::1 - * is joined within the kernel when the interfaces - * came UP. - */ - ASSERT(ilm->ilm_ipif == NULL); - if (new_ilm != NULL) { - new_ilm->ilm_refcnt += ilm->ilm_refcnt; - if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || - !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { - new_ilm->ilm_is_new = B_TRUE; - } - } else { - /* - * check if we can just move the ilm - */ - if (from_ill->ill_ilm_walker_cnt != 0) { - /* - * We have walkers we cannot move - * the ilm, so allocate a new ilm, - * this (old) ilm will be marked - * ILM_DELETED at the end of the loop - * and will be freed when the - * last walker exits. - */ - new_ilm = (ilm_t *)mi_zalloc - (sizeof (ilm_t)); - if (new_ilm == NULL) { - ip0dbg(("ilm_move_v6: " - "FAILBACK of IPv6" - " multicast address %s : " - "from %s to" - " %s failed : ENOMEM \n", - inet_ntop(AF_INET6, - &ilm->ilm_v6addr, buf, - sizeof (buf)), - from_ill->ill_name, - to_ill->ill_name)); - - ilmp = &ilm->ilm_next; - continue; - } - *new_ilm = *ilm; - /* - * we don't want new_ilm linked to - * ilm's filter list. - */ - new_ilm->ilm_filter = NULL; - } else { - /* - * No walkers we can move the ilm. - * lets take it out of the list. - */ - *ilmp = ilm->ilm_next; - ilm->ilm_next = NULL; - DTRACE_PROBE3(ill__decr__cnt, - (ill_t *), from_ill, - (char *), "ilm", (void *), ilm); - ASSERT(from_ill->ill_ilm_cnt > 0); - from_ill->ill_ilm_cnt--; - - new_ilm = ilm; - } + rw_exit(&ipst->ips_ipmp_lock); - /* - * if this is the first ilm for the group - * set ilm_notify_driver so that we notify the - * driver in ilm_send_multicast_reqs. - */ - if (ilm_lookup_ill_v6(to_ill, - &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) - new_ilm->ilm_notify_driver = B_TRUE; - - DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill, - (char *), "ilm", (void *), new_ilm); - new_ilm->ilm_ill = to_ill; - to_ill->ill_ilm_cnt++; - - /* Add to the to_ill's list */ - new_ilm->ilm_next = to_ill->ill_ilm; - to_ill->ill_ilm = new_ilm; - /* - * set the flag so that mld_joingroup is - * called in ilm_send_multicast_reqs(). - */ - new_ilm->ilm_is_new = B_TRUE; - } - goto bottom; - } else if (ifindex != 0) { - /* - * If this is FAILBACK (ifindex != 0) and the ifindex - * has not matched above, look at the next ilm. - */ - ilmp = &ilm->ilm_next; - continue; - } - /* - * If we are here, it means ifindex is 0. Failover - * everything. - * - * We need to handle solicited node mcast address - * and all_nodes mcast address differently as they - * are joined witin the kenrel (ipif_multicast_up) - * and potentially from the userland. We are called - * after the ipifs of from_ill has been moved. - * If we still find ilms on ill with solicited node - * mcast address or all_nodes mcast address, it must - * belong to the UP interface that has not moved e.g. - * ipif_id 0 with the link local prefix does not move. - * We join this on the new ill accounting for all the - * userland memberships so that applications don't - * see any failure. - * - * We need to make sure that we account only for the - * solicited node and all node multicast addresses - * that was brought UP on these. In the case of - * a failover from A to B, we might have ilms belonging - * to A (ilm_orig_ifindex pointing at A) on B accounting - * for the membership from the userland. If we are failing - * over from B to C now, we will find the ones belonging - * to A on B. These don't account for the ill_ipif_up_count. - * They just move from B to C. The check below on - * ilm_orig_ifindex ensures that. - */ - if ((ilm->ilm_orig_ifindex == - from_ill->ill_phyint->phyint_ifindex) && - (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) || - IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, - &ilm->ilm_v6addr))) { - ASSERT(ilm->ilm_refcnt > 0); - count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count; - /* - * For indentation reasons, we are not using a - * "else" here. - */ - if (count == 0) { - ilmp = &ilm->ilm_next; - continue; - } - ilm->ilm_refcnt -= count; - if (new_ilm != NULL) { - /* - * Can find one with the same - * ilm_orig_ifindex, if we are failing - * over to a STANDBY. This happens - * when somebody wants to join a group - * on a STANDBY interface and we - * internally join on a different one. - * If we had joined on from_ill then, a - * failover now will find a new ilm - * with this index. - */ - ip1dbg(("ilm_move_v6: FAILOVER, found" - " new ilm on %s, group address %s\n", - to_ill->ill_name, - inet_ntop(AF_INET6, - &ilm->ilm_v6addr, buf, - sizeof (buf)))); - new_ilm->ilm_refcnt += count; - if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || - !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { - new_ilm->ilm_is_new = B_TRUE; - } - } else { - new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); - if (new_ilm == NULL) { - ip0dbg(("ilm_move_v6: FAILOVER of IPv6" - " multicast address %s : from %s to" - " %s failed : ENOMEM \n", - inet_ntop(AF_INET6, - &ilm->ilm_v6addr, buf, - sizeof (buf)), from_ill->ill_name, - to_ill->ill_name)); - ilmp = &ilm->ilm_next; - continue; - } - *new_ilm = *ilm; - new_ilm->ilm_filter = NULL; - new_ilm->ilm_refcnt = count; - new_ilm->ilm_timer = INFINITY; - new_ilm->ilm_rtx.rtx_timer = INFINITY; - new_ilm->ilm_is_new = B_TRUE; - /* - * If the to_ill has not joined this - * group we need to tell the driver in - * ill_send_multicast_reqs. - */ - if (ilm_lookup_ill_v6(to_ill, - &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) - new_ilm->ilm_notify_driver = B_TRUE; - - new_ilm->ilm_ill = to_ill; - DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill, - (char *), "ilm", (void *), new_ilm); - to_ill->ill_ilm_cnt++; - - /* Add to the to_ill's list */ - new_ilm->ilm_next = to_ill->ill_ilm; - to_ill->ill_ilm = new_ilm; - ASSERT(new_ilm->ilm_ipif == NULL); - } - if (ilm->ilm_refcnt == 0) { - goto bottom; - } else { - new_ilm->ilm_fmode = MODE_IS_EXCLUDE; - CLEAR_SLIST(new_ilm->ilm_filter); - ilmp = &ilm->ilm_next; - } - continue; - } else { - /* - * ifindex = 0 means, move everything pointing at - * from_ill. We are doing this becuase ill has - * either FAILED or became INACTIVE. - * - * As we would like to move things later back to - * from_ill, we want to retain the identity of this - * ilm. Thus, we don't blindly increment the reference - * count on the ilms matching the address alone. We - * need to match on the ilm_orig_index also. new_ilm - * was obtained by matching ilm_orig_index also. - */ - if (new_ilm != NULL) { - /* - * This is possible only if a previous restore - * was incomplete i.e restore to - * ilm_orig_ifindex left some ilms because - * of some failures. Thus when we are failing - * again, we might find our old friends there. - */ - ip1dbg(("ilm_move_v6: FAILOVER, found new ilm" - " on %s, group address %s\n", - to_ill->ill_name, - inet_ntop(AF_INET6, - &ilm->ilm_v6addr, buf, - sizeof (buf)))); - new_ilm->ilm_refcnt += ilm->ilm_refcnt; - if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || - !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { - new_ilm->ilm_is_new = B_TRUE; - } - } else { - if (from_ill->ill_ilm_walker_cnt != 0) { - new_ilm = (ilm_t *) - mi_zalloc(sizeof (ilm_t)); - if (new_ilm == NULL) { - ip0dbg(("ilm_move_v6: " - "FAILOVER of IPv6" - " multicast address %s : " - "from %s to" - " %s failed : ENOMEM \n", - inet_ntop(AF_INET6, - &ilm->ilm_v6addr, buf, - sizeof (buf)), - from_ill->ill_name, - to_ill->ill_name)); - - ilmp = &ilm->ilm_next; - continue; - } - *new_ilm = *ilm; - new_ilm->ilm_filter = NULL; - } else { - *ilmp = ilm->ilm_next; - DTRACE_PROBE3(ill__decr__cnt, - (ill_t *), from_ill, - (char *), "ilm", (void *), ilm); - ASSERT(from_ill->ill_ilm_cnt > 0); - from_ill->ill_ilm_cnt--; - - new_ilm = ilm; - } - /* - * If the to_ill has not joined this - * group we need to tell the driver in - * ill_send_multicast_reqs. - */ - if (ilm_lookup_ill_v6(to_ill, - &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) - new_ilm->ilm_notify_driver = B_TRUE; - - /* Add to the to_ill's list */ - new_ilm->ilm_next = to_ill->ill_ilm; - to_ill->ill_ilm = new_ilm; - ASSERT(ilm->ilm_ipif == NULL); - new_ilm->ilm_ill = to_ill; - DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill, - (char *), "ilm", (void *), new_ilm); - to_ill->ill_ilm_cnt++; - new_ilm->ilm_is_new = B_TRUE; - } - - } - -bottom: - /* - * Revert multicast filter state to (EXCLUDE, NULL). - * new_ilm->ilm_is_new should already be set if needed. - */ - new_ilm->ilm_fmode = MODE_IS_EXCLUDE; - CLEAR_SLIST(new_ilm->ilm_filter); + ipmp_phyint_join_grp(phyi, grp); + ill_refhold(ill); + qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, + SWITCH_OP, B_FALSE); + return (0); + } else { /* - * We allocated/got a new ilm, free the old one. + * Request to remove the interface from a group. If the + * interface is not in a group, this trivially succeeds. */ - if (new_ilm != ilm) { - if (from_ill->ill_ilm_walker_cnt == 0) { - *ilmp = ilm->ilm_next; - - ASSERT(ilm->ilm_ipif == NULL); /* ipv6 */ - DTRACE_PROBE3(ill__decr__cnt, (ill_t *), - from_ill, (char *), "ilm", (void *), ilm); - ASSERT(from_ill->ill_ilm_cnt > 0); - from_ill->ill_ilm_cnt--; - - ilm_inactive(ilm); /* frees this ilm */ - - } else { - ilm->ilm_flags |= ILM_DELETED; - from_ill->ill_ilm_cleanup_reqd = 1; - ilmp = &ilm->ilm_next; - } - } + rw_exit(&ipst->ips_ipmp_lock); + if (IS_UNDER_IPMP(ill)) + ipmp_phyint_leave_grp(phyi); + return (0); } +unlock: + rw_exit(&ipst->ips_ipmp_lock); + return (err); } /* - * Move all the multicast memberships to to_ill. Called when - * an ipif moves from "from_ill" to "to_ill". This function is slightly - * different from IPv6 counterpart as multicast memberships are associated - * with ills in IPv6. This function is called after every ipif is moved - * unlike IPv6, where it is moved only once. + * Process an SIOCGLIFBINDING request. */ -static void -ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif) -{ - ilm_t *ilm; - ilm_t *ilm_next; - ilm_t *new_ilm; - ilm_t **ilmp; - ip_stack_t *ipst = from_ill->ill_ipst; - - ASSERT(MUTEX_HELD(&to_ill->ill_lock)); - ASSERT(MUTEX_HELD(&from_ill->ill_lock)); - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - - ilmp = &from_ill->ill_ilm; - for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { - ilm_next = ilm->ilm_next; - - if (ilm->ilm_flags & ILM_DELETED) { - ilmp = &ilm->ilm_next; - continue; - } - - ASSERT(ilm->ilm_ipif != NULL); - - if (ilm->ilm_ipif != ipif) { - ilmp = &ilm->ilm_next; - continue; - } - - if (V4_PART_OF_V6(ilm->ilm_v6addr) == - htonl(INADDR_ALLHOSTS_GROUP)) { - new_ilm = ilm_lookup_ipif(ipif, - V4_PART_OF_V6(ilm->ilm_v6addr)); - if (new_ilm != NULL) { - new_ilm->ilm_refcnt += ilm->ilm_refcnt; - /* - * We still need to deal with the from_ill. - */ - new_ilm->ilm_is_new = B_TRUE; - new_ilm->ilm_fmode = MODE_IS_EXCLUDE; - CLEAR_SLIST(new_ilm->ilm_filter); - ASSERT(ilm->ilm_ipif == ipif); - ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0); - if (from_ill->ill_ilm_walker_cnt == 0) { - DTRACE_PROBE3(ill__decr__cnt, - (ill_t *), from_ill, - (char *), "ilm", (void *), ilm); - ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0); - } - goto delete_ilm; - } - /* - * If we could not find one e.g. ipif is - * still down on to_ill, we add this ilm - * on ill_new to preserve the reference - * count. - */ - } - /* - * When ipifs move, ilms always move with it - * to the NEW ill. Thus we should never be - * able to find ilm till we really move it here. - */ - ASSERT(ilm_lookup_ipif(ipif, - V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL); - - if (from_ill->ill_ilm_walker_cnt != 0) { - new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); - if (new_ilm == NULL) { - char buf[INET6_ADDRSTRLEN]; - ip0dbg(("ilm_move_v4: FAILBACK of IPv4" - " multicast address %s : " - "from %s to" - " %s failed : ENOMEM \n", - inet_ntop(AF_INET, - &ilm->ilm_v6addr, buf, - sizeof (buf)), - from_ill->ill_name, - to_ill->ill_name)); - - ilmp = &ilm->ilm_next; - continue; - } - *new_ilm = *ilm; - DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ipif, - (char *), "ilm", (void *), ilm); - new_ilm->ilm_ipif->ipif_ilm_cnt++; - /* We don't want new_ilm linked to ilm's filter list */ - new_ilm->ilm_filter = NULL; - } else { - /* Remove from the list */ - *ilmp = ilm->ilm_next; - new_ilm = ilm; - } - - /* - * If we have never joined this group on the to_ill - * make sure we tell the driver. - */ - if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr, - ALL_ZONES) == NULL) - new_ilm->ilm_notify_driver = B_TRUE; - - /* Add to the to_ill's list */ - new_ilm->ilm_next = to_ill->ill_ilm; - to_ill->ill_ilm = new_ilm; - new_ilm->ilm_is_new = B_TRUE; - - /* - * Revert multicast filter state to (EXCLUDE, NULL) - */ - new_ilm->ilm_fmode = MODE_IS_EXCLUDE; - CLEAR_SLIST(new_ilm->ilm_filter); - - /* - * Delete only if we have allocated a new ilm. - */ - if (new_ilm != ilm) { -delete_ilm: - if (from_ill->ill_ilm_walker_cnt == 0) { - /* Remove from the list */ - *ilmp = ilm->ilm_next; - ilm->ilm_next = NULL; - DTRACE_PROBE3(ipif__decr__cnt, - (ipif_t *), ilm->ilm_ipif, - (char *), "ilm", (void *), ilm); - ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0); - ilm->ilm_ipif->ipif_ilm_cnt--; - ilm_inactive(ilm); - } else { - ilm->ilm_flags |= ILM_DELETED; - from_ill->ill_ilm_cleanup_reqd = 1; - ilmp = &ilm->ilm_next; - } - } - } -} - -static uint_t -ipif_get_id(ill_t *ill, uint_t id) -{ - uint_t unit; - ipif_t *tipif; - boolean_t found = B_FALSE; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * During failback, we want to go back to the same id - * instead of the smallest id so that the original - * configuration is maintained. id is non-zero in that - * case. - */ - if (id != 0) { - /* - * While failing back, if we still have an ipif with - * MAX_ADDRS_PER_IF, it means this will be replaced - * as soon as we return from this function. It was - * to set to MAX_ADDRS_PER_IF by the caller so that - * we can choose the smallest id. Thus we return zero - * in that case ignoring the hint. - */ - if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF) - return (0); - for (tipif = ill->ill_ipif; tipif != NULL; - tipif = tipif->ipif_next) { - if (tipif->ipif_id == id) { - found = B_TRUE; - break; - } - } - /* - * If somebody already plumbed another logical - * with the same id, we won't be able to find it. - */ - if (!found) - return (id); - } - for (unit = 0; unit <= ipst->ips_ip_addrs_per_if; unit++) { - found = B_FALSE; - for (tipif = ill->ill_ipif; tipif != NULL; - tipif = tipif->ipif_next) { - if (tipif->ipif_id == unit) { - found = B_TRUE; - break; - } - } - if (!found) - break; - } - return (unit); -} - /* ARGSUSED */ -static int -ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, - ipif_t **rep_ipif_ptr) +int +ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, + ip_ioctl_cmd_t *ipip, void *ifreq) { - ill_t *from_ill; - ipif_t *rep_ipif; - uint_t unit; - int err = 0; - ipif_t *to_ipif; - struct iocblk *iocp; - boolean_t failback_cmd; - boolean_t remove_ipif; - int rc; - ip_stack_t *ipst; - - ASSERT(IAM_WRITER_ILL(to_ill)); - ASSERT(IAM_WRITER_IPIF(ipif)); - - iocp = (struct iocblk *)mp->b_rptr; - failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK); - remove_ipif = B_FALSE; - - from_ill = ipif->ipif_ill; - ipst = from_ill->ill_ipst; - - ASSERT(MUTEX_HELD(&to_ill->ill_lock)); - ASSERT(MUTEX_HELD(&from_ill->ill_lock)); - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - - /* - * Don't move LINK LOCAL addresses as they are tied to - * physical interface. - */ - if (from_ill->ill_isv6 && - IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) { - ipif->ipif_was_up = B_FALSE; - IPIF_UNMARK_MOVING(ipif); - return (0); - } - - /* - * We set the ipif_id to maximum so that the search for - * ipif_id will pick the lowest number i.e 0 in the - * following 2 cases : - * - * 1) We have a replacement ipif at the head of to_ill. - * We can't remove it yet as we can exceed ip_addrs_per_if - * on to_ill and hence the MOVE might fail. We want to - * remove it only if we could move the ipif. Thus, by - * setting it to the MAX value, we make the search in - * ipif_get_id return the zeroth id. - * - * 2) When DR pulls out the NIC and re-plumbs the interface, - * we might just have a zero address plumbed on the ipif - * with zero id in the case of IPv4. We remove that while - * doing the failback. We want to remove it only if we - * could move the ipif. Thus, by setting it to the MAX - * value, we make the search in ipif_get_id return the - * zeroth id. - * - * Both (1) and (2) are done only when when we are moving - * an ipif (either due to failover/failback) which originally - * belonged to this interface i.e the ipif_orig_ifindex is - * the same as to_ill's ifindex. This is needed so that - * FAILOVER from A -> B ( A failed) followed by FAILOVER - * from B -> A (B is being removed from the group) and - * FAILBACK from A -> B restores the original configuration. - * Without the check for orig_ifindex, the second FAILOVER - * could make the ipif belonging to B replace the A's zeroth - * ipif and the subsequent failback re-creating the replacement - * ipif again. - * - * NOTE : We created the replacement ipif when we did a - * FAILOVER (See below). We could check for FAILBACK and - * then look for replacement ipif to be removed. But we don't - * want to do that because we wan't to allow the possibility - * of a FAILOVER from A -> B (which creates the replacement ipif), - * followed by a *FAILOVER* from B -> A instead of a FAILBACK - * from B -> A. - */ - to_ipif = to_ill->ill_ipif; - if ((to_ill->ill_phyint->phyint_ifindex == - ipif->ipif_orig_ifindex) && - to_ipif->ipif_replace_zero) { - ASSERT(to_ipif->ipif_id == 0); - remove_ipif = B_TRUE; - to_ipif->ipif_id = MAX_ADDRS_PER_IF; - } - /* - * Find the lowest logical unit number on the to_ill. - * If we are failing back, try to get the original id - * rather than the lowest one so that the original - * configuration is maintained. - * - * XXX need a better scheme for this. - */ - if (failback_cmd) { - unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid); - } else { - unit = ipif_get_id(to_ill, 0); - } - - /* Reset back to zero in case we fail below */ - if (to_ipif->ipif_id == MAX_ADDRS_PER_IF) - to_ipif->ipif_id = 0; + ill_t *bound_ill; + struct lifreq *lifr = ifreq; - if (unit == ipst->ips_ip_addrs_per_if) { - ipif->ipif_was_up = B_FALSE; - IPIF_UNMARK_MOVING(ipif); + if (!IS_IPMP(ipif->ipif_ill)) return (EINVAL); - } - - /* - * ipif is ready to move from "from_ill" to "to_ill". - * - * 1) If we are moving ipif with id zero, create a - * replacement ipif for this ipif on from_ill. If this fails - * fail the MOVE operation. - * - * 2) Remove the replacement ipif on to_ill if any. - * We could remove the replacement ipif when we are moving - * the ipif with id zero. But what if somebody already - * unplumbed it ? Thus we always remove it if it is present. - * We want to do it only if we are sure we are going to - * move the ipif to to_ill which is why there are no - * returns due to error till ipif is linked to to_ill. - * Note that the first ipif that we failback will always - * be zero if it is present. - */ - if (ipif->ipif_id == 0) { - ipaddr_t inaddr_any = INADDR_ANY; - rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED); - if (rep_ipif == NULL) { - ipif->ipif_was_up = B_FALSE; - IPIF_UNMARK_MOVING(ipif); - return (ENOMEM); - } - *rep_ipif = ipif_zero; - /* - * Before we put the ipif on the list, store the addresses - * as mapped addresses as some of the ioctls e.g SIOCGIFADDR - * assumes so. This logic is not any different from what - * ipif_allocate does. - */ - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6lcl_addr); - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6src_addr); - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6subnet); - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6net_mask); - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6brd_addr); - IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &rep_ipif->ipif_v6pp_dst_addr); - /* - * We mark IPIF_NOFAILOVER so that this can never - * move. - */ - rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; - rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE; - rep_ipif->ipif_replace_zero = B_TRUE; - mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, - MUTEX_DEFAULT, NULL); - rep_ipif->ipif_id = 0; - rep_ipif->ipif_ire_type = ipif->ipif_ire_type; - rep_ipif->ipif_ill = from_ill; - rep_ipif->ipif_orig_ifindex = - from_ill->ill_phyint->phyint_ifindex; - /* Insert at head */ - rep_ipif->ipif_next = from_ill->ill_ipif; - from_ill->ill_ipif = rep_ipif; - /* - * We don't really care to let apps know about - * this interface. - */ - } - - if (remove_ipif) { - /* - * We set to a max value above for this case to get - * id zero. ASSERT that we did get one. - */ - ASSERT((to_ipif->ipif_id == 0) && (unit == 0)); - rep_ipif = to_ipif; - to_ill->ill_ipif = rep_ipif->ipif_next; - rep_ipif->ipif_next = NULL; - /* - * If some apps scanned and find this interface, - * it is time to let them know, so that they can - * delete it. - */ - - *rep_ipif_ptr = rep_ipif; - } - - /* Get it out of the ILL interface list. */ - ipif_remove(ipif, B_FALSE); - - /* Assign the new ill */ - ipif->ipif_ill = to_ill; - ipif->ipif_id = unit; - /* id has already been checked */ - rc = ipif_insert(ipif, B_FALSE, B_FALSE); - ASSERT(rc == 0); - /* Let SCTP update its list */ - sctp_move_ipif(ipif, from_ill, to_ill); - /* - * Handle the failover and failback of ipif_t between - * ill_t that have differing maximum mtu values. - */ - if (ipif->ipif_mtu > to_ill->ill_max_mtu) { - if (ipif->ipif_saved_mtu == 0) { - /* - * As this ipif_t is moving to an ill_t - * that has a lower ill_max_mtu, its - * ipif_mtu needs to be saved so it can - * be restored during failback or during - * failover to an ill_t which has a - * higher ill_max_mtu. - */ - ipif->ipif_saved_mtu = ipif->ipif_mtu; - ipif->ipif_mtu = to_ill->ill_max_mtu; - } else { - /* - * The ipif_t is, once again, moving to - * an ill_t that has a lower maximum mtu - * value. - */ - ipif->ipif_mtu = to_ill->ill_max_mtu; - } - } else if (ipif->ipif_mtu < to_ill->ill_max_mtu && - ipif->ipif_saved_mtu != 0) { - /* - * The mtu of this ipif_t had to be reduced - * during an earlier failover; this is an - * opportunity for it to be increased (either as - * part of another failover or a failback). - */ - if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) { - ipif->ipif_mtu = ipif->ipif_saved_mtu; - ipif->ipif_saved_mtu = 0; - } else { - ipif->ipif_mtu = to_ill->ill_max_mtu; - } + if ((bound_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { + lifr->lifr_binding[0] = '\0'; + return (0); } - /* - * We preserve all the other fields of the ipif including - * ipif_saved_ire_mp. The routes that are saved here will - * be recreated on the new interface and back on the old - * interface when we move back. - */ - ASSERT(ipif->ipif_arp_del_mp == NULL); - - return (err); -} - -static int -ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp, - int ifindex, ipif_t **rep_ipif_ptr) -{ - ipif_t *mipif; - ipif_t *ipif_next; - int err; - - /* - * We don't really try to MOVE back things if some of the - * operations fail. The daemon will take care of moving again - * later on. - */ - for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) { - ipif_next = mipif->ipif_next; - if (!(mipif->ipif_flags & IPIF_NOFAILOVER) && - (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) { - - err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr); - - /* - * When the MOVE fails, it is the job of the - * application to take care of this properly - * i.e try again if it is ENOMEM. - */ - if (mipif->ipif_ill != from_ill) { - /* - * ipif has moved. - * - * Move the multicast memberships associated - * with this ipif to the new ill. For IPv6, we - * do it once after all the ipifs are moved - * (in ill_move) as they are not associated - * with ipifs. - * - * We need to move the ilms as the ipif has - * already been moved to a new ill even - * in the case of errors. Neither - * ilm_free(ipif) will find the ilm - * when somebody unplumbs this ipif nor - * ilm_delete(ilm) will be able to find the - * ilm, if we don't move now. - */ - if (!from_ill->ill_isv6) - ilm_move_v4(from_ill, to_ill, mipif); - } - - if (err != 0) - return (err); - } - } + (void) strlcpy(lifr->lifr_binding, bound_ill->ill_name, LIFNAMSIZ); + ill_refrele(bound_ill); return (0); } -static int -ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp) -{ - int ifindex; - int err; - struct iocblk *iocp; - ipif_t *ipif; - ipif_t *rep_ipif_ptr = NULL; - ipif_t *from_ipif = NULL; - boolean_t check_rep_if = B_FALSE; - ip_stack_t *ipst = from_ill->ill_ipst; - - iocp = (struct iocblk *)mp->b_rptr; - if (iocp->ioc_cmd == SIOCLIFFAILOVER) { - /* - * Move everything pointing at from_ill to to_ill. - * We acheive this by passing in 0 as ifindex. - */ - ifindex = 0; - } else { - /* - * Move everything pointing at from_ill whose original - * ifindex of connp, ipif, ilm points at to_ill->ill_index. - * We acheive this by passing in ifindex rather than 0. - * Multicast vifs, ilgs move implicitly because ipifs move. - */ - ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK); - ifindex = to_ill->ill_phyint->phyint_ifindex; - } - - /* - * Determine if there is at least one ipif that would move from - * 'from_ill' to 'to_ill'. If so, it is possible that the replacement - * ipif (if it exists) on the to_ill would be consumed as a result of - * the move, in which case we need to quiesce the replacement ipif also. - */ - for (from_ipif = from_ill->ill_ipif; from_ipif != NULL; - from_ipif = from_ipif->ipif_next) { - if (((ifindex == 0) || - (ifindex == from_ipif->ipif_orig_ifindex)) && - !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) { - check_rep_if = B_TRUE; - break; - } - } - - ill_down_ipifs(from_ill, mp, ifindex, B_TRUE); - - GRAB_ILL_LOCKS(from_ill, to_ill); - if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) { - (void) ipsq_pending_mp_add(NULL, ipif, q, - mp, ILL_MOVE_OK); - RELEASE_ILL_LOCKS(from_ill, to_ill); - return (EINPROGRESS); - } - - /* Check if the replacement ipif is quiescent to delete */ - if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif, - (iocp->ioc_cmd == SIOCLIFFAILBACK))) { - to_ill->ill_ipif->ipif_state_flags |= - IPIF_MOVING | IPIF_CHANGING; - if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) { - (void) ipsq_pending_mp_add(NULL, ipif, q, - mp, ILL_MOVE_OK); - RELEASE_ILL_LOCKS(from_ill, to_ill); - return (EINPROGRESS); - } - } - RELEASE_ILL_LOCKS(from_ill, to_ill); - - ASSERT(!MUTEX_HELD(&to_ill->ill_lock)); - rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - GRAB_ILL_LOCKS(from_ill, to_ill); - err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr); - - /* ilm_move is done inside ipif_move for IPv4 */ - if (err == 0 && from_ill->ill_isv6) - ilm_move_v6(from_ill, to_ill, ifindex); - - RELEASE_ILL_LOCKS(from_ill, to_ill); - rw_exit(&ipst->ips_ill_g_lock); - - /* - * send rts messages and multicast messages. - */ - if (rep_ipif_ptr != NULL) { - if (rep_ipif_ptr->ipif_recovery_id != 0) { - (void) untimeout(rep_ipif_ptr->ipif_recovery_id); - rep_ipif_ptr->ipif_recovery_id = 0; - } - ip_rts_ifmsg(rep_ipif_ptr); - ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr); -#ifdef DEBUG - ipif_trace_cleanup(rep_ipif_ptr); -#endif - mi_free(rep_ipif_ptr); - } - - conn_move_ill(from_ill, to_ill, ifindex); - - return (err); -} - /* - * Used to extract arguments for FAILOVER/FAILBACK ioctls. - * Also checks for the validity of the arguments. - * Note: We are already exclusive inside the from group. - * It is upto the caller to release refcnt on the to_ill's. + * Process an SIOCGLIFGROUPNAME request. */ -static int -ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4, - ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6) +/* ARGSUSED */ +int +ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, + ip_ioctl_cmd_t *ipip, void *ifreq) { - int dst_index; - ipif_t *ipif_v4, *ipif_v6; - struct lifreq *lifr; - mblk_t *mp1; - boolean_t exists; - sin_t *sin; - int err = 0; - ip_stack_t *ipst; + ipmp_grp_t *grp; + struct lifreq *lifr = ifreq; + ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - if (CONN_Q(q)) - ipst = CONNQ_TO_IPST(q); + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) + lifr->lifr_groupname[0] = '\0'; else - ipst = ILLQ_TO_IPST(q); - - if ((mp1 = mp->b_cont) == NULL) - return (EPROTO); - - if ((mp1 = mp1->b_cont) == NULL) - return (EPROTO); - - lifr = (struct lifreq *)mp1->b_rptr; - sin = (sin_t *)&lifr->lifr_addr; - - /* - * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6 - * specific operations. - */ - if (sin->sin_family != AF_UNSPEC) - return (EINVAL); - - /* - * Get ipif with id 0. We are writer on the from ill. So we can pass - * NULLs for the last 4 args and we know the lookup won't fail - * with EINPROGRESS. - */ - ipif_v4 = ipif_lookup_on_name(lifr->lifr_name, - mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE, - ALL_ZONES, NULL, NULL, NULL, NULL, ipst); - ipif_v6 = ipif_lookup_on_name(lifr->lifr_name, - mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE, - ALL_ZONES, NULL, NULL, NULL, NULL, ipst); - - if (ipif_v4 == NULL && ipif_v6 == NULL) - return (ENXIO); - - if (ipif_v4 != NULL) { - ASSERT(ipif_v4->ipif_refcnt != 0); - if (ipif_v4->ipif_id != 0) { - err = EINVAL; - goto done; - } - - ASSERT(IAM_WRITER_IPIF(ipif_v4)); - *ill_from_v4 = ipif_v4->ipif_ill; - } - - if (ipif_v6 != NULL) { - ASSERT(ipif_v6->ipif_refcnt != 0); - if (ipif_v6->ipif_id != 0) { - err = EINVAL; - goto done; - } - - ASSERT(IAM_WRITER_IPIF(ipif_v6)); - *ill_from_v6 = ipif_v6->ipif_ill; - } - - err = 0; - dst_index = lifr->lifr_movetoindex; - *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE, - q, mp, ip_process_ioctl, &err, ipst); - if (err != 0) { - /* - * A move may be in progress, EINPROGRESS looking up the "to" - * ill means changes already done to the "from" ipsq need to - * be undone to avoid potential deadlocks. - * - * ENXIO will usually be because there is only v6 on the ill, - * that's not treated as an error unless an ENXIO is also - * seen when looking up the v6 "to" ill. - * - * If EINPROGRESS, the mp has been enqueued and can not be - * used to look up the v6 "to" ill, but a preemptive clean - * up of changes to the v6 "from" ipsq is done. - */ - if (err == EINPROGRESS) { - if (*ill_from_v4 != NULL) { - ill_t *from_ill; - ipsq_t *from_ipsq; - - from_ill = ipif_v4->ipif_ill; - from_ipsq = from_ill->ill_phyint->phyint_ipsq; - - mutex_enter(&from_ipsq->ipsq_lock); - from_ipsq->ipsq_current_ipif = NULL; - mutex_exit(&from_ipsq->ipsq_lock); - } - if (*ill_from_v6 != NULL) { - ill_t *from_ill; - ipsq_t *from_ipsq; - - from_ill = ipif_v6->ipif_ill; - from_ipsq = from_ill->ill_phyint->phyint_ipsq; - - mutex_enter(&from_ipsq->ipsq_lock); - from_ipsq->ipsq_current_ipif = NULL; - mutex_exit(&from_ipsq->ipsq_lock); - } - goto done; - } - ASSERT(err == ENXIO); - err = 0; - } - - *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE, - q, mp, ip_process_ioctl, &err, ipst); - if (err != 0) { - /* - * A move may be in progress, EINPROGRESS looking up the "to" - * ill means changes already done to the "from" ipsq need to - * be undone to avoid potential deadlocks. - */ - if (err == EINPROGRESS) { - if (*ill_from_v6 != NULL) { - ill_t *from_ill; - ipsq_t *from_ipsq; - - from_ill = ipif_v6->ipif_ill; - from_ipsq = from_ill->ill_phyint->phyint_ipsq; - - mutex_enter(&from_ipsq->ipsq_lock); - from_ipsq->ipsq_current_ipif = NULL; - mutex_exit(&from_ipsq->ipsq_lock); - } - goto done; - } - ASSERT(err == ENXIO); - - /* Both v4 and v6 lookup failed */ - if (*ill_to_v4 == NULL) { - err = ENXIO; - goto done; - } - err = 0; - } - - /* - * If we have something to MOVE i.e "from" not NULL, - * "to" should be non-NULL. - */ - if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) || - (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) { - err = EINVAL; - } - -done: - if (ipif_v4 != NULL) - ipif_refrele(ipif_v4); - if (ipif_v6 != NULL) - ipif_refrele(ipif_v6); - return (err); + (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); + rw_exit(&ipst->ips_ipmp_lock); + return (0); } /* - * FAILOVER and FAILBACK are modelled as MOVE operations. - * - * We don't check whether the MOVE is within the same group or - * not, because this ioctl can be used as a generic mechanism - * to failover from interface A to B, though things will function - * only if they are really part of the same group. Moreover, - * all ipifs may be down and hence temporarily out of the group. - * - * ipif's that need to be moved are first brought down; V4 ipifs are brought - * down first and then V6. For each we wait for the ipif's to become quiescent. - * Bringing down the ipifs ensures that all ires pointing to these ipifs's - * have been deleted and there are no active references. Once quiescent the - * ipif's are moved and brought up on the new ill. - * - * Normally the source ill and destination ill belong to the same IPMP group - * and hence the same ipsq_t. In the event they don't belong to the same - * same group the two ipsq's are first merged into one ipsq - that of the - * to_ill. The multicast memberships on the source and destination ill cannot - * change during the move operation since multicast joins/leaves also have to - * execute on the same ipsq and are hence serialized. + * Process an SIOCGLIFGROUPINFO request. */ /* ARGSUSED */ int -ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, - ip_ioctl_cmd_t *ipip, void *ifreq) +ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, + ip_ioctl_cmd_t *ipip, void *dummy) { - ill_t *ill_to_v4 = NULL; - ill_t *ill_to_v6 = NULL; - ill_t *ill_from_v4 = NULL; - ill_t *ill_from_v6 = NULL; - int err = 0; - - /* - * setup from and to ill's, we can get EINPROGRESS only for - * to_ill's. - */ - err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6, - &ill_to_v4, &ill_to_v6); - - if (err != 0) { - ip0dbg(("ip_sioctl_move: extract args failed\n")); - goto done; - } - - /* - * nothing to do. - */ - if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) { - goto done; - } - - /* - * nothing to do. - */ - if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) { - goto done; - } - - /* - * Mark the ill as changing. - * ILL_CHANGING flag is cleared when the ipif's are brought up - * in ill_up_ipifs in case of error they are cleared below. - */ - - GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); - if (ill_from_v4 != NULL) - ill_from_v4->ill_state_flags |= ILL_CHANGING; - if (ill_from_v6 != NULL) - ill_from_v6->ill_state_flags |= ILL_CHANGING; - RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); - - /* - * Make sure that both src and dst are - * in the same syncq group. If not make it happen. - * We are not holding any locks because we are the writer - * on the from_ipsq and we will hold locks in ill_merge_groups - * to protect to_ipsq against changing. - */ - if (ill_from_v4 != NULL) { - if (ill_from_v4->ill_phyint->phyint_ipsq != - ill_to_v4->ill_phyint->phyint_ipsq) { - err = ill_merge_groups(ill_from_v4, ill_to_v4, - NULL, mp, q); - goto err_ret; - - } - ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock)); - } else { - - if (ill_from_v6->ill_phyint->phyint_ipsq != - ill_to_v6->ill_phyint->phyint_ipsq) { - err = ill_merge_groups(ill_from_v6, ill_to_v6, - NULL, mp, q); - goto err_ret; - - } - ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock)); - } - - /* - * Now that the ipsq's have been merged and we are the writer - * lets mark to_ill as changing as well. - */ - - GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); - if (ill_to_v4 != NULL) - ill_to_v4->ill_state_flags |= ILL_CHANGING; - if (ill_to_v6 != NULL) - ill_to_v6->ill_state_flags |= ILL_CHANGING; - RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); - - /* - * Its ok for us to proceed with the move even if - * ill_pending_mp is non null on one of the from ill's as the reply - * should not be looking at the ipif, it should only care about the - * ill itself. - */ - - /* - * lets move ipv4 first. - */ - if (ill_from_v4 != NULL) { - ASSERT(IAM_WRITER_ILL(ill_to_v4)); - ill_from_v4->ill_move_in_progress = B_TRUE; - ill_to_v4->ill_move_in_progress = B_TRUE; - ill_to_v4->ill_move_peer = ill_from_v4; - ill_from_v4->ill_move_peer = ill_to_v4; - err = ill_move(ill_from_v4, ill_to_v4, q, mp); - } - - /* - * Now lets move ipv6. - */ - if (err == 0 && ill_from_v6 != NULL) { - ASSERT(IAM_WRITER_ILL(ill_to_v6)); - ill_from_v6->ill_move_in_progress = B_TRUE; - ill_to_v6->ill_move_in_progress = B_TRUE; - ill_to_v6->ill_move_peer = ill_from_v6; - ill_from_v6->ill_move_peer = ill_to_v6; - err = ill_move(ill_from_v6, ill_to_v6, q, mp); - } - -err_ret: - /* - * EINPROGRESS means we are waiting for the ipif's that need to be - * moved to become quiescent. - */ - if (err == EINPROGRESS) { - goto done; - } - - /* - * if err is set ill_up_ipifs will not be called - * lets clear the flags. - */ - - GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); - GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); - /* - * Some of the clearing may be redundant. But it is simple - * not making any extra checks. - */ - if (ill_from_v6 != NULL) { - ill_from_v6->ill_move_in_progress = B_FALSE; - ill_from_v6->ill_move_peer = NULL; - ill_from_v6->ill_state_flags &= ~ILL_CHANGING; - } - if (ill_from_v4 != NULL) { - ill_from_v4->ill_move_in_progress = B_FALSE; - ill_from_v4->ill_move_peer = NULL; - ill_from_v4->ill_state_flags &= ~ILL_CHANGING; - } - if (ill_to_v6 != NULL) { - ill_to_v6->ill_move_in_progress = B_FALSE; - ill_to_v6->ill_move_peer = NULL; - ill_to_v6->ill_state_flags &= ~ILL_CHANGING; - } - if (ill_to_v4 != NULL) { - ill_to_v4->ill_move_in_progress = B_FALSE; - ill_to_v4->ill_move_peer = NULL; - ill_to_v4->ill_state_flags &= ~ILL_CHANGING; - } - - /* - * Check for setting INACTIVE, if STANDBY is set and FAILED is not set. - * Do this always to maintain proper state i.e even in case of errors. - * As phyint_inactive looks at both v4 and v6 interfaces, - * we need not call on both v4 and v6 interfaces. - */ - if (ill_from_v4 != NULL) { - if ((ill_from_v4->ill_phyint->phyint_flags & - (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { - phyint_inactive(ill_from_v4->ill_phyint); - } - } else if (ill_from_v6 != NULL) { - if ((ill_from_v6->ill_phyint->phyint_flags & - (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { - phyint_inactive(ill_from_v6->ill_phyint); - } - } - - if (ill_to_v4 != NULL) { - if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) { - ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; - } - } else if (ill_to_v6 != NULL) { - if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) { - ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; - } - } - - RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); - RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); - -no_err: - /* - * lets bring the interfaces up on the to_ill. - */ - if (err == 0) { - err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4, - q, mp); - } - - if (err == 0) { - if (ill_from_v4 != NULL && ill_to_v4 != NULL) - ilm_send_multicast_reqs(ill_from_v4, ill_to_v4); + lifgroupinfo_t *lifgr; + ipmp_grp_t *grp; + ip_stack_t *ipst = CONNQ_TO_IPST(q); - if (ill_from_v6 != NULL && ill_to_v6 != NULL) - ilm_send_multicast_reqs(ill_from_v6, ill_to_v6); - } -done: + /* ip_wput_nondata() verified mp->b_cont->b_cont */ + lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; + lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; - if (ill_to_v4 != NULL) { - ill_refrele(ill_to_v4); - } - if (ill_to_v6 != NULL) { - ill_refrele(ill_to_v6); + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { + rw_exit(&ipst->ips_ipmp_lock); + return (ENOENT); } - - return (err); + ipmp_grp_info(grp, lifgr); + rw_exit(&ipst->ips_ipmp_lock); + return (0); } static void @@ -18167,10 +14492,9 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) * we only wait for the ACK of the DL_UNBIND_REQ. */ mutex_enter(&ill->ill_lock); - if (!(ill->ill_state_flags & ILL_CONDEMNED) || - (prim == DL_UNBIND_REQ)) { + if (!(ill->ill_state_flags & ILL_CONDEMNED) || (prim == DL_UNBIND_REQ)) ill->ill_dlpi_pending = prim; - } + mutex_exit(&ill->ill_lock); putnext(ill->ill_wq, mp); } @@ -18324,6 +14648,7 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim) { mblk_t *mp; ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; + ipxop_t *ipx = ipsq->ipsq_xop; ASSERT(IAM_WRITER_IPSQ(ipsq)); mutex_enter(&ill->ill_lock); @@ -18336,12 +14661,11 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim) if ((mp = ill->ill_dlpi_deferred) == NULL) { ill->ill_dlpi_pending = DL_PRIM_INVAL; - - mutex_enter(&ipsq->ipsq_lock); - if (ipsq->ipsq_current_done) - ipsq->ipsq_current_ipif = NULL; - mutex_exit(&ipsq->ipsq_lock); - + if (ipx->ipx_current_done) { + mutex_enter(&ipx->ipx_lock); + ipx->ipx_current_ipif = NULL; + mutex_exit(&ipx->ipx_lock); + } cv_signal(&ill->ill_cv); mutex_exit(&ill->ill_lock); return; @@ -18379,7 +14703,7 @@ conn_delete_ire(conn_t *connp, caddr_t arg) } /* - * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number + * Some operations (e.g., ipif_down()) conditionally delete a number * of IREs. Those IREs may have been previously cached in the conn structure. * This ipcl_walk() walker function releases all references to such IREs based * on the condemned flag. @@ -18403,7 +14727,6 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) /* * Take down a specific interface, but don't lose any information about it. - * Also delete interface from its interface group (ifgrp). * (Always called as writer.) * This function goes through the down sequence even if the interface is * already down. There are 2 reasons. @@ -18501,7 +14824,7 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) * For eg. bind, and route operations (Eg. route add / delete) cannot return * failure if the ipif is currently undergoing an exclusive operation, and * hence pass the flag. The mblk is then enqueued in the ipsq and the operation - * is restarted by ipsq_exit() when the currently exclusive ioctl completes. + * is restarted by ipsq_exit() when the current exclusive operation completes. * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't * change while the ill_lock is held. Before dropping the ill_lock we acquire @@ -18522,7 +14845,6 @@ int ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) { ill_t *ill = ipif->ipif_ill; - phyint_t *phyi; conn_t *connp; boolean_t success; boolean_t ipif_was_up = B_FALSE; @@ -18569,20 +14891,7 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) } /* - * Before we delete the ill from the group (if any), we need - * to make sure that we delete all the routes dependent on - * this and also any ipifs dependent on this ipif for - * source address. We need to do before we delete from - * the group because - * - * 1) ipif_down_delete_ire de-references ill->ill_group. - * - * 2) ipif_update_other_ipifs needs to walk the whole group - * for re-doing source address selection. Note that - * ipif_select_source[_v6] called from - * ipif_update_other_ipifs[_v6] will not pick this ipif - * because we have already marked down here i.e cleared - * IPIF_UP. + * Delete all IRE's pointing at this ipif or its source address. */ if (ipif->ipif_isv6) { ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, @@ -18592,6 +14901,17 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) ipst); } + if (ipif_was_up && ill->ill_ipif_up_count == 0) { + /* + * Since the interface is now down, it may have just become + * inactive. Note that this needs to be done even for a + * lll_logical_down(), or ARP entries will not get correctly + * restored when the interface comes back up. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_ill_refresh_active(ill); + } + /* * Cleaning up the conn_ire_cache or conns must be done only after the * ires have been deleted above. Otherwise a thread could end up @@ -18609,53 +14929,9 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) * entries for such ipifs. */ if (ipif->ipif_isv6) - ipif_update_other_ipifs_v6(ipif, ill->ill_group); + ipif_update_other_ipifs_v6(ipif); else - ipif_update_other_ipifs(ipif, ill->ill_group); - - if (ipif_was_up) { - /* - * Check whether it is last ipif to leave this group. - * If this is the last ipif to leave, we should remove - * this ill from the group as ipif_select_source will not - * be able to find any useful ipifs if this ill is selected - * for load balancing. - * - * For nameless groups, we should call ifgrp_delete if this - * belongs to some group. As this ipif is going down, we may - * need to reconstruct groups. - */ - phyi = ill->ill_phyint; - /* - * If the phyint_groupname_len is 0, it may or may not - * be in the nameless group. If the phyint_groupname_len is - * not 0, then this ill should be part of some group. - * As we always insert this ill in the group if - * phyint_groupname_len is not zero when the first ipif - * comes up (in ipif_up_done), it should be in a group - * when the namelen is not 0. - * - * NOTE : When we delete the ill from the group,it will - * blow away all the IRE_CACHES pointing either at this ipif or - * ill_wq (illgrp_cache_delete does this). Thus, no IRES - * should be pointing at this ill. - */ - ASSERT(phyi->phyint_groupname_len == 0 || - (phyi->phyint_groupname != NULL && ill->ill_group != NULL)); - - if (phyi->phyint_groupname_len != 0) { - if (ill->ill_ipif_up_count == 0) - illgrp_delete(ill); - } - - /* - * If we have deleted some of the broadcast ires associated - * with this ipif, we need to re-nominate somebody else if - * the ires that we deleted were the nominated ones. - */ - if (ill->ill_group != NULL && !ill->ill_isv6) - ipif_renominate_bcast(ipif); - } + ipif_update_other_ipifs(ipif); /* * neighbor-discovery or arp entries for this interface. @@ -18734,17 +15010,12 @@ ipif_down_tail(ipif_t *ipif) ill->ill_logical_down = 0; /* - * Have to be after removing the routes in ipif_down_delete_ire. + * Has to be after removing the routes in ipif_down_delete_ire. */ - if (ipif->ipif_isv6) { - if (ill->ill_flags & ILLF_XRESOLV) - ipif_arp_down(ipif); - } else { - ipif_arp_down(ipif); - } + ipif_resolver_down(ipif); - ip_rts_ifmsg(ipif); - ip_rts_newaddrmsg(RTM_DELETE, 0, ipif); + ip_rts_ifmsg(ipif, RTSQ_DEFAULT); + ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); } /* @@ -18804,39 +15075,11 @@ static void ipif_down_delete_ire(ire_t *ire, char *ipif_arg) { ipif_t *ipif = (ipif_t *)ipif_arg; - ill_t *ire_ill; - ill_t *ipif_ill; ASSERT(IAM_WRITER_IPIF(ipif)); if (ire->ire_ipif == NULL) return; - /* - * For IPv4, we derive source addresses for an IRE from ipif's - * belonging to the same IPMP group as the IRE's outgoing - * interface. If an IRE's outgoing interface isn't in the - * same IPMP group as a particular ipif, then that ipif - * couldn't have been used as a source address for this IRE. - * - * For IPv6, source addresses are only restricted to the IPMP group - * if the IRE is for a link-local address or a multicast address. - * Otherwise, source addresses for an IRE can be chosen from - * interfaces other than the the outgoing interface for that IRE. - * - * For source address selection details, see ipif_select_source() - * and ipif_select_source_v6(). - */ - if (ire->ire_ipversion == IPV4_VERSION || - IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) || - IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { - ire_ill = ire->ire_ipif->ipif_ill; - ipif_ill = ipif->ipif_ill; - - if (ire_ill->ill_group != ipif_ill->ill_group) { - return; - } - } - if (ire->ire_ipif != ipif) { /* * Look for a matching source address. @@ -18875,83 +15118,53 @@ void ill_ipif_cache_delete(ire_t *ire, char *ill_arg) { ill_t *ill = (ill_t *)ill_arg; - ill_t *ipif_ill; ASSERT(IAM_WRITER_ILL(ill)); - /* - * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. - * Hence this should be IRE_CACHE. - */ ASSERT(ire->ire_type == IRE_CACHE); /* - * We are called for IRE_CACHES whose ire_ipif matches ill. - * We are only interested in IRE_CACHES that has borrowed - * the source address from ill_arg e.g. ipif_up_done[_v6] - * for which we need to look at ire_ipif->ipif_ill match - * with ill. + * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches + * ill, but we only want to delete the IRE if ire_ipif matches. */ ASSERT(ire->ire_ipif != NULL); - ipif_ill = ire->ire_ipif->ipif_ill; - if (ipif_ill == ill || (ill->ill_group != NULL && - ipif_ill->ill_group == ill->ill_group)) { + if (ill == ire->ire_ipif->ipif_ill) ire_delete(ire); - } } /* - * Delete all the ire whose stq references ill_arg. + * Delete all the IREs whose ire_stq's reference `ill_arg'. IPMP uses this + * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references + * the IPMP ill. */ -static void +void ill_stq_cache_delete(ire_t *ire, char *ill_arg) { ill_t *ill = (ill_t *)ill_arg; - ill_t *ire_ill; ASSERT(IAM_WRITER_ILL(ill)); - /* - * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. - * Hence this should be IRE_CACHE. - */ ASSERT(ire->ire_type == IRE_CACHE); /* - * We are called for IRE_CACHES whose ire_stq and ire_ipif - * matches ill. We are only interested in IRE_CACHES that - * has ire_stq->q_ptr pointing at ill_arg. Thus we do the - * filtering here. + * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches + * ill, but we only want to delete the IRE if ire_stq matches. */ - ire_ill = (ill_t *)ire->ire_stq->q_ptr; - - if (ire_ill == ill) + if (ire->ire_stq->q_ptr == ill_arg) ire_delete(ire); } /* - * This is called when an ill leaves the group. We want to delete - * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is - * pointing at ill. + * Delete all broadcast IREs with a source address on `ill_arg'. */ static void -illgrp_cache_delete(ire_t *ire, char *ill_arg) +ill_broadcast_delete(ire_t *ire, char *ill_arg) { - ill_t *ill = (ill_t *)ill_arg; + ill_t *ill = (ill_t *)ill_arg; ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(ill->ill_group == NULL); - /* - * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. - * Hence this should be IRE_CACHE. - */ - ASSERT(ire->ire_type == IRE_CACHE); - /* - * We are called for IRE_CACHES whose ire_stq and ire_ipif - * matches ill. We are interested in both. - */ - ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) || - (ire->ire_ipif->ipif_ill == ill)); + ASSERT(ire->ire_type == IRE_BROADCAST); - ire_delete(ire); + if (ire->ire_ipif->ipif_ill == ill) + ire_delete(ire); } /* @@ -18997,13 +15210,12 @@ ipif_free(ipif_t *ipif) rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); /* Remove pointers to this ill in the multicast routing tables */ reset_mrt_vif_ipif(ipif); + /* If necessary, clear the cached source ipif rotor. */ + if (ipif->ipif_ill->ill_src_ipif == ipif) + ipif->ipif_ill->ill_src_ipif = NULL; rw_exit(&ipst->ips_ill_g_lock); } -/* - * Warning: this is not the only function that calls mi_free on an ipif_t. See - * also ill_move(). - */ static void ipif_free_tail(ipif_t *ipif) { @@ -19036,7 +15248,7 @@ ipif_free_tail(ipif_t *ipif) sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); /* Get it out of the ILL interface list. */ - ipif_remove(ipif, B_TRUE); + ipif_remove(ipif); rw_exit(&ipst->ips_ill_g_lock); mutex_destroy(&ipif->ipif_saved_ire_lock); @@ -19208,8 +15420,10 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, } else if (IPIF_CAN_WAIT(ipif, q)) { ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ill->ill_lock); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); RELEASE_CONN_LOCK(q); ill_refrele(ill); @@ -19244,7 +15458,7 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, ire_type = IRE_LOOPBACK; else ire_type = IRE_LOCAL; - ipif = ipif_allocate(ill, id, ire_type, B_TRUE); + ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE); if (ipif != NULL) ipif_refhold_locked(ipif); else if (error != NULL) @@ -19342,65 +15556,62 @@ ill_mtu_change(ire_t *ire, char *ill_arg) void ipif_multicast_up(ipif_t *ipif) { - int err, index; + int err; ill_t *ill; ASSERT(IAM_WRITER_IPIF(ipif)); ill = ipif->ipif_ill; - index = ill->ill_phyint->phyint_ifindex; ip1dbg(("ipif_multicast_up\n")); if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) return; if (ipif->ipif_isv6) { + in6_addr_t v6allmc = ipv6_all_hosts_mcast; + in6_addr_t v6solmc = ipv6_solicited_node_mcast; + + v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; + if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) return; - /* Join the all hosts multicast address */ ip1dbg(("ipif_multicast_up - addmulti\n")); + /* - * Passing B_TRUE means we have to join the multicast - * membership on this interface even though this is - * FAILED. If we join on a different one in the group, - * we will not be able to delete the membership later - * as we currently don't track where we join when we - * join within the kernel unlike applications where - * we have ilg/ilg_orig_index. See ip_addmulti_v6 - * for more on this. + * Join the all hosts multicast address. We skip this for + * underlying IPMP interfaces since they should be invisible. */ - err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index, - ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); - if (err != 0) { - ip0dbg(("ipif_multicast_up: " - "all_hosts_mcast failed %d\n", - err)); - return; + if (!IS_UNDER_IPMP(ill)) { + err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid, + ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); + if (err != 0) { + ip0dbg(("ipif_multicast_up: " + "all_hosts_mcast failed %d\n", err)); + return; + } + ipif->ipif_joined_allhosts = 1; } + /* * Enable multicast for the solicited node multicast address */ if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { - in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; - - ipv6_multi.s6_addr32[3] |= - ipif->ipif_v6lcl_addr.s6_addr32[3]; - - err = ip_addmulti_v6(&ipv6_multi, ill, index, - ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, - NULL); + err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid, + ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); if (err != 0) { ip0dbg(("ipif_multicast_up: solicited MC" " failed %d\n", err)); - (void) ip_delmulti_v6(&ipv6_all_hosts_mcast, - ill, ill->ill_phyint->phyint_ifindex, - ipif->ipif_zoneid, B_TRUE, B_TRUE); + if (ipif->ipif_joined_allhosts) { + (void) ip_delmulti_v6(&v6allmc, ill, + ipif->ipif_zoneid, B_TRUE, B_TRUE); + ipif->ipif_joined_allhosts = 0; + } return; } } } else { - if (ipif->ipif_lcl_addr == INADDR_ANY) + if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) return; /* Join the all hosts multicast address */ @@ -19420,7 +15631,7 @@ ipif_multicast_up(ipif_t *ipif) * (Explicit memberships are blown away in ill_leave_multicast() when the * ill is brought down.) */ -static void +void ipif_multicast_down(ipif_t *ipif) { int err; @@ -19444,19 +15655,18 @@ ipif_multicast_down(ipif_t *ipif) } /* - * Leave the all hosts multicast address. Similar to ip_addmulti_v6, - * we should look for ilms on this ill rather than the ones that have - * been failed over here. They are here temporarily. As - * ipif_multicast_up has joined on this ill, we should delete only - * from this ill. + * Leave the all-hosts multicast address. */ - err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, - ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, - B_TRUE, B_TRUE); - if (err != 0) { - ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n", - err)); + if (ipif->ipif_joined_allhosts) { + err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, + ipif->ipif_zoneid, B_TRUE, B_TRUE); + if (err != 0) { + ip0dbg(("ipif_multicast_down: all_hosts_mcast " + "failed %d\n", err)); + } + ipif->ipif_joined_allhosts = 0; } + /* * Disable multicast for the solicited node multicast address */ @@ -19467,9 +15677,7 @@ ipif_multicast_down(ipif_t *ipif) ipif->ipif_v6lcl_addr.s6_addr32[3]; err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, - ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, B_TRUE, B_TRUE); - if (err != 0) { ip0dbg(("ipif_multicast_down: sol MC failed %d\n", err)); @@ -19683,9 +15891,8 @@ ipif_set_default(ipif_t *ipif) * Return 0 if this address can be used as local address without causing * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address * is already up on a different ill, and EADDRINUSE if it's up on the same ill. - * Special checks are needed to allow the same IPv6 link-local address - * on different ills. - * TODO: allowing the same site-local address on different ill's. + * Note that the same IPv6 link-local address is allowed as long as the ills + * are not on the same link. */ int ip_addr_availability_check(ipif_t *new_ipif) @@ -19717,30 +15924,26 @@ ip_addr_availability_check(ipif_t *new_ipif) ipif = ipif->ipif_next) { if ((ipif == new_ipif) || !(ipif->ipif_flags & IPIF_UP) || - (ipif->ipif_flags & IPIF_UNNUMBERED)) + (ipif->ipif_flags & IPIF_UNNUMBERED) || + !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, + &our_v6addr)) continue; - if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, - &our_v6addr)) { - if (new_ipif->ipif_flags & IPIF_POINTOPOINT) - new_ipif->ipif_flags |= IPIF_UNNUMBERED; - else if (ipif->ipif_flags & IPIF_POINTOPOINT) - ipif->ipif_flags |= IPIF_UNNUMBERED; - else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) && - new_ipif->ipif_ill != ill) - continue; - else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) && - new_ipif->ipif_ill != ill) - continue; - else if (new_ipif->ipif_zoneid != - ipif->ipif_zoneid && - ipif->ipif_zoneid != ALL_ZONES && - IS_LOOPBACK(ill)) - continue; - else if (new_ipif->ipif_ill == ill) - return (EADDRINUSE); - else - return (EADDRNOTAVAIL); - } + + if (new_ipif->ipif_flags & IPIF_POINTOPOINT) + new_ipif->ipif_flags |= IPIF_UNNUMBERED; + else if (ipif->ipif_flags & IPIF_POINTOPOINT) + ipif->ipif_flags |= IPIF_UNNUMBERED; + else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || + IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && + !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) + continue; + else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && + ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) + continue; + else if (new_ipif->ipif_ill == ill) + return (EADDRINUSE); + else + return (EADDRNOTAVAIL); } } @@ -19753,13 +15956,15 @@ ip_addr_availability_check(ipif_t *new_ipif) * When the routine returns EINPROGRESS then mp has been consumed and * the ioctl will be acked from ip_rput_dlpi. */ -static int +int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) { - ill_t *ill = ipif->ipif_ill; - boolean_t isv6 = ipif->ipif_isv6; - int err = 0; - boolean_t success; + ill_t *ill = ipif->ipif_ill; + boolean_t isv6 = ipif->ipif_isv6; + int err = 0; + boolean_t success; + uint_t ipif_orig_id; + ip_stack_t *ipst = ill->ill_ipst; ASSERT(IAM_WRITER_IPIF(ipif)); @@ -19769,6 +15974,123 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) if (ipif->ipif_flags & IPIF_UP) return (EALREADY); + /* + * If this is a request to bring up a data address on an interface + * under IPMP, then move the address to its IPMP meta-interface and + * try to bring it up. One complication is that the zeroth ipif for + * an ill is special, in that every ill always has one, and that code + * throughout IP deferences ill->ill_ipif without holding any locks. + */ + if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && + (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { + ipif_t *stubipif = NULL, *moveipif = NULL; + ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); + + /* + * The ipif being brought up should be quiesced. If it's not, + * something has gone amiss and we need to bail out. (If it's + * quiesced, we know it will remain so via IPIF_CHANGING.) + */ + mutex_enter(&ill->ill_lock); + if (!ipif_is_quiescent(ipif)) { + mutex_exit(&ill->ill_lock); + return (EINVAL); + } + mutex_exit(&ill->ill_lock); + + /* + * If we're going to need to allocate ipifs, do it prior + * to starting the move (and grabbing locks). + */ + if (ipif->ipif_id == 0) { + moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, + B_FALSE); + stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, + B_FALSE); + if (moveipif == NULL || stubipif == NULL) { + mi_free(moveipif); + mi_free(stubipif); + return (ENOMEM); + } + } + + /* + * Grab or transfer the ipif to move. During the move, keep + * ill_g_lock held to prevent any ill walker threads from + * seeing things in an inconsistent state. + */ + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + if (ipif->ipif_id != 0) { + ipif_remove(ipif); + } else { + ipif_transfer(ipif, moveipif, stubipif); + ipif = moveipif; + } + + /* + * Place the ipif on the IPMP ill. If the zeroth ipif on + * the IPMP ill is a stub (0.0.0.0 down address) then we + * replace that one. Otherwise, pick the next available slot. + */ + ipif->ipif_ill = ipmp_ill; + ipif_orig_id = ipif->ipif_id; + + if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { + ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); + ipif = ipmp_ill->ill_ipif; + } else { + ipif->ipif_id = -1; + if (ipif_insert(ipif, B_FALSE) != 0) { + /* + * No more available ipif_id's -- put it back + * on the original ill and fail the operation. + * Since we're writer on the ill, we can be + * sure our old slot is still available. + */ + ipif->ipif_id = ipif_orig_id; + ipif->ipif_ill = ill; + if (ipif_orig_id == 0) { + ipif_transfer(ipif, ill->ill_ipif, + NULL); + } else { + VERIFY(ipif_insert(ipif, B_FALSE) == 0); + } + rw_exit(&ipst->ips_ill_g_lock); + return (ENOMEM); + } + } + rw_exit(&ipst->ips_ill_g_lock); + + /* + * Tell SCTP that the ipif has moved. Note that even if we + * had to allocate a new ipif, the original sequence id was + * preserved and therefore SCTP won't know. + */ + sctp_move_ipif(ipif, ill, ipmp_ill); + + /* + * If the ipif being brought up was on slot zero, then we + * first need to bring up the placeholder we stuck there. In + * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call + * to ipif_up() itself, if we successfully bring up the + * placeholder, we'll check ill_move_ipif and bring it up too. + */ + if (ipif_orig_id == 0) { + ASSERT(ill->ill_move_ipif == NULL); + ill->ill_move_ipif = ipif; + if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) + ASSERT(ill->ill_move_ipif == NULL); + if (err != EINPROGRESS) + ill->ill_move_ipif = NULL; + return (err); + } + + /* + * Bring it up on the IPMP ill. + */ + return (ipif_up(ipif, q, mp)); + } + /* Skip arp/ndp for any loopback interface. */ if (ill->ill_wq != NULL) { conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; @@ -19798,7 +16120,6 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) */ ASSERT(connp != NULL || !CONN_Q(q)); - ASSERT(ipsq->ipsq_pending_mp == NULL); if (connp != NULL) mutex_enter(&connp->conn_lock); mutex_enter(&ill->ill_lock); @@ -19810,27 +16131,25 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) return (EINTR); /* - * Crank up IPv6 neighbor discovery - * Unlike ARP, this should complete when - * ipif_ndp_up returns. However, for - * ILLF_XRESOLV interfaces we also send a - * AR_INTERFACE_UP to the external resolver. - * That ioctl will complete in ip_rput. + * Crank up the resolver. For IPv6, this cranks up the + * external resolver if one is configured, but even if an + * external resolver isn't configured, it must be called to + * reset DAD state. For IPv6, if an external resolver is not + * being used, ipif_resolver_up() will never return + * EINPROGRESS, so we can always call ipif_ndp_up() here. + * Note that if an external resolver is being used, there's no + * need to call ipif_ndp_up() since it will do nothing. */ - if (isv6) { - err = ipif_ndp_up(ipif); - if (err != 0) { - if (err != EINPROGRESS) - mp = ipsq_pending_mp_get(ipsq, &connp); - return (err); - } - } - /* Now, ARP */ err = ipif_resolver_up(ipif, Res_act_initial); if (err == EINPROGRESS) { - /* We will complete it in ip_arp_done */ + /* We will complete it in ip_arp_done() */ return (err); } + + if (isv6 && err == 0) + err = ipif_ndp_up(ipif, B_TRUE); + + ASSERT(err != EINPROGRESS); mp = ipsq_pending_mp_get(ipsq, &connp); ASSERT(mp != NULL); if (err != 0) @@ -19843,7 +16162,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); ipif->ipif_addr_ready = 1; } - return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); + + err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif); + if (err == 0 && ill->ill_move_ipif != NULL) { + ipif = ill->ill_move_ipif; + ill->ill_move_ipif = NULL; + return (ipif_up(ipif, q, mp)); + } + return (err); } /* @@ -19939,13 +16265,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) return (EINPROGRESS); bad: ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); - /* - * We don't have to check for possible removal from illgrp - * as we have not yet inserted in illgrp. For groups - * without names, this ipif is still not UP and hence - * this could not have possibly had any influence in forming - * groups. - */ freemsg(bind_mp); freemsg(unbind_mp); @@ -19974,12 +16293,10 @@ ipif_up_done(ipif_t *ipif) ipif_t *tmp_ipif; boolean_t flush_ire_cache = B_TRUE; int err = 0; - phyint_t *phyi; ire_t **ipif_saved_irep = NULL; int ipif_saved_ire_cnt; int cnt; boolean_t src_ipif_held = B_FALSE; - boolean_t ire_added = B_FALSE; boolean_t loopback = B_FALSE; ip_stack_t *ipst = ill->ill_ipst; @@ -20010,7 +16327,7 @@ ipif_up_done(ipif_t *ipif) break; } if (flush_ire_cache) - ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, + ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); /* @@ -20044,7 +16361,9 @@ ipif_up_done(ipif_t *ipif) ipif->ipif_ire_type = IRE_LOCAL; } - if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { + if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) || + ((ipif->ipif_flags & IPIF_DEPRECATED) && + !(ipif->ipif_flags & IPIF_NOFAILOVER))) { /* * Can't use our source address. Select a different * source address for the IRE_INTERFACE and IRE_LOCAL @@ -20189,11 +16508,9 @@ ipif_up_done(ipif_t *ipif) } /* - * Need to atomically check for ip_addr_availablity_check - * under ip_addr_avail_lock, and if it fails got bad, and remove - * from group also.The ill_g_lock is grabbed as reader - * just to make sure no new ills or new ipifs are being added - * to the system while we are checking the uniqueness of addresses. + * Need to atomically check for IP address availability under + * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new + * ills or new ipifs can be added while we are checking availability. */ rw_enter(&ipst->ips_ill_g_lock, RW_READER); mutex_enter(&ipst->ips_ip_addr_avail_lock); @@ -20227,13 +16544,6 @@ ipif_up_done(ipif_t *ipif) /* * Add in all newly created IREs. ire_create_bcast() has * already checked for duplicates of the IRE_BROADCAST type. - * We want to add before we call ifgrp_insert which wants - * to know whether IRE_IF_RESOLVER exists or not. - * - * NOTE : We refrele the ire though we may branch to "bad" - * later on where we do ire_delete. This is okay - * because nobody can delete it as we are running - * exclusively. */ for (irep1 = irep; irep1 > ire_array; ) { irep1--; @@ -20243,44 +16553,6 @@ ipif_up_done(ipif_t *ipif) */ (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); } - ire_added = B_TRUE; - /* - * Form groups if possible. - * - * If we are supposed to be in a ill_group with a name, insert it - * now as we know that at least one ipif is UP. Otherwise form - * nameless groups. - * - * If ip_enable_group_ifs is set and ipif address is not 0, insert - * this ipif into the appropriate interface group, or create a - * new one. If this is already in a nameless group, we try to form - * a bigger group looking at other ills potentially sharing this - * ipif's prefix. - */ - phyi = ill->ill_phyint; - if (phyi->phyint_groupname_len != 0) { - ASSERT(phyi->phyint_groupname != NULL); - if (ill->ill_ipif_up_count == 1) { - ASSERT(ill->ill_group == NULL); - err = illgrp_insert(&ipst->ips_illgrp_head_v4, ill, - phyi->phyint_groupname, NULL, B_TRUE); - if (err != 0) { - ip1dbg(("ipif_up_done: illgrp allocation " - "failed, error %d\n", err)); - goto bad; - } - } - ASSERT(ill->ill_group != NULL); - } - - /* - * When this is part of group, we need to make sure that - * any broadcast ires created because of this ipif coming - * UP gets marked/cleared with IRE_MARK_NORECV appropriately - * so that we don't receive duplicate broadcast packets. - */ - if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0) - ipif_renominate_bcast(ipif); /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; @@ -20331,19 +16603,30 @@ ipif_up_done(ipif_t *ipif) */ ill_recover_multicast(ill); } - /* Join the allhosts multicast address */ - ipif_multicast_up(ipif); - if (!loopback) { + if (ill->ill_ipif_up_count == 1) { + /* + * Since the interface is now up, it may now be active. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_ill_refresh_active(ill); + /* - * See whether anybody else would benefit from the - * new ipif that we added. We call this always rather - * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST - * ipif is for the benefit of illgrp_insert (done above) - * which does not do source address selection as it does - * not want to re-create interface routes that we are - * having reference to it here. + * If this is an IPMP interface, we may now be able to + * establish ARP entries. */ + if (IS_IPMP(ill)) + ipmp_illgrp_refresh_arpent(ill->ill_grp); + } + + /* Join the allhosts multicast address */ + ipif_multicast_up(ipif); + + /* + * See if anybody else would benefit from our new ipif. + */ + if (!loopback && + !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { ill_update_source_selection(ill); } @@ -20386,27 +16669,11 @@ ipif_up_done(ipif_t *ipif) bad: ip1dbg(("ipif_up_done: FAILED \n")); - /* - * We don't have to bother removing from ill groups because - * - * 1) For groups with names, we insert only when the first ipif - * comes up. In that case if it fails, it will not be in any - * group. So, we need not try to remove for that case. - * - * 2) For groups without names, either we tried to insert ipif_ill - * in a group as singleton or found some other group to become - * a bigger group. For the former, if it fails we don't have - * anything to do as ipif_ill is not in the group and for the - * latter, there are no failures in illgrp_insert/illgrp_delete - * (ENOMEM can't occur for this. Check ifgrp_insert). - */ + while (irep > ire_array) { irep--; - if (*irep != NULL) { + if (*irep != NULL) ire_delete(*irep); - if (ire_added) - ire_refrele(*irep); - } } (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); @@ -20417,7 +16684,7 @@ bad: if (src_ipif_held) ipif_refrele(src_ipif); - ipif_arp_down(ipif); + ipif_resolver_down(ipif); return (err); } @@ -20493,119 +16760,6 @@ ill_arp_on(ill_t *ill) } /* - * Called after either deleting ill from the group or when setting - * FAILED or STANDBY on the interface. - */ -static void -illgrp_reset_schednext(ill_t *ill) -{ - ill_group_t *illgrp; - ill_t *save_ill; - - ASSERT(IAM_WRITER_ILL(ill)); - /* - * When called from illgrp_delete, ill_group will be non-NULL. - * But when called from ip_sioctl_flags, it could be NULL if - * somebody is setting FAILED/INACTIVE on some interface which - * is not part of a group. - */ - illgrp = ill->ill_group; - if (illgrp == NULL) - return; - if (illgrp->illgrp_ill_schednext != ill) - return; - - illgrp->illgrp_ill_schednext = NULL; - save_ill = ill; - /* - * Choose a good ill to be the next one for - * outbound traffic. As the flags FAILED/STANDBY is - * not yet marked when called from ip_sioctl_flags, - * we check for ill separately. - */ - for (ill = illgrp->illgrp_ill; ill != NULL; - ill = ill->ill_group_next) { - if ((ill != save_ill) && - !(ill->ill_phyint->phyint_flags & - (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) { - illgrp->illgrp_ill_schednext = ill; - return; - } - } -} - -/* - * Given an ill, find the next ill in the group to be scheduled. - * (This should be called by ip_newroute() before ire_create().) - * The passed in ill may be pulled out of the group, after we have picked - * up a different outgoing ill from the same group. However ire add will - * atomically check this. - */ -ill_t * -illgrp_scheduler(ill_t *ill) -{ - ill_t *retill; - ill_group_t *illgrp; - int illcnt; - int i; - uint64_t flags; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * We don't use a lock to check for the ill_group. If this ill - * is currently being inserted we may end up just returning this - * ill itself. That is ok. - */ - if (ill->ill_group == NULL) { - ill_refhold(ill); - return (ill); - } - - /* - * Grab the ill_g_lock as reader to make sure we are dealing with - * a set of stable ills. No ill can be added or deleted or change - * group while we hold the reader lock. - */ - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - if ((illgrp = ill->ill_group) == NULL) { - rw_exit(&ipst->ips_ill_g_lock); - ill_refhold(ill); - return (ill); - } - - illcnt = illgrp->illgrp_ill_count; - mutex_enter(&illgrp->illgrp_lock); - retill = illgrp->illgrp_ill_schednext; - - if (retill == NULL) - retill = illgrp->illgrp_ill; - - /* - * We do a circular search beginning at illgrp_ill_schednext - * or illgrp_ill. We don't check the flags against the ill lock - * since it can change anytime. The ire creation will be atomic - * and will fail if the ill is FAILED or OFFLINE. - */ - for (i = 0; i < illcnt; i++) { - flags = retill->ill_phyint->phyint_flags; - - if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && - ILL_CAN_LOOKUP(retill)) { - illgrp->illgrp_ill_schednext = retill->ill_group_next; - ill_refhold(retill); - break; - } - retill = retill->ill_group_next; - if (retill == NULL) - retill = illgrp->illgrp_ill; - } - mutex_exit(&illgrp->illgrp_lock); - rw_exit(&ipst->ips_ill_g_lock); - - return (i == illcnt ? NULL : retill); -} - -/* * Checks for availbility of a usable source address (if there is one) when the * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note * this selection is done regardless of the destination. @@ -20654,11 +16808,26 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) } /* - * Determine the best source address given a destination address and an ill. - * Prefers non-deprecated over deprecated but will return a deprecated - * address if there is no other choice. If there is a usable source address - * on the interface pointed to by ill_usesrc_ifindex then that is given - * first preference. + * IP source address type, sorted from worst to best. For a given type, + * always prefer IP addresses on the same subnet. All-zones addresses are + * suboptimal because they pose problems with unlabeled destinations. + */ +typedef enum { + IPIF_NONE, + IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ + IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ + IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ + IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ + IPIF_DIFFNET, /* normal and different subnet */ + IPIF_SAMENET /* normal and same subnet */ +} ipif_type_t; + +/* + * Pick the optimal ipif on `ill' for sending to destination `dst' from zone + * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t + * enumeration, and return the highest-rated ipif. If there's a tie, we pick + * the first one, unless IPMP is used in which case we round-robin among them; + * see below for more. * * Returns NULL if there is no suitable source address for the ill. * This only occurs when there is no valid source address for the ill. @@ -20666,17 +16835,13 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) ipif_t * ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) { - ipif_t *ipif; - ipif_t *ipif_dep = NULL; /* Fallback to deprecated */ - ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE]; - int index = 0; - boolean_t wrapped = B_FALSE; - boolean_t same_subnet_only = B_FALSE; - boolean_t ipif_same_found, ipif_other_found; - boolean_t specific_found; - ill_t *till, *usill = NULL; + ill_t *usill = NULL; + ill_t *ipmp_ill = NULL; + ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; + ipif_type_t type, best_type; tsol_tpc_t *src_rhtp, *dst_rhtp; - ip_stack_t *ipst = ill->ill_ipst; + ip_stack_t *ipst = ill->ill_ipst; + boolean_t samenet; if (ill->ill_usesrc_ifindex != 0) { usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, @@ -20688,6 +16853,17 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) } /* + * Test addresses should never be used for source address selection, + * so if we were passed one, switch to the IPMP meta-interface. + */ + if (IS_UNDER_IPMP(ill)) { + if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) + ill = ipmp_ill; /* Select source from IPMP ill */ + else + return (NULL); + } + + /* * If we're dealing with an unlabeled destination on a labeled system, * make sure that we ignore source addresses that are incompatible with * the destination's default label. That destination's default label @@ -20705,7 +16881,7 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) } /* - * Holds the ill_g_lock as reader. This makes sure that no ipif/ill + * Hold the ill_g_lock as reader. This makes sure that no ipif/ill * can be deleted. But an ipif/ill can get CONDEMNED any time. * After selecting the right ipif, under ill_lock make sure ipif is * not condemned, and increment refcnt. If ipif is CONDEMNED, @@ -20713,190 +16889,117 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) * but not under a lock. */ rw_enter(&ipst->ips_ill_g_lock, RW_READER); - retry: - till = ill; - ipif_arr[0] = NULL; + /* + * For source address selection, we treat the ipif list as circular + * and continue until we get back to where we started. This allows + * IPMP to vary source address selection (which improves inbound load + * spreading) by caching its last ending point and starting from + * there. NOTE: we don't have to worry about ill_src_ipif changing + * ills since that can't happen on the IPMP ill. + */ + start_ipif = ill->ill_ipif; + if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) + start_ipif = ill->ill_src_ipif; - if (till->ill_group != NULL) - till = till->ill_group->illgrp_ill; + ipif = start_ipif; + best_ipif = NULL; + best_type = IPIF_NONE; + do { + if ((next_ipif = ipif->ipif_next) == NULL) + next_ipif = ill->ill_ipif; - /* - * Choose one good source address from each ill across the group. - * If possible choose a source address in the same subnet as - * the destination address. - * - * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE - * This is okay because of the following. - * - * If PHYI_FAILED is set and we still have non-deprecated - * addresses, it means the addresses have not yet been - * failed over to a different interface. We potentially - * select them to create IRE_CACHES, which will be later - * flushed when the addresses move over. - * - * If PHYI_INACTIVE is set and we still have non-deprecated - * addresses, it means either the user has configured them - * or PHYI_INACTIVE has not been cleared after the addresses - * been moved over. For the former, in.mpathd does a failover - * when the interface becomes INACTIVE and hence we should - * not find them. Once INACTIVE is set, we don't allow them - * to create logical interfaces anymore. For the latter, a - * flush will happen when INACTIVE is cleared which will - * flush the IRE_CACHES. - * - * If PHYI_OFFLINE is set, all the addresses will be failed - * over soon. We potentially select them to create IRE_CACHEs, - * which will be later flushed when the addresses move over. - * - * NOTE : As ipif_select_source is called to borrow source address - * for an ipif that is part of a group, source address selection - * will be re-done whenever the group changes i.e either an - * insertion/deletion in the group. - * - * Fill ipif_arr[] with source addresses, using these rules: - * - * 1. At most one source address from a given ill ends up - * in ipif_arr[] -- that is, at most one of the ipif's - * associated with a given ill ends up in ipif_arr[]. - * - * 2. If there is at least one non-deprecated ipif in the - * IPMP group with a source address on the same subnet as - * our destination, then fill ipif_arr[] only with - * source addresses on the same subnet as our destination. - * Note that because of (1), only the first - * non-deprecated ipif found with a source address - * matching the destination ends up in ipif_arr[]. - * - * 3. Otherwise, fill ipif_arr[] with non-deprecated source - * addresses not in the same subnet as our destination. - * Again, because of (1), only the first off-subnet source - * address will be chosen. - * - * 4. If there are no non-deprecated ipifs, then just use - * the source address associated with the last deprecated - * one we find that happens to be on the same subnet, - * otherwise the first one not in the same subnet. - */ - specific_found = B_FALSE; - for (; till != NULL; till = till->ill_group_next) { - ipif_same_found = B_FALSE; - ipif_other_found = B_FALSE; - for (ipif = till->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (!IPIF_CAN_LOOKUP(ipif)) - continue; - /* Always skip NOLOCAL and ANYCAST interfaces */ - if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) - continue; - if (!(ipif->ipif_flags & IPIF_UP) || - !ipif->ipif_addr_ready) - continue; - if (ipif->ipif_zoneid != zoneid && - ipif->ipif_zoneid != ALL_ZONES) - continue; - /* - * Interfaces with 0.0.0.0 address are allowed to be UP, - * but are not valid as source addresses. - */ - if (ipif->ipif_lcl_addr == INADDR_ANY) - continue; + if (!IPIF_CAN_LOOKUP(ipif)) + continue; + /* Always skip NOLOCAL and ANYCAST interfaces */ + if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) + continue; + if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready) + continue; + if (ipif->ipif_zoneid != zoneid && + ipif->ipif_zoneid != ALL_ZONES) + continue; - /* - * Check compatibility of local address for - * destination's default label if we're on a labeled - * system. Incompatible addresses can't be used at - * all. - */ - if (dst_rhtp != NULL) { - boolean_t incompat; + /* + * Interfaces with 0.0.0.0 address are allowed to be UP, but + * are not valid as source addresses. + */ + if (ipif->ipif_lcl_addr == INADDR_ANY) + continue; - src_rhtp = find_tpc(&ipif->ipif_lcl_addr, - IPV4_VERSION, B_FALSE); - if (src_rhtp == NULL) - continue; - incompat = - src_rhtp->tpc_tp.host_type != SUN_CIPSO || - src_rhtp->tpc_tp.tp_doi != - dst_rhtp->tpc_tp.tp_doi || - (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, - &src_rhtp->tpc_tp.tp_sl_range_cipso) && - !blinlset(&dst_rhtp->tpc_tp.tp_def_label, - src_rhtp->tpc_tp.tp_sl_set_cipso)); - TPC_RELE(src_rhtp); - if (incompat) - continue; - } + /* + * Check compatibility of local address for destination's + * default label if we're on a labeled system. Incompatible + * addresses can't be used at all. + */ + if (dst_rhtp != NULL) { + boolean_t incompat; - /* - * We prefer not to use all all-zones addresses, if we - * can avoid it, as they pose problems with unlabeled - * destinations. - */ - if (ipif->ipif_zoneid != ALL_ZONES) { - if (!specific_found && - (!same_subnet_only || - (ipif->ipif_net_mask & dst) == - ipif->ipif_subnet)) { - index = 0; - specific_found = B_TRUE; - ipif_other_found = B_FALSE; - } - } else { - if (specific_found) - continue; - } - if (ipif->ipif_flags & IPIF_DEPRECATED) { - if (ipif_dep == NULL || - (ipif->ipif_net_mask & dst) == - ipif->ipif_subnet) - ipif_dep = ipif; + src_rhtp = find_tpc(&ipif->ipif_lcl_addr, + IPV4_VERSION, B_FALSE); + if (src_rhtp == NULL) + continue; + incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || + src_rhtp->tpc_tp.tp_doi != + dst_rhtp->tpc_tp.tp_doi || + (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, + &src_rhtp->tpc_tp.tp_sl_range_cipso) && + !blinlset(&dst_rhtp->tpc_tp.tp_def_label, + src_rhtp->tpc_tp.tp_sl_set_cipso)); + TPC_RELE(src_rhtp); + if (incompat) continue; - } - if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) { - /* found a source address in the same subnet */ - if (!same_subnet_only) { - same_subnet_only = B_TRUE; - index = 0; - } - ipif_same_found = B_TRUE; - } else { - if (same_subnet_only || ipif_other_found) - continue; - ipif_other_found = B_TRUE; - } - ipif_arr[index++] = ipif; - if (index == MAX_IPIF_SELECT_SOURCE) { - wrapped = B_TRUE; - index = 0; - } - if (ipif_same_found) - break; } - } - if (ipif_arr[0] == NULL) { - ipif = ipif_dep; - } else { - if (wrapped) - index = MAX_IPIF_SELECT_SOURCE; - ipif = ipif_arr[ipif_rand(ipst) % index]; - ASSERT(ipif != NULL); - } + samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); - if (ipif != NULL) { + if (ipif->ipif_flags & IPIF_DEPRECATED) { + type = samenet ? IPIF_SAMENET_DEPRECATED : + IPIF_DIFFNET_DEPRECATED; + } else if (ipif->ipif_zoneid == ALL_ZONES) { + type = samenet ? IPIF_SAMENET_ALLZONES : + IPIF_DIFFNET_ALLZONES; + } else { + type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; + } + + if (type > best_type) { + best_type = type; + best_ipif = ipif; + if (best_type == IPIF_SAMENET) + break; /* can't get better */ + } + } while ((ipif = next_ipif) != start_ipif); + + if ((ipif = best_ipif) != NULL) { mutex_enter(&ipif->ipif_ill->ill_lock); if (!IPIF_CAN_LOOKUP(ipif)) { mutex_exit(&ipif->ipif_ill->ill_lock); goto retry; } ipif_refhold_locked(ipif); + + /* + * For IPMP, update the source ipif rotor to the next ipif, + * provided we can look it up. (We must not use it if it's + * IPIF_CONDEMNED since we may have grabbed ill_g_lock after + * ipif_free() checked ill_src_ipif.) + */ + if (IS_IPMP(ill) && ipif != NULL) { + next_ipif = ipif->ipif_next; + if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif)) + ill->ill_src_ipif = next_ipif; + else + ill->ill_src_ipif = NULL; + } mutex_exit(&ipif->ipif_ill->ill_lock); } rw_exit(&ipst->ips_ill_g_lock); if (usill != NULL) ill_refrele(usill); + if (ipmp_ill != NULL) + ill_refrele(ipmp_ill); if (dst_rhtp != NULL) TPC_RELE(dst_rhtp); @@ -20929,8 +17032,7 @@ retry: * ipif_update_other_ipifs calls us. * * If old_ipif is NULL, just redo the source address selection - * if needed. This happens when illgrp_insert or ipif_up_done - * calls us. + * if needed. This happens when ipif_up_done calls us. */ static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) @@ -21064,49 +17166,31 @@ ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) /* * This old_ipif is going away. * - * Determine if any other ipif's is using our address as + * Determine if any other ipif's are using our address as * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or * IPIF_DEPRECATED). * Find the IRE_INTERFACE for such ipifs and recreate them * to use an different source address following the rules in * ipif_up_done. - * - * This function takes an illgrp as an argument so that illgrp_delete - * can call this to update source address even after deleting the - * old_ipif->ipif_ill from the ill group. */ static void -ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp) +ipif_update_other_ipifs(ipif_t *old_ipif) { - ipif_t *ipif; - ill_t *ill; + ipif_t *ipif; + ill_t *ill; char buf[INET6_ADDRSTRLEN]; ASSERT(IAM_WRITER_IPIF(old_ipif)); - ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif)); ill = old_ipif->ipif_ill; - ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", - ill->ill_name, - inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, - buf, sizeof (buf)))); - /* - * If this part of a group, look at all ills as ipif_select_source - * borrows source address across all the ills in the group. - */ - if (illgrp != NULL) - ill = illgrp->illgrp_ill; - - for (; ill != NULL; ill = ill->ill_group_next) { - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - - if (ipif == old_ipif) - continue; + ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name, + inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf)))); - ipif_recreate_interface_routes(old_ipif, ipif); - } + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + if (ipif == old_ipif) + continue; + ipif_recreate_interface_routes(old_ipif, ipif); } } @@ -21117,8 +17201,7 @@ if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, { /* * ill_phyint_reinit merged the v4 and v6 into a single - * ipsq. Could also have become part of a ipmp group in the - * process, and we might not have been able to complete the + * ipsq. We might not have been able to complete the * operation in ipif_set_values, if we could not become * exclusive. If so restart it here. */ @@ -21171,6 +17254,48 @@ ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, } /* + * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the + * minimum (but complete) set exist. This is necessary when adding or + * removing an interface to/from an IPMP group, since interfaces in an + * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever + * its test address subnets overlap with IPMP data addresses). It's also + * used to refresh the IRE_BROADCAST entries associated with the IPMP + * interface when the nominated broadcast interface changes. + */ +void +ill_refresh_bcast(ill_t *ill) +{ + ire_t *ire_array[12]; /* max ipif_create_bcast_ires() can create */ + ire_t **irep; + ipif_t *ipif; + + ASSERT(!ill->ill_isv6); + ASSERT(IAM_WRITER_ILL(ill)); + + /* + * Remove any old broadcast IREs. + */ + ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST, + ill_broadcast_delete, ill, ill); + + /* + * Create new ones for any ipifs that are up and broadcast-capable. + */ + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) != + (IPIF_UP|IPIF_BROADCAST)) + continue; + + irep = ipif_create_bcast_ires(ipif, ire_array); + while (irep-- > ire_array) { + (void) ire_add(irep, NULL, NULL, NULL, B_FALSE); + if (*irep != NULL) + ire_refrele(*irep); + } + } +} + +/* * Create any IRE_BROADCAST entries for `ipif', and store those entries in * `irep'. Returns a pointer to the next free `irep' entry (just like * ire_check_and_create_bcast()). @@ -21433,10 +17558,33 @@ ipif_check_bcast_ires(ipif_t *test_ipif) /* * Walk through all the ipifs that will be affected by the dying IREs, - * and recreate the IREs as necessary. + * and recreate the IREs as necessary. Note that all interfaces in an + * IPMP illgrp share the same broadcast IREs, and thus the entire + * illgrp must be walked, starting with the IPMP meta-interface (so + * that broadcast IREs end up on it whenever possible). */ + if (IS_UNDER_IPMP(ill)) + ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); + irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); + if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { + ipmp_illgrp_t *illg = ill->ill_grp; + + ill = list_head(&illg->ig_if); + for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { + for (i = 0; i < BCAST_COUNT; i++) { + if (bireinfo[i].bi_willdie && + !bireinfo[i].bi_haverep) + break; + } + if (i == BCAST_COUNT) + break; + + irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); + } + } + /* * Scan through the set of broadcast IREs and see if there are any * that we need to replace that have not yet been replaced. If so, @@ -21528,7 +17676,7 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, /* * If there's another ill already with the requested name, ensure - * that it's of the same type. Otherwise, ill_phyint_reinit() will + * that it's of the same type. Otherwise, ill_phyint_reinit() will * fuse together two unrelated ills, which will cause chaos. */ ipst = ill->ill_ipst; @@ -21620,8 +17768,7 @@ ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, { /* * ill_phyint_reinit merged the v4 and v6 into a single - * ipsq. Could also have become part of a ipmp group in the - * process, and we might not have been able to complete the + * ipsq. We might not have been able to complete the * slifname in ipif_set_values, if we could not become * exclusive. If so restart it here */ @@ -21665,85 +17812,6 @@ ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, return (ipif); } -typedef struct conn_change_s { - uint_t cc_old_ifindex; - uint_t cc_new_ifindex; -} conn_change_t; - -/* - * ipcl_walk function for changing interface index. - */ -static void -conn_change_ifindex(conn_t *connp, caddr_t arg) -{ - conn_change_t *connc; - uint_t old_ifindex; - uint_t new_ifindex; - int i; - ilg_t *ilg; - - connc = (conn_change_t *)arg; - old_ifindex = connc->cc_old_ifindex; - new_ifindex = connc->cc_new_ifindex; - - if (connp->conn_orig_bound_ifindex == old_ifindex) - connp->conn_orig_bound_ifindex = new_ifindex; - - if (connp->conn_orig_multicast_ifindex == old_ifindex) - connp->conn_orig_multicast_ifindex = new_ifindex; - - for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { - ilg = &connp->conn_ilg[i]; - if (ilg->ilg_orig_ifindex == old_ifindex) - ilg->ilg_orig_ifindex = new_ifindex; - } -} - -/* - * Walk all the ipifs and ilms on this ill and change the orig_ifindex - * to new_index if it matches the old_index. - * - * Failovers typically happen within a group of ills. But somebody - * can remove an ill from the group after a failover happened. If - * we are setting the ifindex after this, we potentially need to - * look at all the ills rather than just the ones in the group. - * We cut down the work by looking at matching ill_net_types - * and ill_types as we could not possibly grouped them together. - */ -static void -ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc) -{ - ill_t *ill; - ipif_t *ipif; - uint_t old_ifindex; - uint_t new_ifindex; - ilm_t *ilm; - ill_walk_context_t ctx; - ip_stack_t *ipst = ill_orig->ill_ipst; - - old_ifindex = connc->cc_old_ifindex; - new_ifindex = connc->cc_new_ifindex; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - ill = ILL_START_WALK_ALL(&ctx, ipst); - for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if ((ill_orig->ill_net_type != ill->ill_net_type) || - (ill_orig->ill_type != ill->ill_type)) { - continue; - } - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (ipif->ipif_orig_ifindex == old_ifindex) - ipif->ipif_orig_ifindex = new_ifindex; - } - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (ilm->ilm_orig_ifindex == old_ifindex) - ilm->ilm_orig_ifindex = new_ifindex; - } - } - rw_exit(&ipst->ips_ill_g_lock); -} - /* * We first need to ensure that the new index is unique, and * then carry the change across both v4 and v6 ill representation @@ -21755,13 +17823,10 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) { ill_t *ill; - ill_t *ill_other; phyint_t *phyi; - int old_index; - conn_change_t connc; struct ifreq *ifr = (struct ifreq *)ifreq; struct lifreq *lifr = (struct lifreq *)ifreq; - uint_t index; + uint_t old_index, index; ill_t *ill_v4; ill_t *ill_v6; ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; @@ -21773,31 +17838,15 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, /* * Only allow on physical interface. Also, index zero is illegal. - * - * Need to check for PHYI_FAILED and PHYI_INACTIVE - * - * 1) If PHYI_FAILED is set, a failover could have happened which - * implies a possible failback might have to happen. As failback - * depends on the old index, we should fail setting the index. - * - * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that - * any addresses or multicast memberships are failed over to - * a non-STANDBY interface. As failback depends on the old - * index, we should fail setting the index for this case also. - * - * 3) If PHYI_OFFLINE is set, a possible failover has happened. - * Be consistent with PHYI_FAILED and fail the ioctl. */ ill = ipif->ipif_ill; phyi = ill->ill_phyint; - if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) || - ipif->ipif_id != 0 || index == 0) { + if (ipif->ipif_id != 0 || index == 0) { return (EINVAL); } - old_index = phyi->phyint_ifindex; /* If the index is not changing, no work to do */ - if (old_index == index) + if (phyi->phyint_ifindex == index) return (0); /* @@ -21816,31 +17865,17 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, return (EBUSY); } - /* - * The new index is unused. Set it in the phyint. - * Locate the other ill so that we can send a routing - * sockets message. - */ - if (ill->ill_isv6) { - ill_other = phyi->phyint_illv4; - } else { - ill_other = phyi->phyint_illv6; - } - + /* The new index is unused. Set it in the phyint. */ + old_index = phyi->phyint_ifindex; phyi->phyint_ifindex = index; /* Update SCTP's ILL list */ sctp_ill_reindex(ill, old_index); - connc.cc_old_ifindex = old_index; - connc.cc_new_ifindex = index; - ip_change_ifindex(ill, &connc); - ipcl_walk(conn_change_ifindex, (caddr_t)&connc, ipst); - /* Send the routing sockets message */ - ip_rts_ifmsg(ipif); - if (ill_other != NULL) - ip_rts_ifmsg(ill_other->ill_ipif); + ip_rts_ifmsg(ipif, RTSQ_DEFAULT); + if (ILL_OTHER(ill)) + ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); return (0); } @@ -22038,6 +18073,45 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, B_TRUE)); } +/* + * Return the number of addresses on `ill' with one or more of the values + * in `set' set and all of the values in `clear' clear. + */ +static uint_t +ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) +{ + ipif_t *ipif; + uint_t cnt = 0; + + ASSERT(IAM_WRITER_ILL(ill)); + + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) + cnt++; + + return (cnt); +} + +/* + * Return the number of migratable addresses on `ill' that are under + * application control. + */ +uint_t +ill_appaddr_cnt(const ill_t *ill) +{ + return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, + IPIF_NOFAILOVER)); +} + +/* + * Return the number of point-to-point addresses on `ill'. + */ +uint_t +ill_ptpaddr_cnt(const ill_t *ill) +{ + return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); +} + /* ARGSUSED */ int ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, @@ -22158,7 +18232,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; int err = 0, ret; uint_t ifindex; - phyint_t *us_phyint, *us_cli_phyint; ipsq_t *ipsq = NULL; ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; @@ -22167,19 +18240,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ASSERT(CONN_Q(q)); isv6 = (Q_TO_CONN(q))->conn_af_isv6; - us_cli_phyint = usesrc_cli_ill->ill_phyint; - - ASSERT(us_cli_phyint != NULL); - - /* - * If the client ILL is being used for IPMP, abort. - * Note, this can be done before ipsq_try_enter since we are already - * exclusive on this ILL - */ - if ((us_cli_phyint->phyint_groupname != NULL) || - (us_cli_phyint->phyint_flags & PHYI_STANDBY)) { - return (EINVAL); - } ifindex = lifr->lifr_index; if (ifindex == 0) { @@ -22198,15 +18258,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, return (err); } - /* - * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP - * group nor can either of the interfaces be used for standy. So - * to guarantee mutual exclusion with ip_sioctl_flags (which sets - * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname) - * we need to be exclusive on the ipsq belonging to the usesrc_ill. - * We are already exlusive on this ipsq i.e ipsq corresponding to - * the usesrc_cli_ill - */ ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, NEW_OP, B_TRUE); if (ipsq == NULL) { @@ -22215,11 +18266,19 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, goto done; } - /* Check if the usesrc_ill is used for IPMP */ - us_phyint = usesrc_ill->ill_phyint; - if ((us_phyint->phyint_groupname != NULL) || - (us_phyint->phyint_flags & PHYI_STANDBY)) { - err = EINVAL; + /* USESRC isn't currently supported with IPMP */ + if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { + err = ENOTSUP; + goto done; + } + + /* + * USESRC isn't compatible with the STANDBY flag. (STANDBY is only + * used by IPMP underlying interfaces, but someone might think it's + * more general and try to use it independently with VNI.) + */ + if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { + err = ENOTSUP; goto done; } @@ -22372,79 +18431,45 @@ ill_phyint_compare_name(const void *name_ptr, const void *phyip) return (-1); return (0); } + /* - * This function is called from ill_delete when the ill is being - * unplumbed. We remove the reference from the phyint and we also - * free the phyint when there are no more references to it. + * This function is called on the unplumb path via ill_glist_delete() when + * there are no ills left on the phyint and thus the phyint can be freed. */ static void -ill_phyint_free(ill_t *ill) +phyint_free(phyint_t *phyi) { - phyint_t *phyi; - phyint_t *next_phyint; - ipsq_t *cur_ipsq; - ip_stack_t *ipst = ill->ill_ipst; + ip_stack_t *ipst = PHYINT_TO_IPST(phyi); - ASSERT(ill->ill_phyint != NULL); + ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); - ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); - phyi = ill->ill_phyint; - ill->ill_phyint = NULL; /* - * ill_init allocates a phyint always to store the copy - * of flags relevant to phyint. At that point in time, we could - * not assign the name and hence phyint_illv4/v6 could not be - * initialized. Later in ipif_set_values, we assign the name to - * the ill, at which point in time we assign phyint_illv4/v6. - * Thus we don't rely on phyint_illv6 to be initialized always. + * If this phyint was an IPMP meta-interface, blow away the group. + * This is safe to do because all of the illgrps have already been + * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. + * If we're cleaning up as a result of failed initialization, + * phyint_grp may be NULL. */ - if (ill->ill_flags & ILLF_IPV6) { - phyi->phyint_illv6 = NULL; - } else { - phyi->phyint_illv4 = NULL; + if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + ipmp_grp_destroy(phyi->phyint_grp); + phyi->phyint_grp = NULL; + rw_exit(&ipst->ips_ipmp_lock); } - /* - * ipif_down removes it from the group when the last ipif goes - * down. - */ - ASSERT(ill->ill_group == NULL); - - if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) - return; /* - * Make sure this phyint was put in the list. + * If this interface was under IPMP, take it out of the group. */ - if (phyi->phyint_ifindex > 0) { - avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, - phyi); - avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, - phyi); - } + if (phyi->phyint_grp != NULL) + ipmp_phyint_leave_grp(phyi); + /* - * remove phyint from the ipsq list. + * Delete the phyint and disassociate its ipsq. The ipsq itself + * will be freed in ipsq_exit(). */ - cur_ipsq = phyi->phyint_ipsq; - if (phyi == cur_ipsq->ipsq_phyint_list) { - cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next; - } else { - next_phyint = cur_ipsq->ipsq_phyint_list; - while (next_phyint != NULL) { - if (next_phyint->phyint_ipsq_next == phyi) { - next_phyint->phyint_ipsq_next = - phyi->phyint_ipsq_next; - break; - } - next_phyint = next_phyint->phyint_ipsq_next; - } - ASSERT(next_phyint != NULL); - } - IPSQ_DEC_REF(cur_ipsq, ipst); + phyi->phyint_ipsq->ipsq_phyint = NULL; + phyi->phyint_name[0] = '\0'; - if (phyi->phyint_groupname_len != 0) { - ASSERT(phyi->phyint_groupname != NULL); - mi_free(phyi->phyint_groupname); - } mi_free(phyi); } @@ -22464,7 +18489,6 @@ ill_phyint_reinit(ill_t *ill) phyint_t *phyi; avl_index_t where = 0; ill_t *ill_other = NULL; - ipsq_t *ipsq; ip_stack_t *ipst = ill->ill_ipst; ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); @@ -22476,6 +18500,11 @@ ill_phyint_reinit(ill_t *ill) phyi_old->phyint_illv4 == NULL)); ASSERT(phyi_old->phyint_ifindex == 0); + /* + * Now that our ill has a name, set it in the phyint. + */ + (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); + phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, ill->ill_name, &where); @@ -22497,8 +18526,7 @@ ill_phyint_reinit(ill_t *ill) * we are initializing IPv4. */ if (phyi != NULL) { - ill_other = (isv6) ? phyi->phyint_illv4 : - phyi->phyint_illv6; + ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; ASSERT(ill_other->ill_phyint != NULL); ASSERT((isv6 && !ill_other->ill_isv6) || (!isv6 && ill_other->ill_isv6)); @@ -22517,26 +18545,15 @@ ill_phyint_reinit(ill_t *ill) ASSERT(phyi->phyint_illv4 == NULL); phyi->phyint_illv4 = ill; } - /* - * This is a new ill, currently undergoing SLIFNAME - * So we could not have joined an IPMP group until now. - */ - ASSERT(phyi_old->phyint_ipsq_next == NULL && - phyi_old->phyint_groupname == NULL); /* - * This phyi_old is going away. Decref ipsq_refs and - * assert it is zero. The ipsq itself will be freed in - * ipsq_exit + * Delete the old phyint and make its ipsq eligible + * to be freed in ipsq_exit(). */ - ipsq = phyi_old->phyint_ipsq; - IPSQ_DEC_REF(ipsq, ipst); - ASSERT(ipsq->ipsq_refs == 0); - /* Get the singleton phyint out of the ipsq list */ - ASSERT(phyi_old->phyint_ipsq_next == NULL); - ipsq->ipsq_phyint_list = NULL; phyi_old->phyint_illv4 = NULL; phyi_old->phyint_illv6 = NULL; + phyi_old->phyint_ipsq->ipsq_phyint = NULL; + phyi_old->phyint_name[0] = '\0'; mi_free(phyi_old); } else { mutex_enter(&ill->ill_lock); @@ -22551,9 +18568,6 @@ ill_phyint_reinit(ill_t *ill) if (!phyint_assign_ifindex(phyi, ipst)) cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); - /* No IPMP group yet, thus the hook uses the ifindex */ - phyi->phyint_hook_ifindex = phyi->phyint_ifindex; - avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, (void *)phyi, where); @@ -22571,13 +18585,6 @@ ill_phyint_reinit(ill_t *ill) ill->ill_phyint = phyi; /* - * Keep the index on ipif_orig_index to be used by FAILOVER. - * We do this here as when the first ipif was allocated, - * ipif_allocate does not know the right interface index. - */ - - ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex; - /* * Now that the phyint's ifindex has been assigned, complete the * remaining */ @@ -22606,45 +18613,14 @@ ill_phyint_reinit(ill_t *ill) */ if (ill->ill_name_length <= 2 || ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') { - /* - * Generate nic plumb event for ill_name even if - * ipmp_hook_emulation is set. That avoids generating events - * for the ill_names should ipmp_hook_emulation be turned on - * later. - */ - ill_nic_event_plumb(ill, B_FALSE); + ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, + ill->ill_name_length); } RELEASE_ILL_LOCKS(ill, ill_other); mutex_exit(&phyi->phyint_lock); } /* - * Allocate a NE_PLUMB nic info event and store in the ill. - * If 'group' is set we do it for the group name, otherwise the ill name. - * It will be sent when we leave the ipsq. - */ -void -ill_nic_event_plumb(ill_t *ill, boolean_t group) -{ - phyint_t *phyi = ill->ill_phyint; - char *name; - int namelen; - - ASSERT(MUTEX_HELD(&ill->ill_lock)); - - if (group) { - ASSERT(phyi->phyint_groupname_len != 0); - namelen = phyi->phyint_groupname_len; - name = phyi->phyint_groupname; - } else { - namelen = ill->ill_name_length; - name = ill->ill_name; - } - - ill_nic_event_dispatch(ill, 0, NE_PLUMB, name, namelen); -} - -/* * Notify any downstream modules of the name of this interface. * An M_IOCTL is used even though we don't expect a successful reply. * Any reply message from the driver (presumably an M_IOCNAK) will @@ -22686,8 +18662,9 @@ ip_ifname_notify(ill_t *ill, queue_t *q) static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) { - int err; + int err; ip_stack_t *ipst = ill->ill_ipst; + phyint_t *phyi = ill->ill_phyint; /* Set the obsolete NDD per-interface forwarding name. */ err = ill_set_ndd_name(ill); @@ -22696,6 +18673,34 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) err); } + /* + * Now that ill_name is set, the configuration for the IPMP + * meta-interface can be performed. + */ + if (IS_IPMP(ill)) { + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + /* + * If phyi->phyint_grp is NULL, then this is the first IPMP + * meta-interface and we need to create the IPMP group. + */ + if (phyi->phyint_grp == NULL) { + /* + * If someone has renamed another IPMP group to have + * the same name as our interface, bail. + */ + if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { + rw_exit(&ipst->ips_ipmp_lock); + return (EEXIST); + } + phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); + if (phyi->phyint_grp == NULL) { + rw_exit(&ipst->ips_ipmp_lock); + return (ENOMEM); + } + } + rw_exit(&ipst->ips_ipmp_lock); + } + /* Tell downstream modules where they are. */ ip_ifname_notify(ill, q); @@ -22966,10 +18971,10 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) /* * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. */ - if (ipsq->ipsq_current_ipif == NULL) + if (ipsq->ipsq_xop->ipx_current_ipif == NULL) ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); else - ASSERT(ipsq->ipsq_current_ipif == ipif); + ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); error = ipif_set_values_tail(ill, ipif, mp, q); ipsq_exit(ipsq); @@ -22986,18 +18991,8 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) void ipif_init(ip_stack_t *ipst) { - hrtime_t hrt; int i; - /* - * Can't call drv_getparm here as it is too early in the boot. - * As we use ipif_src_random just for picking a different - * source address everytime, this need not be really random. - */ - hrt = gethrtime(); - ipst->ips_ipif_src_random = - ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff); - for (i = 0; i < MAX_G_HEADS; i++) { ipst->ips_ill_g_heads[i].ill_g_list_head = (ill_if_t *)&ipst->ips_ill_g_heads[i]; @@ -23023,7 +19018,11 @@ ipif_init(ip_stack_t *ipst) * match is found to take care of such rare network configurations like - * le0: 129.146.1.1/16 * le1: 129.146.2.2/24 - * It is used only by SO_DONTROUTE at the moment. + * + * This is used by SO_DONTROUTE and IP_NEXTHOP. Since neither of those are + * supported on underlying interfaces in an IPMP group, underlying interfaces + * are ignored when looking up a match. (If we didn't ignore them, we'd + * risk using a test address as a source for outgoing traffic.) */ ipif_t * ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) @@ -23038,6 +19037,8 @@ ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { + if (IS_UNDER_IPMP(ill)) + continue; mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { @@ -23660,30 +19661,76 @@ ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa, * Knows about IEEE 802 and IEEE EUI-64 mappings. */ static boolean_t -ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) +ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) { char *addr; - if (phys_length != ETHERADDRL) + if (ill->ill_phys_addr_length != ETHERADDRL) return (B_FALSE); /* Form EUI-64 like address */ addr = (char *)&v6addr->s6_addr32[2]; - bcopy((char *)phys_addr, addr, 3); + bcopy(ill->ill_phys_addr, addr, 3); addr[0] ^= 0x2; /* Toggle Universal/Local bit */ addr[3] = (char)0xff; addr[4] = (char)0xfe; - bcopy((char *)phys_addr + 3, addr + 5, 3); + bcopy(ill->ill_phys_addr + 3, addr + 5, 3); return (B_TRUE); } /* ARGSUSED */ static boolean_t -ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) +ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) { return (B_FALSE); } +typedef struct ipmp_ifcookie { + uint32_t ic_hostid; + char ic_ifname[LIFNAMSIZ]; + char ic_zonename[ZONENAME_MAX]; +} ipmp_ifcookie_t; + +/* + * Construct a pseudo-random interface ID for the IPMP interface that's both + * predictable and (almost) guaranteed to be unique. + */ +static boolean_t +ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) +{ + zone_t *zp; + uint8_t *addr; + uchar_t hash[16]; + ulong_t hostid; + MD5_CTX ctx; + ipmp_ifcookie_t ic = { 0 }; + + ASSERT(IS_IPMP(ill)); + + (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); + ic.ic_hostid = htonl((uint32_t)hostid); + + (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); + + if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { + (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); + zone_rele(zp); + } + + MD5Init(&ctx); + MD5Update(&ctx, &ic, sizeof (ic)); + MD5Final(hash, &ctx); + + /* + * Map the hash to an interface ID per the basic approach in RFC3041. + */ + addr = &v6addr->s6_addr8[8]; + bcopy(hash + 8, addr, sizeof (uint64_t)); + addr[0] &= ~0x2; /* set local bit */ + + return (B_TRUE); +} + /* ARGSUSED */ static boolean_t ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, @@ -23739,14 +19786,14 @@ ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, * Derive IPoIB interface id from the link layer address. */ static boolean_t -ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) +ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) { char *addr; - if (phys_length != 20) + if (ill->ill_phys_addr_length != 20) return (B_FALSE); addr = (char *)&v6addr->s6_addr32[2]; - bcopy(phys_addr + 12, addr, 8); + bcopy(ill->ill_phys_addr + 12, addr, 8); /* * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit * in the globally assigned EUI-64 GUID to 1, in violation of IEEE @@ -23863,6 +19910,7 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) *ipifp = NULL; return (B_FALSE); } + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (!IPIF_CAN_LOOKUP(ipif)) continue; @@ -23897,71 +19945,9 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) } /* - * Same as ipif_lookup_zoneid() but looks at all the ills in the same group. - */ -boolean_t -ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) -{ - ill_t *illg; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * We look at the passed-in ill first without grabbing ill_g_lock. - */ - if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) { - return (B_TRUE); - } - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - if (ill->ill_group == NULL) { - /* ill not in a group */ - rw_exit(&ipst->ips_ill_g_lock); - return (B_FALSE); - } - - /* - * There's no ipif in the zone on ill, however ill is part of an IPMP - * group. We need to look for an ipif in the zone on all the ills in the - * group. - */ - illg = ill->ill_group->illgrp_ill; - do { - /* - * We don't call ipif_lookup_zoneid() on ill as we already know - * that it's not there. - */ - if (illg != ill && - ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) { - break; - } - } while ((illg = illg->ill_group_next) != NULL); - rw_exit(&ipst->ips_ill_g_lock); - return (illg != NULL); -} - -/* - * Check if this ill is only being used to send ICMP probes for IPMP - */ -boolean_t -ill_is_probeonly(ill_t *ill) -{ - /* - * Check if the interface is FAILED, or INACTIVE - */ - if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) - return (B_TRUE); - - return (B_FALSE); -} - -/* * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) * If a pointer to an ipif_t is returned then the caller will need to do * an ill_refrele(). - * - * If there is no real interface which matches the ifindex, then it looks - * for a group that has a matching index. In the case of a group match the - * lifidx must be zero. We don't need emulate the logical interfaces - * since IP Filter's use of netinfo doesn't use that. */ ipif_t * ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, @@ -23972,18 +19958,8 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, ipst); - - if (ill == NULL) { - /* Fallback to group names only if hook_emulation set */ - if (!ipst->ips_ipmp_hook_emulation) - return (NULL); - - if (lifidx != 0) - return (NULL); - ill = ill_group_lookup_on_ifindex(ifindex, isv6, ipst); - if (ill == NULL) - return (NULL); - } + if (ill == NULL) + return (NULL); mutex_enter(&ill->ill_lock); if (ill->ill_state_flags & ILL_CONDEMNED) { @@ -24059,7 +20035,7 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp) * If we can quiesce the ill, then set the address. If not, then * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). */ - ill_down_ipifs(ill, NULL, 0, B_FALSE); + ill_down_ipifs(ill); mutex_enter(&ill->ill_lock); if (!ill_is_quiescent(ill)) { /* call cannot fail since `conn_t *' argument is NULL */ @@ -24283,10 +20259,7 @@ ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) goto fail; - if (event == NE_UNPLUMB) - info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; - else - info->hnei_event.hne_nic = ill->ill_phyint->phyint_hook_ifindex; + info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; info->hnei_event.hne_lif = lif; info->hnei_event.hne_event = event; info->hnei_event.hne_protocol = ill->ill_isv6 ? @@ -24323,8 +20296,8 @@ fail: void ipif_up_notify(ipif_t *ipif) { - ip_rts_ifmsg(ipif); - ip_rts_newaddrmsg(RTM_ADD, 0, ipif); + ip_rts_ifmsg(ipif, RTSQ_DEFAULT); + ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); sctp_update_ipif(ipif, SCTP_IPIF_UP); ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), NE_LIF_UP, NULL, 0); diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c index 405cb653d5..52a7e74806 100644 --- a/usr/src/uts/common/inet/ip/ip_ire.c +++ b/usr/src/uts/common/inet/ip/ip_ire.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -31,6 +31,7 @@ #include <sys/types.h> #include <sys/stream.h> #include <sys/stropts.h> +#include <sys/strsun.h> #include <sys/ddi.h> #include <sys/cmn_err.h> #include <sys/policy.h> @@ -61,7 +62,6 @@ #include <net/pfkeyv2.h> #include <inet/ipsec_info.h> #include <inet/sadb.h> -#include <sys/kmem.h> #include <inet/tcp.h> #include <inet/ipclassifier.h> #include <sys/zone.h> @@ -220,11 +220,6 @@ struct kmem_cache *rt_entry_cache; * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is * to be ignored when walking the ires using ire_next. * - * IRE_MARK_HIDDEN signifies that the ire is a special ire typically for the - * benefit of in.mpathd which needs to probe interfaces for failures. Normal - * applications should not be seeing this ire and hence this ire is ignored - * in most cases in the search using ire_next. - * * Zones note: * Walking IREs within a given zone also walks certain ires in other * zones. This is done intentionally. IRE walks with a specified @@ -1235,10 +1230,9 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) { irb_t *irb; boolean_t drop = B_FALSE; - /* LINTED : set but not used in function */ boolean_t mctl_present; mblk_t *first_mp = NULL; - mblk_t *save_mp = NULL; + mblk_t *data_mp = NULL; ire_t *dst_ire; ipha_t *ipha; ip6_t *ip6h; @@ -1258,27 +1252,16 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) * we resolve an IPv6 address with an IPv4 ire * or vice versa. */ + EXTRACT_PKT_MP(mp, first_mp, mctl_present); + data_mp = mp; + mp = first_mp; if (ire->ire_ipversion == IPV4_VERSION) { - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - ipha = (ipha_t *)mp->b_rptr; - save_mp = mp; - mp = first_mp; - + ipha = (ipha_t *)data_mp->b_rptr; dst_ire = ire_cache_lookup(ipha->ipha_dst, ire->ire_zoneid, MBLK_GETLABEL(mp), ipst); } else { ASSERT(ire->ire_ipversion == IPV6_VERSION); - /* - * Get a pointer to the beginning of the IPv6 header. - * Ignore leading IPsec control mblks. - */ - first_mp = mp; - if (mp->b_datap->db_type == M_CTL) { - mp = mp->b_cont; - } - ip6h = (ip6_t *)mp->b_rptr; - save_mp = mp; - mp = first_mp; + ip6h = (ip6_t *)data_mp->b_rptr; dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ire->ire_zoneid, MBLK_GETLABEL(mp), ipst); } @@ -1330,10 +1313,8 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) * is over: we just drop the packet. */ if (ire->ire_flags & RTF_MULTIRT) { - if (save_mp) { - save_mp->b_prev = NULL; - save_mp->b_next = NULL; - } + data_mp->b_prev = NULL; + data_mp->b_next = NULL; MULTIRT_DEBUG_UNTAG(mp); freemsg(mp); } else { @@ -1355,9 +1336,31 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) (CONN_Q(q) ? Q_TO_CONN(q) : NULL), ire->ire_zoneid, ipst); } else { + int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN; + ASSERT(ire->ire_ipversion == IPV6_VERSION); - ip_newroute_v6(q, mp, &ip6h->ip6_dst, NULL, - NULL, ire->ire_zoneid, ipst); + + /* + * If necessary, skip over the ip6i_t to find + * the header with the actual source address. + */ + if (ip6h->ip6_nxt == IPPROTO_RAW) { + if (MBLKL(data_mp) < minlen && + pullupmsg(data_mp, -1) == 0) { + ip1dbg(("ire_add_then_send: " + "cannot pullupmsg ip6i\n")); + if (mctl_present) + freeb(first_mp); + ire_refrele(ire); + return; + } + ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN); + ip6h = (ip6_t *)(data_mp->b_rptr + + sizeof (ip6i_t)); + } + ip_newroute_v6(q, mp, &ip6h->ip6_dst, + &ip6h->ip6_src, NULL, ire->ire_zoneid, + ipst); } } @@ -1680,7 +1683,9 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, { ire_t *ire; uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + boolean_t prefer; + ill_t *ill = ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; /* * No broadcast IREs for the LOOPBACK interface @@ -1690,21 +1695,26 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, (ipif->ipif_flags & IPIF_NOXMIT)) return (irep); - /* If this would be a duplicate, don't bother. */ + /* + * If this new IRE would be a duplicate, only prefer it if one of + * the following is true: + * + * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST + * set and the new one has all of those clear. + * + * 2. The existing one corresponds to an underlying ILL in an IPMP + * group and the new one corresponds to an IPMP group interface. + */ if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif, ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) { - /* - * We look for non-deprecated (and non-anycast, non-nolocal) - * ipifs as the best choice. ipifs with check_flags matching - * (deprecated, etc) are used only if non-deprecated ipifs - * are not available. if the existing ire's ipif is deprecated - * and the new ipif is non-deprecated, switch to the new ipif - */ - if ((!(ire->ire_ipif->ipif_flags & check_flags)) || - (ipif->ipif_flags & check_flags)) { + prefer = ((ire->ire_ipif->ipif_flags & check_flags) && + !(ipif->ipif_flags & check_flags)) || + (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill)); + if (!prefer) { ire_refrele(ire); return (irep); } + /* * Bcast ires exist in pairs. Both have to be deleted, * Since we are exclusive we can make the above assertion. @@ -1716,10 +1726,7 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, ire_delete(ire); ire_refrele(ire); } - - irep = ire_create_bcast(ipif, addr, irep); - - return (irep); + return (ire_create_bcast(ipif, addr, irep)); } uint_t ip_loopback_mtu = IP_LOOPBACK_MTU; @@ -1733,6 +1740,22 @@ ire_t ** ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) { ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + ill_t *ill = ipif->ipif_ill; + + ASSERT(IAM_WRITER_IPIF(ipif)); + + if (IS_IPMP(ill)) { + /* + * Broadcast IREs for the IPMP meta-interface use the + * nominated broadcast interface to send and receive packets. + * If there's no nominated interface, send the packets down to + * the IPMP stub driver, which will discard them. If the + * nominated broadcast interface changes, ill_refresh_bcast() + * will refresh the broadcast IREs. + */ + if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) + ill = ipif->ipif_ill; + } *irep++ = ire_create( (uchar_t *)&addr, /* dest addr */ @@ -1741,8 +1764,8 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) NULL, /* no gateway */ &ipif->ipif_mtu, /* max frag */ NULL, /* no src nce */ - ipif->ipif_rq, /* recv-from queue */ - ipif->ipif_wq, /* send-to queue */ + ill->ill_rq, /* recv-from queue */ + ill->ill_wq, /* send-to queue */ IRE_BROADCAST, ipif, 0, @@ -1761,7 +1784,7 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) NULL, /* no gateway */ &ip_loopback_mtu, /* max frag size */ NULL, /* no src_nce */ - ipif->ipif_rq, /* recv-from queue */ + ill->ill_rq, /* recv-from queue */ NULL, /* no send-to queue */ IRE_BROADCAST, /* Needed for fanout in wput */ ipif, @@ -2049,32 +2072,23 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, { ill_t *ire_stq_ill = NULL; ill_t *ire_ipif_ill = NULL; - ill_group_t *ire_ill_group = NULL; ASSERT(match_flags != 0 || zoneid != ALL_ZONES); /* - * MATCH_IRE_ILL/MATCH_IRE_ILL_GROUP : We match both on ill - * pointed by ire_stq and ire_ipif. Only in the case of - * IRE_CACHEs can ire_stq and ire_ipif be pointing to - * different ills. But we want to keep this function generic - * enough for future use. So, we always try to match on both. - * The only caller of this function ire_walk_ill_tables, will - * call "func" after we return from this function. We expect - * "func" to do the right filtering of ires in this case. - * - * NOTE : In the case of MATCH_IRE_ILL_GROUP, groups - * pointed by ire_stq and ire_ipif should always be the same. - * So, we just match on only one of them. + * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and + * ire_ipif. Only in the case of IRE_CACHEs can ire_stq and + * ire_ipif be pointing to different ills. But we want to keep + * this function generic enough for future use. So, we always + * try to match on both. The only caller of this function + * ire_walk_ill_tables, will call "func" after we return from + * this function. We expect "func" to do the right filtering + * of ires in this case. */ - if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) { + if (match_flags & MATCH_IRE_ILL) { if (ire->ire_stq != NULL) - ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr; + ire_stq_ill = ire->ire_stq->q_ptr; if (ire->ire_ipif != NULL) ire_ipif_ill = ire->ire_ipif->ipif_ill; - if (ire_stq_ill != NULL) - ire_ill_group = ire_stq_ill->ill_group; - if ((ire_ill_group == NULL) && (ire_ipif_ill != NULL)) - ire_ill_group = ire_ipif_ill->ill_group; } if (zoneid != ALL_ZONES) { @@ -2115,7 +2129,7 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, ipif_t *src_ipif; src_ipif = ipif_select_source_v6(ire_stq_ill, - &ire->ire_addr_v6, RESTRICT_TO_NONE, + &ire->ire_addr_v6, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); if (src_ipif != NULL) { @@ -2143,9 +2157,9 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, ire_t *rire; ire_match_flags |= MATCH_IRE_TYPE; - if (ire->ire_ipif != NULL) { - ire_match_flags |= MATCH_IRE_ILL_GROUP; - } + if (ire->ire_ipif != NULL) + ire_match_flags |= MATCH_IRE_ILL; + if (ire->ire_ipversion == IPV4_VERSION) { rire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, IRE_INTERFACE, ire->ire_ipif, NULL, @@ -2169,11 +2183,8 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, if (((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & ire_type)) && ((!(match_flags & MATCH_IRE_ILL)) || - (ire_stq_ill == ill || ire_ipif_ill == ill)) && - ((!(match_flags & MATCH_IRE_ILL_GROUP)) || - (ire_stq_ill == ill) || (ire_ipif_ill == ill) || - (ire_ill_group != NULL && - ire_ill_group == ill->ill_group))) { + (ire_stq_ill == ill || ire_ipif_ill == ill || + ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) { return (B_TRUE); } return (B_FALSE); @@ -2221,8 +2232,7 @@ ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, boolean_t ret; struct rtfuncarg rtfarg; - ASSERT((!(match_flags & (MATCH_IRE_ILL | - MATCH_IRE_ILL_GROUP))) || (ill != NULL)); + ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL)); ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0)); /* * Optimize by not looking at the forwarding table if there @@ -2399,32 +2409,26 @@ ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp, } /* - * IPMP flag settings happen without taking the exclusive route - * in ip_sioctl_flags. So we need to make an atomic check here - * for FAILED/OFFLINE/INACTIVE flags or if it has hit the - * FAILBACK=no case. + * Don't allow IRE's to be created on changing ill's. Also, since + * IPMP flags can be set on an ill without quiescing it, if we're not + * a writer on stq_ill, check that the flags still allow IRE creation. */ if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) { if (stq_ill->ill_state_flags & ILL_CHANGING) { ill = stq_ill; error = EAGAIN; - } else if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) || - (ill_is_probeonly(stq_ill) && - !(ire->ire_marks & IRE_MARK_HIDDEN))) { - error = EINVAL; + } else if (IS_UNDER_IPMP(stq_ill)) { + mutex_enter(&stq_ill->ill_phyint->phyint_lock); + if (!ipmp_ill_is_active(stq_ill) && + !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) { + error = EINVAL; + } + mutex_exit(&stq_ill->ill_phyint->phyint_lock); } - goto done; + if (error != 0) + goto done; } - /* - * We don't check for OFFLINE/FAILED in this case because - * the source address selection logic (ipif_select_source) - * may still select a source address from such an ill. The - * assumption is that these addresses will be moved by in.mpathd - * soon. (i.e. this is a race). However link local addresses - * will not move and hence ipif_select_source_v6 tries to avoid - * FAILED ills. Please see ipif_select_source_v6 for more info - */ if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) && (ipif_ill->ill_state_flags & ILL_CHANGING)) { ill = ipif_ill; @@ -2444,8 +2448,10 @@ done: if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) { ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); ire_atomic_end(irb_ptr, ire); ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); mutex_exit(&ipsq->ipsq_lock); error = EINPROGRESS; } else if (error != 0) { @@ -2502,39 +2508,7 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, ire = ire1; } if (ire->ire_stq != NULL) - stq_ill = (ill_t *)ire->ire_stq->q_ptr; - - if (ire->ire_type == IRE_CACHE) { - /* - * If this interface is FAILED, or INACTIVE or has hit - * the FAILBACK=no case, we create IRE_CACHES marked - * HIDDEN for some special cases e.g. bind to - * IPIF_NOFAILOVER address etc. So, if this interface - * is FAILED/INACTIVE/hit FAILBACK=no case, and we are - * not creating hidden ires, we should not allow that. - * This happens because the state of the interface - * changed while we were waiting in ARP. If this is the - * daemon sending probes, the next probe will create - * HIDDEN ires and we will create an ire then. This - * cannot happen with NDP currently because IRE is - * never queued in NDP. But it can happen in the - * future when we have external resolvers with IPv6. - * If the interface gets marked with OFFLINE while we - * are waiting in ARP, don't add the ire. - */ - if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) || - (ill_is_probeonly(stq_ill) && - !(ire->ire_marks & IRE_MARK_HIDDEN))) { - /* - * We don't know whether it is a valid ipif or not. - * unless we do the check below. So, set it to NULL. - */ - ire->ire_ipif = NULL; - ire_delete(ire); - *irep = NULL; - return (EINVAL); - } - } + stq_ill = ire->ire_stq->q_ptr; if (stq_ill != NULL && ire->ire_type == IRE_CACHE && stq_ill->ill_net_type == IRE_IF_RESOLVER) { @@ -2573,12 +2547,12 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, rw_exit(&ipst->ips_ill_g_lock); if (ipif == NULL || (ipif->ipif_isv6 && + !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) && !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, &ipif->ipif_v6src_addr)) || (!ipif->ipif_isv6 && ire->ire_src_addr != ipif->ipif_src_addr) || ire->ire_zoneid != ipif->ipif_zoneid) { - if (ipif != NULL) ipif_refrele(ipif); ire->ire_ipif = NULL; @@ -2587,20 +2561,7 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, return (EINVAL); } - ASSERT(ill != NULL); - /* - * If this group was dismantled while this packets was - * queued in ARP, don't add it here. - */ - if (ire->ire_ipif->ipif_ill->ill_group != ill->ill_group) { - /* We don't want ire_inactive bump stats for this */ - ipif_refrele(ipif); - ire->ire_ipif = NULL; - ire_delete(ire); - *irep = NULL; - return (EINVAL); - } /* * Since we didn't attach label security attributes to the @@ -2677,6 +2638,16 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, boolean_t need_refrele = B_FALSE; nce_t *nce; ip_stack_t *ipst = ire->ire_ipst; + uint_t marks = 0; + + /* + * IREs with source addresses hosted on interfaces that are under IPMP + * should be hidden so that applications don't accidentally end up + * sending packets with test addresses as their source addresses, or + * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here. + */ + if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill)) + marks |= IRE_MARK_TESTHIDDEN; if (ire->ire_ipif != NULL) ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); @@ -2691,10 +2662,15 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, case IRE_HOST: ire->ire_mask = IP_HOST_MASK; ire->ire_masklen = IP_ABITS; + ire->ire_marks |= marks; if ((ire->ire_flags & RTF_SETSRC) == 0) ire->ire_src_addr = 0; break; case IRE_CACHE: + ire->ire_mask = IP_HOST_MASK; + ire->ire_masklen = IP_ABITS; + ire->ire_marks |= marks; + break; case IRE_BROADCAST: case IRE_LOCAL: case IRE_LOOPBACK: @@ -2702,15 +2678,14 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, ire->ire_masklen = IP_ABITS; break; case IRE_PREFIX: - if ((ire->ire_flags & RTF_SETSRC) == 0) - ire->ire_src_addr = 0; - break; case IRE_DEFAULT: + ire->ire_marks |= marks; if ((ire->ire_flags & RTF_SETSRC) == 0) ire->ire_src_addr = 0; break; case IRE_IF_RESOLVER: case IRE_IF_NORESOLVER: + ire->ire_marks |= marks; break; default: ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n", @@ -2796,19 +2771,13 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, */ flags |= MATCH_IRE_IPIF; /* - * If we are creating hidden ires, make sure we search on - * this ill (MATCH_IRE_ILL) and a hidden ire, - * while we are searching for duplicates below. Otherwise we - * could potentially find an IRE on some other interface - * and it may not be a IRE marked with IRE_MARK_HIDDEN. We - * shouldn't do this as this will lead to an infinite loop - * (if we get to ip_wput again) eventually we need an hidden - * ire for this packet to go out. MATCH_IRE_ILL is explicitly - * done below. + * If we are creating a hidden IRE, make sure we search for + * hidden IREs when searching for duplicates below. + * Otherwise, we might find an IRE on some other interface + * that's not marked hidden. */ - if (ire->ire_type == IRE_CACHE && - (ire->ire_marks & IRE_MARK_HIDDEN)) - flags |= (MATCH_IRE_MARK_HIDDEN); + if (ire->ire_marks & IRE_MARK_TESTHIDDEN) + flags |= MATCH_IRE_MARK_TESTHIDDEN; } if ((ire->ire_type & IRE_CACHETABLE) == 0) { irb_ptr = ire_get_bucket(ire); @@ -2927,7 +2896,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, * avoid a lookup in the caller again. If the callers * don't want to use it, they need to do a REFRELE. */ - ip1dbg(("found dup ire existing %p new %p", + ip1dbg(("found dup ire existing %p new %p\n", (void *)ire1, (void *)ire)); IRE_REFHOLD(ire1); ire_atomic_end(irb_ptr, ire); @@ -2948,6 +2917,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, return (0); } } + if (ire->ire_type & IRE_CACHE) { ASSERT(ire->ire_stq != NULL); nce = ndp_lookup_v4(ire_to_ill(ire), @@ -2999,17 +2969,9 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, } /* * Make it easy for ip_wput_ire() to hit multiple broadcast ires by - * grouping identical addresses together on the hash chain. We also - * don't want to send multiple copies out if there are two ills part - * of the same group. Thus we group the ires with same addr and same - * ill group together so that ip_wput_ire can easily skip all the - * ires with same addr and same group after sending the first copy. - * We do this only for IRE_BROADCASTs as ip_wput_ire is currently - * interested in such groupings only for broadcasts. - * - * NOTE : If the interfaces are brought up first and then grouped, - * illgrp_insert will handle it. We come here when the interfaces - * are already in group and we are bringing them UP. + * grouping identical addresses together on the hash chain. We do + * this only for IRE_BROADCASTs as ip_wput_ire is currently interested + * in such groupings only for broadcasts. * * Find the first entry that matches ire_addr. *irep will be null * if no match. @@ -3023,29 +2985,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, if (ire->ire_type == IRE_BROADCAST && *irep != NULL) { /* * We found some ire (i.e *irep) with a matching addr. We - * want to group ires with same addr and same ill group - * together. - * - * First get to the entry that matches our address and - * ill group i.e stop as soon as we find the first ire - * matching the ill group and address. If there is only - * an address match, we should walk and look for some - * group match. These are some of the possible scenarios : - * - * 1) There are no groups at all i.e all ire's ill_group - * are NULL. In that case we will essentially group - * all the ires with the same addr together. Same as - * the "else" block of this "if". - * - * 2) There are some groups and this ire's ill_group is - * NULL. In this case, we will first find the group - * that matches the address and a NULL group. Then - * we will insert the ire at the end of that group. - * - * 3) There are some groups and this ires's ill_group is - * non-NULL. In this case we will first find the group - * that matches the address and the ill_group. Then - * we will insert the ire at the end of that group. + * want to group ires with same addr. */ for (;;) { ire1 = *irep; @@ -3053,8 +2993,8 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, (ire1->ire_next->ire_addr != ire->ire_addr) || (ire1->ire_type != IRE_BROADCAST) || (ire1->ire_flags & RTF_MULTIRT) || - (ire1->ire_ipif->ipif_ill->ill_group == - ire->ire_ipif->ipif_ill->ill_group)) + (ire1->ire_ipif->ipif_ill->ill_grp == + ire->ire_ipif->ipif_ill->ill_grp)) break; irep = &ire1->ire_next; } @@ -3071,18 +3011,14 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, /* * Either we have hit the end of the list or the address - * did not match or the group *matched*. If we found - * a match on the group, skip to the end of the group. + * did not match. */ while (*irep != NULL) { ire1 = *irep; if ((ire1->ire_addr != ire->ire_addr) || - (ire1->ire_type != IRE_BROADCAST) || - (ire1->ire_ipif->ipif_ill->ill_group != - ire->ire_ipif->ipif_ill->ill_group)) + (ire1->ire_type != IRE_BROADCAST)) break; - if (ire1->ire_ipif->ipif_ill->ill_group == NULL && - ire1->ire_ipif == ire->ire_ipif) { + if (ire1->ire_ipif == ire->ire_ipif) { irep = &ire1->ire_next; break; } @@ -3611,15 +3547,14 @@ ire_inactive(ire_t *ire) * The ipif that is associated with an ire is ire->ire_ipif and * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call * ipif_ill_refrele_tail. Usually stq_ill is null or the same as - * ire->ire_ipif->ipif_ill. So nothing more needs to be done. Only - * in the case of IRE_CACHES when IPMP is used, stq_ill can be - * different. If this is different from ire->ire_ipif->ipif_ill and - * if the ill_ire_cnt on the stq_ill also has dropped to zero, we call + * ire->ire_ipif->ipif_ill. So nothing more needs to be done. + * However, for VNI or IPMP IRE entries, stq_ill can be different. + * If this is different from ire->ire_ipif->ipif_ill and if the + * ill_ire_cnt on the stq_ill also has dropped to zero, we call * ipif_ill_refrele_tail on the stq_ill. */ - if (ire->ire_stq != NULL) - stq_ill = (ill_t *)ire->ire_stq->q_ptr; + stq_ill = ire->ire_stq->q_ptr; if (stq_ill == NULL || stq_ill == ill) { /* Optimize the most common case */ @@ -3881,26 +3816,27 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, { ill_t *ire_ill = NULL, *dst_ill; ill_t *ipif_ill = NULL; - ill_group_t *ire_ill_group = NULL; - ill_group_t *ipif_ill_group = NULL; ASSERT(ire->ire_ipversion == IPV4_VERSION); ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); - ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) || + ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ipif != NULL && !ipif->ipif_isv6)); ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL); /* - * HIDDEN cache entries have to be looked up specifically with - * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set - * when the interface is FAILED or INACTIVE. In that case, - * any IRE_CACHES that exists should be marked with - * IRE_MARK_HIDDEN. So, we don't really need to match below - * for IRE_MARK_HIDDEN. But we do so for consistency. + * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it + * is in fact hidden, to ensure the caller gets the right one. One + * exception: if the caller passed MATCH_IRE_IHANDLE, then they + * already know the identity of the given IRE_INTERFACE entry and + * there's no point trying to hide it from them. */ - if (!(match_flags & MATCH_IRE_MARK_HIDDEN) && - (ire->ire_marks & IRE_MARK_HIDDEN)) - return (B_FALSE); + if (ire->ire_marks & IRE_MARK_TESTHIDDEN) { + if (match_flags & MATCH_IRE_IHANDLE) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; + + if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) + return (B_FALSE); + } /* * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option @@ -3994,19 +3930,18 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, } /* - * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that - * somebody wants to send out on a particular interface which - * is given by ire_stq and hence use ire_stq to derive the ill - * value. ire_ipif for IRE_CACHES is just the means of getting - * a source address i.e ire_src_addr = ire->ire_ipif->ipif_src_addr. - * ire_to_ill does the right thing for this. + * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to + * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means + * of getting a source address -- i.e., ire_src_addr == + * ire->ire_ipif->ipif_src_addr). ire_to_ill() handles this. + * + * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group. + * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for + * IPMP test traffic), then the ill must match exactly. */ - if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) { + if (match_flags & MATCH_IRE_ILL) { ire_ill = ire_to_ill(ire); - if (ire_ill != NULL) - ire_ill_group = ire_ill->ill_group; ipif_ill = ipif->ipif_ill; - ipif_ill_group = ipif_ill->ill_group; } if ((ire->ire_addr == (addr & mask)) && @@ -4018,24 +3953,21 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, (ire->ire_src_addr == ipif->ipif_src_addr)) && ((!(match_flags & MATCH_IRE_IPIF)) || (ire->ire_ipif == ipif)) && - ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) || - (ire->ire_type != IRE_CACHE || - ire->ire_marks & IRE_MARK_HIDDEN)) && + ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) || + (ire->ire_marks & IRE_MARK_TESTHIDDEN)) && ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) || (ire->ire_type != IRE_CACHE || ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) && - ((!(match_flags & MATCH_IRE_ILL)) || - (ire_ill == ipif_ill)) && ((!(match_flags & MATCH_IRE_WQ)) || (ire->ire_stq == wq)) && + ((!(match_flags & MATCH_IRE_ILL)) || + (ire_ill == ipif_ill || + (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) && + ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) && ((!(match_flags & MATCH_IRE_IHANDLE)) || (ire->ire_ihandle == ihandle)) && ((!(match_flags & MATCH_IRE_MASK)) || (ire->ire_mask == mask)) && - ((!(match_flags & MATCH_IRE_ILL_GROUP)) || - (ire_ill == ipif_ill) || - (ire_ill_group != NULL && - ire_ill_group == ipif_ill_group)) && ((!(match_flags & MATCH_IRE_SECATTR)) || (!is_system_labeled()) || (tsol_ire_match_gwattr(ire, tsl) == 0))) { @@ -4060,8 +3992,7 @@ ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, * ire_match_args() will dereference ipif MATCH_IRE_SRC or * MATCH_IRE_ILL is set. */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && - (ipif == NULL)) + if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) return (NULL); /* @@ -4142,14 +4073,15 @@ ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif, /* * Check whether the IRE_LOCAL and the IRE potentially used to transmit - * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are part of - * the same ill group. + * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical + * or part of the same illgrp. (In the IPMP case, usually the two IREs + * will both belong to the IPMP ill, but exceptions are possible -- e.g. + * if IPMP test addresses are on their own subnet.) */ boolean_t -ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire) +ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire) { - ill_t *recv_ill, *xmit_ill; - ill_group_t *recv_group, *xmit_group; + ill_t *recv_ill, *xmit_ill; ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK)); ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE)); @@ -4160,20 +4092,11 @@ ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire) ASSERT(recv_ill != NULL); ASSERT(xmit_ill != NULL); - if (recv_ill == xmit_ill) - return (B_TRUE); - - recv_group = recv_ill->ill_group; - xmit_group = xmit_ill->ill_group; - - if (recv_group != NULL && recv_group == xmit_group) - return (B_TRUE); - - return (B_FALSE); + return (IS_ON_SAME_LAN(recv_ill, xmit_ill)); } /* - * Check if the IRE_LOCAL uses the same ill (group) as another route would use. + * Check if the IRE_LOCAL uses the same ill as another route would use. * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE, * then we don't allow this IRE_LOCAL to be used. */ @@ -4183,17 +4106,16 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, { ire_t *alt_ire; boolean_t rval; + int flags; + + flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE; if (ire_local->ire_ipversion == IPV4_VERSION) { alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL, - NULL, zoneid, 0, tsl, - MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE, ipst); + NULL, zoneid, 0, tsl, flags, ipst); } else { - alt_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL, - 0, NULL, NULL, zoneid, 0, tsl, - MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE, ipst); + alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, + NULL, zoneid, 0, tsl, flags, ipst); } if (alt_ire == NULL) @@ -4203,16 +4125,14 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, ire_refrele(alt_ire); return (B_FALSE); } - rval = ire_local_same_ill_group(ire_local, alt_ire); + rval = ire_local_same_lan(ire_local, alt_ire); ire_refrele(alt_ire); return (rval); } /* - * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers - * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get - * to the hidden ones. + * Lookup cache * * In general the zoneid has to match (where ALL_ZONES match all of them). * But for IRE_LOCAL we also need to handle the case where L2 should @@ -4220,8 +4140,7 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, * Ethernet drivers nor Ethernet hardware loops back packets sent to their * own MAC address. This loopback is needed when the normal * routes (ignoring IREs with different zoneids) would send out the packet on - * the same ill (or ill group) as the ill with which this IRE_LOCAL is - * associated. + * the same ill as the ill with which this IRE_LOCAL is associated. * * Earlier versions of this code always matched an IRE_LOCAL independently of * the zoneid. We preserve that earlier behavior when @@ -4239,7 +4158,7 @@ ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl, rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { if (ire->ire_marks & (IRE_MARK_CONDEMNED | - IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) { + IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) { continue; } if (ire->ire_addr == addr) { @@ -4284,7 +4203,7 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst) ire_t *ire; /* - * Lets look for an ire in the cachetable whose + * Look for an ire in the cachetable whose * ire_addr matches the destination. * Since we are being called by forwarding fastpath * no need to check for Trusted Solaris label. @@ -4293,8 +4212,8 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst) dst, ipst->ips_ip_cache_table_size)]; rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & (IRE_MARK_CONDEMNED | - IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) { + if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN | + IRE_MARK_PRIVATE_ADDR)) { continue; } if (ire->ire_addr == dst) { @@ -4307,7 +4226,6 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst) return (NULL); } - /* * Locate the interface ire that is tied to the cache ire 'cire' via * cire->ire_ihandle. @@ -4333,13 +4251,8 @@ ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire) * because the ihandle refers to an ipif which can be in only one zone. */ match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; - /* - * ip_newroute calls ire_ftable_lookup with MATCH_IRE_ILL only - * for on-link hosts. We should never be here for onlink. - * Thus, use MATCH_IRE_ILL_GROUP. - */ if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; /* * We know that the mask of the interface ire equals cire->ire_cmask. * (When ip_newroute() created 'cire' for the gateway it set its @@ -4376,7 +4289,7 @@ ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire) */ match_flags = MATCH_IRE_TYPE; if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL_GROUP; + match_flags |= MATCH_IRE_ILL; ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET, pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); if (ire == NULL) @@ -4411,7 +4324,16 @@ ire_t * ipif_to_ire(const ipif_t *ipif) { ire_t *ire; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK; + + /* + * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN + * so that they aren't accidentally returned. However, if the + * caller's ipif is on an ill under IPMP, there's no need to hide 'em. + */ + if (IS_UNDER_IPMP(ipif->ipif_ill)) + match_flags |= MATCH_IRE_MARK_TESTHIDDEN; ASSERT(!ipif->ipif_isv6); if (ipif->ipif_ire_type == IRE_LOOPBACK) { @@ -4421,13 +4343,12 @@ ipif_to_ire(const ipif_t *ipif) } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { /* In this case we need to lookup destination address. */ ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0, - IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, - (MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK), ipst); + IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags, + ipst); } else { ire = ire_ftable_lookup(ipif->ipif_subnet, ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL, - ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF | - MATCH_IRE_MASK), ipst); + ALL_ZONES, 0, NULL, match_flags, ipst); } return (ire); } @@ -4811,7 +4732,7 @@ ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst) continue; if (cire->ire_addr != dst) continue; - if (cire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) + if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) continue; unres_cnt--; } @@ -4983,7 +4904,7 @@ ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, continue; if (cire->ire_marks & (IRE_MARK_CONDEMNED | - IRE_MARK_HIDDEN)) + IRE_MARK_TESTHIDDEN)) continue; if (cire->ire_gw_secattr != NULL && @@ -5186,7 +5107,7 @@ ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, continue; if (cire->ire_marks & (IRE_MARK_CONDEMNED | - IRE_MARK_HIDDEN)) + IRE_MARK_TESTHIDDEN)) continue; if (cire->ire_gw_secattr != NULL && @@ -5401,7 +5322,7 @@ ire_trace_cleanup(const ire_t *ire) * invoked when the mblk containing fake_ire is freed. */ void -ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) +ire_arpresolve(ire_t *in_ire) { areq_t *areq; ipaddr_t *addrp; @@ -5409,8 +5330,13 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) ire_t *ire, *buf; size_t bufsize; frtn_t *frtnp; - ill_t *ill; - ip_stack_t *ipst = dst_ill->ill_ipst; + ill_t *dst_ill; + ip_stack_t *ipst; + + ASSERT(in_ire->ire_nce != NULL); + + dst_ill = ire_to_ill(in_ire); + ipst = dst_ill->ill_ipst; /* * Construct message chain for the resolver @@ -5431,16 +5357,16 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) */ /* - * We use esballoc to allocate the second part(the ire_t size mblk) - * of the message chain depicted above. THis mblk will be freed - * by arp when there is a timeout, and otherwise passed to IP - * and IP will * free it after processing the ARP response. + * We use esballoc to allocate the second part (IRE_MBLK) + * of the message chain depicted above. This mblk will be freed + * by arp when there is a timeout, and otherwise passed to IP + * and IP will free it after processing the ARP response. */ bufsize = sizeof (ire_t) + sizeof (frtn_t); buf = kmem_alloc(bufsize, KM_NOSLEEP); if (buf == NULL) { - ip1dbg(("ire_arpresolver:alloc buffer failed\n ")); + ip1dbg(("ire_arpresolve: alloc buffer failed\n")); return; } frtnp = (frtn_t *)(buf + 1); @@ -5448,16 +5374,15 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) frtnp->free_func = ire_freemblk; ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); - if (ire_mp == NULL) { ip1dbg(("ire_arpresolve: esballoc failed\n")); kmem_free(buf, bufsize); return; } - ASSERT(in_ire->ire_nce != NULL); + areq_mp = copyb(dst_ill->ill_resolver_mp); if (areq_mp == NULL) { - kmem_free(buf, bufsize); + freemsg(ire_mp); return; } @@ -5473,9 +5398,8 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) ire->ire_ipif_seqid = in_ire->ire_ipif_seqid; ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex; ire->ire_ipif = in_ire->ire_ipif; - ire->ire_stq = in_ire->ire_stq; - ill = ire_to_ill(ire); - ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; + ire->ire_stq = dst_ill->ill_wq; + ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex; ire->ire_zoneid = in_ire->ire_zoneid; ire->ire_stackid = ipst->ips_netstack->netstack_stackid; ire->ire_ipst = ipst; @@ -5528,7 +5452,6 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) * Note that the ARP/IP merge should replace the functioanlity by providing * direct function calls to clean up unresolved entries in ire/nce lists. */ - void ire_freemblk(ire_t *ire_mp) { @@ -5738,9 +5661,8 @@ retry_nce: * is marked as ND_REACHABLE at this point. * This nce does not undergo any further state changes, * and exists as long as the interface is plumbed. - * Note: we do the ire_nce assignment here for IRE_BROADCAST - * because some functions like ill_mark_bcast() inline the - * ire_add functionality. + * Note: the assignment of ire_nce here is a historical + * artifact of old code that used to inline ire_add(). */ ire->ire_nce = nce; /* @@ -5772,8 +5694,7 @@ ip4_ctable_lookup_impl(ire_ctable_args_t *margs) ire_t *ire; ip_stack_t *ipst = margs->ict_ipst; - if ((margs->ict_flags & - (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && + if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (margs->ict_ipif == NULL)) { return (NULL); } @@ -5802,10 +5723,7 @@ ip4_ctable_lookup_impl(ire_ctable_args_t *margs) /* * This function locates IRE_CACHE entries which were added by the * ire_forward() path. We can fully specify the IRE we are looking for by - * providing the ipif_t AND the ire_stq. This is different to MATCH_IRE_ILL - * which uses the ipif_ill. This is inadequate with IPMP groups where - * illgrp_scheduler() may have been used to select an ill from the group for - * the outgoing interface. + * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ). */ ire_t * ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif, diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c index ac14adf00d..1a3df02418 100644 --- a/usr/src/uts/common/inet/ip/ip_mroute.c +++ b/usr/src/uts/common/inet/ip/ip_mroute.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -2037,6 +2037,7 @@ static int ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, struct mfc *rt) { + ill_t *vill; vifi_t vifi; struct vif *vifp; ipaddr_t dst = ipha->ipha_dst; @@ -2102,25 +2103,21 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, } /* * Don't forward if it didn't arrive from the parent vif for its - * origin. But do match on the groups as we nominate only one - * ill in the group for receiving allmulti packets. + * origin. */ - if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill && - (ill->ill_group == NULL || - ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group != - ill->ill_group)) || + vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill; + if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) || (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { /* Came in the wrong interface */ ip1dbg(("ip_mdq: arrived wrong if, vifi %d " "numvifs %d ill %s viftable ill %s\n", (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, - ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name)); + vill->ill_name)); if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "ip_mdq: arrived wrong if, vifi %d ill " "%s viftable ill %s\n", - (int)vifi, ill->ill_name, - ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); + (int)vifi, ill->ill_name, vill->ill_name); } ipst->ips_mrtstat->mrts_wrong_if++; rt->mfc_wrong_if++; @@ -3047,7 +3044,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp) dst = ipha->ipha_dst; ipif = vifp->v_ipif; - mutex_enter(&ipif->ipif_ill->ill_lock); if (ilm_lookup_ipif(ipif, dst) != NULL) { /* * The packet is not yet reassembled, thus we need to @@ -3057,7 +3053,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp) mblk_t *mp_loop; ire_t *ire; - mutex_exit(&ipif->ipif_ill->ill_lock); if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, @@ -3082,8 +3077,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp) } if (ire != NULL) ire_refrele(ire); - } else { - mutex_exit(&ipif->ipif_ill->ill_lock); } if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c index f3c95ae362..cbea9be165 100644 --- a/usr/src/uts/common/inet/ip/ip_multi.c +++ b/usr/src/uts/common/inet/ip/ip_multi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -68,12 +68,10 @@ static void ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, static ilm_t *ilm_add_v6(ipif_t *ipif, const in6_addr_t *group, ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist, - int orig_ifindex, zoneid_t zoneid); + zoneid_t zoneid); static void ilm_delete(ilm_t *ilm); static int ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *group); static int ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *group); -static ilg_t *ilg_lookup_ill_index_v6(conn_t *connp, - const in6_addr_t *v6group, int index); static ilg_t *ilg_lookup_ipif(conn_t *connp, ipaddr_t group, ipif_t *ipif); static int ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, @@ -91,25 +89,21 @@ static int ip_opt_delete_group_excl(conn_t *connp, ipaddr_t group, static int ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src); +static void ill_ilm_walker_hold(ill_t *ill); +static void ill_ilm_walker_rele(ill_t *ill); /* * MT notes: * * Multicast joins operate on both the ilg and ilm structures. Multiple * threads operating on an conn (socket) trying to do multicast joins - * need to synchronize when operating on the ilg. Multiple threads + * need to synchronize when operating on the ilg. Multiple threads * potentially operating on different conn (socket endpoints) trying to * do multicast joins could eventually end up trying to manipulate the - * ilm simulatenously and need to synchronize on the access to the ilm. - * Both are amenable to standard Solaris MT techniques, but it would be - * complex to handle a failover or failback which needs to manipulate - * ilg/ilms if an applications can also simultaenously join/leave - * multicast groups. Hence multicast join/leave also go through the ipsq_t + * ilm simultaneously and need to synchronize access to the ilm. Currently, + * this is done by synchronizing join/leave via per-phyint ipsq_t * serialization. * - * Multicast joins and leaves are single-threaded per phyint/IPMP group - * using the ipsq serialization mechanism. - * * An ilm is an IP data structure used to track multicast join/leave. * An ilm is associated with a <multicast group, ipif> tuple in IPv4 and * with just <multicast group> in IPv6. ilm_refcnt is the number of ilg's @@ -211,12 +205,13 @@ conn_ilg_reap(conn_t *connp) * Returns a pointer to the next available ilg in conn_ilg. Allocs more * buffers in size of ILG_ALLOC_CHUNK ilgs when needed, and updates conn's * ilg tracking fields appropriately (conn_ilg_inuse reflects usage of the - * returned ilg). Returns NULL on failure (ENOMEM). + * returned ilg). Returns NULL on failure, in which case `*errp' will be + * filled in with the reason. * * Assumes connp->conn_lock is held. */ static ilg_t * -conn_ilg_alloc(conn_t *connp) +conn_ilg_alloc(conn_t *connp, int *errp) { ilg_t *new, *ret; int curcnt; @@ -224,10 +219,21 @@ conn_ilg_alloc(conn_t *connp) ASSERT(MUTEX_HELD(&connp->conn_lock)); ASSERT(connp->conn_ilg_inuse <= connp->conn_ilg_allocated); + /* + * If CONN_CLOSING is set, conn_ilg cleanup has begun and we must not + * create any ilgs. + */ + if (connp->conn_state_flags & CONN_CLOSING) { + *errp = EINVAL; + return (NULL); + } + if (connp->conn_ilg == NULL) { connp->conn_ilg = GETSTRUCT(ilg_t, ILG_ALLOC_CHUNK); - if (connp->conn_ilg == NULL) + if (connp->conn_ilg == NULL) { + *errp = ENOMEM; return (NULL); + } connp->conn_ilg_allocated = ILG_ALLOC_CHUNK; connp->conn_ilg_inuse = 0; } @@ -241,12 +247,15 @@ conn_ilg_alloc(conn_t *connp) * ilg_delete_all() will have to be changed when * this logic is changed. */ + *errp = EBUSY; return (NULL); } curcnt = connp->conn_ilg_allocated; new = GETSTRUCT(ilg_t, curcnt + ILG_ALLOC_CHUNK); - if (new == NULL) + if (new == NULL) { + *errp = ENOMEM; return (NULL); + } bcopy(connp->conn_ilg, new, sizeof (ilg_t) * curcnt); mi_free((char *)connp->conn_ilg); connp->conn_ilg = new; @@ -378,42 +387,6 @@ ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist) } } -/* - * If the given interface has failed, choose a new one to join on so - * that we continue to receive packets. ilg_orig_ifindex remembers - * what the application used to join on so that we know the ilg to - * delete even though we change the ill here. Callers will store the - * ilg returned from this function in ilg_ill. Thus when we receive - * a packet on ilg_ill, conn_wantpacket_v6 will deliver the packets. - * - * This function must be called as writer so we can walk the group - * list and examine flags without holding a lock. - */ -ill_t * -ip_choose_multi_ill(ill_t *ill, const in6_addr_t *grp) -{ - ill_t *till; - ill_group_t *illgrp = ill->ill_group; - - ASSERT(IAM_WRITER_ILL(ill)); - - if (IN6_IS_ADDR_UNSPECIFIED(grp) || illgrp == NULL) - return (ill); - - if ((ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) == 0) - return (ill); - - till = illgrp->illgrp_ill; - while (till != NULL && - (till->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE))) { - till = till->ill_group_next; - } - if (till != NULL) - return (till); - - return (ill); -} - static int ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist, boolean_t isv6) @@ -560,8 +533,7 @@ ilm_update_del(ilm_t *ilm, boolean_t isv6) } /* - * INADDR_ANY means all multicast addresses. This is only used - * by the multicast router. + * INADDR_ANY means all multicast addresses. * INADDR_ANY is stored as IPv6 unspecified addr. */ int @@ -578,40 +550,31 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat, if (!CLASSD(group) && group != INADDR_ANY) return (EINVAL); + if (IS_UNDER_IPMP(ill)) + return (EINVAL); + /* - * INADDR_ANY is represented as the IPv6 unspecifed addr. + * INADDR_ANY is represented as the IPv6 unspecified addr. */ if (group == INADDR_ANY) v6group = ipv6_all_zeros; else IN6_IPADDR_TO_V4MAPPED(group, &v6group); - mutex_enter(&ill->ill_lock); ilm = ilm_lookup_ipif(ipif, group); - mutex_exit(&ill->ill_lock); /* * Since we are writer, we know the ilm_flags itself cannot * change at this point, and ilm_lookup_ipif would not have * returned a DELETED ilm. However, the data path can free - * ilm->next via ilm_walker_cleanup() so we can safely + * ilm->ilm_next via ilm_walker_cleanup() so we can safely * access anything in ilm except ilm_next (for safe access to - * ilm_next we'd have to take the ill_lock). + * ilm_next we'd have to take the ill_lock). */ if (ilm != NULL) return (ilm_update_add(ilm, ilgstat, ilg_flist, B_FALSE)); - /* - * ilms are associated with ipifs in IPv4. It moves with the - * ipif if the ipif moves to a new ill when the interface - * fails. Thus we really don't check whether the ipif_ill - * has failed like in IPv6. If it has FAILED the ipif - * will move (daemon will move it) and hence the ilm, if the - * ipif is not IPIF_NOFAILOVER. For the IPIF_NOFAILOVER ipifs, - * we continue to receive in the same place even if the - * interface fails. - */ ilm = ilm_add_v6(ipif, &v6group, ilgstat, ilg_fmode, ilg_flist, - ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid); + ipif->ipif_zoneid); if (ilm == NULL) return (ENOMEM); @@ -623,10 +586,7 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat, */ if (ilm_numentries_v6(ill, &v6group) > 1) return (0); - if (ill->ill_group == NULL) - ret = ill_join_allmulti(ill); - else - ret = ill_nominate_mcast_rcv(ill->ill_group); + ret = ill_join_allmulti(ill); if (ret != 0) ilm_delete(ilm); return (ret); @@ -646,12 +606,8 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat, /* * The unspecified address means all multicast addresses. - * This is only used by the multicast router. * - * ill identifies the interface to join on; it may not match the - * interface requested by the application of a failover has taken - * place. orig_ifindex always identifies the interface requested - * by the app. + * ill identifies the interface to join on. * * ilgstat tells us if there's an ilg associated with this join, * and if so, if it's a new ilg or a change to an existing one. @@ -659,9 +615,8 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat, * the ilg (and will be EXCLUDE {NULL} in the case of no ilg). */ int -ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, - zoneid_t zoneid, ilg_stat_t ilgstat, mcast_record_t ilg_fmode, - slist_t *ilg_flist) +ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid, + ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist) { ilm_t *ilm; int ret; @@ -673,37 +628,20 @@ ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, return (EINVAL); } + if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_MC_SOLICITEDNODE(v6group)) + return (EINVAL); + /* - * An ilm is uniquely identified by the tuple of (group, ill, - * orig_ill). group is the multicast group address, ill is - * the interface on which it is currently joined, and orig_ill - * is the interface on which the application requested the - * join. orig_ill and ill are the same unless orig_ill has - * failed over. - * - * Both orig_ill and ill are required, which means we may have - * 2 ilms on an ill for the same group, but with different - * orig_ills. These must be kept separate, so that when failback - * occurs, the appropriate ilms are moved back to their orig_ill - * without disrupting memberships on the ill to which they had - * been moved. - * - * In order to track orig_ill, we store orig_ifindex in the - * ilm and ilg. + * An ilm is uniquely identified by the tuple of (group, ill) where + * `group' is the multicast group address, and `ill' is the interface + * on which it is currently joined. */ - mutex_enter(&ill->ill_lock); - ilm = ilm_lookup_ill_index_v6(ill, v6group, orig_ifindex, zoneid); - mutex_exit(&ill->ill_lock); + ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid); if (ilm != NULL) return (ilm_update_add(ilm, ilgstat, ilg_flist, B_TRUE)); - /* - * We need to remember where the application really wanted - * to join. This will be used later if we want to failback - * to the original interface. - */ ilm = ilm_add_v6(ill->ill_ipif, v6group, ilgstat, ilg_fmode, - ilg_flist, orig_ifindex, zoneid); + ilg_flist, zoneid); if (ilm == NULL) return (ENOMEM); @@ -715,11 +653,7 @@ ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, */ if (ilm_numentries_v6(ill, v6group) > 1) return (0); - if (ill->ill_group == NULL) - ret = ill_join_allmulti(ill); - else - ret = ill_nominate_mcast_rcv(ill->ill_group); - + ret = ill_join_allmulti(ill); if (ret != 0) ilm_delete(ilm); return (ret); @@ -756,6 +690,14 @@ ip_ll_send_enabmulti_req(ill_t *ill, const in6_addr_t *v6groupp) ASSERT(IAM_WRITER_ILL(ill)); /* + * If we're on the IPMP ill, use the nominated multicast interface to + * send and receive DLPI messages, if one exists. (If none exists, + * there are no usable interfaces and thus nothing to do.) + */ + if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) + return (0); + + /* * Create a AR_ENTRY_SQUERY message with a dl_enabmulti_req tacked * on. */ @@ -842,9 +784,8 @@ ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *v6groupp) } /* - * INADDR_ANY means all multicast addresses. This is only used - * by the multicast router. - * INADDR_ANY is stored as the IPv6 unspecifed addr. + * INADDR_ANY means all multicast addresses. + * INADDR_ANY is stored as the IPv6 unspecified addr. */ int ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving) @@ -859,7 +800,7 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving) return (EINVAL); /* - * INADDR_ANY is represented as the IPv6 unspecifed addr. + * INADDR_ANY is represented as the IPv6 unspecified addr. */ if (group == INADDR_ANY) v6group = ipv6_all_zeros; @@ -870,9 +811,7 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving) * Look for a match on the ipif. * (IP_DROP_MEMBERSHIP specifies an ipif using an IP address). */ - mutex_enter(&ill->ill_lock); ilm = ilm_lookup_ipif(ipif, group); - mutex_exit(&ill->ill_lock); if (ilm == NULL) return (ENOENT); @@ -897,11 +836,9 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving) return (0); /* If we never joined, then don't leave. */ - if (ill->ill_join_allmulti) { + if (ill->ill_join_allmulti) ill_leave_allmulti(ill); - if (ill->ill_group != NULL) - (void) ill_nominate_mcast_rcv(ill->ill_group); - } + return (0); } @@ -921,11 +858,10 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving) /* * The unspecified address means all multicast addresses. - * This is only used by the multicast router. */ int -ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, - zoneid_t zoneid, boolean_t no_ilg, boolean_t leaving) +ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid, + boolean_t no_ilg, boolean_t leaving) { ipif_t *ipif; ilm_t *ilm; @@ -938,25 +874,8 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, /* * Look for a match on the ill. - * (IPV6_LEAVE_GROUP specifies an ill using an ifindex). - * - * Similar to ip_addmulti_v6, we should always look using - * the orig_ifindex. - * - * 1) If orig_ifindex is different from ill's ifindex - * we should have an ilm with orig_ifindex created in - * ip_addmulti_v6. We should delete that here. - * - * 2) If orig_ifindex is same as ill's ifindex, we should - * not delete the ilm that is temporarily here because of - * a FAILOVER. Those ilms will have a ilm_orig_ifindex - * different from ill's ifindex. - * - * Thus, always lookup using orig_ifindex. */ - mutex_enter(&ill->ill_lock); - ilm = ilm_lookup_ill_index_v6(ill, v6group, orig_ifindex, zoneid); - mutex_exit(&ill->ill_lock); + ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid); if (ilm == NULL) return (ENOENT); @@ -985,11 +904,9 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex, return (0); /* If we never joined, then don't leave. */ - if (ill->ill_join_allmulti) { + if (ill->ill_join_allmulti) ill_leave_allmulti(ill); - if (ill->ill_group != NULL) - (void) ill_nominate_mcast_rcv(ill->ill_group); - } + return (0); } @@ -1020,6 +937,13 @@ ip_ll_send_disabmulti_req(ill_t *ill, const in6_addr_t *v6groupp) uint32_t addrlen, addroff; ASSERT(IAM_WRITER_ILL(ill)); + + /* + * See comment in ip_ll_send_enabmulti_req(). + */ + if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) + return (0); + /* * Create a AR_ENTRY_SQUERY message with a dl_disabmulti_req tacked * on. @@ -1099,16 +1023,16 @@ ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *v6group) } /* - * Make the driver pass up all multicast packets - * - * With ill groups, the caller makes sure that there is only - * one ill joining the allmulti group. + * Make the driver pass up all multicast packets. NOTE: to keep callers + * IPMP-unaware, if an IPMP ill is passed in, the ill_join_allmulti flag is + * set on it (rather than the cast ill). */ int ill_join_allmulti(ill_t *ill) { mblk_t *promiscon_mp, *promiscoff_mp; uint32_t addrlen, addroff; + ill_t *join_ill = ill; ASSERT(IAM_WRITER_ILL(ill)); @@ -1120,7 +1044,13 @@ ill_join_allmulti(ill_t *ill) return (0); } - ASSERT(!ill->ill_join_allmulti); + /* + * See comment in ip_ll_send_enabmulti_req(). + */ + if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) + return (0); + + ASSERT(!join_ill->ill_join_allmulti); /* * Create a DL_PROMISCON_REQ message and send it directly to the DLPI @@ -1144,20 +1074,18 @@ ill_join_allmulti(ill_t *ill) ill_dlpi_send(ill, promiscon_mp); } - ill->ill_join_allmulti = B_TRUE; + join_ill->ill_join_allmulti = B_TRUE; return (0); } /* * Make the driver stop passing up all multicast packets - * - * With ill groups, we need to nominate some other ill as - * this ipif->ipif_ill is leaving the group. */ void ill_leave_allmulti(ill_t *ill) { - mblk_t *promiscoff_mp = ill->ill_promiscoff_mp; + mblk_t *promiscoff_mp; + ill_t *leave_ill = ill; ASSERT(IAM_WRITER_ILL(ill)); @@ -1169,7 +1097,13 @@ ill_leave_allmulti(ill_t *ill) return; } - ASSERT(ill->ill_join_allmulti); + /* + * See comment in ip_ll_send_enabmulti_req(). + */ + if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) + return; + + ASSERT(leave_ill->ill_join_allmulti); /* * Create a DL_PROMISCOFF_REQ message and send it directly to @@ -1179,12 +1113,13 @@ ill_leave_allmulti(ill_t *ill) */ if ((ill->ill_net_type == IRE_IF_RESOLVER) && !(ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST)) { + promiscoff_mp = ill->ill_promiscoff_mp; ASSERT(promiscoff_mp != NULL); ill->ill_promiscoff_mp = NULL; ill_dlpi_send(ill, promiscoff_mp); } - ill->ill_join_allmulti = B_FALSE; + leave_ill->ill_join_allmulti = B_FALSE; } static ill_t * @@ -1213,22 +1148,35 @@ int ip_join_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) { ill_t *ill; - int ret; + int ret = 0; if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL) return (ENODEV); + + /* + * The ip_addmulti*() functions won't allow IPMP underlying interfaces + * to join allmulti since only the nominated underlying interface in + * the group should receive multicast. We silently succeed to avoid + * having to teach IPobs (currently the only caller of this routine) + * to ignore failures in this case. + */ + if (IS_UNDER_IPMP(ill)) + goto out; + if (isv6) { - ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ifindex, - ill->ill_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); + ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ill->ill_zoneid, + ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); } else { ret = ip_addmulti(INADDR_ANY, ill->ill_ipif, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); } ill->ill_ipallmulti_cnt++; +out: ipsq_exit(ill->ill_phyint->phyint_ipsq); return (ret); } + int ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) { @@ -1236,14 +1184,17 @@ ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL) return (ENODEV); - ASSERT(ill->ill_ipallmulti_cnt != 0); - if (isv6) { - (void) ip_delmulti_v6(&ipv6_all_zeros, ill, ifindex, - ill->ill_zoneid, B_TRUE, B_TRUE); - } else { - (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE, B_TRUE); + + if (ill->ill_ipallmulti_cnt > 0) { + if (isv6) { + (void) ip_delmulti_v6(&ipv6_all_zeros, ill, + ill->ill_zoneid, B_TRUE, B_TRUE); + } else { + (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE, + B_TRUE); + } + ill->ill_ipallmulti_cnt--; } - ill->ill_ipallmulti_cnt--; ipsq_exit(ill->ill_phyint->phyint_ipsq); return (0); } @@ -1260,8 +1211,7 @@ ip_purge_allmulti(ill_t *ill) for (; ill->ill_ipallmulti_cnt > 0; ill->ill_ipallmulti_cnt--) { if (ill->ill_isv6) { (void) ip_delmulti_v6(&ipv6_all_zeros, ill, - ill->ill_phyint->phyint_ifindex, ill->ill_zoneid, - B_TRUE, B_TRUE); + ill->ill_zoneid, B_TRUE, B_TRUE); } else { (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE, B_TRUE); @@ -1539,13 +1489,14 @@ void ill_recover_multicast(ill_t *ill) { ilm_t *ilm; + ipif_t *ipif = ill->ill_ipif; char addrbuf[INET6_ADDRSTRLEN]; ASSERT(IAM_WRITER_ILL(ill)); ill->ill_need_recover_multicast = 0; - ILM_WALKER_HOLD(ill); + ill_ilm_walker_hold(ill); for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { /* * Check how many ipif's that have members in this group - @@ -1553,47 +1504,45 @@ ill_recover_multicast(ill_t *ill) * in the list. */ if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 && - ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm) + ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE, + ALL_ZONES) != ilm) { continue; - ip1dbg(("ill_recover_multicast: %s\n", - inet_ntop(AF_INET6, &ilm->ilm_v6addr, addrbuf, - sizeof (addrbuf)))); + } + + ip1dbg(("ill_recover_multicast: %s\n", inet_ntop(AF_INET6, + &ilm->ilm_v6addr, addrbuf, sizeof (addrbuf)))); + if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { - if (ill->ill_group == NULL) { - (void) ill_join_allmulti(ill); - } else { - /* - * We don't want to join on this ill, - * if somebody else in the group has - * already been nominated. - */ - (void) ill_nominate_mcast_rcv(ill->ill_group); - } + (void) ill_join_allmulti(ill); } else { - (void) ip_ll_addmulti_v6(ill->ill_ipif, - &ilm->ilm_v6addr); + if (ill->ill_isv6) + mld_joingroup(ilm); + else + igmp_joingroup(ilm); + + (void) ip_ll_addmulti_v6(ipif, &ilm->ilm_v6addr); } } - ILM_WALKER_RELE(ill); + ill_ilm_walker_rele(ill); + } /* * The opposite of ill_recover_multicast() -- leaves all multicast groups - * that were explicitly joined. Note that both these functions could be - * disposed of if we enhanced ARP to allow us to handle DL_DISABMULTI_REQ - * and DL_ENABMULTI_REQ messages when an interface is down. + * that were explicitly joined. */ void ill_leave_multicast(ill_t *ill) { ilm_t *ilm; + ipif_t *ipif = ill->ill_ipif; char addrbuf[INET6_ADDRSTRLEN]; ASSERT(IAM_WRITER_ILL(ill)); ill->ill_need_recover_multicast = 1; - ILM_WALKER_HOLD(ill); + ill_ilm_walker_hold(ill); for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { /* * Check how many ipif's that have members in this group - @@ -1601,25 +1550,26 @@ ill_leave_multicast(ill_t *ill) * in the list. */ if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 && - ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm) + ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE, + ALL_ZONES) != ilm) { continue; - ip1dbg(("ill_leave_multicast: %s\n", - inet_ntop(AF_INET6, &ilm->ilm_v6addr, addrbuf, - sizeof (addrbuf)))); + } + + ip1dbg(("ill_leave_multicast: %s\n", inet_ntop(AF_INET6, + &ilm->ilm_v6addr, addrbuf, sizeof (addrbuf)))); + if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { ill_leave_allmulti(ill); - /* - * If we were part of an IPMP group, then - * ill_handoff_responsibility() has already - * nominated a new member (so we don't). - */ - ASSERT(ill->ill_group == NULL); } else { - (void) ip_ll_delmulti_v6(ill->ill_ipif, - &ilm->ilm_v6addr); + if (ill->ill_isv6) + mld_leavegroup(ilm); + else + igmp_leavegroup(ilm); + + (void) ip_ll_delmulti_v6(ipif, &ilm->ilm_v6addr); } } - ILM_WALKER_RELE(ill); + ill_ilm_walker_rele(ill); } /* Find an ilm for matching the ill */ @@ -1628,91 +1578,79 @@ ilm_lookup_ill(ill_t *ill, ipaddr_t group, zoneid_t zoneid) { in6_addr_t v6group; - ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock)); /* - * INADDR_ANY is represented as the IPv6 unspecifed addr. + * INADDR_ANY is represented as the IPv6 unspecified addr. */ if (group == INADDR_ANY) v6group = ipv6_all_zeros; else IN6_IPADDR_TO_V4MAPPED(group, &v6group); - return (ilm_lookup_ill_v6(ill, &v6group, zoneid)); + return (ilm_lookup_ill_v6(ill, &v6group, B_TRUE, zoneid)); } /* - * Find an ilm for matching the ill. All the ilm lookup functions - * ignore ILM_DELETED ilms. These have been logically deleted, and - * igmp and linklayer disable multicast have been done. Only mi_free - * yet to be done. Still there in the list due to ilm_walkers. The - * last walker will release it. + * Find an ilm for address `v6group' on `ill' and zone `zoneid' (which may be + * ALL_ZONES). In general, if `ill' is in an IPMP group, we will match + * against any ill in the group. However, if `restrict_solicited' is set, + * then specifically for IPv6 solicited-node multicast, the match will be + * restricted to the specified `ill'. */ ilm_t * -ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group, zoneid_t zoneid) +ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group, + boolean_t restrict_solicited, zoneid_t zoneid) { ilm_t *ilm; + ilm_walker_t ilw; + boolean_t restrict_ill = B_FALSE; - ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock)); + /* + * In general, underlying interfaces cannot have multicast memberships + * and thus lookups always match across the illgrp. However, we must + * allow IPv6 solicited-node multicast memberships on underlying + * interfaces, and thus an IPMP meta-interface and one of its + * underlying ills may have the same solicited-node multicast address. + * In that case, we need to restrict the lookup to the requested ill. + * However, we may receive packets on an underlying interface that + * are for the corresponding IPMP interface's solicited-node multicast + * address, and thus in that case we need to match across the group -- + * hence the unfortunate `restrict_solicited' argument. + */ + if (IN6_IS_ADDR_MC_SOLICITEDNODE(v6group) && restrict_solicited) + restrict_ill = (IS_IPMP(ill) || IS_UNDER_IPMP(ill)); - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { - if (ilm->ilm_flags & ILM_DELETED) + ilm = ilm_walker_start(&ilw, ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group)) continue; - if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) && - (zoneid == ALL_ZONES || zoneid == ilm->ilm_zoneid)) - return (ilm); - } - return (NULL); -} - -ilm_t * -ilm_lookup_ill_index_v6(ill_t *ill, const in6_addr_t *v6group, int index, - zoneid_t zoneid) -{ - ilm_t *ilm; - - ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock)); - - for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (ilm->ilm_flags & ILM_DELETED) + if (zoneid != ALL_ZONES && zoneid != ilm->ilm_zoneid) continue; - if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) && - (zoneid == ALL_ZONES || zoneid == ilm->ilm_zoneid) && - ilm->ilm_orig_ifindex == index) { - return (ilm); + if (!restrict_ill || ill == (ill->ill_isv6 ? + ilm->ilm_ill : ilm->ilm_ipif->ipif_ill)) { + break; } } - return (NULL); + ilm_walker_finish(&ilw); + return (ilm); } - /* - * Found an ilm for the ipif. Only needed for IPv4 which does + * Find an ilm for the ipif. Only needed for IPv4 which does * ipif specific socket options. */ ilm_t * ilm_lookup_ipif(ipif_t *ipif, ipaddr_t group) { - ill_t *ill = ipif->ipif_ill; - ilm_t *ilm; - in6_addr_t v6group; - - ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock)); - /* - * INADDR_ANY is represented as the IPv6 unspecifed addr. - */ - if (group == INADDR_ANY) - v6group = ipv6_all_zeros; - else - IN6_IPADDR_TO_V4MAPPED(group, &v6group); + ilm_t *ilm; + ilm_walker_t ilw; - for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { - if (ilm->ilm_flags & ILM_DELETED) - continue; - if (ilm->ilm_ipif == ipif && - IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group)) - return (ilm); + ilm = ilm_walker_start(&ilw, ipif->ipif_ill); + for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + if (ilm->ilm_ipif == ipif && ilm->ilm_addr == group) + break; } - return (NULL); + ilm_walker_finish(&ilw); + return (ilm); } /* @@ -1739,8 +1677,7 @@ ilm_numentries_v6(ill_t *ill, const in6_addr_t *v6group) /* Caller guarantees that the group is not already on the list */ static ilm_t * ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat, - mcast_record_t ilg_fmode, slist_t *ilg_flist, int orig_ifindex, - zoneid_t zoneid) + mcast_record_t ilg_fmode, slist_t *ilg_flist, zoneid_t zoneid) { ill_t *ill = ipif->ipif_ill; ilm_t *ilm; @@ -1783,19 +1720,10 @@ ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat, (char *), "ilm", (void *), ilm); ipif->ipif_ilm_cnt++; } + ASSERT(ill->ill_ipst); ilm->ilm_ipst = ill->ill_ipst; /* No netstack_hold */ - /* - * After this if ilm moves to a new ill, we don't change - * the ilm_orig_ifindex. Thus, if ill_index != ilm_orig_ifindex, - * it has been moved. Indexes don't match even when the application - * wants to join on a FAILED/INACTIVE interface because we choose - * a new interface to join in. This is considered as an implicit - * move. - */ - ilm->ilm_orig_ifindex = orig_ifindex; - ASSERT(!(ipif->ipif_state_flags & IPIF_CONDEMNED)); ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED)); @@ -1969,6 +1897,108 @@ ilm_delete(ilm_t *ilm) } } +/* Increment the ILM walker count for `ill' */ +static void +ill_ilm_walker_hold(ill_t *ill) +{ + mutex_enter(&ill->ill_lock); + ill->ill_ilm_walker_cnt++; + mutex_exit(&ill->ill_lock); +} + +/* Decrement the ILM walker count for `ill' */ +static void +ill_ilm_walker_rele(ill_t *ill) +{ + mutex_enter(&ill->ill_lock); + ill->ill_ilm_walker_cnt--; + if (ill->ill_ilm_walker_cnt == 0 && ill->ill_ilm_cleanup_reqd) + ilm_walker_cleanup(ill); /* drops ill_lock */ + else + mutex_exit(&ill->ill_lock); +} + +/* + * Start walking the ILMs associated with `ill'; the first ILM in the walk + * (if any) is returned. State associated with the walk is stored in `ilw'. + * Note that walks associated with interfaces under IPMP also walk the ILMs + * on the associated IPMP interface; this is handled transparently to callers + * via ilm_walker_step(). (Usually with IPMP all ILMs will be on the IPMP + * interface; the only exception is to support IPv6 test addresses, which + * require ILMs for their associated solicited-node multicast addresses.) + */ +ilm_t * +ilm_walker_start(ilm_walker_t *ilw, ill_t *ill) +{ + ilw->ilw_ill = ill; + if (IS_UNDER_IPMP(ill)) + ilw->ilw_ipmp_ill = ipmp_ill_hold_ipmp_ill(ill); + else + ilw->ilw_ipmp_ill = NULL; + + ill_ilm_walker_hold(ill); + if (ilw->ilw_ipmp_ill != NULL) + ill_ilm_walker_hold(ilw->ilw_ipmp_ill); + + if (ilw->ilw_ipmp_ill != NULL && ilw->ilw_ipmp_ill->ill_ilm != NULL) + ilw->ilw_walk_ill = ilw->ilw_ipmp_ill; + else + ilw->ilw_walk_ill = ilw->ilw_ill; + + return (ilm_walker_step(ilw, NULL)); +} + +/* + * Helper function for ilm_walker_step() that returns the next ILM + * associated with `ilw', regardless of whether it's deleted. + */ +static ilm_t * +ilm_walker_step_all(ilm_walker_t *ilw, ilm_t *ilm) +{ + if (ilm == NULL) + return (ilw->ilw_walk_ill->ill_ilm); + + if (ilm->ilm_next != NULL) + return (ilm->ilm_next); + + if (ilw->ilw_ipmp_ill != NULL && IS_IPMP(ilw->ilw_walk_ill)) { + ilw->ilw_walk_ill = ilw->ilw_ill; + /* + * It's possible that ilw_ill left the group during our walk, + * so we can't ASSERT() that it's under IPMP. Callers that + * care will be writer on the IPSQ anyway. + */ + return (ilw->ilw_walk_ill->ill_ilm); + } + return (NULL); +} + +/* + * Step to the next ILM associated with `ilw'. + */ +ilm_t * +ilm_walker_step(ilm_walker_t *ilw, ilm_t *ilm) +{ + while ((ilm = ilm_walker_step_all(ilw, ilm)) != NULL) { + if (!(ilm->ilm_flags & ILM_DELETED)) + break; + } + return (ilm); +} + +/* + * Finish the ILM walk associated with `ilw'. + */ +void +ilm_walker_finish(ilm_walker_t *ilw) +{ + ill_ilm_walker_rele(ilw->ilw_ill); + if (ilw->ilw_ipmp_ill != NULL) { + ill_ilm_walker_rele(ilw->ilw_ipmp_ill); + ill_refrele(ilw->ilw_ipmp_ill); + } + bzero(&ilw, sizeof (ilw)); +} /* * Looks up the appropriate ipif given a v4 multicast group and interface @@ -2256,16 +2286,15 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf, * didn't find an ilg, there's nothing to do. */ if (!leave_grp) - ilg = conn_ilg_alloc(connp); + ilg = conn_ilg_alloc(connp, &err); if (leave_grp || ilg == NULL) { mutex_exit(&connp->conn_lock); - return (leave_grp ? 0 : ENOMEM); + return (leave_grp ? 0 : err); } ilgstat = ILGSTAT_NEW; IN6_IPADDR_TO_V4MAPPED(grp, &ilg->ilg_v6group); ilg->ilg_ipif = ipif; ilg->ilg_ill = NULL; - ilg->ilg_orig_ifindex = 0; } else if (leave_grp) { ilg_delete(connp, ilg, NULL); mutex_exit(&connp->conn_lock); @@ -2389,7 +2418,7 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf, const struct in6_addr *grp, ill_t *ill) { ilg_t *ilg; - int i, orig_ifindex, orig_fmode, new_fmode, err; + int i, orig_fmode, new_fmode, err; slist_t *orig_filter = NULL; slist_t *new_filter = NULL; struct sockaddr_storage *sl; @@ -2409,65 +2438,31 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf, ASSERT(IAM_WRITER_ILL(ill)); - /* - * Use the ifindex to do the lookup. We can't use the ill - * directly because ilg_ill could point to a different ill - * if things have moved. - */ - orig_ifindex = ill->ill_phyint->phyint_ifindex; - mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_index_v6(connp, grp, orig_ifindex); + ilg = ilg_lookup_ill_v6(connp, grp, ill); if (ilg == NULL) { /* * if the request was actually to leave, and we * didn't find an ilg, there's nothing to do. */ if (!leave_grp) - ilg = conn_ilg_alloc(connp); + ilg = conn_ilg_alloc(connp, &err); if (leave_grp || ilg == NULL) { mutex_exit(&connp->conn_lock); - return (leave_grp ? 0 : ENOMEM); + return (leave_grp ? 0 : err); } ilgstat = ILGSTAT_NEW; ilg->ilg_v6group = *grp; ilg->ilg_ipif = NULL; - /* - * Choose our target ill to join on. This might be - * different from the ill we've been given if it's - * currently down and part of a group. - * - * new ill is not refheld; we are writer. - */ - ill = ip_choose_multi_ill(ill, grp); - ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED)); ilg->ilg_ill = ill; - /* - * Remember the index that we joined on, so that we can - * successfully delete them later on and also search for - * duplicates if the application wants to join again. - */ - ilg->ilg_orig_ifindex = orig_ifindex; } else if (leave_grp) { - /* - * Use the ilg's current ill for the deletion, - * we might have failed over. - */ - ill = ilg->ilg_ill; ilg_delete(connp, ilg, NULL); mutex_exit(&connp->conn_lock); - (void) ip_delmulti_v6(grp, ill, orig_ifindex, - connp->conn_zoneid, B_FALSE, B_TRUE); + (void) ip_delmulti_v6(grp, ill, connp->conn_zoneid, B_FALSE, + B_TRUE); return (0); } else { ilgstat = ILGSTAT_CHANGE; - /* - * The current ill might be different from the one we were - * asked to join on (if failover has occurred); we should - * join on the ill stored in the ilg. The original ill - * is noted in ilg_orig_ifindex, which matched our request. - */ - ill = ilg->ilg_ill; /* preserve existing state in case ip_addmulti() fails */ orig_fmode = ilg->ilg_fmode; if (ilg->ilg_filter == NULL) { @@ -2531,8 +2526,8 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf, mutex_exit(&connp->conn_lock); - err = ip_addmulti_v6(grp, ill, orig_ifindex, connp->conn_zoneid, - ilgstat, new_fmode, new_filter); + err = ip_addmulti_v6(grp, ill, connp->conn_zoneid, ilgstat, new_fmode, + new_filter); if (err != 0) { /* * Restore the original filter state, or delete the @@ -2541,7 +2536,7 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf, * conn_lock. */ mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_index_v6(connp, grp, orig_ifindex); + ilg = ilg_lookup_ill_v6(connp, grp, ill); ASSERT(ilg != NULL); if (ilgstat == ILGSTAT_NEW) { ilg_delete(connp, ilg, NULL); @@ -3043,20 +3038,12 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src) { ilg_t *ilg; - ill_t *ilg_ill; - uint_t ilg_orig_ifindex; boolean_t leaving = B_TRUE; ASSERT(IAM_WRITER_ILL(ill)); - /* - * Use the index that we originally used to join. We can't - * use the ill directly because ilg_ill could point to - * a new ill if things have moved. - */ mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_index_v6(connp, v6group, - ill->ill_phyint->phyint_ifindex); + ilg = ilg_lookup_ill_v6(connp, v6group, ill); if ((ilg == NULL) || (ilg->ilg_flags & ILG_DELETED)) { mutex_exit(&connp->conn_lock); return (EADDRNOTAVAIL); @@ -3087,12 +3074,10 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group, leaving = B_FALSE; } - ilg_ill = ilg->ilg_ill; - ilg_orig_ifindex = ilg->ilg_orig_ifindex; ilg_delete(connp, ilg, v6src); mutex_exit(&connp->conn_lock); - (void) ip_delmulti_v6(v6group, ilg_ill, ilg_orig_ifindex, - connp->conn_zoneid, B_FALSE, leaving); + (void) ip_delmulti_v6(v6group, ill, connp->conn_zoneid, B_FALSE, + leaving); return (0); } @@ -3345,10 +3330,10 @@ ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode, if (ilg == NULL) { ilgstat = ILGSTAT_NEW; - if ((ilg = conn_ilg_alloc(connp)) == NULL) { + if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) { mutex_exit(&connp->conn_lock); l_free(new_filter); - return (ENOMEM); + return (error); } if (src != INADDR_ANY) { ilg->ilg_filter = l_alloc(); @@ -3369,7 +3354,6 @@ ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode, } ilg->ilg_ipif = ipif; ilg->ilg_ill = NULL; - ilg->ilg_orig_ifindex = 0; ilg->ilg_fmode = fmode; } else { int index; @@ -3437,7 +3421,6 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src) { int error = 0; - int orig_ifindex; ilg_t *ilg; ilg_stat_t ilgstat; slist_t *new_filter = NULL; @@ -3456,13 +3439,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, */ mutex_enter(&connp->conn_lock); - /* - * Use the ifindex to do the lookup. We can't use the ill - * directly because ilg_ill could point to a different ill if - * things have moved. - */ - orig_ifindex = ill->ill_phyint->phyint_ifindex; - ilg = ilg_lookup_ill_index_v6(connp, v6group, orig_ifindex); + ilg = ilg_lookup_ill_v6(connp, v6group, ill); /* * Depending on the option we're handling, may or may not be okay @@ -3501,10 +3478,10 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, } if (ilg == NULL) { - if ((ilg = conn_ilg_alloc(connp)) == NULL) { + if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) { mutex_exit(&connp->conn_lock); l_free(new_filter); - return (ENOMEM); + return (error); } if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) { ilg->ilg_filter = l_alloc(); @@ -3521,22 +3498,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, ilg->ilg_v6group = *v6group; ilg->ilg_fmode = fmode; ilg->ilg_ipif = NULL; - /* - * Choose our target ill to join on. This might be different - * from the ill we've been given if it's currently down and - * part of a group. - * - * new ill is not refheld; we are writer. - */ - ill = ip_choose_multi_ill(ill, v6group); - ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED)); ilg->ilg_ill = ill; - /* - * Remember the orig_ifindex that we joined on, so that we - * can successfully delete them later on and also search - * for duplicates if the application wants to join again. - */ - ilg->ilg_orig_ifindex = orig_ifindex; } else { int index; if (ilg->ilg_fmode != fmode || IN6_IS_ADDR_UNSPECIFIED(v6src)) { @@ -3560,13 +3522,6 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, ilgstat = ILGSTAT_CHANGE; index = ilg->ilg_filter->sl_numsrc++; ilg->ilg_filter->sl_addr[index] = *v6src; - /* - * The current ill might be different from the one we were - * asked to join on (if failover has occurred); we should - * join on the ill stored in the ilg. The original ill - * is noted in ilg_orig_ifindex, which matched our request. - */ - ill = ilg->ilg_ill; } /* @@ -3584,8 +3539,8 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, * info for the ill, which involves looking at the status of * all the ilgs associated with this group/interface pair. */ - error = ip_addmulti_v6(v6group, ill, orig_ifindex, connp->conn_zoneid, - ilgstat, new_fmode, new_filter); + error = ip_addmulti_v6(v6group, ill, connp->conn_zoneid, ilgstat, + new_fmode, new_filter); if (error != 0) { /* * But because we waited, we have to undo the ilg update @@ -3595,7 +3550,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, in6_addr_t delsrc = (ilgstat == ILGSTAT_NEW) ? ipv6_all_zeros : *v6src; mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_index_v6(connp, v6group, orig_ifindex); + ilg = ilg_lookup_ill_v6(connp, v6group, ill); ASSERT(ilg != NULL); ilg_delete(connp, ilg, &delsrc); mutex_exit(&connp->conn_lock); @@ -3639,7 +3594,7 @@ ilg_lookup_ill_withsrc(conn_t *connp, ipaddr_t group, ipaddr_t src, ill_t *ill) ASSERT(ilg->ilg_ill == NULL); ilg_ill = ipif->ipif_ill; ASSERT(!ilg_ill->ill_isv6); - if (ilg_ill == ill && + if (IS_ON_SAME_LAN(ilg_ill, ill) && IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group)) { if (SLIST_IS_EMPTY(ilg->ilg_filter)) { /* no source filter, so this is a match */ @@ -3692,7 +3647,7 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group, continue; ASSERT(ilg->ilg_ipif == NULL); ASSERT(ilg_ill->ill_isv6); - if (ilg_ill == ill && + if (IS_ON_SAME_LAN(ilg_ill, ill) && IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) { if (SLIST_IS_EMPTY(ilg->ilg_filter)) { /* no source filter, so this is a match */ @@ -3724,35 +3679,6 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group, } /* - * Get the ilg whose ilg_orig_ifindex is associated with ifindex. - * This is useful when the interface fails and we have moved - * to a new ill, but still would like to locate using the index - * that we originally used to join. Used only for IPv6 currently. - */ -static ilg_t * -ilg_lookup_ill_index_v6(conn_t *connp, const in6_addr_t *v6group, int ifindex) -{ - ilg_t *ilg; - int i; - - ASSERT(MUTEX_HELD(&connp->conn_lock)); - for (i = 0; i < connp->conn_ilg_inuse; i++) { - ilg = &connp->conn_ilg[i]; - if (ilg->ilg_ill == NULL || - (ilg->ilg_flags & ILG_DELETED) != 0) - continue; - /* ilg_ipif is NULL for V6 */ - ASSERT(ilg->ilg_ipif == NULL); - ASSERT(ilg->ilg_orig_ifindex != 0); - if (IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group) && - ilg->ilg_orig_ifindex == ifindex) { - return (ilg); - } - } - return (NULL); -} - -/* * Find an IPv6 ilg matching group and ill */ ilg_t * @@ -3863,32 +3789,28 @@ ilg_delete_all(conn_t *connp) in6_addr_t v6group; boolean_t success; ipsq_t *ipsq; - int orig_ifindex; mutex_enter(&connp->conn_lock); retry: ILG_WALKER_HOLD(connp); - for (i = connp->conn_ilg_inuse - 1; i >= 0; ) { + for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { ilg = &connp->conn_ilg[i]; /* * Since this walk is not atomic (we drop the * conn_lock and wait in ipsq_enter) we need * to check for the ILG_DELETED flag. */ - if (ilg->ilg_flags & ILG_DELETED) { - /* Go to the next ilg */ - i--; + if (ilg->ilg_flags & ILG_DELETED) continue; - } - v6group = ilg->ilg_v6group; - if (IN6_IS_ADDR_V4MAPPED(&v6group)) { + if (IN6_IS_ADDR_V4MAPPED(&ilg->ilg_v6group)) { ipif = ilg->ilg_ipif; ill = ipif->ipif_ill; } else { ipif = NULL; ill = ilg->ilg_ill; } + /* * We may not be able to refhold the ill if the ill/ipif * is changing. But we need to make sure that the ill will @@ -3897,11 +3819,9 @@ retry: * in which case the unplumb thread will handle the cleanup, * and we move on to the next ilg. */ - if (!ill_waiter_inc(ill)) { - /* Go to the next ilg */ - i--; + if (!ill_waiter_inc(ill)) continue; - } + mutex_exit(&connp->conn_lock); /* * To prevent deadlock between ill close which waits inside @@ -3916,51 +3836,31 @@ retry: ipsq = ill->ill_phyint->phyint_ipsq; ill_waiter_dcr(ill); mutex_enter(&connp->conn_lock); - if (!success) { - /* Go to the next ilg */ - i--; + if (!success) continue; - } /* - * Make sure that nothing has changed under. For eg. - * a failover/failback can change ilg_ill while we were - * waiting to become exclusive above + * Move on if the ilg was deleted while conn_lock was dropped. */ - if (IN6_IS_ADDR_V4MAPPED(&v6group)) { - ipif = ilg->ilg_ipif; - ill = ipif->ipif_ill; - } else { - ipif = NULL; - ill = ilg->ilg_ill; - } - if (!IAM_WRITER_ILL(ill) || (ilg->ilg_flags & ILG_DELETED)) { - /* - * The ilg has changed under us probably due - * to a failover or unplumb. Retry on the same ilg. - */ + if (ilg->ilg_flags & ILG_DELETED) { mutex_exit(&connp->conn_lock); ipsq_exit(ipsq); mutex_enter(&connp->conn_lock); continue; } v6group = ilg->ilg_v6group; - orig_ifindex = ilg->ilg_orig_ifindex; ilg_delete(connp, ilg, NULL); mutex_exit(&connp->conn_lock); - if (ipif != NULL) + if (ipif != NULL) { (void) ip_delmulti(V4_PART_OF_V6(v6group), ipif, B_FALSE, B_TRUE); - - else - (void) ip_delmulti_v6(&v6group, ill, orig_ifindex, + } else { + (void) ip_delmulti_v6(&v6group, ill, connp->conn_zoneid, B_FALSE, B_TRUE); - + } ipsq_exit(ipsq); mutex_enter(&connp->conn_lock); - /* Go to the next ilg */ - i--; } ILG_WALKER_RELE(connp); @@ -4063,7 +3963,6 @@ conn_delete_ill(conn_t *connp, caddr_t arg) int i; char group_buf[INET6_ADDRSTRLEN]; in6_addr_t v6group; - int orig_ifindex; ilg_t *ilg; /* @@ -4097,11 +3996,10 @@ conn_delete_ill(conn_t *connp, caddr_t arg) ill->ill_name)); v6group = ilg->ilg_v6group; - orig_ifindex = ilg->ilg_orig_ifindex; ilg_delete(connp, ilg, NULL); mutex_exit(&connp->conn_lock); - (void) ip_delmulti_v6(&v6group, ill, orig_ifindex, + (void) ip_delmulti_v6(&v6group, ill, connp->conn_zoneid, B_FALSE, B_TRUE); mutex_enter(&connp->conn_lock); } @@ -4115,7 +4013,6 @@ conn_delete_ill(conn_t *connp, caddr_t arg) if (connp->conn_multicast_ill == ill) { /* Revert to late binding */ connp->conn_multicast_ill = NULL; - connp->conn_orig_multicast_ifindex = 0; } mutex_exit(&connp->conn_lock); } diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c index b53897cefe..895cc74bd2 100644 --- a/usr/src/uts/common/inet/ip/ip_ndp.c +++ b/usr/src/uts/common/inet/ip/ip_ndp.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -83,8 +83,9 @@ static boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr, static void nce_ire_delete(nce_t *nce); static void nce_ire_delete1(ire_t *ire, char *nce_arg); static void nce_set_ll(nce_t *nce, uchar_t *ll_addr); -static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *); -static nce_t *nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr); +static nce_t *nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *, + nce_t *); +static nce_t *nce_lookup_mapping(ill_t *, const in6_addr_t *); static void nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr); static int nce_set_multicast(ill_t *ill, const in6_addr_t *addr); @@ -93,11 +94,16 @@ static mblk_t *nce_udreq_alloc(ill_t *ill); static void nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr); static uint32_t nce_solicit(nce_t *nce, mblk_t *mp); -static boolean_t nce_xmit(ill_t *ill, uint32_t operation, - ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender, +static boolean_t nce_xmit(ill_t *ill, uint8_t type, + boolean_t use_lla_addr, const in6_addr_t *sender, const in6_addr_t *target, int flag); +static boolean_t nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, + const in6_addr_t *target, uint_t flags); +static boolean_t nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, + const in6_addr_t *src, uint_t flags); static int ndp_add_v4(ill_t *, const in_addr_t *, uint16_t, nce_t **, nce_t *); +static ipif_t *ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill); #ifdef DEBUG static void nce_trace_cleanup(const nce_t *); @@ -110,22 +116,6 @@ static void nce_trace_cleanup(const nce_t *); (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ NCE_TABLE_SIZE)])) -/* - * Compute default flags to use for an advertisement of this nce's address. - */ -static int -nce_advert_flags(const nce_t *nce) -{ - int flag = 0; - - if (nce->nce_flags & NCE_F_ISROUTER) - flag |= NDP_ISROUTER; - if (!(nce->nce_flags & NCE_F_ANYCAST)) - flag |= NDP_ORIDE; - - return (flag); -} - /* Non-tunable probe interval, based on link capabilities */ #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) @@ -262,8 +252,7 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, mutex_exit(&ipst->ips_ndp6->ndp_g_lock); nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; mutex_exit(&nce->nce_lock); - dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE, - &ipv6_all_zeros, addr, NDP_PROBE); + dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE); if (dropped) { mutex_enter(&nce->nce_lock); nce->nce_pcnt++; @@ -282,23 +271,20 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, mutex_exit(&ipst->ips_ndp6->ndp_g_lock); nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1; mutex_exit(&nce->nce_lock); - dropped = nce_xmit(ill, - ND_NEIGHBOR_ADVERT, - ill, /* ill to be used for extracting ill_nd_lla */ - B_TRUE, /* use ill_nd_lla */ - addr, /* Source and target of the advertisement pkt */ - &ipv6_all_hosts_mcast, /* Destination of the packet */ - nce_advert_flags(nce)); + dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast, + 0); mutex_enter(&nce->nce_lock); if (dropped) nce->nce_unsolicit_count++; if (nce->nce_unsolicit_count != 0) { + ASSERT(nce->nce_timeout_id == 0); nce->nce_timeout_id = timeout(ndp_timer, nce, MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval)); } mutex_exit(&nce->nce_lock); mutex_enter(&ipst->ips_ndp6->ndp_g_lock); } + /* * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then * we call nce_fastpath as soon as the nce is resolved in ndp_process. @@ -311,10 +297,10 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, } int -ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, - const in6_addr_t *mask, const in6_addr_t *extract_mask, - uint32_t hw_extract_start, uint16_t flags, uint16_t state, - nce_t **newnce) +ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr, + const in6_addr_t *addr, const in6_addr_t *mask, + const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags, + uint16_t state, nce_t **newnce) { int err = 0; nce_t *nce; @@ -325,7 +311,7 @@ ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, /* Get head of v6 hash table */ nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); - nce = nce_lookup_addr(ill, addr, nce); + nce = nce_lookup_addr(ill, match_illgrp, addr, nce); if (nce == NULL) { err = ndp_add_v6(ill, hw_addr, @@ -562,13 +548,11 @@ nce_ire_delete_list(nce_t *nce) if (nce->nce_ipversion == IPV4_VERSION) { ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, nce_ire_delete1, - (char *)nce, nce->nce_ill); + IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill); } else { ASSERT(nce->nce_ipversion == IPV6_VERSION); ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, nce_ire_delete1, - (char *)nce, nce->nce_ill); + IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill); } NCE_REFRELE_NOTR(nce); nce = nce_next; @@ -628,8 +612,7 @@ ndp_restart_dad(nce_t *nce) nce->nce_state = ND_PROBE; nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1; mutex_exit(&nce->nce_lock); - dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL, - B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE); + dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE); if (dropped) { mutex_enter(&nce->nce_lock); nce->nce_pcnt++; @@ -649,22 +632,19 @@ ndp_restart_dad(nce_t *nce) * If one is found, the refcnt on the nce will be incremented. */ nce_t * -ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock) +ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr, + boolean_t caller_holds_lock) { nce_t *nce; - ip_stack_t *ipst; - - ASSERT(ill != NULL); - ipst = ill->ill_ipst; + ip_stack_t *ipst = ill->ill_ipst; - ASSERT(ill != NULL && ill->ill_isv6); - if (!caller_holds_lock) { + ASSERT(ill->ill_isv6); + if (!caller_holds_lock) mutex_enter(&ipst->ips_ndp6->ndp_g_lock); - } /* Get head of v6 hash table */ nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); - nce = nce_lookup_addr(ill, addr, nce); + nce = nce_lookup_addr(ill, match_illgrp, addr, nce); if (nce == NULL) nce = nce_lookup_mapping(ill, addr); if (!caller_holds_lock) @@ -685,14 +665,17 @@ ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock) in6_addr_t addr6; ip_stack_t *ipst = ill->ill_ipst; - if (!caller_holds_lock) { + if (!caller_holds_lock) mutex_enter(&ipst->ips_ndp4->ndp_g_lock); - } /* Get head of v4 hash table */ nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); - nce = nce_lookup_addr(ill, &addr6, nce); + /* + * NOTE: IPv4 never matches across the illgrp since the NCE's we're + * looking up have fastpath headers that are inherently per-ill. + */ + nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce); if (!caller_holds_lock) mutex_exit(&ipst->ips_ndp4->ndp_g_lock); return (nce); @@ -706,7 +689,8 @@ ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock) * lock (ndp_g_lock). */ static nce_t * -nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce) +nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr, + nce_t *nce) { ndp_g_t *ndp; ip_stack_t *ipst = ill->ill_ipst; @@ -716,12 +700,12 @@ nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce) else ndp = ipst->ips_ndp4; - ASSERT(ill != NULL); ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); if (IN6_IS_ADDR_UNSPECIFIED(addr)) return (NULL); for (; nce != NULL; nce = nce->nce_next) { - if (nce->nce_ill == ill) { + if (nce->nce_ill == ill || + match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) { if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) && IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) { @@ -771,8 +755,8 @@ nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr) * Process passed in parameters either from an incoming packet or via * user ioctl. */ -void -ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) +static void +nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) { ill_t *ill = nce->nce_ill; uint32_t hw_addr_len = ill->ill_nd_lla_len; @@ -852,7 +836,7 @@ ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) } else { /* * Send locally originated packets back - * into * ip_wput_v6. + * into ip_wput_v6. */ put(ill->ill_wq, mp); } @@ -918,6 +902,65 @@ ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) } /* + * Walker state structure used by ndp_process() / ndp_process_entry(). + */ +typedef struct ndp_process_data { + ill_t *np_ill; /* ill/illgrp to match against */ + const in6_addr_t *np_addr; /* IPv6 address to match */ + uchar_t *np_hw_addr; /* passed to nce_process() */ + uint32_t np_flag; /* passed to nce_process() */ + boolean_t np_is_adv; /* passed to nce_process() */ +} ndp_process_data_t; + +/* + * Walker callback used by ndp_process() for IPMP groups: calls nce_process() + * for each NCE with a matching address that's in the same IPMP group. + */ +static void +ndp_process_entry(nce_t *nce, void *arg) +{ + ndp_process_data_t *npp = arg; + + if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) && + IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) && + IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) { + nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv); + } +} + +/* + * Wrapper around nce_process() that handles IPMP. In particular, for IPMP, + * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have + * more than one NCE for a given IPv6 address to tend to. In that case, we + * need to walk all NCEs and callback nce_process() for each one. Since this + * is expensive, in the non-IPMP case we just directly call nce_process(). + * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP + * interfaces in an IPMP group share the same NCEs -- at which point this + * function can be removed entirely. + */ +void +ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) +{ + ill_t *ill = nce->nce_ill; + struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6; + ndp_process_data_t np; + + if (ill->ill_grp == NULL) { + nce_process(nce, hw_addr, flag, is_adv); + return; + } + + /* IPMP case: walk all NCEs */ + np.np_ill = ill; + np.np_addr = &nce->nce_addr; + np.np_flag = flag; + np.np_is_adv = is_adv; + np.np_hw_addr = hw_addr; + + ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES); +} + +/* * Pass arg1 to the pfi supplied, along with each nce in existence. * ndp_walk() places a REFHOLD on the nce and drops the lock when * walking the hash list. @@ -926,7 +969,6 @@ void ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, boolean_t trace) { - nce_t *nce; nce_t *nce1; nce_t **ncep; @@ -1021,27 +1063,58 @@ ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) int ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid) { - nce_t *nce; - int err = 0; + nce_t *nce, *hw_nce = NULL; + int err; + ill_t *ipmp_ill; + uint16_t nce_flags; uint32_t ms; mblk_t *mp_nce = NULL; ip_stack_t *ipst = ill->ill_ipst; + uchar_t *hwaddr = NULL; ASSERT(ill->ill_isv6); - if (IN6_IS_ADDR_MULTICAST(dst)) { - err = nce_set_multicast(ill, dst); - return (err); + + if (IN6_IS_ADDR_MULTICAST(dst)) + return (nce_set_multicast(ill, dst)); + + nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0; + + /* + * If `ill' is under IPMP, then first check to see if there's an NCE + * for `dst' on the IPMP meta-interface (e.g., because an application + * explicitly did an SIOCLIFSETND to tie a hardware address to `dst'). + * If so, we use that hardware address when creating the NCE below. + * Note that we don't yet have a mechanism to remove these NCEs if the + * NCE for `dst' on the IPMP meta-interface is subsequently removed -- + * but rather than build such a beast, we should fix NCEs so that they + * can be properly shared across an IPMP group. + */ + if (IS_UNDER_IPMP(ill)) { + if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { + hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE); + if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) { + hwaddr = hw_nce->nce_res_mp->b_rptr + + NCE_LL_ADDR_OFFSET(ipmp_ill); + nce_flags |= hw_nce->nce_flags; + } + ill_refrele(ipmp_ill); + } } + err = ndp_lookup_then_add_v6(ill, - NULL, /* No hardware address */ + B_FALSE, /* NCE fastpath is per ill; don't match across group */ + hwaddr, dst, &ipv6_all_ones, &ipv6_all_zeros, 0, - (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0, - ND_INCOMPLETE, + nce_flags, + hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE, &nce); + if (hw_nce != NULL) + NCE_REFRELE(hw_nce); + switch (err) { case 0: /* @@ -1057,11 +1130,10 @@ ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid) NCE_REFRELE(nce); return (0); } - rw_enter(&ipst->ips_ill_g_lock, RW_READER); + mutex_enter(&nce->nce_lock); if (nce->nce_state != ND_INCOMPLETE) { mutex_exit(&nce->nce_lock); - rw_exit(&ipst->ips_ill_g_lock); NCE_REFRELE(nce); return (0); } @@ -1069,14 +1141,11 @@ ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid) if (mp_nce == NULL) { /* The caller will free mp */ mutex_exit(&nce->nce_lock); - rw_exit(&ipst->ips_ill_g_lock); ndp_delete(nce); NCE_REFRELE(nce); return (ENOMEM); } - ms = nce_solicit(nce, mp_nce); - rw_exit(&ipst->ips_ill_g_lock); - if (ms == 0) { + if ((ms = nce_solicit(nce, mp_nce)) == 0) { /* The caller will free mp */ if (mp_nce != mp) freeb(mp_nce); @@ -1143,6 +1212,7 @@ ndp_noresolver(ill_t *ill, const in6_addr_t *dst) } err = ndp_lookup_then_add_v6(ill, + B_FALSE, /* NCE fastpath is per ill; don't match across group */ NULL, /* hardware address */ dst, &ipv6_all_ones, @@ -1191,7 +1261,7 @@ nce_set_multicast(ill_t *ill, const in6_addr_t *dst) mutex_enter(&ipst->ips_ndp6->ndp_g_lock); nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst)); - nce = nce_lookup_addr(ill, dst, nce); + nce = nce_lookup_addr(ill, B_FALSE, dst, nce); if (nce != NULL) { mutex_exit(&ipst->ips_ndp6->ndp_g_lock); NCE_REFRELE(nce); @@ -1259,7 +1329,13 @@ ndp_query(ill_t *ill, struct lif_nd_req *lnr) sin6 = (sin6_t *)&lnr->lnr_addr; addr = &sin6->sin6_addr; - nce = ndp_lookup_v6(ill, addr, B_FALSE); + /* + * NOTE: if the ill is an IPMP interface, then match against the whole + * illgrp. This e.g. allows in.ndpd to retrieve the link layer + * addresses for the data addresses on an IPMP interface even though + * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill. + */ + nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE); if (nce == NULL) return (ESRCH); /* If in INCOMPLETE state, no link layer address is available yet */ @@ -1347,24 +1423,14 @@ ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len, uint32_t nce_solicit(nce_t *nce, mblk_t *mp) { - ill_t *ill; - ill_t *src_ill; ip6_t *ip6h; - in6_addr_t src; - in6_addr_t dst; - ipif_t *ipif; - ip6i_t *ip6i; - boolean_t dropped = B_FALSE; - ip_stack_t *ipst = nce->nce_ill->ill_ipst; + in6_addr_t sender; + boolean_t dropped; - ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); ASSERT(MUTEX_HELD(&nce->nce_lock)); - ill = nce->nce_ill; - ASSERT(ill != NULL); - if (nce->nce_rcnt == 0) { + if (nce->nce_rcnt == 0) return (0); - } if (mp == NULL) { ASSERT(nce->nce_qd_mp != NULL); @@ -1385,60 +1451,22 @@ nce_solicit(nce_t *nce, mblk_t *mp) * could be from the nce_qd_mp which could have b_next/b_prev * non-NULL. */ - ip6i = (ip6i_t *)ip6h; - ASSERT((mp->b_wptr - (uchar_t *)ip6i) >= - sizeof (ip6i_t) + IPV6_HDR_LEN); + ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN); ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t)); } - src = ip6h->ip6_src; - /* - * If the src of outgoing packet is one of the assigned interface - * addresses use it, otherwise we will pick the source address below. - */ - src_ill = ill; - if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { - if (ill->ill_group != NULL) - src_ill = ill->ill_group->illgrp_ill; - for (; src_ill != NULL; src_ill = src_ill->ill_group_next) { - for (ipif = src_ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (IN6_ARE_ADDR_EQUAL(&src, - &ipif->ipif_v6lcl_addr)) { - break; - } - } - if (ipif != NULL) - break; - } - /* - * If no relevant ipif can be found, then it's not one of our - * addresses. Reset to :: and let nce_xmit. If an ipif can be - * found, but it's not yet done with DAD verification, then - * just postpone this transmission until later. - */ - if (src_ill == NULL) - src = ipv6_all_zeros; - else if (!ipif->ipif_addr_ready) - return (ill->ill_reachable_retrans_time); - } - dst = nce->nce_addr; + /* - * If source address is unspecified, nce_xmit will choose - * one for us and initialize the hardware address also - * appropriately. + * Need to copy the sender address into a local since `mp' can + * go away once we drop nce_lock. */ - if (IN6_IS_ADDR_UNSPECIFIED(&src)) - src_ill = NULL; + sender = ip6h->ip6_src; nce->nce_rcnt--; mutex_exit(&nce->nce_lock); - rw_exit(&ipst->ips_ill_g_lock); - dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src, - &dst, 0); - rw_enter(&ipst->ips_ill_g_lock, RW_READER); + dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0); mutex_enter(&nce->nce_lock); if (dropped) nce->nce_rcnt++; - return (ill->ill_reachable_retrans_time); + return (nce->nce_ill->ill_reachable_retrans_time); } /* @@ -1475,7 +1503,7 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) */ mutex_enter(&ill->ill_lock); if (!(ipif->ipif_flags & IPIF_DUPLICATE) || - (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) { + (ipif->ipif_state_flags & IPIF_CONDEMNED)) { mutex_exit(&ill->ill_lock); continue; } @@ -1485,8 +1513,8 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) mutex_exit(&ill->ill_lock); ipif->ipif_was_dup = B_TRUE; - if (ipif_ndp_up(ipif) != EINPROGRESS) - (void) ipif_up_done_v6(ipif); + VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); + (void) ipif_up_done_v6(ipif); } freeb(mp); } @@ -1515,7 +1543,7 @@ ipif6_dup_recovery(void *arg) /* * No lock, because this is just an optimization. */ - if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED)) + if (ipif->ipif_state_flags & IPIF_CONDEMNED) return; /* If the link is down, we'll retry this later */ @@ -1542,13 +1570,20 @@ ndp_do_recovery(ipif_t *ipif) if (mp == NULL) { mutex_enter(&ill->ill_lock); if (ipif->ipif_recovery_id == 0 && - !(ipif->ipif_state_flags & (IPIF_MOVING | - IPIF_CONDEMNED))) { + !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); } mutex_exit(&ill->ill_lock); } else { + /* + * A recovery timer may still be running if we got here from + * ill_restart_dad(); cancel that timer. + */ + if (ipif->ipif_recovery_id != 0) + (void) untimeout(ipif->ipif_recovery_id); + ipif->ipif_recovery_id = 0; + bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, sizeof (ipif->ipif_v6lcl_addr)); ill_refhold(ill); @@ -1558,41 +1593,51 @@ ndp_do_recovery(ipif_t *ipif) } /* - * Find the solicitation in the given message, and extract printable details - * (MAC and IP addresses) from it. + * Find the MAC and IP addresses in an NA/NS message. */ -static nd_neighbor_solicit_t * -ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf, - size_t hlen, char *sbuf, size_t slen, uchar_t **haddr) +static void +ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp, + uchar_t **haddr, uint_t *haddrlenp) { - nd_neighbor_solicit_t *ns; - ip6_t *ip6h; + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); + nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; + nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; uchar_t *addr; - int alen; + int alen = 0; - alen = 0; - ip6h = (ip6_t *)mp->b_rptr; if (dl_mp == NULL) { nd_opt_hdr_t *opt; - int nslen; + int len; /* * If it's from the fast-path, then it can't be a probe - * message, and thus must include the source linkaddr option. + * message, and thus must include a linkaddr option. * Extract that here. */ - ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN); - nslen = mp->b_wptr - (uchar_t *)ns; - if ((nslen -= sizeof (*ns)) > 0) { - opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen, - ND_OPT_SOURCE_LINKADDR); - if (opt != NULL && - opt->nd_opt_len * 8 - sizeof (*opt) >= - ill->ill_nd_lla_len) { - addr = (uchar_t *)(opt + 1); - alen = ill->ill_nd_lla_len; + switch (icmp6->icmp6_type) { + case ND_NEIGHBOR_SOLICIT: + len = mp->b_wptr - (uchar_t *)ns; + if ((len -= sizeof (*ns)) > 0) { + opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), + len, ND_OPT_SOURCE_LINKADDR); } + break; + case ND_NEIGHBOR_ADVERT: + len = mp->b_wptr - (uchar_t *)na; + if ((len -= sizeof (*na)) > 0) { + opt = ndp_get_option((nd_opt_hdr_t *)(na + 1), + len, ND_OPT_TARGET_LINKADDR); + } + break; + } + + if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >= + ill->ill_nd_lla_len) { + addr = (uchar_t *)(opt + 1); + alen = ill->ill_nd_lla_len; } + /* * We cheat a bit here for the sake of printing usable log * messages in the rare case where the reply we got was unicast @@ -1624,16 +1669,17 @@ ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf, } } } + if (alen > 0) { *haddr = addr; - (void) mac_colon_addr(addr, alen, hbuf, hlen); + *haddrlenp = alen; } else { *haddr = NULL; - (void) strcpy(hbuf, "?"); + *haddrlenp = 0; } - ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN); - (void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen); - return (ns); + + /* nd_ns_target and nd_na_target are at the same offset, so we cheat */ + *targp = ns->nd_ns_target; } /* @@ -1646,68 +1692,80 @@ ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) { ill_t *ill = rq->q_ptr; ipif_t *ipif; - char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ - char hbuf[MAC_STR_LEN]; - char sbuf[INET6_ADDRSTRLEN]; - nd_neighbor_solicit_t *ns; - mblk_t *dl_mp = NULL; - uchar_t *haddr; + mblk_t *dl_mp = NULL; + uchar_t *haddr; + uint_t haddrlen; ip_stack_t *ipst = ill->ill_ipst; + in6_addr_t targ; if (DB_TYPE(mp) != M_DATA) { dl_mp = mp; mp = mp->b_cont; } - ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf, - sizeof (sbuf), &haddr); - if (haddr != NULL && - bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { + + ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen); + if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { /* - * Ignore conflicts generated by misbehaving switches that just - * reflect our own messages back to us. + * Ignore conflicts generated by misbehaving switches that + * just reflect our own messages back to us. For IPMP, we may + * see reflections across any ill in the illgrp. */ - goto ignore_conflict; + if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || + IS_UNDER_IPMP(ill) && + ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL) + goto ignore_conflict; } - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + /* + * Look up the appropriate ipif. + */ + ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL, + NULL, ipst); + if (ipif == NULL) + goto ignore_conflict; - if ((ipif->ipif_flags & IPIF_POINTOPOINT) || - !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, - &ns->nd_ns_target)) { - continue; - } + /* Reload the ill to match the ipif */ + ill = ipif->ipif_ill; - /* If it's already marked, then don't do anything. */ - if (ipif->ipif_flags & IPIF_DUPLICATE) - continue; + /* If it's already duplicate or ineligible, then don't do anything. */ + if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { + ipif_refrele(ipif); + goto ignore_conflict; + } - /* - * If this is a failure during duplicate recovery, then don't - * complain. It may take a long time to recover. - */ - if (!ipif->ipif_was_dup) { - ipif_get_name(ipif, ibuf, sizeof (ibuf)); - cmn_err(CE_WARN, "%s has duplicate address %s (in " - "use by %s); disabled", ibuf, sbuf, hbuf); - } - mutex_enter(&ill->ill_lock); - ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); - ipif->ipif_flags |= IPIF_DUPLICATE; - ill->ill_ipif_dup_count++; - mutex_exit(&ill->ill_lock); - (void) ipif_down(ipif, NULL, NULL); - ipif_down_tail(ipif); - mutex_enter(&ill->ill_lock); - if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && - ill->ill_net_type == IRE_IF_RESOLVER && - !(ipif->ipif_state_flags & (IPIF_MOVING | - IPIF_CONDEMNED)) && - ipst->ips_ip_dup_recovery > 0) { - ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, - ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); - } - mutex_exit(&ill->ill_lock); + /* + * If this is a failure during duplicate recovery, then don't + * complain. It may take a long time to recover. + */ + if (!ipif->ipif_was_dup) { + char ibuf[LIFNAMSIZ]; + char hbuf[MAC_STR_LEN]; + char sbuf[INET6_ADDRSTRLEN]; + + ipif_get_name(ipif, ibuf, sizeof (ibuf)); + cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" + " disabled", ibuf, + inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), + mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf))); } + mutex_enter(&ill->ill_lock); + ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); + ipif->ipif_flags |= IPIF_DUPLICATE; + ill->ill_ipif_dup_count++; + mutex_exit(&ill->ill_lock); + (void) ipif_down(ipif, NULL, NULL); + ipif_down_tail(ipif); + mutex_enter(&ill->ill_lock); + if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && + ill->ill_net_type == IRE_IF_RESOLVER && + !(ipif->ipif_state_flags & IPIF_CONDEMNED) && + ipst->ips_ip_dup_recovery > 0) { + ASSERT(ipif->ipif_recovery_id == 0); + ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, + ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); + } + mutex_exit(&ill->ill_lock); + ipif_refrele(ipif); ignore_conflict: if (dl_mp != NULL) freeb(dl_mp); @@ -1721,7 +1779,7 @@ ignore_conflict: * we start a timer on the ipif. */ static void -ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) +ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) { if ((mp = copymsg(mp)) != NULL) { if (dl_mp == NULL) @@ -1736,7 +1794,6 @@ ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) B_FALSE); } } - ndp_delete(nce); } /* @@ -1757,6 +1814,7 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) NULL, NULL, ipst); if (ipif == NULL) return; + /* * First, figure out if this address is disposable. */ @@ -1786,19 +1844,21 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) * sending out an unsolicited Neighbor Advertisement. */ if (defs >= maxdefense) { - ip_ndp_failure(ill, mp, dl_mp, nce); + ip_ndp_failure(ill, mp, dl_mp); } else { char hbuf[MAC_STR_LEN]; char sbuf[INET6_ADDRSTRLEN]; uchar_t *haddr; + uint_t haddrlen; + in6_addr_t targ; - (void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, - sizeof (hbuf), sbuf, sizeof (sbuf), &haddr); + ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen); cmn_err(CE_WARN, "node %s is using our IP address %s on %s", - hbuf, sbuf, ill->ill_name); - (void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE, - &nce->nce_addr, &ipv6_all_hosts_mcast, - nce_advert_flags(nce)); + mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)), + inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), + ill->ill_name); + + (void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0); } } @@ -1843,6 +1903,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) bad_solicit = B_TRUE; goto done; } + } if (IN6_IS_ADDR_UNSPECIFIED(&src)) { /* Check to see if this is a valid DAD solicitation */ @@ -1859,7 +1920,13 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) } } - our_nce = ndp_lookup_v6(ill, &target, B_FALSE); + /* + * NOTE: with IPMP, it's possible the nominated multicast ill (which + * received this packet if it's multicast) is not the ill tied to + * e.g. the IPMP ill's data link-local. So we match across the illgrp + * to ensure we find the associated NCE. + */ + our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE); /* * If this is a valid Solicitation, a permanent * entry should exist in the cache @@ -1883,7 +1950,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) haddr = (uchar_t *)&opt[1]; if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || hlen == 0) { - ip1dbg(("ndp_input_advert: bad SLLA\n")); + ip1dbg(("ndp_input_solicit: bad SLLA\n")); bad_solicit = B_TRUE; goto done; } @@ -1934,6 +2001,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) goto no_source; err = ndp_lookup_then_add_v6(ill, + B_FALSE, haddr, &src, /* Soliciting nodes address */ &ipv6_all_ones, @@ -1949,8 +2017,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) break; case EEXIST: /* - * B_FALSE indicates this is not an - * an advertisement. + * B_FALSE indicates this is not an an advertisement. */ ndp_process(nnce, haddr, 0, B_FALSE); NCE_REFRELE(nnce); @@ -1985,7 +2052,7 @@ no_source: * If someone else is probing our address, then * we've crossed wires. Declare failure. */ - ip_ndp_failure(ill, mp, dl_mp, our_nce); + ip_ndp_failure(ill, mp, dl_mp); } goto done; } @@ -1995,15 +2062,8 @@ no_source: */ src = ipv6_all_hosts_mcast; } - flag |= nce_advert_flags(our_nce); /* Response to a solicitation */ - (void) nce_xmit(ill, - ND_NEIGHBOR_ADVERT, - ill, /* ill to be used for extracting ill_nd_lla */ - B_TRUE, /* use ill_nd_lla */ - &target, /* Source and target of the advertisement pkt */ - &src, /* IP Destination (source of original pkt) */ - flag); + (void) nce_xmit_advert(our_nce, B_TRUE, &src, flag); done: if (bad_solicit) BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); @@ -2023,8 +2083,8 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) in6_addr_t target; nd_opt_hdr_t *opt = NULL; int len; - mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; ip_stack_t *ipst = ill->ill_ipst; + mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; ip6h = (ip6_t *)mp->b_rptr; icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); @@ -2067,66 +2127,62 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) } /* - * If this interface is part of the group look at all the + * NOTE: we match across the illgrp since we need to do DAD for all of + * our local addresses, and those are spread across all the active * ills in the group. */ - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - if (ill->ill_group != NULL) - ill = ill->ill_group->illgrp_ill; + if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL) + return; - for (; ill != NULL; ill = ill->ill_group_next) { - mutex_enter(&ill->ill_lock); - if (!ILL_CAN_LOOKUP(ill)) { - mutex_exit(&ill->ill_lock); - continue; - } - ill_refhold_locked(ill); - mutex_exit(&ill->ill_lock); - dst_nce = ndp_lookup_v6(ill, &target, B_FALSE); - /* We have to drop the lock since ndp_process calls put* */ - rw_exit(&ipst->ips_ill_g_lock); - if (dst_nce != NULL) { - if ((dst_nce->nce_flags & NCE_F_PERMANENT) && - dst_nce->nce_state == ND_PROBE) { - /* - * Someone else sent an advertisement for an - * address that we're trying to configure. - * Tear it down. Note that dl_mp might be NULL - * if we're getting a unicast reply. This - * isn't typically done (multicast is the norm - * in response to a probe), but ip_ndp_failure - * will handle the dl_mp == NULL case as well. - */ - ip_ndp_failure(ill, mp, dl_mp, dst_nce); - } else if (dst_nce->nce_flags & NCE_F_PERMANENT) { - /* - * Someone just announced one of our local - * addresses. If it wasn't us, then this is a - * conflict. Defend the address or shut it - * down. - */ - if (dl_mp != NULL && - (haddr == NULL || - nce_cmp_ll_addr(dst_nce, haddr, - ill->ill_nd_lla_len))) { - ip_ndp_conflict(ill, mp, dl_mp, - dst_nce); - } - } else { - if (na->nd_na_flags_reserved & - ND_NA_FLAG_ROUTER) { - dst_nce->nce_flags |= NCE_F_ISROUTER; + if (dst_nce->nce_flags & NCE_F_PERMANENT) { + /* + * Someone just advertised one of our local addresses. First, + * check it it was us -- if so, we can safely ignore it. + */ + if (haddr != NULL) { + if (!nce_cmp_ll_addr(dst_nce, haddr, hlen)) + goto out; /* from us -- no conflict */ + + /* + * If we're in an IPMP group, check if this is an echo + * from another ill in the group. Use the double- + * checked locking pattern to avoid grabbing + * ill_g_lock in the non-IPMP case. + */ + if (IS_UNDER_IPMP(ill)) { + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill( + ill->ill_grp, haddr, hlen) != NULL) { + rw_exit(&ipst->ips_ill_g_lock); + goto out; } - /* B_TRUE indicates this an advertisement */ - ndp_process(dst_nce, haddr, - na->nd_na_flags_reserved, B_TRUE); + rw_exit(&ipst->ips_ill_g_lock); } - NCE_REFRELE(dst_nce); } - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - ill_refrele(ill); + + /* + * This appears to be a real conflict. If we're trying to + * configure this NCE (ND_PROBE), then shut it down. + * Otherwise, handle the discovered conflict. + * + * Note that dl_mp might be NULL if we're getting a unicast + * reply. This isn't typically done (multicast is the norm in + * response to a probe), but we can handle the dl_mp == NULL + * case as well. + */ + if (dst_nce->nce_state == ND_PROBE) + ip_ndp_failure(ill, mp, dl_mp); + else + ip_ndp_conflict(ill, mp, dl_mp, dst_nce); + } else { + if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) + dst_nce->nce_flags |= NCE_F_ISROUTER; + + /* B_TRUE indicates this an advertisement */ + ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE); } - rw_exit(&ipst->ips_ill_g_lock); +out: + NCE_REFRELE(dst_nce); } /* @@ -2194,6 +2250,40 @@ done: } /* + * Utility routine to send an advertisement. Assumes that the NCE cannot + * go away (e.g., because it's refheld). + */ +static boolean_t +nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target, + uint_t flags) +{ + ASSERT((flags & NDP_PROBE) == 0); + + if (nce->nce_flags & NCE_F_ISROUTER) + flags |= NDP_ISROUTER; + if (!(nce->nce_flags & NCE_F_ANYCAST)) + flags |= NDP_ORIDE; + + return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla, + &nce->nce_addr, target, flags)); +} + +/* + * Utility routine to send a solicitation. Assumes that the NCE cannot + * go away (e.g., because it's refheld). + */ +static boolean_t +nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender, + uint_t flags) +{ + if (flags & NDP_PROBE) + sender = &ipv6_all_zeros; + + return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla, + sender, &nce->nce_addr, flags)); +} + +/* * nce_xmit is called to form and transmit a ND solicitation or * advertisement ICMP packet. * @@ -2207,88 +2297,79 @@ done: * corresponding ill's ill_wq otherwise returns B_TRUE. */ static boolean_t -nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, - boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target, - int flag) +nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla, + const in6_addr_t *sender, const in6_addr_t *target, int flag) { + ill_t *hwaddr_ill; uint32_t len; icmp6_t *icmp6; mblk_t *mp; ip6_t *ip6h; nd_opt_hdr_t *opt; - uint_t plen; + uint_t plen, maxplen; ip6i_t *ip6i; ipif_t *src_ipif = NULL; uint8_t *hw_addr; zoneid_t zoneid = GLOBAL_ZONEID; + char buf[INET6_ADDRSTRLEN]; + + ASSERT(!IS_IPMP(ill)); /* - * If we have a unspecified source(sender) address, select a - * proper source address for the solicitation here itself so - * that we can initialize the h/w address correctly. This is - * needed for interface groups as source address can come from - * the whole group and the h/w address initialized from ill will - * be wrong if the source address comes from a different ill. - * - * If the sender is specified then we use this address in order - * to lookup the zoneid before calling ip_output_v6(). This is to - * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly - * by IP (we cannot guarantee that the global zone has an interface - * route to the destination). - * - * Note that the NA never comes here with the unspecified source - * address. The following asserts that whenever the source - * address is specified, the haddr also should be specified. + * Check that the sender is actually a usable address on `ill', and if + * so, track that as the src_ipif. If not, for solicitations, set the + * sender to :: so that a new one will be picked below; for adverts, + * drop the packet since we expect nce_xmit_advert() to always provide + * a valid sender. */ - ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL)); + if (!IN6_IS_ADDR_UNSPECIFIED(sender)) { + if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL || + !src_ipif->ipif_addr_ready) { + if (src_ipif != NULL) { + ipif_refrele(src_ipif); + src_ipif = NULL; + } + if (type == ND_NEIGHBOR_ADVERT) { + ip1dbg(("nce_xmit: No source ipif for src %s\n", + inet_ntop(AF_INET6, sender, buf, + sizeof (buf)))); + return (B_TRUE); + } + sender = &ipv6_all_zeros; + } + } + /* + * If we still have an unspecified source (sender) address and this + * isn't a probe, select a source address from `ill'. + */ if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) { - ASSERT(operation != ND_NEIGHBOR_ADVERT); + ASSERT(type != ND_NEIGHBOR_ADVERT); /* - * Pick a source address for this solicitation, but - * restrict the selection to addresses assigned to the - * output interface (or interface group). We do this - * because the destination will create a neighbor cache - * entry for the source address of this packet, so the - * source address had better be a valid neighbor. + * Pick a source address for this solicitation, but restrict + * the selection to addresses assigned to the output + * interface. We do this because the destination will create + * a neighbor cache entry for the source address of this + * packet, so the source address needs to be a valid neighbor. */ - src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL, + src_ipif = ipif_select_source_v6(ill, target, B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES); if (src_ipif == NULL) { - char buf[INET6_ADDRSTRLEN]; - ip1dbg(("nce_xmit: No source ipif for dst %s\n", - inet_ntop(AF_INET6, (char *)target, buf, - sizeof (buf)))); + inet_ntop(AF_INET6, target, buf, sizeof (buf)))); return (B_TRUE); } sender = &src_ipif->ipif_v6src_addr; - hwaddr_ill = src_ipif->ipif_ill; - } else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { - zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst); - /* - * It's possible for ipif_lookup_addr_zoneid_v6() to return - * ALL_ZONES if it cannot find a matching ipif for the address - * we are trying to use. In this case we err on the side of - * trying to send the packet by defaulting to the GLOBAL_ZONEID. - */ - if (zoneid == ALL_ZONES) - zoneid = GLOBAL_ZONEID; } /* - * Always make sure that the NS/NA packets don't get load - * spread. This is needed so that the probe packets sent - * by the in.mpathd daemon can really go out on the desired - * interface. Probe packets are made to go out on a desired - * interface by including a ip6i with ATTACH_IF flag. As these - * packets indirectly end up sending/receiving NS/NA packets - * (neighbor doing NUD), we have to make sure that NA - * also go out on the same interface. + * We're either sending a probe or we have a source address. */ - plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8; + ASSERT((flag & NDP_PROBE) || src_ipif != NULL); + + maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8); len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) + - plen * 8; + maxplen; mp = allocb(len, BPRI_LO); if (mp == NULL) { if (src_ipif != NULL) @@ -2301,28 +2382,27 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, ip6i = (ip6i_t *)mp->b_rptr; ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6i->ip6i_nxt = IPPROTO_RAW; - ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT; + ip6i->ip6i_flags = IP6I_HOPLIMIT; if (flag & NDP_PROBE) ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; - ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t)); ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); ip6h->ip6_nxt = IPPROTO_ICMPV6; ip6h->ip6_hops = IPV6_MAX_HOPS; + ip6h->ip6_src = *sender; ip6h->ip6_dst = *target; icmp6 = (icmp6_t *)&ip6h[1]; opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); - if (operation == ND_NEIGHBOR_SOLICIT) { + if (type == ND_NEIGHBOR_SOLICIT) { nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; if (!(flag & NDP_PROBE)) opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; - ip6h->ip6_src = *sender; ns->nd_ns_target = *target; if (!(flag & NDP_UNICAST)) { /* Form multicast address of the target */ @@ -2335,7 +2415,6 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, ASSERT(!(flag & NDP_PROBE)); opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; - ip6h->ip6_src = *sender; na->nd_na_target = *sender; if (flag & NDP_ISROUTER) na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; @@ -2347,22 +2426,48 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, hw_addr = NULL; if (!(flag & NDP_PROBE)) { + /* + * Use our source address to find the hardware address to put + * in the packet, so that the hardware address and IP address + * will match up -- even if that hardware address doesn't + * match the ill we actually transmit the packet through. + */ + if (IS_IPMP(src_ipif->ipif_ill)) { + hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif); + if (hwaddr_ill == NULL) { + ip1dbg(("nce_xmit: no bound ill!\n")); + ipif_refrele(src_ipif); + freemsg(mp); + return (B_TRUE); + } + } else { + hwaddr_ill = src_ipif->ipif_ill; + ill_refhold(hwaddr_ill); /* for symmetry */ + } + + plen = roundup(sizeof (nd_opt_hdr_t) + + hwaddr_ill->ill_nd_lla_len, 8); + hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla : hwaddr_ill->ill_phys_addr; if (hw_addr != NULL) { /* Fill in link layer address and option len */ - opt->nd_opt_len = (uint8_t)plen; + opt->nd_opt_len = (uint8_t)(plen / 8); bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len); } + + ill_refrele(hwaddr_ill); } - if (hw_addr == NULL) { - /* If there's no link layer address option, then strip it. */ - len -= plen * 8; - mp->b_wptr = mp->b_rptr + len; - ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); - } - icmp6->icmp6_type = (uint8_t)operation; + if (hw_addr == NULL) + plen = 0; + + /* Fix up the length of the packet now that plen is known */ + len -= (maxplen - plen); + mp->b_wptr = mp->b_rptr + len; + ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); + + icmp6->icmp6_type = type; icmp6->icmp6_code = 0; /* * Prepare for checksum by putting icmp length in the icmp @@ -2370,8 +2475,17 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, */ icmp6->icmp6_cksum = ip6h->ip6_plen; - if (src_ipif != NULL) + /* + * Before we toss the src_ipif, look up the zoneid to pass to + * ip_output_v6(). This is to ensure unicast ND_NEIGHBOR_ADVERT + * packets to be routed correctly by IP (we cannot guarantee that the + * global zone has an interface route to the destination). + */ + if (src_ipif != NULL) { + if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES) + zoneid = GLOBAL_ZONEID; ipif_refrele(src_ipif); + } ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT); return (B_FALSE); @@ -2448,7 +2562,6 @@ ndp_timer(void *arg) ill_t *ill = nce->nce_ill; uint32_t ms; char addrbuf[INET6_ADDRSTRLEN]; - mblk_t *mp; boolean_t dropped = B_FALSE; ip_stack_t *ipst = ill->ill_ipst; @@ -2460,11 +2573,6 @@ ndp_timer(void *arg) */ ASSERT(nce != NULL); - /* - * Grab the ill_g_lock now itself to avoid lock order problems. - * nce_solicit needs ill_g_lock to be able to traverse ills - */ - rw_enter(&ipst->ips_ill_g_lock, RW_READER); mutex_enter(&nce->nce_lock); NCE_REFHOLD_LOCKED(nce); nce->nce_timeout_id = 0; @@ -2474,11 +2582,10 @@ ndp_timer(void *arg) */ switch (nce->nce_state) { case ND_DELAY: - rw_exit(&ipst->ips_ill_g_lock); nce->nce_state = ND_PROBE; mutex_exit(&nce->nce_lock); - (void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE, - &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST); + (void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros, + NDP_UNICAST); if (ip_debug > 3) { /* ip2dbg */ pr_addr_dbg("ndp_timer: state for %s changed " @@ -2489,7 +2596,6 @@ ndp_timer(void *arg) return; case ND_PROBE: /* must be retransmit timer */ - rw_exit(&ipst->ips_ill_g_lock); nce->nce_pcnt--; ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT && nce->nce_pcnt >= -1); @@ -2504,8 +2610,8 @@ ndp_timer(void *arg) nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr, addrbuf, sizeof (addrbuf)))); mutex_exit(&nce->nce_lock); - dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, - B_FALSE, &ipv6_all_zeros, &nce->nce_addr, + dropped = nce_xmit_solicit(nce, B_FALSE, + &ipv6_all_zeros, (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE : NDP_UNICAST); if (dropped) { @@ -2542,8 +2648,8 @@ ndp_timer(void *arg) */ nce->nce_state = ND_REACHABLE; mutex_exit(&nce->nce_lock); - ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, - ALL_ZONES, NULL, NULL, NULL, NULL, ipst); + ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr, + nce->nce_ill); if (ipif != NULL) { if (ipif->ipif_was_dup) { char ibuf[LIFNAMSIZ + 10]; @@ -2566,9 +2672,8 @@ ndp_timer(void *arg) } /* Begin defending our new address */ nce->nce_unsolicit_count = 0; - dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, - B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast, - nce_advert_flags(nce)); + dropped = nce_xmit_advert(nce, B_FALSE, + &ipv6_all_hosts_mcast, 0); if (dropped) { nce->nce_unsolicit_count = 1; NDP_RESTART_TIMER(nce, @@ -2589,51 +2694,40 @@ ndp_timer(void *arg) } NCE_REFRELE(nce); return; - case ND_INCOMPLETE: + case ND_INCOMPLETE: { + ip6_t *ip6h; + ip6i_t *ip6i; + mblk_t *mp, *datamp, *nextmp, **prevmpp; + /* - * Must be resolvers retransmit timer. + * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp + * for any IPMP probe packets, and toss 'em. IPMP probe + * packets will always be at the head of nce_qd_mp and always + * have an ip6i_t header, so we can stop at the first queued + * ND packet without an ip6i_t. */ - for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) { - ip6i_t *ip6i; - ip6_t *ip6h; - mblk_t *data_mp; - - /* - * Walk the list of packets queued, and see if there - * are any multipathing probe packets. Such packets - * are always queued at the head. Since this is a - * retransmit timer firing, mark such packets as - * delayed in ND resolution. This info will be used - * in ip_wput_v6(). Multipathing probe packets will - * always have an ip6i_t. Once we hit a packet without - * it, we can break out of this loop. - */ - if (mp->b_datap->db_type == M_CTL) - data_mp = mp->b_cont; - else - data_mp = mp; - - ip6h = (ip6_t *)data_mp->b_rptr; + prevmpp = &nce->nce_qd_mp; + for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) { + nextmp = mp->b_next; + datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp; + ip6h = (ip6_t *)datamp->b_rptr; if (ip6h->ip6_nxt != IPPROTO_RAW) break; - /* - * This message should have been pulled up already in - * ip_wput_v6. We can't do pullups here because the - * b_next/b_prev is non-NULL. - */ ip6i = (ip6i_t *)ip6h; - ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >= - sizeof (ip6i_t) + IPV6_HDR_LEN); - - /* Mark this packet as delayed due to ND resolution */ - if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) - ip6i->ip6i_flags |= IP6I_ND_DELAYED; + if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) { + inet_freemsg(mp); + *prevmpp = nextmp; + } else { + prevmpp = &mp->b_next; + } } + + /* + * Must be resolver's retransmit timer. + */ if (nce->nce_qd_mp != NULL) { - ms = nce_solicit(nce, NULL); - rw_exit(&ipst->ips_ill_g_lock); - if (ms == 0) { + if ((ms = nce_solicit(nce, NULL)) == 0) { if (nce->nce_state != ND_REACHABLE) { mutex_exit(&nce->nce_lock); nce_resolv_failed(nce); @@ -2649,11 +2743,10 @@ ndp_timer(void *arg) return; } mutex_exit(&nce->nce_lock); - rw_exit(&ipst->ips_ill_g_lock); NCE_REFRELE(nce); break; - case ND_REACHABLE : - rw_exit(&ipst->ips_ill_g_lock); + } + case ND_REACHABLE: if (((nce->nce_flags & NCE_F_UNSOL_ADV) && nce->nce_unsolicit_count != 0) || ((nce->nce_flags & NCE_F_PERMANENT) && @@ -2661,13 +2754,8 @@ ndp_timer(void *arg) if (nce->nce_unsolicit_count > 0) nce->nce_unsolicit_count--; mutex_exit(&nce->nce_lock); - dropped = nce_xmit(ill, - ND_NEIGHBOR_ADVERT, - ill, /* ill to be used for hw addr */ - B_FALSE, /* use ill_phys_addr */ - &nce->nce_addr, - &ipv6_all_hosts_mcast, - nce_advert_flags(nce)); + dropped = nce_xmit_advert(nce, B_FALSE, + &ipv6_all_hosts_mcast, 0); if (dropped) { mutex_enter(&nce->nce_lock); nce->nce_unsolicit_count++; @@ -2686,7 +2774,6 @@ ndp_timer(void *arg) NCE_REFRELE(nce); break; default: - rw_exit(&ipst->ips_ill_g_lock); mutex_exit(&nce->nce_lock); NCE_REFRELE(nce); break; @@ -2819,23 +2906,20 @@ void nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert) { uint_t count = 0; - mblk_t **mpp; + mblk_t **mpp, *tmp; ASSERT(MUTEX_HELD(&nce->nce_lock)); - for (mpp = &nce->nce_qd_mp; *mpp != NULL; - mpp = &(*mpp)->b_next) { - if (++count > - nce->nce_ill->ill_max_buf) { - mblk_t *tmp = nce->nce_qd_mp->b_next; - + for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { + if (++count > nce->nce_ill->ill_max_buf) { + tmp = nce->nce_qd_mp->b_next; nce->nce_qd_mp->b_next = NULL; nce->nce_qd_mp->b_prev = NULL; freemsg(nce->nce_qd_mp); nce->nce_qd_mp = tmp; } } - /* put this on the list */ + if (head_insert) { mp->b_next = nce->nce_qd_mp; nce->nce_qd_mp = mp; @@ -2849,8 +2933,8 @@ nce_queue_mp(nce_t *nce, mblk_t *mp) { boolean_t head_insert = B_FALSE; ip6_t *ip6h; - ip6i_t *ip6i; - mblk_t *data_mp; + ip6i_t *ip6i; + mblk_t *data_mp; ASSERT(MUTEX_HELD(&nce->nce_lock)); @@ -2867,43 +2951,28 @@ nce_queue_mp(nce_t *nce, mblk_t *mp) * non-NULL. */ ip6i = (ip6i_t *)ip6h; - ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >= - sizeof (ip6i_t) + IPV6_HDR_LEN); + ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN); + /* - * Multipathing probe packets have IP6I_DROP_IFDELAYED set. - * This has 2 aspects mentioned below. - * 1. Perform head insertion in the nce_qd_mp for these packets. - * This ensures that next retransmit of ND solicitation - * will use the interface specified by the probe packet, - * for both NS and NA. This corresponds to the src address - * in the IPv6 packet. If we insert at tail, we will be - * depending on the packet at the head for successful - * ND resolution. This is not reliable, because the interface - * on which the NA arrives could be different from the interface - * on which the NS was sent, and if the receiving interface is - * failed, it will appear that the sending interface is also - * failed, causing in.mpathd to misdiagnose this as link - * failure. - * 2. Drop the original packet, if the ND resolution did not - * succeed in the first attempt. However we will create the - * nce and the ire, as soon as the ND resolution succeeds. - * We don't gain anything by queueing multiple probe packets - * and sending them back-to-back once resolution succeeds. - * It is sufficient to send just 1 packet after ND resolution - * succeeds. Since mpathd is sending down probe packets at a - * constant rate, we don't need to send the queued packet. We - * need to queue it only for NDP resolution. The benefit of - * dropping the probe packets that were delayed in ND - * resolution, is that in.mpathd will not see inflated - * RTT. If the ND resolution does not succeed within - * in.mpathd's failure detection time, mpathd may detect - * a failure, and it does not matter whether the packet - * was queued or dropped. + * If this packet is marked IP6I_IPMP_PROBE, then we need to: + * + * 1. Insert it at the head of the nce_qd_mp list. Consider + * the normal (non-probe) load-speading case where the + * source address of the ND packet is not tied to nce_ill. + * If the ill bound to the source address cannot receive, + * the response to the ND packet will not be received. + * However, if ND packets for nce_ill's probes are queued + * behind that ND packet, those probes will also fail to + * be sent, and thus in.mpathd will erroneously conclude + * that nce_ill has also failed. + * + * 2. Drop the probe packet in ndp_timer() if the ND did + * not succeed on the first attempt. This ensures that + * ND problems do not manifest as probe RTT spikes. */ - if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) + if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) head_insert = B_TRUE; } - nce_queue_mp_common(nce, mp, head_insert); } @@ -2988,13 +3057,17 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) (lnr->lnr_state_create != ND_STALE)) return (EINVAL); + if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN) + return (EINVAL); + sin6 = (sin6_t *)&lnr->lnr_addr; addr = &sin6->sin6_addr; mutex_enter(&ipst->ips_ndp6->ndp_g_lock); /* We know it can not be mapping so just look in the hash table */ nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); - nce = nce_lookup_addr(ill, addr, nce); + /* See comment in ndp_query() regarding IS_IPMP(ill) usage */ + nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce); if (nce != NULL) new_flags = nce->nce_flags; @@ -3065,7 +3138,7 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) * the link layer address passed in to determine the state * much like incoming packets. */ - ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); + nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); NCE_REFRELE(nce); return (0); } @@ -3463,7 +3536,11 @@ ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags, mutex_enter(&ipst->ips_ndp4->ndp_g_lock); nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); - nce = nce_lookup_addr(ill, &addr6, nce); + /* + * NOTE: IPv4 never matches across the illgrp since the NCE's we're + * looking up have fastpath headers that are inherently per-ill. + */ + nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce); if (nce == NULL) { err = ndp_add_v4(ill, addr, flags, newnce, src_nce); } else { @@ -3718,3 +3795,26 @@ ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns) mutex_exit(&ipst->ips_ndp4->ndp_g_lock); return (nce != NULL); } + +/* + * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly + * with IPMP. Specifically, since neighbor discovery is always done on + * underlying interfaces (even for addresses owned by an IPMP interface), we + * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface + * associated with `ill' (if it exists). + */ +static ipif_t * +ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill) +{ + ipif_t *ipif; + ip_stack_t *ipst = ill->ill_ipst; + + ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst); + if (ipif == NULL && IS_UNDER_IPMP(ill)) { + if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { + ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst); + ill_refrele(ill); + } + } + return (ipif); +} diff --git a/usr/src/uts/common/inet/ip/ip_netinfo.c b/usr/src/uts/common/inet/ip/ip_netinfo.c index 53665593be..e81c7a0e1f 100644 --- a/usr/src/uts/common/inet/ip/ip_netinfo.c +++ b/usr/src/uts/common/inet/ip/ip_netinfo.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -568,33 +568,17 @@ ip_getifname_impl(phy_if_t phy_ifdata, char *buffer, const size_t buflen, boolean_t isv6, ip_stack_t *ipst) { ill_t *ill; - char *name; ASSERT(buffer != NULL); ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, NULL, NULL, NULL, NULL, ipst); - if (ill != NULL) { - name = ill->ill_name; - } else { - /* Fallback to group names only if hook_emulation is set */ - if (ipst->ips_ipmp_hook_emulation) { - ill = ill_group_lookup_on_ifindex((uint_t)phy_ifdata, - isv6, ipst); - } - if (ill == NULL) - return (1); - name = ill->ill_phyint->phyint_groupname; - } - if (name != NULL) { - (void) strlcpy(buffer, name, buflen); - ill_refrele(ill); - return (0); - } else { - ill_refrele(ill); + if (ill == NULL) return (1); - } + (void) strlcpy(buffer, ill->ill_name, buflen); + ill_refrele(ill); + return (0); } /* @@ -625,9 +609,6 @@ ipv6_getmtu(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata) /* * Shared implementation to determine the MTU of a network interface - * - * Note: this does not handle a non-zero ifdata when ipmp_hook_emulation is set. - * But IP Filter only uses a zero ifdata. */ /* ARGSUSED */ static int @@ -653,16 +634,7 @@ ip_getmtu_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6, if ((ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, NULL, NULL, NULL, NULL, ipst)) == NULL) { - /* - * Fallback to group names only if hook_emulation - * is set - */ - if (ipst->ips_ipmp_hook_emulation) { - ill = ill_group_lookup_on_ifindex( - (uint_t)phy_ifdata, isv6, ipst); - } - if (ill == NULL) - return (0); + return (0); } mtu = ill->ill_max_frag; ill_refrele(ill); @@ -686,9 +658,6 @@ ip_getpmtuenabled(net_handle_t neti) /* * Get next interface from the current list of IPv4 physical network interfaces - * - * Note: this does not handle the case when ipmp_hook_emulation is set. - * But IP Filter does not use this function. */ static phy_if_t ip_phygetnext(net_handle_t neti, phy_if_t phy_ifdata) @@ -752,15 +721,10 @@ ip_phylookup_impl(const char *name, boolean_t isv6, ip_stack_t *ipst) ill = ill_lookup_on_name((char *)name, B_FALSE, isv6, NULL, NULL, NULL, NULL, NULL, ipst); - - /* Fallback to group names only if hook_emulation is set */ - if (ill == NULL && ipst->ips_ipmp_hook_emulation) { - ill = ill_group_lookup_on_name((char *)name, isv6, ipst); - } if (ill == NULL) return (0); - phy = ill->ill_phyint->phyint_hook_ifindex; + phy = ill->ill_phyint->phyint_ifindex; ill_refrele(ill); @@ -798,9 +762,6 @@ ipv6_lifgetnext(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata) /* * Shared implementation to get next interface from the current list of * logical network interfaces - * - * Note: this does not handle the case when ipmp_hook_emulation is set. - * But IP Filter does not use this function. */ static lif_if_t ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6, @@ -834,7 +795,7 @@ ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6, /* * It's safe to iterate the ill_ipif list when holding an ill_lock. * And it's also safe to access ipif_id without ipif refhold. - * See ipif_get_id(). + * See the field access rules in ip.h. */ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (!IPIF_CAN_LOOKUP(ipif)) @@ -1013,8 +974,8 @@ ip_inject_impl(inject_t style, net_inject_t *packet, boolean_t isv6, if (ire->ire_nce == NULL || ire->ire_nce->nce_fp_mp == NULL && ire->ire_nce->nce_res_mp == NULL) { - ip_newroute_v6(ire->ire_stq, mp, - &sin6->sin6_addr, NULL, NULL, ALL_ZONES, ipst); + ip_newroute_v6(ire->ire_stq, mp, &sin6->sin6_addr, + &ip6h->ip6_src, NULL, ALL_ZONES, ipst); ire_refrele(ire); return (0); @@ -1170,7 +1131,7 @@ ip_routeto_impl(struct sockaddr *address, struct sockaddr *nexthop, } ASSERT(ill != NULL); - phy_if = (phy_if_t)ill->ill_phyint->phyint_hook_ifindex; + phy_if = (phy_if_t)ill->ill_phyint->phyint_ifindex; if (sire != NULL) ire_refrele(sire); ire_refrele(ire); @@ -1305,9 +1266,6 @@ ipv6_getlifaddr(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata, /* * Shared implementation to determine the network addresses for an interface - * - * Note: this does not handle a non-zero ifdata when ipmp_hook_emulation is set. - * But IP Filter only uses a zero ifdata. */ /* ARGSUSED */ static int @@ -1531,12 +1489,6 @@ ip_ni_queue_func_impl(injection_t *inject, boolean_t out) ill = ill_lookup_on_ifindex((uint_t)packet->ni_physical, B_FALSE, NULL, NULL, NULL, NULL, ipst); - - /* Fallback to group names only if hook_emulation is set */ - if (ill == NULL && ipst->ips_ipmp_hook_emulation) { - ill = ill_group_lookup_on_ifindex((uint_t)packet->ni_physical, - B_FALSE, ipst); - } if (ill == NULL) { kmem_free(inject, sizeof (*inject)); return; @@ -1613,65 +1565,3 @@ done: kmem_free(info->hnei_event.hne_data, info->hnei_event.hne_datalen); kmem_free(arg, sizeof (hook_nic_event_int_t)); } - -/* - * Temporary function to support IPMP emulation for IP Filter. - * Lookup an ill based on the ifindex assigned to the group. - * Skips unusable ones i.e. where any of these flags are set: - * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE) - */ -ill_t * -ill_group_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) -{ - ill_t *ill; - phyint_t *phyi; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - phyi = phyint_lookup_group_ifindex(index, ipst); - if (phyi != NULL) { - ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; - if (ill != NULL) { - mutex_enter(&ill->ill_lock); - if (ILL_CAN_LOOKUP(ill)) { - ill_refhold_locked(ill); - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (ill); - } - mutex_exit(&ill->ill_lock); - } - } - rw_exit(&ipst->ips_ill_g_lock); - return (NULL); -} - -/* - * Temporary function to support IPMP emulation for IP Filter. - * Lookup an ill based on the group name. - * Skips unusable ones i.e. where any of these flags are set: - * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE) - */ -ill_t * -ill_group_lookup_on_name(char *name, boolean_t isv6, ip_stack_t *ipst) -{ - ill_t *ill; - phyint_t *phyi; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - phyi = phyint_lookup_group(name, B_TRUE, ipst); - if (phyi != NULL) { - ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; - if (ill != NULL) { - mutex_enter(&ill->ill_lock); - if (ILL_CAN_LOOKUP(ill)) { - ill_refhold_locked(ill); - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (ill); - } - mutex_exit(&ill->ill_lock); - } - } - rw_exit(&ipst->ips_ill_g_lock); - return (NULL); -} diff --git a/usr/src/uts/common/inet/ip/ip_opt_data.c b/usr/src/uts/common/inet/ip/ip_opt_data.c index bb6e98a99e..1c91ea667f 100644 --- a/usr/src/uts/common/inet/ip/ip_opt_data.c +++ b/usr/src/uts/common/inet/ip/ip_opt_data.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -119,9 +119,6 @@ opdes_t ip_opt_arr[] = { { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 /* no ifindex */ }, -{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, - sizeof (struct in_addr), 0 /* not initialized */ }, - { IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, @@ -199,12 +196,6 @@ opdes_t ip_opt_arr[] = { { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 /* no ifindex */ }, -{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 /* no ifindex */ }, - -{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 /* no ifindex */ }, - { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, sizeof (int), 0 }, diff --git a/usr/src/uts/common/inet/ip/ip_rts.c b/usr/src/uts/common/inet/ip/ip_rts.c index 3324d1d833..77ab2cc220 100644 --- a/usr/src/uts/common/inet/ip/ip_rts.c +++ b/usr/src/uts/common/inet/ip/ip_rts.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -93,34 +93,52 @@ static void rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics); static void ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *); /* - * Send the ack to all the routing queues. In case of the originating queue, - * send it only if the loopback is set. - * - * Messages are sent upstream only on routing sockets that did not specify an - * address family when they were created or when the address family matches the - * one specified by the caller. + * Send `mp' to all eligible routing queues. A queue is ineligible if: * + * 1. SO_USELOOPBACK is off and it is not the originating queue. + * 2. RTAW_UNDER_IPMP is on and RTSQ_UNDER_IPMP is clear in `flags'. + * 3. RTAW_UNDER_IPMP is off and RTSQ_NORMAL is clear in `flags'. + * 4. It is not the same address family as `af', and `af' isn't AF_UNSPEC. */ void -rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst) +rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags, + ip_stack_t *ipst) { mblk_t *mp1; conn_t *connp, *next_connp; + /* + * Since we don't have an ill_t here, RTSQ_DEFAULT must already be + * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP by now. + */ + ASSERT(!(flags & RTSQ_DEFAULT)); + mutex_enter(&ipst->ips_rts_clients->connf_lock); connp = ipst->ips_rts_clients->connf_head; - while (connp != NULL) { + for (; connp != NULL; connp = next_connp) { + next_connp = connp->conn_next; + /* * If there was a family specified when this routing socket was * created and it doesn't match the family of the message to * copy, then continue. */ if ((connp->conn_proto != AF_UNSPEC) && - (connp->conn_proto != af)) { - connp = connp->conn_next; + (connp->conn_proto != af)) continue; + + /* + * Queue the message only if the conn_t and flags match. + */ + if (connp->conn_rtaware & RTAW_UNDER_IPMP) { + if (!(flags & RTSQ_UNDER_IPMP)) + continue; + } else { + if (!(flags & RTSQ_NORMAL)) + continue; } + /* * For the originating queue, we only copy the message upstream * if loopback is set. For others reading on the routing @@ -128,8 +146,8 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst) * message. */ if ((o_connp == connp) && connp->conn_loopback == 0) { - connp = connp->conn_next; - continue; + connp = connp->conn_next; + continue; } CONN_INC_REF(connp); mutex_exit(&ipst->ips_rts_clients->connf_lock); @@ -145,10 +163,9 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst) } mutex_enter(&ipst->ips_rts_clients->connf_lock); - /* Follow the next pointer before releasing the conn. */ + /* reload next_connp since conn_next may have changed */ next_connp = connp->conn_next; CONN_DEC_REF(connp); - connp = next_connp; } mutex_exit(&ipst->ips_rts_clients->connf_lock); freemsg(mp); @@ -209,7 +226,7 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; - rts_queue_input(mp, NULL, af, ipst); + rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst); } /* ARGSUSED */ @@ -430,7 +447,7 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) if (index != 0) { ill_t *ill; - +lookup: /* * IPC must be refheld somewhere in ip_wput_nondata or * ip_wput_ioctl etc... and cleaned up if ioctl is killed. @@ -445,16 +462,33 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) goto done; } - ipif = ipif_get_next_ipif(NULL, ill); - ill_refrele(ill); /* - * If this is replacement ipif, prevent a route from - * being added. + * Since all interfaces in an IPMP group must be equivalent, + * we prevent changes to a specific underlying interface's + * routing configuration. However, for backward compatibility, + * we intepret a request to add a route on an underlying + * interface as a request to add a route on its IPMP interface. */ - if (ipif != NULL && ipif->ipif_replace_zero) { - error = ENETDOWN; - goto done; + if (IS_UNDER_IPMP(ill)) { + switch (rtm->rtm_type) { + case RTM_CHANGE: + case RTM_DELETE: + ill_refrele(ill); + error = EINVAL; + goto done; + case RTM_ADD: + index = ipmp_ill_get_ipmp_ifindex(ill); + ill_refrele(ill); + if (index == 0) { + error = EINVAL; + goto done; + } + goto lookup; + } } + + ipif = ipif_get_next_ipif(NULL, ill); + ill_refrele(ill); match_flags |= MATCH_IRE_ILL; } @@ -1037,7 +1071,7 @@ done: /* OK ACK already set up by caller except this */ ip2dbg(("ip_rts_request: OK ACK\n")); } - rts_queue_input(mp, connp, af, ipst); + rts_queue_input(mp, connp, af, RTSQ_ALL, ipst); } iocp->ioc_error = error; @@ -1724,7 +1758,7 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask, rtm->rtm_errno = error; rtm->rtm_flags |= RTF_DONE; rtm->rtm_addrs = rtm_addrs; - rts_queue_input(mp, NULL, AF_INET, ipst); + rts_queue_input(mp, NULL, AF_INET, RTSQ_ALL, ipst); } /* @@ -1733,7 +1767,13 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask, * Message type generated RTM_IFINFO. */ void -ip_rts_ifmsg(const ipif_t *ipif) +ip_rts_ifmsg(const ipif_t *ipif, uint_t flags) +{ + ip_rts_xifmsg(ipif, 0, 0, flags); +} + +void +ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags) { if_msghdr_t *ifm; mblk_t *mp; @@ -1741,12 +1781,12 @@ ip_rts_ifmsg(const ipif_t *ipif) ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; /* - * This message should be generated only - * when the physical device is changing - * state. + * This message should be generated only when the physical interface + * is changing state. */ if (ipif->ipif_id != 0) return; + if (ipif->ipif_isv6) { af = AF_INET6; mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0); @@ -1765,11 +1805,22 @@ ip_rts_ifmsg(const ipif_t *ipif) } ifm = (if_msghdr_t *)mp->b_rptr; ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; - ifm->ifm_flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags | - ipif->ipif_ill->ill_phyint->phyint_flags; + ifm->ifm_flags = (ipif->ipif_flags | ipif->ipif_ill->ill_flags | + ipif->ipif_ill->ill_phyint->phyint_flags | set) & ~clear; rts_getifdata(&ifm->ifm_data, ipif); ifm->ifm_addrs = RTA_IFP; - rts_queue_input(mp, NULL, af, ipst); + + if (flags & RTSQ_DEFAULT) { + flags = RTSQ_ALL; + /* + * If this message is for an underlying interface, prevent + * "normal" (IPMP-unaware) routing sockets from seeing it. + */ + if (IS_UNDER_IPMP(ipif->ipif_ill)) + flags &= ~RTSQ_NORMAL; + } + + rts_queue_input(mp, NULL, af, flags, ipst); } /* @@ -1778,7 +1829,7 @@ ip_rts_ifmsg(const ipif_t *ipif) * The structure of the code is based on the 4.4BSD-Lite2 <net/rtsock.c>. */ void -ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif) +ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags) { int pass; int ncmd; @@ -1793,6 +1844,17 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif) af = AF_INET6; else af = AF_INET; + + if (flags & RTSQ_DEFAULT) { + flags = RTSQ_ALL; + /* + * If this message is for an underlying interface, prevent + * "normal" (IPMP-unaware) routing sockets from seeing it. + */ + if (IS_UNDER_IPMP(ipif->ipif_ill)) + flags &= ~RTSQ_NORMAL; + } + /* * If the request is DELETE, send RTM_DELETE and RTM_DELADDR. * if the request is ADD, send RTM_NEWADDR and RTM_ADD. @@ -1827,7 +1889,7 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif) ifam->ifam_metric = ipif->ipif_metric; ifam->ifam_flags = ((cmd == RTM_ADD) ? RTF_UP : 0); ifam->ifam_addrs = rtm_addrs; - rts_queue_input(mp, NULL, af, ipst); + rts_queue_input(mp, NULL, af, flags, ipst); } if ((cmd == RTM_ADD && pass == 2) || (cmd == RTM_DELETE && pass == 1)) { @@ -1857,7 +1919,7 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif) if (error == 0) rtm->rtm_flags |= RTF_DONE; rtm->rtm_addrs = rtm_addrs; - rts_queue_input(mp, NULL, af, ipst); + rts_queue_input(mp, NULL, af, flags, ipst); } } } diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index 59ddb7461f..5afa70160d 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -2322,11 +2322,8 @@ ipcl_conn_cleanup(conn_t *connp) * We should replace these pointers with ifindex/ipaddr_t to * make the code less complex. */ - ASSERT(connp->conn_xmit_if_ill == NULL); - ASSERT(connp->conn_nofailover_ill == NULL); ASSERT(connp->conn_outgoing_ill == NULL); ASSERT(connp->conn_incoming_ill == NULL); - ASSERT(connp->conn_outgoing_pill == NULL); ASSERT(connp->conn_multicast_ipif == NULL); ASSERT(connp->conn_multicast_ill == NULL); #endif diff --git a/usr/src/uts/common/inet/ip/ipmp.c b/usr/src/uts/common/inet/ip/ipmp.c new file mode 100644 index 0000000000..b8f3768834 --- /dev/null +++ b/usr/src/uts/common/inet/ip/ipmp.c @@ -0,0 +1,2201 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <inet/arp.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> +#include <inet/ip_multi.h> +#include <inet/ip_rts.h> +#include <inet/mi.h> +#include <net/if_types.h> +#include <sys/dlpi.h> +#include <sys/kmem.h> +#include <sys/modhash.h> +#include <sys/sdt.h> +#include <sys/strsun.h> +#include <sys/sunddi.h> +#include <sys/types.h> + +/* + * Convenience macros for getting the ip_stack_t associated with an + * ipmp_illgrp_t or ipmp_grp_t. + */ +#define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint) +#define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst) + +/* + * Assorted constants that aren't important enough to be tunable. + */ +#define IPMP_GRP_HASH_SIZE 64 +#define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */ + +/* + * Templates for IPMP ARP messages. + */ +static const arie_t ipmp_aract_template = { + AR_IPMP_ACTIVATE, + sizeof (arie_t), /* Name offset */ + sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ +}; + +static const arie_t ipmp_ardeact_template = { + AR_IPMP_DEACTIVATE, + sizeof (arie_t), /* Name offset */ + sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ +}; + +/* + * IPMP meta-interface kstats (based on those in PSARC/1997/198). + */ +static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = { + { "obytes", KSTAT_DATA_UINT32 }, + { "obytes64", KSTAT_DATA_UINT64 }, + { "rbytes", KSTAT_DATA_UINT32 }, + { "rbytes64", KSTAT_DATA_UINT64 }, + { "opackets", KSTAT_DATA_UINT32 }, + { "opackets64", KSTAT_DATA_UINT64 }, + { "oerrors", KSTAT_DATA_UINT32 }, + { "ipackets", KSTAT_DATA_UINT32 }, + { "ipackets64", KSTAT_DATA_UINT64 }, + { "ierrors", KSTAT_DATA_UINT32 }, + { "multircv", KSTAT_DATA_UINT32 }, + { "multixmt", KSTAT_DATA_UINT32 }, + { "brdcstrcv", KSTAT_DATA_UINT32 }, + { "brdcstxmt", KSTAT_DATA_UINT32 }, + { "link_up", KSTAT_DATA_UINT32 } +}; + +static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t); +static int ipmp_grp_create_kstats(ipmp_grp_t *); +static int ipmp_grp_update_kstats(kstat_t *, int); +static void ipmp_grp_destroy_kstats(ipmp_grp_t *); +static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *); +static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *); +static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *); +static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t); +static boolean_t ipmp_ill_activate(ill_t *); +static void ipmp_ill_deactivate(ill_t *); +static void ipmp_ill_ire_mark_testhidden(ire_t *, char *); +static void ipmp_ill_ire_clear_testhidden(ire_t *, char *); +static void ipmp_ill_refresh_active_timer_start(ill_t *); +static void ipmp_ill_rtsaddrmsg(ill_t *, int); +static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action); +static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t); +static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *); +static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *); + +/* + * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init(). + */ +void +ipmp_init(ip_stack_t *ipst) +{ + ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash", + IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, + mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); + rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0); +} + +/* + * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini(). + */ +void +ipmp_destroy(ip_stack_t *ipst) +{ + mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash); + rw_destroy(&ipst->ips_ipmp_lock); +} + +/* + * Create an IPMP group named `grname', associate it with IPMP phyint `phyi', + * and add it to the hash. On success, return a pointer to the created group. + * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP + * meta-interface associated with the group also has the same name (but they + * may differ later via ipmp_grp_rename()). + */ +ipmp_grp_t * +ipmp_grp_create(const char *grname, phyint_t *phyi) +{ + ipmp_grp_t *grp; + ip_stack_t *ipst = PHYINT_TO_IPST(phyi); + mod_hash_hndl_t mh; + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL) + return (NULL); + + (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); + (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname)); + + /* + * Cache the group's phyint. This is safe since a phyint_t will + * outlive its ipmp_grp_t. + */ + grp->gr_phyint = phyi; + + /* + * Create IPMP group kstats. + */ + if (ipmp_grp_create_kstats(grp) != 0) { + kmem_free(grp, sizeof (ipmp_grp_t)); + return (NULL); + } + + /* + * Insert the group into the hash. + */ + if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) { + ipmp_grp_destroy_kstats(grp); + kmem_free(grp, sizeof (ipmp_grp_t)); + return (NULL); + } + ipmp_grp_insert(grp, mh); + + return (grp); +} + +/* + * Create IPMP kstat structures for `grp'. Return an errno upon failure. + */ +static int +ipmp_grp_create_kstats(ipmp_grp_t *grp) +{ + kstat_t *ksp; + netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; + + ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net", + KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id); + if (ksp == NULL) + return (ENOMEM); + + ksp->ks_update = ipmp_grp_update_kstats; + ksp->ks_private = grp; + bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats)); + + kstat_install(ksp); + grp->gr_ksp = ksp; + return (0); +} + +/* + * Update the IPMP kstats tracked by `ksp'; called by the kstats framework. + */ +static int +ipmp_grp_update_kstats(kstat_t *ksp, int rw) +{ + uint_t i; + kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); + ipmp_grp_t *grp = ksp->ks_private; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq; + phyint_t *phyi; + uint64_t phyi_kstats[IPMP_KSTAT_MAX]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Start with the group's baseline values. + */ + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + if (kn[i].data_type == KSTAT_DATA_UINT32) { + kn[i].value.ui32 = grp->gr_kstats0[i]; + } else { + ASSERT(kn[i].data_type == KSTAT_DATA_UINT64); + kn[i].value.ui64 = grp->gr_kstats0[i]; + } + } + + /* + * Add in the stats of each phyint currently in the group. Since we + * don't directly track the phyints in a group, we cheat by walking + * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while + * ill_g_lock is held.) + */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + ipsq = grp_ipsq->ipsq_next; + for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) { + phyi = ipsq->ipsq_phyint; + + /* + * If a phyint in a group is being unplumbed, it's possible + * that ill_glist_delete() -> phyint_free() already freed the + * phyint (and set ipsq_phyint to NULL), but the unplumb + * operation has yet to complete (and thus ipsq_dq() has yet + * to remove the phyint's IPSQ from the group IPSQ's phyint + * list). We skip those phyints here (note that their kstats + * have already been added to gr_kstats0[]). + */ + if (phyi == NULL) + continue; + + ipmp_phyint_get_kstats(phyi, phyi_kstats); + + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + phyi_kstats[i] -= phyi->phyint_kstats0[i]; + if (kn[i].data_type == KSTAT_DATA_UINT32) + kn[i].value.ui32 += phyi_kstats[i]; + else + kn[i].value.ui64 += phyi_kstats[i]; + } + } + + kn[IPMP_KSTAT_LINK_UP].value.ui32 = + (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0; + + rw_exit(&ipst->ips_ill_g_lock); + return (0); +} + +/* + * Destroy IPMP kstat structures for `grp'. + */ +static void +ipmp_grp_destroy_kstats(ipmp_grp_t *grp) +{ + netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; + + kstat_delete_netstack(grp->gr_ksp, id); + bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0)); + grp->gr_ksp = NULL; +} + +/* + * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it + * does not exist. + */ +ipmp_grp_t * +ipmp_grp_lookup(const char *grname, ip_stack_t *ipst) +{ + ipmp_grp_t *grp; + + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, + (mod_hash_val_t *)&grp) == 0) + return (grp); + + return (NULL); +} + +/* + * Place information about group `grp' into `lifgr'. + */ +void +ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr) +{ + ill_t *ill; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + lifgr->gi_v4 = (grp->gr_v4 != NULL); + lifgr->gi_v6 = (grp->gr_v6 != NULL); + lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4; + lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6; + lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP; + (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ); + lifgr->gi_m4ifname[0] = '\0'; + lifgr->gi_m6ifname[0] = '\0'; + lifgr->gi_bcifname[0] = '\0'; + + if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) { + (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ); + (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ); + } + + if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL) + (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ); +} + +/* + * Insert `grp' into the hash using the reserved hash entry `mh'. + * Caller must ensure `grp' is not yet in the hash. + */ +static void +ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh) +{ + int err; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + /* + * Since grp->gr_name will exist at least as long as `grp' is in the + * hash, we use it directly as the key. + */ + err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash, + (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh); + if (err != 0) { + /* + * This should never happen since `mh' was preallocated. + */ + panic("cannot insert IPMP group \"%s\" (err %d)", + grp->gr_name, err); + } +} + +/* + * Remove `grp' from the hash. Caller must ensure `grp' is in it. + */ +static void +ipmp_grp_remove(ipmp_grp_t *grp) +{ + int err; + mod_hash_val_t val; + mod_hash_key_t key = (mod_hash_key_t)grp->gr_name; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val); + if (err != 0 || val != grp) { + panic("cannot remove IPMP group \"%s\" (err %d)", + grp->gr_name, err); + } +} + +/* + * Attempt to rename `grp' to new name `grname'. Return an errno if the new + * group name already exists or is invalid, or if there isn't enough memory. + */ +int +ipmp_grp_rename(ipmp_grp_t *grp, const char *grname) +{ + mod_hash_hndl_t mh; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if (grname[0] == '\0') + return (EINVAL); + + if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, + (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND) + return (EEXIST); + + /* + * Before we remove the group from the hash, ensure we'll be able to + * re-insert it by reserving space. + */ + if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) + return (ENOMEM); + + ipmp_grp_remove(grp); + (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); + ipmp_grp_insert(grp, mh); + + return (0); +} + +/* + * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in + * the hash, and that there are no interfaces on it. + */ +void +ipmp_grp_destroy(ipmp_grp_t *grp) +{ + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + /* + * If there are still interfaces using this group, panic before things + * go really off the rails. + */ + if (grp->gr_nif != 0) + panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name); + + ipmp_grp_remove(grp); + ipmp_grp_destroy_kstats(grp); + + ASSERT(grp->gr_v4 == NULL); + ASSERT(grp->gr_v6 == NULL); + ASSERT(grp->gr_nv4 == 0); + ASSERT(grp->gr_nv6 == 0); + ASSERT(grp->gr_nactif == 0); + ASSERT(grp->gr_linkdownmp == NULL); + grp->gr_phyint = NULL; + + kmem_free(grp, sizeof (ipmp_grp_t)); +} + +/* + * Check whether `ill' is suitable for inclusion into `grp', and return an + * errno describing the problem (if any). NOTE: many of these errno values + * are interpreted by ifconfig, which will take corrective action and retry + * the SIOCSLIFGROUPNAME, so please exercise care when changing them. + */ +static int +ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill) +{ + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + /* + * To sidestep complicated address migration logic in the kernel and + * to force the kernel's all-hosts multicast memberships to be blown + * away, all addresses that had been brought up must be brought back + * down prior to adding an interface to a group. (This includes + * addresses currently down due to DAD.) Once the interface has been + * added to the group, its addresses can then be brought back up, at + * which point they will be moved to the IPMP meta-interface. + * NOTE: we do this before ill_appaddr_cnt() since bringing down the + * link-local causes in.ndpd to remove its ADDRCONF'd addresses. + */ + if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) + return (EADDRINUSE); + + /* + * To avoid confusing applications by changing addresses that are + * under their control, all such control must be removed prior to + * adding an interface into a group. + */ + if (ill_appaddr_cnt(ill) != 0) + return (EADDRNOTAVAIL); + + /* + * Since PTP addresses do not share the same broadcast domain, they + * are not allowed to be in an IPMP group. + */ + if (ill_ptpaddr_cnt(ill) != 0) + return (EINVAL); + + /* + * An ill must support multicast to be allowed into a group. + */ + if (!(ill->ill_flags & ILLF_MULTICAST)) + return (ENOTSUP); + + /* + * An ill must strictly be using ARP and/or ND for address + * resolution for it to be allowed into a group. + */ + if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV)) + return (ENOTSUP); + + /* + * An ill cannot also be using usesrc groups. (Although usesrc uses + * ill_g_usesrc_lock, we don't need to grab it since usesrc also does + * all its modifications as writer.) + */ + if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill)) + return (ENOTSUP); + + /* + * All ills in a group must be the same mactype. + */ + if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype) + return (EINVAL); + + return (0); +} + +/* + * Check whether `phyi' is suitable for inclusion into `grp', and return an + * errno describing the problem (if any). See comment above ipmp_grp_vet_ill() + * regarding errno values. + */ +int +ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi) +{ + int err = 0; + ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); + + ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq)); + ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); + + /* + * An interface cannot have address families plumbed that are not + * configured in the group. + */ + if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL || + phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL) + return (EAFNOSUPPORT); + + if (phyi->phyint_illv4 != NULL) + err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4); + if (err == 0 && phyi->phyint_illv6 != NULL) + err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6); + + return (err); +} + +/* + * Create a new illgrp on IPMP meta-interface `ill'. + */ +ipmp_illgrp_t * +ipmp_illgrp_create(ill_t *ill) +{ + uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; + ipmp_illgrp_t *illg; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_IPMP(ill)); + ASSERT(ill->ill_grp == NULL); + + if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL) + return (NULL); + + list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode)); + list_create(&illg->ig_actif, sizeof (ill_t), + offsetof(ill_t, ill_actnode)); + list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t), + offsetof(ipmp_arpent_t, ia_node)); + + illg->ig_ipmp_ill = ill; + ill->ill_grp = illg; + ipmp_illgrp_set_mtu(illg, mtu); + + return (illg); +} + +/* + * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface. + */ +void +ipmp_illgrp_destroy(ipmp_illgrp_t *illg) +{ + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + ASSERT(IS_IPMP(illg->ig_ipmp_ill)); + + /* + * Verify `illg' is empty. + */ + ASSERT(illg->ig_next_ill == NULL); + ASSERT(illg->ig_cast_ill == NULL); + ASSERT(list_is_empty(&illg->ig_arpent)); + ASSERT(list_is_empty(&illg->ig_if)); + ASSERT(list_is_empty(&illg->ig_actif)); + ASSERT(illg->ig_nactif == 0); + + /* + * Destroy `illg'. + */ + illg->ig_ipmp_ill->ill_grp = NULL; + illg->ig_ipmp_ill = NULL; + list_destroy(&illg->ig_if); + list_destroy(&illg->ig_actif); + list_destroy(&illg->ig_arpent); + kmem_free(illg, sizeof (ipmp_illgrp_t)); +} + +/* + * Add `ipif' to the pool of usable data addresses on `illg' and attempt to + * bind it to an underlying ill, while keeping an even address distribution. + * If the bind is successful, return a pointer to the bound ill. + */ +ill_t * +ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) +{ + ill_t *minill; + ipmp_arpent_t *entp; + + ASSERT(IAM_WRITER_IPIF(ipif)); + ASSERT(ipmp_ipif_is_dataaddr(ipif)); + + /* + * IPMP data address mappings are internally managed by IP itself, so + * delete any existing ARP entries associated with the address. + */ + if (!ipif->ipif_isv6) { + entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr); + if (entp != NULL) + ipmp_illgrp_destroy_arpent(illg, entp); + } + + if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) + ipmp_ill_bind_ipif(minill, ipif, Res_act_none); + + return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL); +} + +/* + * Delete `ipif' from the pool of usable data addresses on `illg'. If it's + * bound, unbind it from the underlying ill while keeping an even address + * distribution. + */ +void +ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) +{ + ill_t *maxill, *boundill = ipif->ipif_bound_ill; + + ASSERT(IAM_WRITER_IPIF(ipif)); + + if (boundill != NULL) { + (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE); + + maxill = ipmp_illgrp_max_ill(illg); + if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) { + ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); + ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind); + } + } +} + +/* + * Return the active ill with the greatest number of data addresses in `illg'. + */ +static ill_t * +ipmp_illgrp_max_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill, *bestill = NULL; + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + ill = list_head(&illg->ig_actif); + for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { + if (bestill == NULL || + ill->ill_bound_cnt > bestill->ill_bound_cnt) { + bestill = ill; + } + } + return (bestill); +} + +/* + * Return the active ill with the fewest number of data addresses in `illg'. + */ +static ill_t * +ipmp_illgrp_min_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill, *bestill = NULL; + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + ill = list_head(&illg->ig_actif); + for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { + if (bestill == NULL || + ill->ill_bound_cnt < bestill->ill_bound_cnt) { + if (ill->ill_bound_cnt == 0) + return (ill); /* can't get better */ + bestill = ill; + } + } + return (bestill); +} + +/* + * Return a pointer to IPMP meta-interface for `illg' (which must exist). + * Since ig_ipmp_ill never changes for a given illg, no locks are needed. + */ +ill_t * +ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg) +{ + return (illg->ig_ipmp_ill); +} + +/* + * Return a pointer to the next available underlying ill in `illg', or NULL if + * one doesn't exist. Caller must be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_next_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + if ((ill = illg->ig_next_ill) != NULL) { + illg->ig_next_ill = list_next(&illg->ig_actif, ill); + if (illg->ig_next_ill == NULL) + illg->ig_next_ill = list_head(&illg->ig_actif); + } + rw_exit(&ipst->ips_ipmp_lock); + + return (ill); +} + +/* + * Return a held pointer to the next available underlying ill in `illg', or + * NULL if one doesn't exist. Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) +{ + ill_t *ill; + uint_t i; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + for (i = 0; i < illg->ig_nactif; i++) { + ill = illg->ig_next_ill; + illg->ig_next_ill = list_next(&illg->ig_actif, ill); + if (illg->ig_next_ill == NULL) + illg->ig_next_ill = list_head(&illg->ig_actif); + + if (ILL_CAN_LOOKUP(ill)) { + ill_refhold(ill); + rw_exit(&ipst->ips_ipmp_lock); + return (ill); + } + } + rw_exit(&ipst->ips_ipmp_lock); + + return (NULL); +} + +/* + * Return a pointer to the nominated multicast ill in `illg', or NULL if one + * doesn't exist. Caller must be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg) +{ + /* + * Since an IPMP ill's ill_grp gets cleared during I_PUNLINK but + * this function can get called after that point, handle NULL. + */ + if (illg == NULL) + return (NULL); + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + return (illg->ig_cast_ill); +} + +/* + * Return a held pointer to the nominated multicast ill in `illg', or NULL if + * one doesn't exist. Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) +{ + ill_t *castill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + castill = illg->ig_cast_ill; + if (castill != NULL && ILL_CAN_LOOKUP(castill)) { + ill_refhold(castill); + rw_exit(&ipst->ips_ipmp_lock); + return (castill); + } + rw_exit(&ipst->ips_ipmp_lock); + return (NULL); +} + +/* + * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL, + * any existing nomination is removed. Caller must be inside the IPSQ. + */ +static void +ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) +{ + ill_t *ocastill = illg->ig_cast_ill; + ill_t *ipmp_ill = illg->ig_ipmp_ill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ipmp_ill)); + + /* + * Disable old nominated ill (if any). + */ + if (ocastill != NULL) { + DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *, + illg, ill_t *, ocastill); + ASSERT(ocastill->ill_nom_cast); + ocastill->ill_nom_cast = B_FALSE; + /* + * If the IPMP meta-interface is down, we never did the join, + * so we must not try to leave. + */ + if (ipmp_ill->ill_dl_up) + ill_leave_multicast(ipmp_ill); + } + + /* + * Set new nomination. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + illg->ig_cast_ill = castill; + rw_exit(&ipst->ips_ipmp_lock); + + if (ocastill != NULL) { + /* + * Delete any IREs tied to the old nomination. We must do + * this after the new castill is set and has reached global + * visibility since the datapath has not been quiesced. + */ + ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, + ill_stq_cache_delete, ocastill, ocastill); + } + + /* + * Enable new nominated ill (if any). + */ + if (castill != NULL) { + DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *, + illg, ill_t *, castill); + ASSERT(!castill->ill_nom_cast); + castill->ill_nom_cast = B_TRUE; + /* + * If the IPMP meta-interface is down, the attempt to recover + * will silently fail but ill_need_recover_multicast will be + * erroneously cleared -- so check first. + */ + if (ipmp_ill->ill_dl_up) + ill_recover_multicast(ipmp_ill); + } + + /* + * For IPv4, refresh our broadcast IREs. This needs to be done even + * if there's no new nomination since ill_refresh_bcast() still must + * update the IPMP meta-interface's broadcast IREs to point back at + * the IPMP meta-interface itself. + */ + if (!ipmp_ill->ill_isv6) + ill_refresh_bcast(ipmp_ill); +} + +/* + * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an + * entry for the same IP address already exists, destroy it first. Return the + * created IPMP ARP entry, or NULL on failure. + */ +ipmp_arpent_t * +ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp) +{ + uchar_t *addrp; + area_t *area = (area_t *)mp->b_rptr; + ipmp_arpent_t *entp, *oentp; + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t)); + + if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL) + return (NULL); + + if ((mp = copyb(mp)) == NULL) { + kmem_free(entp, sizeof (ipmp_arpent_t)); + return (NULL); + } + + DB_TYPE(mp) = M_PROTO; + entp->ia_area_mp = mp; + entp->ia_proxyarp = proxyarp; + addrp = mi_offset_paramc(mp, area->area_proto_addr_offset, + sizeof (ipaddr_t)); + bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t)); + + if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL) + ipmp_illgrp_destroy_arpent(illg, oentp); + + list_insert_head(&illg->ig_arpent, entp); + return (entp); +} + +/* + * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it. + */ +void +ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) +{ + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + list_remove(&illg->ig_arpent, entp); + freeb(entp->ia_area_mp); + kmem_free(entp, sizeof (ipmp_arpent_t)); +} + +/* + * Mark that ARP has been notified about the IP address on `entp'; `illg' is + * taken as a debugging aid for DTrace FBT probes. + */ +/* ARGSUSED */ +void +ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) +{ + entp->ia_notified = B_TRUE; +} + +/* + * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is + * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist. + */ +ipmp_arpent_t * +ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp) +{ + ipmp_arpent_t *entp = list_head(&illg->ig_arpent); + + ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); + + if (addrp == NULL) + return (entp); + + for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) + if (entp->ia_ipaddr == *addrp) + break; + return (entp); +} + +/* + * Refresh ARP entries on `illg' to be distributed across its active + * interfaces. Entries that cannot be refreshed (e.g., because there are no + * active interfaces) are marked so that subsequent calls can try again. + */ +void +ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) +{ + ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill; + uint_t paddrlen = ipmp_ill->ill_phys_addr_length; + area_t *area; + mblk_t *area_mp; + uchar_t *physaddr; + ipmp_arpent_t *entp; + + ASSERT(IAM_WRITER_ILL(ipmp_ill)); + ASSERT(!ipmp_ill->ill_isv6); + + ill = list_head(&illg->ig_actif); + entp = list_head(&illg->ig_arpent); + for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) { + if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) { + entp->ia_notified = B_FALSE; + continue; + } + + area = (area_t *)entp->ia_area_mp->b_rptr; + ASSERT(paddrlen == ill->ill_phys_addr_length); + ASSERT(paddrlen == area->area_hw_addr_length); + physaddr = mi_offset_paramc(entp->ia_area_mp, + area->area_hw_addr_offset, paddrlen); + + /* + * If this is a proxy ARP entry, we can skip notifying ARP if + * the entry is already up-to-date. If it has changed, we + * update the entry's hardware address before notifying ARP. + */ + if (entp->ia_proxyarp) { + if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 && + entp->ia_notified) + continue; + bcopy(ill->ill_phys_addr, physaddr, paddrlen); + } + + if ((area_mp = copyb(entp->ia_area_mp)) == NULL) { + entp->ia_notified = B_FALSE; + continue; + } + + putnext(ipmp_ill->ill_rq, area_mp); + ipmp_illgrp_mark_arpent(illg, entp); + + if ((ill = list_next(&illg->ig_actif, ill)) == NULL) + ill = list_head(&illg->ig_actif); + } +} + +/* + * Return an interface in `illg' with the specified `physaddr', or NULL if one + * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ. + */ +ill_t * +ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen) +{ + ill_t *ill; + ill_t *ipmp_ill = illg->ig_ipmp_ill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock)); + + ill = list_head(&illg->ig_if); + for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { + if (ill->ill_phys_addr_length == paddrlen && + bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0) + return (ill); + } + return (NULL); +} + +/* + * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND. + * Caller must be inside the IPSQ unless this is initialization. + */ +static void +ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu) +{ + ill_t *ill = illg->ig_ipmp_ill; + mblk_t *mp; + + ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill)); + + /* + * If allocation fails, we have bigger problems than MTU. + */ + if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) { + illg->ig_mtu = mtu; + put(ill->ill_rq, mp); + } +} + +/* + * Recalculate the IPMP group MTU for `illg', and update its associated IPMP + * ill MTU if necessary. + */ +void +ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg) +{ + ill_t *ill; + ill_t *ipmp_ill = illg->ig_ipmp_ill; + uint_t mtu = 0; + + ASSERT(IAM_WRITER_ILL(ipmp_ill)); + + /* + * Since ill_max_mtu can only change under ill_lock, we hold ill_lock + * for each ill as we iterate through the list. Any changes to the + * ill_max_mtu will also trigger an update, so even if we missed it + * this time around, the update will catch it. + */ + ill = list_head(&illg->ig_if); + for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { + mutex_enter(&ill->ill_lock); + if (mtu == 0 || ill->ill_max_mtu < mtu) + mtu = ill->ill_max_mtu; + mutex_exit(&ill->ill_lock); + } + + /* + * MTU must be at least the minimum MTU. + */ + mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); + + if (illg->ig_mtu != mtu) + ipmp_illgrp_set_mtu(illg, mtu); +} + +/* + * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently + * allow the same link to be established more than once. + */ +void +ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp) +{ + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if (illg->ig_ipmp_ill->ill_isv6) { + ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg); + grp->gr_v6 = illg; + } else { + ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg); + grp->gr_v4 = illg; + } +} + +/* + * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp + * cannot be unlinked (e.g., because there are still interfaces using it). + */ +int +ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg) +{ + ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); + + if (illg->ig_ipmp_ill->ill_isv6) { + if (grp->gr_nv6 + grp->gr_pendv6 != 0) + return (EBUSY); + grp->gr_v6 = NULL; + } else { + if (grp->gr_nv4 + grp->gr_pendv4 != 0) + return (EBUSY); + grp->gr_v4 = NULL; + } + return (0); +} + +/* + * Place `ill' into `illg', and rebalance the data addresses on `illg' + * to be spread evenly across the ills now in it. Also, adjust the IPMP + * ill as necessary to account for `ill' (e.g., MTU). + */ +void +ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) +{ + ill_t *ipmp_ill; + ipif_t *ipif; + ip_stack_t *ipst = ill->ill_ipst; + + /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */ + ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL); + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(ill->ill_grp == NULL); + + ipmp_ill = illg->ig_ipmp_ill; + + /* + * Account for `ill' joining the illgrp. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + if (ill->ill_isv6) + ill->ill_phyint->phyint_grp->gr_nv6++; + else + ill->ill_phyint->phyint_grp->gr_nv4++; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * Ensure the ILLF_ROUTER flag remains consistent across the group. + */ + mutex_enter(&ill->ill_lock); + if (ipmp_ill->ill_flags & ILLF_ROUTER) + ill->ill_flags |= ILLF_ROUTER; + else + ill->ill_flags &= ~ILLF_ROUTER; + mutex_exit(&ill->ill_lock); + + /* + * Blow away all multicast memberships that currently exist on `ill'. + * This may seem odd, but it's consistent with the application view + * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()). + */ + if (ill->ill_isv6) { + reset_conn_ill(ill); + reset_mrt_ill(ill); + } else { + ipif = ill->ill_ipif; + for (; ipif != NULL; ipif = ipif->ipif_next) { + reset_conn_ipif(ipif); + reset_mrt_vif_ipif(ipif); + } + } + ip_purge_allmulti(ill); + + /* + * Borrow the first ill's ill_phys_addr_length value for the illgrp's + * physical address length. All other ills must have the same value, + * since they are required to all be the same mactype. Also update + * the IPMP ill's MTU and CoS marking, if necessary. + */ + if (list_is_empty(&illg->ig_if)) { + ASSERT(ipmp_ill->ill_phys_addr_length == 0); + /* + * NOTE: we leave ill_phys_addr NULL since the IPMP group + * doesn't have a physical address. This means that code must + * not assume that ill_phys_addr is non-NULL just because + * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla. + */ + ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length; + ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length; + ipmp_ill->ill_type = ill->ill_type; + + if (ill->ill_flags & ILLF_COS_ENABLED) { + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags |= ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } + ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); + } else { + ASSERT(ipmp_ill->ill_phys_addr_length == + ill->ill_phys_addr_length); + ASSERT(ipmp_ill->ill_type == ill->ill_type); + + if (!(ill->ill_flags & ILLF_COS_ENABLED)) { + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } + if (illg->ig_mtu > ill->ill_max_mtu) + ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); + } + + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + list_insert_tail(&illg->ig_if, ill); + ill->ill_grp = illg; + rw_exit(&ipst->ips_ill_g_lock); + + /* + * Hide the IREs on `ill' so that we don't accidentally find them when + * sending data traffic. + */ + ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill); + + /* + * Merge any broadcast IREs, if need be. + */ + if (!ill->ill_isv6) + ill_refresh_bcast(ill); + + ipmp_ill_refresh_active(ill); +} + +/* + * Remove `ill' from its illgrp, and rebalance the data addresses in that + * illgrp to be spread evenly across the remaining ills. Also, adjust the + * IPMP ill as necessary now that `ill' is removed (e.g., MTU). + */ +void +ipmp_ill_leave_illgrp(ill_t *ill) +{ + ill_t *ipmp_ill; + ipif_t *ipif; + ipmp_arpent_t *entp; + ipmp_illgrp_t *illg = ill->ill_grp; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IS_UNDER_IPMP(ill)); + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(illg != NULL); + + ipmp_ill = illg->ig_ipmp_ill; + + /* + * Cancel IPMP-specific ill timeouts. + */ + (void) untimeout(ill->ill_refresh_tid); + + /* + * Expose any previously-hidden IREs on `ill'. + */ + ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill); + + /* + * Ensure the multicast state for each ipif on `ill' is down so that + * our ipif_multicast_up() (once `ill' leaves the group) will rejoin + * all eligible groups. + */ + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + if (ipif->ipif_flags & IPIF_UP) + ipif_multicast_down(ipif); + + /* + * Account for `ill' leaving the illgrp. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + if (ill->ill_isv6) + ill->ill_phyint->phyint_grp->gr_nv6--; + else + ill->ill_phyint->phyint_grp->gr_nv4--; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * Pull `ill' out of the interface lists. + */ + if (list_link_active(&ill->ill_actnode)) + ipmp_ill_deactivate(ill); + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + list_remove(&illg->ig_if, ill); + ill->ill_grp = NULL; + rw_exit(&ipst->ips_ill_g_lock); + + /* + * Recreate any broadcast IREs that had been shared, if need be. + */ + if (!ill->ill_isv6) + ill_refresh_bcast(ill); + + /* + * Re-establish multicast memberships that were previously being + * handled by the IPMP meta-interface. + */ + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + if (ipif->ipif_flags & IPIF_UP) + ipif_multicast_up(ipif); + + /* + * Refresh the group MTU based on the new interface list. + */ + ipmp_illgrp_refresh_mtu(illg); + + if (list_is_empty(&illg->ig_if)) { + /* + * No ills left in the illgrp; we no longer have a physical + * address length, nor can we support ARP, CoS, or anything + * else that depends on knowing the link layer type. + */ + while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL) + ipmp_illgrp_destroy_arpent(illg, entp); + + ipmp_ill->ill_phys_addr_length = 0; + ipmp_ill->ill_nd_lla_len = 0; + ipmp_ill->ill_type = IFT_OTHER; + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } else { + /* + * If `ill' didn't support CoS, see if it can now be enabled. + */ + if (!(ill->ill_flags & ILLF_COS_ENABLED)) { + ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED)); + + ill = list_head(&illg->ig_if); + do { + if (!(ill->ill_flags & ILLF_COS_ENABLED)) + break; + } while ((ill = list_next(&illg->ig_if, ill)) != NULL); + + if (ill == NULL) { + mutex_enter(&ipmp_ill->ill_lock); + ipmp_ill->ill_flags |= ILLF_COS_ENABLED; + mutex_exit(&ipmp_ill->ill_lock); + } + } + } +} + +/* + * Check if `ill' should be active, and activate or deactivate if need be. + * Return B_FALSE if a refresh was necessary but could not be performed. + */ +static boolean_t +ipmp_ill_try_refresh_active(ill_t *ill) +{ + boolean_t refreshed = B_TRUE; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + if (ipmp_ill_is_active(ill)) { + if (!list_link_active(&ill->ill_actnode)) + refreshed = ipmp_ill_activate(ill); + } else { + if (list_link_active(&ill->ill_actnode)) + ipmp_ill_deactivate(ill); + } + + return (refreshed); +} + +/* + * Check if `ill' should be active, and activate or deactivate if need be. + * If the refresh fails, schedule a timer to try again later. + */ +void +ipmp_ill_refresh_active(ill_t *ill) +{ + if (!ipmp_ill_try_refresh_active(ill)) + ipmp_ill_refresh_active_timer_start(ill); +} + +/* + * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'. + */ +static void +ipmp_ill_refresh_active_timer(void *ill_arg) +{ + ill_t *ill = ill_arg; + boolean_t refreshed = B_FALSE; + + /* + * Clear ill_refresh_tid to indicate that no timeout is pending + * (another thread could schedule a new timeout while we're still + * running, but that's harmless). If the ill is going away, bail. + */ + mutex_enter(&ill->ill_lock); + ill->ill_refresh_tid = 0; + if (ill->ill_state_flags & ILL_CONDEMNED) { + mutex_exit(&ill->ill_lock); + return; + } + mutex_exit(&ill->ill_lock); + + if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) { + refreshed = ipmp_ill_try_refresh_active(ill); + ipsq_exit(ill->ill_phyint->phyint_ipsq); + } + + /* + * If the refresh failed, schedule another attempt. + */ + if (!refreshed) + ipmp_ill_refresh_active_timer_start(ill); +} + +/* + * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'. + */ +static void +ipmp_ill_refresh_active_timer_start(ill_t *ill) +{ + mutex_enter(&ill->ill_lock); + + /* + * If the ill is going away or a refresh is already scheduled, bail. + */ + if (ill->ill_refresh_tid != 0 || + (ill->ill_state_flags & ILL_CONDEMNED)) { + mutex_exit(&ill->ill_lock); + return; + } + + ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill, + SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT)); + + mutex_exit(&ill->ill_lock); +} + +/* + * Activate `ill' so it will be used to send and receive data traffic. Return + * B_FALSE if `ill' cannot be activated. Note that we allocate any messages + * needed to deactivate `ill' here as well so that deactivation cannot fail. + */ +static boolean_t +ipmp_ill_activate(ill_t *ill) +{ + ipif_t *ipif; + mblk_t *actmp = NULL, *deactmp = NULL; + mblk_t *linkupmp = NULL, *linkdownmp = NULL; + ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; + const char *grifname = grp->gr_ifname; + ipmp_illgrp_t *illg = ill->ill_grp; + ill_t *maxill; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + /* + * If this will be the first active interface in the group, allocate + * the link-up and link-down messages. + */ + if (grp->gr_nactif == 0) { + linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0); + linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0); + if (linkupmp == NULL || linkdownmp == NULL) + goto fail; + } + + /* + * For IPv4, allocate the activate/deactivate messages, and tell ARP. + */ + if (!ill->ill_isv6) { + actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template); + deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template); + if (actmp == NULL || deactmp == NULL) + goto fail; + + ASSERT(ill->ill_ardeact_mp == NULL); + ill->ill_ardeact_mp = deactmp; + putnext(illg->ig_ipmp_ill->ill_rq, actmp); + } + + if (list_is_empty(&illg->ig_actif)) { + /* + * Now that we have an active ill, nominate it for multicast + * and broadcast duties. Do this before ipmp_ill_bind_ipif() + * since that may need to send multicast packets (e.g., IPv6 + * neighbor discovery probes). + */ + ipmp_illgrp_set_cast(illg, ill); + + /* + * This is the first active ill in the illgrp -- add 'em all. + * We can access/walk ig_ipmp_ill's ipif list since we're + * writer on its IPSQ as well. + */ + ipif = illg->ig_ipmp_ill->ill_ipif; + for (; ipif != NULL; ipif = ipif->ipif_next) + if (ipmp_ipif_is_up_dataaddr(ipif)) + ipmp_ill_bind_ipif(ill, ipif, Res_act_initial); + } else { + /* + * Redistribute the addresses by moving them from the ill with + * the most addresses until the ill being activated is at the + * same level as the rest of the ills. + */ + for (;;) { + maxill = ipmp_illgrp_max_ill(illg); + ASSERT(maxill != NULL); + if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt) + break; + ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); + ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind); + } + + /* + * TODO: explore whether it's advantageous to flush IRE_CACHE + * bindings to force existing connections to be redistributed + * to the new ill. + */ + } + + /* + * Put the interface in the active list. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + list_insert_tail(&illg->ig_actif, ill); + illg->ig_nactif++; + illg->ig_next_ill = ill; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * Refresh ARP entries to use `ill', if need be. + */ + if (!ill->ill_isv6) + ipmp_illgrp_refresh_arpent(illg); + + /* + * Finally, mark the group link up, if necessary. + */ + if (grp->gr_nactif++ == 0) { + ASSERT(grp->gr_linkdownmp == NULL); + grp->gr_linkdownmp = linkdownmp; + put(illg->ig_ipmp_ill->ill_rq, linkupmp); + } + return (B_TRUE); +fail: + freemsg(actmp); + freemsg(deactmp); + freemsg(linkupmp); + freemsg(linkdownmp); + return (B_FALSE); +} + +/* + * Deactivate `ill' so it will not be used to send or receive data traffic. + */ +static void +ipmp_ill_deactivate(ill_t *ill) +{ + ill_t *minill; + ipif_t *ipif, *ubnextipif, *ubheadipif = NULL; + mblk_t *mp; + ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; + ipmp_illgrp_t *illg = ill->ill_grp; + ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + /* + * Delete IRE_CACHE entries tied to this ill before they become stale. + */ + ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, + ill_stq_cache_delete, ill, ill); + + /* + * Pull the interface out of the active list. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + list_remove(&illg->ig_actif, ill); + illg->ig_nactif--; + illg->ig_next_ill = list_head(&illg->ig_actif); + rw_exit(&ipst->ips_ipmp_lock); + + /* + * If the ill that's being deactivated had been nominated for + * multicast/broadcast, nominate a new one. + */ + if (ill == illg->ig_cast_ill) + ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif)); + + /* + * Unbind all of the ipifs bound to this ill, and save 'em in a list; + * we'll rebind them after we tell the resolver the ill is no longer + * active. We must do things in this order or the resolver could + * accidentally rebind to the ill we're trying to remove if multiple + * ills in the group have the same hardware address (which is + * unsupported, but shouldn't lead to a wedged machine). + */ + while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) { + ipif->ipif_bound_next = ubheadipif; + ubheadipif = ipif; + } + + if (!ill->ill_isv6) { + /* + * Tell ARP `ill' is no longer active in the group. + */ + mp = ill->ill_ardeact_mp; + ill->ill_ardeact_mp = NULL; + ASSERT(mp != NULL); + putnext(illg->ig_ipmp_ill->ill_rq, mp); + + /* + * Refresh any ARP entries that had been using `ill'. + */ + ipmp_illgrp_refresh_arpent(illg); + } + + /* + * Rebind each ipif from the deactivated ill to the active ill with + * the fewest ipifs. If there are no active ills, the ipifs will + * remain unbound. + */ + for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) { + ubnextipif = ipif->ipif_bound_next; + ipif->ipif_bound_next = NULL; + + if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) + ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind); + } + + /* + * Finally, mark the group link down, if necessary. + */ + if (--grp->gr_nactif == 0) { + mp = grp->gr_linkdownmp; + grp->gr_linkdownmp = NULL; + ASSERT(mp != NULL); + put(illg->ig_ipmp_ill->ill_rq, mp); + } +} + +/* + * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD) + * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners. + */ +static void +ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd) +{ + ipif_t *ipif; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE); + + /* + * If `ill' is truly down, there are no messages to generate since: + * + * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface + * and its addresses by bringing them down. But that's already + * true, so there's nothing to hide. + * + * 2. If cmd == RTM_ADD, then we're supposed to generate messages + * indicating that any previously-hidden up addresses are again + * back up (along with the interface). But they aren't, so + * there's nothing to expose. + */ + if (ill->ill_ipif_up_count == 0) + return; + + if (cmd == RTM_ADD) + ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL); + + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + if (ipif->ipif_flags & IPIF_UP) + ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL); + + if (cmd == RTM_DELETE) + ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL); +} + +/* + * Bind the address named by `ipif' to the underlying ill named by `ill'. + * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act' + * will indicate to the resolver whether this is an initial bringup of + * `ipif', or just a rebind to another ill. + */ +static void +ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) +{ + int err = 0; + ip_stack_t *ipst = ill->ill_ipst; + + ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif)); + ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill)); + ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif)); + ASSERT(ipif->ipif_bound_ill == NULL); + ASSERT(ipif->ipif_bound_next == NULL); + + ipif->ipif_bound_next = ill->ill_bound_ipif; + ill->ill_bound_ipif = ipif; + ill->ill_bound_cnt++; + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + ipif->ipif_bound_ill = ill; + rw_exit(&ipst->ips_ipmp_lock); + + /* + * If necessary, tell ARP/NDP about the new mapping. Note that + * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills. + */ + if (act != Res_act_none) { + if (ill->ill_isv6) { + VERIFY(ipif_resolver_up(ipif, act) == 0); + err = ipif_ndp_up(ipif, act == Res_act_initial); + } else { + err = ipif_resolver_up(ipif, act); + } + + /* + * Since ipif_ndp_up() never returns EINPROGRESS and + * ipif_resolver_up() only returns EINPROGRESS when the + * associated ill is not up, we should never be here with + * EINPROGRESS. We rely on this to simplify the design. + */ + ASSERT(err != EINPROGRESS); + } + /* TODO: retry binding on failure? when? */ + ipif->ipif_bound = (err == 0); +} + +/* + * Unbind the address named by `ipif' from the underlying ill named by `ill'. + * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned. + * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is + * B_TRUE, notify the resolver about the change. + */ +static ipif_t * +ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) +{ + ill_t *ipmp_ill; + ipif_t *previpif; + ip_stack_t *ipst = ill->ill_ipst; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(IS_UNDER_IPMP(ill)); + + ipmp_ill = ill->ill_grp->ig_ipmp_ill; + + /* + * If necessary, find an ipif to unbind. + */ + if (ipif == NULL) { + if ((ipif = ill->ill_bound_ipif) == NULL) { + ASSERT(ill->ill_bound_cnt == 0); + return (NULL); + } + } + + ASSERT(IAM_WRITER_IPIF(ipif)); + ASSERT(IS_IPMP(ipif->ipif_ill)); + ASSERT(ipif->ipif_bound_ill == ill); + ASSERT(ill->ill_bound_cnt > 0); + + /* + * Unbind it. + */ + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + ipif->ipif_bound_ill = NULL; + rw_exit(&ipst->ips_ipmp_lock); + ill->ill_bound_cnt--; + + if (ill->ill_bound_ipif == ipif) { + ill->ill_bound_ipif = ipif->ipif_bound_next; + } else { + previpif = ill->ill_bound_ipif; + while (previpif->ipif_bound_next != ipif) + previpif = previpif->ipif_bound_next; + + previpif->ipif_bound_next = ipif->ipif_bound_next; + } + ipif->ipif_bound_next = NULL; + + /* + * If requested, notify the resolvers (provided we're bound). + */ + if (notifyres && ipif->ipif_bound) { + if (ill->ill_isv6) { + ipif_ndp_down(ipif); + } else { + ASSERT(ipif->ipif_arp_del_mp != NULL); + putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp); + ipif->ipif_arp_del_mp = NULL; + } + } + ipif->ipif_bound = B_FALSE; + + return (ipif); +} + +/* + * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if + * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this + * to determine whether an ill should be considered active, other consumers + * may race and learn about an ill that should be deactivated/activated before + * IPMP has performed the activation/deactivation. This should be safe though + * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that + * would've been cleaned up by ipmp_ill_deactivate(). + */ +boolean_t +ipmp_ill_is_active(ill_t *ill) +{ + phyint_t *phyi = ill->ill_phyint; + + ASSERT(IS_UNDER_IPMP(ill)); + ASSERT(IAM_WRITER_ILL(ill) || + (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock))); + + /* + * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to + * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the + * link flapping logic to be just in in.mpathd and allows us to ignore + * changes to PHYI_RUNNING. + */ + return (!(ill->ill_ipif_up_count == 0 || + (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED)))); +} + +/* + * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet + * IREs with a source address on `ill_arg'. + */ +static void +ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) +{ + ill_t *ill = (ill_t *)ill_arg; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(!IS_IPMP(ill)); + + if (ire->ire_ipif->ipif_ill != ill) + return; + + switch (ire->ire_type) { + case IRE_HOST: + case IRE_PREFIX: + case IRE_DEFAULT: + case IRE_CACHE: + case IRE_IF_RESOLVER: + case IRE_IF_NORESOLVER: + DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); + ire->ire_marks |= IRE_MARK_TESTHIDDEN; + break; + default: + break; + } +} + +/* + * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source + * address on `ill_arg'. + */ +static void +ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) +{ + ill_t *ill = (ill_t *)ill_arg; + + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(!IS_IPMP(ill)); + + if (ire->ire_ipif->ipif_ill == ill) { + DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire); + ire->ire_marks &= ~IRE_MARK_TESTHIDDEN; + } +} + +/* + * Return a held pointer to the IPMP ill for underlying interface `ill', or + * NULL if one doesn't exist. (Unfortunately, this function needs to take an + * underlying ill rather than an ipmp_illgrp_t because an underlying ill's + * ill_grp pointer may become stale when not under an IPSQ and not holding + * ipmp_lock.) Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_ill_hold_ipmp_ill(ill_t *ill) +{ + ip_stack_t *ipst = ill->ill_ipst; + ipmp_illgrp_t *illg; + + ASSERT(!IS_IPMP(ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + illg = ill->ill_grp; + if (illg != NULL && ILL_CAN_LOOKUP(illg->ig_ipmp_ill)) { + ill_refhold(illg->ig_ipmp_ill); + rw_exit(&ipst->ips_ipmp_lock); + return (illg->ig_ipmp_ill); + } + /* + * Assume `ill' was removed from the illgrp in the meantime. + */ + rw_exit(&ill->ill_ipst->ips_ipmp_lock); + return (NULL); +} + +/* + * Return the interface index for the IPMP ill tied to underlying interface + * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ. + */ +uint_t +ipmp_ill_get_ipmp_ifindex(const ill_t *ill) +{ + uint_t ifindex = 0; + ip_stack_t *ipst = ill->ill_ipst; + ipmp_grp_t *grp; + + ASSERT(!IS_IPMP(ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + if ((grp = ill->ill_phyint->phyint_grp) != NULL) + ifindex = grp->gr_phyint->phyint_ifindex; + rw_exit(&ipst->ips_ipmp_lock); + return (ifindex); +} + +/* + * Place phyint `phyi' into IPMP group `grp'. + */ +void +ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp) +{ + ill_t *ill; + ipsq_t *ipsq = phyi->phyint_ipsq; + ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq; + ip_stack_t *ipst = PHYINT_TO_IPST(phyi); + + ASSERT(IAM_WRITER_IPSQ(ipsq)); + ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL); + + /* + * Send routing socket messages indicating that the phyint's ills + * and ipifs vanished. + */ + if (phyi->phyint_illv4 != NULL) { + ill = phyi->phyint_illv4; + ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); + } + + if (phyi->phyint_illv6 != NULL) { + ill = phyi->phyint_illv6; + ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); + } + + /* + * Snapshot the phyint's initial kstats as a baseline. + */ + ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0); + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + + phyi->phyint_grp = grp; + if (++grp->gr_nif == 1) + grp->gr_mactype = ill->ill_mactype; + else + ASSERT(grp->gr_mactype == ill->ill_mactype); + + /* + * Now that we're in the group, request a switch to the group's xop + * when we ipsq_exit(). All future operations will be exclusive on + * the group xop until ipmp_phyint_leave_grp() is called. + */ + ASSERT(ipsq->ipsq_swxop == NULL); + ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop); + ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop; + + rw_exit(&ipst->ips_ipmp_lock); +} + +/* + * Remove phyint `phyi' from its current IPMP group. + */ +void +ipmp_phyint_leave_grp(phyint_t *phyi) +{ + uint_t i; + ipsq_t *ipsq = phyi->phyint_ipsq; + ip_stack_t *ipst = PHYINT_TO_IPST(phyi); + uint64_t phyi_kstats[IPMP_KSTAT_MAX]; + + ASSERT(IAM_WRITER_IPSQ(ipsq)); + + /* + * If any of the phyint's ills are still in an illgrp, kick 'em out. + */ + if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4)) + ipmp_ill_leave_illgrp(phyi->phyint_illv4); + if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6)) + ipmp_ill_leave_illgrp(phyi->phyint_illv6); + + /* + * Send routing socket messages indicating that the phyint's ills + * and ipifs have reappeared. + */ + if (phyi->phyint_illv4 != NULL) + ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD); + if (phyi->phyint_illv6 != NULL) + ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD); + + /* + * Calculate the phyint's cumulative kstats while it was in the group, + * and add that to the group's baseline. + */ + ipmp_phyint_get_kstats(phyi, phyi_kstats); + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + phyi_kstats[i] -= phyi->phyint_kstats0[i]; + atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]); + } + + rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); + + phyi->phyint_grp->gr_nif--; + phyi->phyint_grp = NULL; + + /* + * As our final act in leaving the group, request a switch back to our + * IPSQ's own xop when we ipsq_exit(). + */ + ASSERT(ipsq->ipsq_swxop == NULL); + ipsq->ipsq_swxop = &ipsq->ipsq_ownxop; + + rw_exit(&ipst->ips_ipmp_lock); +} + +/* + * Store the IPMP-related kstats for `phyi' into the array named by `kstats'. + * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements. + */ +static void +ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[]) +{ + uint_t i, j; + const char *name; + kstat_t *ksp; + kstat_named_t *kn; + + bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX); + + /* + * NOTE: ALL_ZONES here assumes that there's at most one link + * with a given name on a given system (safe for now). + */ + ksp = kstat_hold_byname("link", 0, phyi->phyint_name, ALL_ZONES); + if (ksp == NULL) + return; + + KSTAT_ENTER(ksp); + + if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { + /* + * Bring kstats up-to-date before recording. + */ + (void) KSTAT_UPDATE(ksp, KSTAT_READ); + + kn = KSTAT_NAMED_PTR(ksp); + for (i = 0; i < IPMP_KSTAT_MAX; i++) { + name = ipmp_kstats[i].name; + kstats[i] = 0; + for (j = 0; j < ksp->ks_ndata; j++) { + if (strcmp(kn[j].name, name) != 0) + continue; + + switch (kn[j].data_type) { + case KSTAT_DATA_INT32: + case KSTAT_DATA_UINT32: + kstats[i] = kn[j].value.ui32; + break; +#ifdef _LP64 + case KSTAT_DATA_LONG: + case KSTAT_DATA_ULONG: + kstats[i] = kn[j].value.ul; + break; +#endif + case KSTAT_DATA_INT64: + case KSTAT_DATA_UINT64: + kstats[i] = kn[j].value.ui64; + break; + } + break; + } + } + } + + KSTAT_EXIT(ksp); + kstat_rele(ksp); +} + +/* + * Refresh the active state of all ills on `phyi'. + */ +void +ipmp_phyint_refresh_active(phyint_t *phyi) +{ + if (phyi->phyint_illv4 != NULL) + ipmp_ill_refresh_active(phyi->phyint_illv4); + if (phyi->phyint_illv6 != NULL) + ipmp_ill_refresh_active(phyi->phyint_illv6); +} + +/* + * Return a held pointer to the underlying ill bound to `ipif', or NULL if one + * doesn't exist. Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_ipif_hold_bound_ill(const ipif_t *ipif) +{ + ill_t *boundill; + ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + + ASSERT(IS_IPMP(ipif->ipif_ill)); + + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + boundill = ipif->ipif_bound_ill; + if (boundill != NULL && ILL_CAN_LOOKUP(boundill)) { + ill_refhold(boundill); + rw_exit(&ipst->ips_ipmp_lock); + return (boundill); + } + rw_exit(&ipst->ips_ipmp_lock); + return (NULL); +} + +/* + * Return a pointer to the underlying ill bound to `ipif', or NULL if one + * doesn't exist. Caller must be inside the IPSQ. + */ +ill_t * +ipmp_ipif_bound_ill(const ipif_t *ipif) +{ + ASSERT(IAM_WRITER_ILL(ipif->ipif_ill)); + ASSERT(IS_IPMP(ipif->ipif_ill)); + + return (ipif->ipif_bound_ill); +} + +/* + * Check if `ipif' is a "stub" (placeholder address not being used). + */ +boolean_t +ipmp_ipif_is_stubaddr(const ipif_t *ipif) +{ + if (ipif->ipif_flags & IPIF_UP) + return (B_FALSE); + if (ipif->ipif_ill->ill_isv6) + return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); + else + return (ipif->ipif_lcl_addr == INADDR_ANY); +} + +/* + * Check if `ipif' is an IPMP data address. + */ +boolean_t +ipmp_ipif_is_dataaddr(const ipif_t *ipif) +{ + if (ipif->ipif_flags & IPIF_NOFAILOVER) + return (B_FALSE); + if (ipif->ipif_ill->ill_isv6) + return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); + else + return (ipif->ipif_lcl_addr != INADDR_ANY); +} + +/* + * Check if `ipif' is an IPIF_UP IPMP data address. + */ +static boolean_t +ipmp_ipif_is_up_dataaddr(const ipif_t *ipif) +{ + return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP)); +} diff --git a/usr/src/uts/common/inet/ip/rts.c b/usr/src/uts/common/inet/ip/rts.c index 4999f28d1e..2751b19993 100644 --- a/usr/src/uts/common/inet/ip/rts.c +++ b/usr/src/uts/common/inet/ip/rts.c @@ -561,7 +561,6 @@ rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) case SO_TYPE: *i1 = SOCK_RAW; break; - /* * The following three items are available here, * but are only meaningful to IP. @@ -597,6 +596,15 @@ rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) return (-1); } break; + case SOL_ROUTE: + switch (name) { + case RT_AWARE: + mutex_enter(&connp->conn_lock); + *i1 = connp->conn_rtaware; + mutex_exit(&connp->conn_lock); + break; + } + break; default: return (-1); } @@ -701,6 +709,20 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, return (EINVAL); } break; + case SOL_ROUTE: + switch (name) { + case RT_AWARE: + if (!checkonly) { + mutex_enter(&connp->conn_lock); + connp->conn_rtaware = *i1; + mutex_exit(&connp->conn_lock); + } + break; /* goto sizeof (int) option return */ + default: + *outlenp = 0; + return (EINVAL); + } + break; default: *outlenp = 0; return (EINVAL); diff --git a/usr/src/uts/common/inet/ip/rts_opt_data.c b/usr/src/uts/common/inet/ip/rts_opt_data.c index bac0eabdc4..7397b53b9e 100644 --- a/usr/src/uts/common/inet/ip/rts_opt_data.c +++ b/usr/src/uts/common/inet/ip/rts_opt_data.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -60,6 +60,7 @@ opdes_t rts_opt_arr[] = { { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, { SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ RT_AWARE, SOL_ROUTE, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, }; /* diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c index f785d8a3f6..8a3aa86d60 100644 --- a/usr/src/uts/common/inet/ip/spd.c +++ b/usr/src/uts/common/inet/ip/spd.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -3989,7 +3989,7 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h) ipsec_out_t *io; boolean_t v4; mblk_t *mp; - boolean_t secure, attach_if; + boolean_t secure; uint_t ifindex; ipsec_selector_t sel; ipsec_action_t *reflect_action = NULL; @@ -4012,7 +4012,6 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h) } else if (!ii->ipsec_in_loopback) reflect_action = ipsec_in_to_out_action(ii); secure = ii->ipsec_in_secure; - attach_if = ii->ipsec_in_attach_if; ifindex = ii->ipsec_in_ill_index; zoneid = ii->ipsec_in_zoneid; ASSERT(zoneid != ALL_ZONES); @@ -4057,7 +4056,6 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h) io->ipsec_out_proc_begin = B_FALSE; io->ipsec_out_secure = secure; io->ipsec_out_v4 = v4; - io->ipsec_out_attach_if = attach_if; io->ipsec_out_ill_index = ifindex; io->ipsec_out_zoneid = zoneid; io->ipsec_out_ns = ns; /* No netstack_hold */ @@ -4549,7 +4547,6 @@ ipsec_out_to_in(mblk_t *ipsec_mp) ii->ipsec_in_secure = B_TRUE; ii->ipsec_in_v4 = v4; ii->ipsec_in_icmp_loopback = icmp_loopback; - ii->ipsec_in_attach_if = B_FALSE; } /* diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h index d463c3f6ee..ad331d5706 100644 --- a/usr/src/uts/common/inet/ip6.h +++ b/usr/src/uts/common/inet/ip6.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -133,10 +133,8 @@ typedef struct ip6_info ip6i_t; #define IP6I_RAW_CHECKSUM 0x10 /* Compute checksum and stuff in ip6i_checksum_off */ #define IP6I_VERIFY_SRC 0x20 /* Verify ip6_src. Used when IPV6_PKTINFO */ -#define IP6I_ATTACH_IF 0x40 /* Bind to no failover address or BOUND_PIF. */ -#define IP6I_DROP_IFDELAYED 0x80 - /* Drop the packet if delayed in ndp resolver */ -#define IP6I_ND_DELAYED 0x100 /* Packet was delayed in ndp resolver */ +#define IP6I_IPMP_PROBE 0x40 /* IPMP (in.mpathd) probe packet */ + /* 0x80 - 0x100 available */ #define IP6I_DONTFRAG 0x200 /* Don't fragment this packet */ #define IP6I_HOPLIMIT 0x400 /* hoplimit has been set by the sender */ @@ -340,7 +338,7 @@ extern void icmp_time_exceeded_v6(queue_t *, mblk_t *, uint8_t, extern void icmp_unreachable_v6(queue_t *, mblk_t *, uint8_t, boolean_t, boolean_t, zoneid_t, ip_stack_t *); extern void icmp_inbound_error_fanout_v6(queue_t *, mblk_t *, ip6_t *, - icmp6_t *, ill_t *, boolean_t, zoneid_t); + icmp6_t *, ill_t *, ill_t *, boolean_t, zoneid_t); extern boolean_t conn_wantpacket_v6(conn_t *, ill_t *, ip6_t *, int, zoneid_t); extern mblk_t *ip_add_info_v6(mblk_t *, ill_t *, const in6_addr_t *); extern in6addr_scope_t ip_addr_scope_v6(const in6_addr_t *); @@ -382,7 +380,7 @@ extern int ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t, ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t, const in6_addr_t *, mblk_t *); extern void ip_newroute_ipif_v6(queue_t *, mblk_t *, ipif_t *, - in6_addr_t, int, zoneid_t); + const in6_addr_t *, const in6_addr_t *, int, zoneid_t); extern void ip_newroute_v6(queue_t *, mblk_t *, const in6_addr_t *, const in6_addr_t *, ill_t *, zoneid_t, ip_stack_t *); extern void *ip6_kstat_init(netstackid_t, ip6_stat_t *); diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h index c5982de059..094800197e 100644 --- a/usr/src/uts/common/inet/ip_if.h +++ b/usr/src/uts/common/inet/ip_if.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -80,7 +80,7 @@ extern "C" { */ #define IFF_PHYINT_FLAGS (IFF_LOOPBACK|IFF_RUNNING|IFF_PROMISC| \ IFF_ALLMULTI|IFF_INTELLIGENT|IFF_MULTI_BCAST|IFF_FAILED|IFF_STANDBY| \ - IFF_INACTIVE|IFF_OFFLINE|IFF_VIRTUAL) + IFF_INACTIVE|IFF_OFFLINE|IFF_VIRTUAL|IFF_IPMP) #define IFF_PHYINTINST_FLAGS (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP| \ IFF_MULTICAST|IFF_ROUTER|IFF_NONUD|IFF_NORTEXCH|IFF_IPV4|IFF_IPV6| \ @@ -91,11 +91,6 @@ extern "C" { IFF_DEPRECATED|IFF_ADDRCONF|IFF_ANYCAST|IFF_NOFAILOVER| \ IFF_PREFERRED|IFF_TEMPORARY|IFF_FIXEDMTU|IFF_DUPLICATE) -#define IPIF_REPL_CHECK(to_ipif, failback_cmd) \ - (((to_ipif)->ipif_replace_zero) || ((failback_cmd) && \ - !(to_ipif)->ipif_isv6 && !((to_ipif)->ipif_flags & IPIF_UP) && \ - (to_ipif)->ipif_lcl_addr == INADDR_ANY)) - #define PHYI_LOOPBACK IFF_LOOPBACK /* is a loopback net */ #define PHYI_RUNNING IFF_RUNNING /* resources allocated */ #define PHYI_PROMISC IFF_PROMISC /* receive all packets */ @@ -107,6 +102,7 @@ extern "C" { #define PHYI_INACTIVE IFF_INACTIVE /* Standby active or not ? */ #define PHYI_OFFLINE IFF_OFFLINE /* NIC has been offlined */ #define PHYI_VIRTUAL IFF_VIRTUAL /* Will not send or recv pkts */ +#define PHYI_IPMP IFF_IPMP /* IPMP meta-interface */ #define ILLF_DEBUG IFF_DEBUG /* turn on debugging */ #define ILLF_NOTRAILERS IFF_NOTRAILERS /* avoid use of trailers */ @@ -137,11 +133,6 @@ extern "C" { #define IPIF_FIXEDMTU IFF_FIXEDMTU /* set with SIOCSLIFMTU */ #define IPIF_DUPLICATE IFF_DUPLICATE /* address is in use */ -/* Source selection values for ipif_select_source_v6 */ -#define RESTRICT_TO_NONE 0x0 /* No restriction in source selection */ -#define RESTRICT_TO_GROUP 0x1 /* Restrict to IPMP group */ -#define RESTRICT_TO_ILL 0x2 /* Restrict to ILL */ - #ifdef DEBUG #define ILL_MAC_PERIM_HELD(ill) ill_mac_perim_held(ill) #else @@ -151,24 +142,23 @@ extern "C" { /* for ipif_resolver_up */ enum ip_resolver_action { Res_act_initial, /* initial address establishment */ - Res_act_move, /* address move (IPMP, new DL addr) */ - Res_act_defend /* address defense */ + Res_act_rebind, /* IPMP address rebind (new hwaddr) */ + Res_act_defend, /* address defense */ + Res_act_none /* do nothing */ }; -extern ill_t *illgrp_scheduler(ill_t *); -extern mblk_t *ill_arp_alloc(ill_t *, uchar_t *, caddr_t); -extern mblk_t *ipif_area_alloc(ipif_t *); +extern mblk_t *ill_arp_alloc(ill_t *, const uchar_t *, caddr_t); +extern mblk_t *ipif_area_alloc(ipif_t *, uint_t); extern mblk_t *ipif_ared_alloc(ipif_t *); extern mblk_t *ill_ared_alloc(ill_t *, ipaddr_t); -extern void ill_dlpi_done(ill_t *, t_uscalar_t); +extern mblk_t *ill_arie_alloc(ill_t *, const char *, const void *); extern boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t); +extern void ill_dlpi_done(ill_t *, t_uscalar_t); extern void ill_dlpi_send(ill_t *, mblk_t *); extern void ill_dlpi_send_deferred(ill_t *); extern void ill_capability_done(ill_t *); extern mblk_t *ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t); -extern ill_t *ill_group_lookup_on_ifindex(uint_t, boolean_t, ip_stack_t *); -extern ill_t *ill_group_lookup_on_name(char *, boolean_t, ip_stack_t *); /* NOTE: Keep unmodified ill_lookup_on_ifindex for ipp for now */ extern ill_t *ill_lookup_on_ifindex_global_instance(uint_t, boolean_t, queue_t *, mblk_t *, ipsq_func_t, int *); @@ -180,6 +170,7 @@ extern ill_t *ill_lookup_on_name(char *, boolean_t, extern uint_t ill_get_next_ifindex(uint_t, boolean_t, ip_stack_t *); extern uint_t ill_get_ifindex_by_name(char *, ip_stack_t *); extern void ill_ipif_cache_delete(ire_t *, char *); +extern void ill_stq_cache_delete(ire_t *, char *); extern void ill_delete(ill_t *); extern void ill_delete_tail(ill_t *); extern int ill_dl_phys(ill_t *, ipif_t *, mblk_t *, queue_t *); @@ -193,9 +184,9 @@ extern void ill_frag_prune(ill_t *, uint_t); extern void ill_frag_free_pkts(ill_t *, ipfb_t *, ipf_t *, int); extern time_t ill_frag_timeout(ill_t *, time_t); extern int ill_init(queue_t *, ill_t *); -extern int ill_nominate_mcast_rcv(ill_group_t *); -extern boolean_t ill_setdefaulttoken(ill_t *); +extern void ill_refresh_bcast(ill_t *); extern void ill_restart_dad(ill_t *, boolean_t); +extern boolean_t ill_setdefaulttoken(ill_t *); extern int ill_set_phys_addr(ill_t *, mblk_t *); extern void ill_set_ndmp(ill_t *, mblk_t *, uint_t, uint_t); @@ -222,11 +213,9 @@ extern void ill_capability_reset(ill_t *, boolean_t); extern void ill_taskq_dispatch(ip_stack_t *); extern void ill_mtu_change(ire_t *, char *); -extern void ill_group_cleanup(ill_t *); -extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *); -extern boolean_t ill_is_probeonly(ill_t *); -extern boolean_t ill_hook_event_create(ill_t *, lif_if_t, nic_event_t, - nic_event_data_t, size_t); +extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *); +extern uint_t ill_appaddr_cnt(const ill_t *); +extern uint_t ill_ptpaddr_cnt(const ill_t *); extern void ip_loopback_cleanup(ip_stack_t *); extern void ipif_get_name(const ipif_t *, char *, int); @@ -239,6 +228,8 @@ extern ipif_t *ipif_lookup_addr_v6(const in6_addr_t *, ill_t *, zoneid_t, queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *); extern boolean_t ip_addr_exists_v6(const in6_addr_t *, zoneid_t, ip_stack_t *); +extern ipif_t *ipif_lookup_addr_exact_v6(const in6_addr_t *, ill_t *, + ip_stack_t *); extern zoneid_t ipif_lookup_addr_zoneid(ipaddr_t, ill_t *, ip_stack_t *); extern zoneid_t ipif_lookup_addr_zoneid_v6(const in6_addr_t *, ill_t *, ip_stack_t *); @@ -251,31 +242,30 @@ extern ipif_t *ipif_lookup_multicast(ip_stack_t *, zoneid_t, boolean_t); extern ipif_t *ipif_lookup_remote(ill_t *, ipaddr_t, zoneid_t); extern ipif_t *ipif_lookup_onlink_addr(ipaddr_t, zoneid_t, ip_stack_t *); extern ipif_t *ipif_lookup_seqid(ill_t *, uint_t); -extern boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int, - ipif_t **); -extern boolean_t ipif_lookup_zoneid_group(ill_t *, zoneid_t, int, - ipif_t **); +extern boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int, ipif_t **); extern ipif_t *ipif_select_source(ill_t *, ipaddr_t, zoneid_t); extern boolean_t ipif_usesrc_avail(ill_t *, zoneid_t); extern void ipif_refhold(ipif_t *); extern void ipif_refhold_locked(ipif_t *); -extern void ipif_refrele(ipif_t *); +extern void ipif_refrele(ipif_t *); extern void ipif_all_down_tail(ipsq_t *, queue_t *, mblk_t *, void *); +extern void ipif_resolver_down(ipif_t *); extern int ipif_resolver_up(ipif_t *, enum ip_resolver_action); extern int ipif_arp_setup_multicast(ipif_t *, mblk_t **); extern int ipif_down(ipif_t *, queue_t *, mblk_t *); extern void ipif_down_tail(ipif_t *); +extern void ipif_multicast_down(ipif_t *); extern void ipif_multicast_up(ipif_t *); extern void ipif_ndp_down(ipif_t *); -extern int ipif_ndp_up(ipif_t *); +extern int ipif_ndp_up(ipif_t *, boolean_t); extern int ipif_ndp_setup_multicast(ipif_t *, struct nce_s **); extern int ipif_up_done(ipif_t *); extern int ipif_up_done_v6(ipif_t *); extern void ipif_up_notify(ipif_t *); -extern void ipif_update_other_ipifs_v6(ipif_t *, ill_group_t *); +extern void ipif_update_other_ipifs_v6(ipif_t *); extern void ipif_recreate_interface_routes_v6(ipif_t *, ipif_t *); extern void ill_update_source_selection(ill_t *); -extern ipif_t *ipif_select_source_v6(ill_t *, const in6_addr_t *, uint_t, +extern ipif_t *ipif_select_source_v6(ill_t *, const in6_addr_t *, boolean_t, uint32_t, zoneid_t); extern boolean_t ipif_cant_setlinklocal(ipif_t *); extern int ipif_setlinklocal(ipif_t *); @@ -284,11 +274,8 @@ extern ipif_t *ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t, queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *); extern ipif_t *ipif_get_next_ipif(ipif_t *curr, ill_t *ill); extern void ipif_ill_refrele_tail(ill_t *ill); -extern void ipif_arp_down(ipif_t *ipif); extern void ipif_mask_reply(ipif_t *); - -extern int illgrp_insert(ill_group_t **, ill_t *, char *, ill_group_t *, - boolean_t); +extern int ipif_up(ipif_t *, queue_t *, mblk_t *); extern void ipsq_current_start(ipsq_t *, ipif_t *, int); extern void ipsq_current_finish(ipsq_t *); @@ -451,13 +438,13 @@ extern int ip_sioctl_tmyaddr(ipif_t *, sin_t *, queue_t *, mblk_t *, extern int ip_sioctl_tunparam(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); +extern int ip_sioctl_get_binding(ipif_t *, sin_t *, queue_t *, + mblk_t *, ip_ioctl_cmd_t *, void *); extern int ip_sioctl_groupname(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); extern int ip_sioctl_get_groupname(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); -extern int ip_sioctl_slifoindex(ipif_t *, sin_t *, queue_t *, - mblk_t *, ip_ioctl_cmd_t *, void *); -extern int ip_sioctl_get_oindex(ipif_t *, sin_t *, queue_t *, +extern int ip_sioctl_groupinfo(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); extern int ip_sioctl_get_lifzone(ipif_t *, sin_t *, queue_t *, @@ -473,15 +460,11 @@ extern int ip_sioctl_slifusesrc(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); extern int ip_sioctl_get_lifsrcof(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); -extern int ip_sioctl_set_ipmpfailback(ipif_t *, sin_t *, queue_t *, - mblk_t *, ip_ioctl_cmd_t *, void *); extern void ip_sioctl_copyin_resume(ipsq_t *, queue_t *, mblk_t *, void *); extern void ip_sioctl_copyin_setup(queue_t *, mblk_t *); -extern void ip_sioctl_iocack(queue_t *, mblk_t *); +extern void ip_sioctl_iocack(ipsq_t *, queue_t *, mblk_t *, void *); extern ip_ioctl_cmd_t *ip_sioctl_lookup(int); -extern int ip_sioctl_move(ipif_t *, sin_t *, queue_t *, mblk_t *, - ip_ioctl_cmd_t *, void *); extern void conn_delete_ire(conn_t *, caddr_t); diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h index dae62ab499..369ba60005 100644 --- a/usr/src/uts/common/inet/ip_impl.h +++ b/usr/src/uts/common/inet/ip_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -394,11 +394,9 @@ typedef struct ip_lso_info_s { #define CONN_IS_LSO_MD_FASTPATH(connp) \ ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \ !((connp)->conn_nexthop_set) && /* IP_NEXTHOP */ \ - (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \ - (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \ (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */ -/* Definitons for fragmenting IP packets using MDT. */ +/* Definitions for fragmenting IP packets using MDT. */ /* * Smaller and private version of pdescinfo_t used specifically for IP, diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h index 7accbbcfa3..0a9f8add85 100644 --- a/usr/src/uts/common/inet/ip_ire.h +++ b/usr/src/uts/common/inet/ip_ire.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -86,31 +86,17 @@ extern "C" { /* return the ire. No recursive */ /* lookup should be done. */ #define MATCH_IRE_IHANDLE 0x0200 /* Match IRE on ihandle */ -#define MATCH_IRE_MARK_HIDDEN 0x0400 /* Match IRE ire_marks with */ - /* IRE_MARK_HIDDEN. */ +#define MATCH_IRE_MARK_TESTHIDDEN 0x0400 /* Match IRE_MARK_TESTHIDDEN IREs */ + /* - * MATCH_IRE_ILL is used whenever we want to specifically match an IRE - * whose ire_ipif->ipif_ill or (ill_t *)ire_stq->q_ptr matches a given - * ill. When MATCH_IRE_ILL is used to locate an IRE_CACHE, it implies - * that the packet will not be load balanced. This is normally used - * by in.mpathd to send out failure detection probes. - * - * MATCH_IRE_ILL_GROUP is used whenever we are not specific about which - * interface (ill) the packet should be sent out. This implies that the - * packets will be subjected to load balancing and it might go out on - * any interface in the group. When there is only interface in the group, - * MATCH_IRE_ILL_GROUP becomes MATCH_IRE_ILL. Most of the code uses - * MATCH_IRE_ILL_GROUP and MATCH_IRE_ILL is used in very few cases where - * we want to disable load balancing. - * * MATCH_IRE_PARENT is used whenever we unconditionally want to get the * parent IRE (sire) while recursively searching IREs for an offsubnet * destination. With this flag, even if no IRE_CACHETABLE or IRE_INTERFACE * is found to help resolving IRE_OFFSUBNET in lookup routines, the * IRE_OFFSUBNET sire, if any, is returned to the caller. */ -#define MATCH_IRE_ILL_GROUP 0x0800 /* Match IRE on ill or the ill_group. */ -#define MATCH_IRE_ILL 0x1000 /* Match IRE on the ill only */ +/* UNUSED 0x0800 */ +#define MATCH_IRE_ILL 0x1000 /* Match IRE on the ill */ #define MATCH_IRE_PARENT 0x2000 /* Match parent ire, if any, */ /* even if ire is not matched. */ @@ -305,7 +291,7 @@ extern ire_t *ire_ihandle_lookup_onlink(ire_t *); extern ire_t *ire_ihandle_lookup_offlink(ire_t *, ire_t *); extern ire_t *ire_ihandle_lookup_offlink_v6(ire_t *, ire_t *); -extern boolean_t ire_local_same_ill_group(ire_t *, ire_t *); +extern boolean_t ire_local_same_lan(ire_t *, ire_t *); extern boolean_t ire_local_ok_across_zones(ire_t *, zoneid_t, void *, const struct ts_label_s *, ip_stack_t *); @@ -354,7 +340,7 @@ extern ire_t *ipif_lookup_multi_ire_v6(ipif_t *, const in6_addr_t *); extern ire_t *ire_get_next_bcast_ire(ire_t *, ire_t *); extern ire_t *ire_get_next_default_ire(ire_t *, ire_t *); -extern void ire_arpresolve(ire_t *, ill_t *); +extern void ire_arpresolve(ire_t *); extern void ire_freemblk(ire_t *); extern boolean_t ire_match_args(ire_t *, ipaddr_t, ipaddr_t, ipaddr_t, int, const ipif_t *, zoneid_t, uint32_t, const struct ts_label_s *, int, diff --git a/usr/src/uts/common/inet/ip_multi.h b/usr/src/uts/common/inet/ip_multi.h index a3f4282cc7..7dee133967 100644 --- a/usr/src/uts/common/inet/ip_multi.h +++ b/usr/src/uts/common/inet/ip_multi.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -49,6 +49,15 @@ typedef enum { } ilg_stat_t; /* + * Flags shared via ips_mrt_flags, used by mcast_restart_timers_thread(). + */ +typedef enum { + IP_MRT_STOP = 0x1, /* request to stop thread */ + IP_MRT_DONE = 0x2, /* indication that thread is stopped */ + IP_MRT_RUN = 0x4 /* request to restart timers */ +} ip_mrt_flags_t; + +/* * Extern functions */ extern mblk_t *igmp_input(queue_t *, mblk_t *, ill_t *); @@ -78,9 +87,7 @@ extern int ip_get_dlpi_mbcast(ill_t *, mblk_t *); extern void ilm_free(ipif_t *); extern ilm_t *ilm_lookup_ill(ill_t *, ipaddr_t, zoneid_t); extern ilm_t *ilm_lookup_ill_v6(ill_t *, const in6_addr_t *, - zoneid_t); -extern ilm_t *ilm_lookup_ill_index_v6(ill_t *, const in6_addr_t *, - int, zoneid_t); + boolean_t, zoneid_t); extern ilm_t *ilm_lookup_ipif(ipif_t *, ipaddr_t); extern int ilm_numentries_v6(ill_t *, const in6_addr_t *); @@ -92,10 +99,10 @@ extern int ip_ll_send_enabmulti_req(ill_t *, const in6_addr_t *); extern int ip_addmulti(ipaddr_t, ipif_t *, ilg_stat_t, mcast_record_t, slist_t *); -extern int ip_addmulti_v6(const in6_addr_t *, ill_t *, int, +extern int ip_addmulti_v6(const in6_addr_t *, ill_t *, zoneid_t, ilg_stat_t, mcast_record_t, slist_t *); extern int ip_delmulti(ipaddr_t, ipif_t *, boolean_t, boolean_t); -extern int ip_delmulti_v6(const in6_addr_t *, ill_t *, int, +extern int ip_delmulti_v6(const in6_addr_t *, ill_t *, zoneid_t, boolean_t, boolean_t); extern int ill_join_allmulti(ill_t *); extern void ill_leave_allmulti(ill_t *); @@ -140,9 +147,11 @@ extern void reset_conn_ipif(ipif_t *); extern void reset_conn_ill(ill_t *); extern void reset_mrt_ill(ill_t *); extern void reset_mrt_vif_ipif(ipif_t *); -extern void igmp_start_timers(unsigned, ip_stack_t *); -extern void mld_start_timers(unsigned, ip_stack_t *); +extern void mcast_restart_timers_thread(ip_stack_t *); extern void ilm_inactive(ilm_t *); +extern ilm_t *ilm_walker_start(ilm_walker_t *, ill_t *); +extern ilm_t *ilm_walker_step(ilm_walker_t *, ilm_t *); +extern void ilm_walker_finish(ilm_walker_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip_ndp.h b/usr/src/uts/common/inet/ip_ndp.h index 4dbb56a884..5eda155c0e 100644 --- a/usr/src/uts/common/inet/ip_ndp.h +++ b/usr/src/uts/common/inet/ip_ndp.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_IP_NDP_H #define _INET_IP_NDP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/mutex.h> #include <sys/stream.h> #include <netinet/in.h> @@ -318,7 +316,8 @@ extern nd_opt_hdr_t *ndp_get_option(nd_opt_hdr_t *, int, int); extern void ndp_inactive(nce_t *); extern void ndp_input(ill_t *, mblk_t *, mblk_t *); extern boolean_t ndp_lookup_ipaddr(in_addr_t, netstack_t *); -extern nce_t *ndp_lookup_v6(ill_t *, const in6_addr_t *, boolean_t); +extern nce_t *ndp_lookup_v6(ill_t *, boolean_t, const in6_addr_t *, + boolean_t); extern nce_t *ndp_lookup_v4(ill_t *, const in_addr_t *, boolean_t); extern int ndp_mcastreq(ill_t *, const in6_addr_t *, uint32_t, uint32_t, mblk_t *); @@ -346,7 +345,7 @@ extern void nce_fastpath(nce_t *); extern int ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t, nce_t **); -extern int ndp_lookup_then_add_v6(ill_t *, uchar_t *, +extern int ndp_lookup_then_add_v6(ill_t *, boolean_t, uchar_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t, nce_t **); extern int ndp_lookup_then_add_v4(ill_t *, diff --git a/usr/src/uts/common/inet/ip_rts.h b/usr/src/uts/common/inet/ip_rts.h index 70b33e0278..61bc451995 100644 --- a/usr/src/uts/common/inet/ip_rts.h +++ b/usr/src/uts/common/inet/ip_rts.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,19 +37,28 @@ extern "C" { */ #define TSOL_RTSA_REQUEST_MAX 1 /* one per route destination */ +/* + * Flags for RTS queuing operations. + */ +#define RTSQ_UNDER_IPMP 0x01 /* send only on RTAW_UNDER_IPMP queues */ +#define RTSQ_NORMAL 0x02 /* send only on normal queues */ +#define RTSQ_ALL (RTSQ_UNDER_IPMP|RTSQ_NORMAL) /* send on all queues */ +#define RTSQ_DEFAULT 0x04 /* use standard filtering */ + #ifdef _KERNEL extern void ip_rts_change(int, ipaddr_t, ipaddr_t, - ipaddr_t, ipaddr_t, ipaddr_t, int, int, - int, ip_stack_t *); + ipaddr_t, ipaddr_t, ipaddr_t, int, int, int, ip_stack_t *); extern void ip_rts_change_v6(int, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, int, int, int, ip_stack_t *); -extern void ip_rts_ifmsg(const ipif_t *); +extern void ip_rts_ifmsg(const ipif_t *, uint_t); -extern void ip_rts_newaddrmsg(int, int, const ipif_t *); +extern void ip_rts_xifmsg(const ipif_t *, uint64_t, uint64_t, uint_t); + +extern void ip_rts_newaddrmsg(int, int, const ipif_t *, uint_t); extern int ip_rts_request(queue_t *, mblk_t *, cred_t *); @@ -70,9 +79,11 @@ extern void rts_fill_msg_v6(int, int, const in6_addr_t *, extern size_t rts_header_msg_size(int); -extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, ip_stack_t *); +extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, uint_t, + ip_stack_t *); extern int ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *, cred_t *); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h index 3c53e1a3d3..750378f587 100644 --- a/usr/src/uts/common/inet/ip_stack.h +++ b/usr/src/uts/common/inet/ip_stack.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +33,7 @@ extern "C" { #include <sys/netstack.h> #include <netinet/igmp_var.h> +#include <sys/modhash.h> #ifdef _KERNEL #include <sys/list.h> @@ -172,9 +173,6 @@ struct ip_stack { krwlock_t ips_ill_g_usesrc_lock; - struct ill_group *ips_illgrp_head_v4; /* Head of IPv4 ill groups */ - struct ill_group *ips_illgrp_head_v6; /* Head of IPv6 ill groups */ - /* Taskq dispatcher for capability operations */ kmutex_t ips_capab_taskq_lock; kcondvar_t ips_capab_taskq_cv; @@ -204,7 +202,6 @@ struct ip_stack { int ips_igmp_timer_scheduled_last; int ips_igmp_deferred_next; timeout_id_t ips_igmp_timeout_id; - kthread_t *ips_igmp_timer_thread; boolean_t ips_igmp_timer_setter_active; /* Following protected by mld_timer_lock */ @@ -212,7 +209,6 @@ struct ip_stack { int ips_mld_timer_scheduled_last; int ips_mld_deferred_next; timeout_id_t ips_mld_timeout_id; - kthread_t *ips_mld_timer_thread; boolean_t ips_mld_timer_setter_active; /* Protected by igmp_slowtimeout_lock */ @@ -269,8 +265,6 @@ struct ip_stack { int ips_ip_g_forward; int ips_ipv6_forward; - int ips_ipmp_hook_emulation; /* ndd variable */ - time_t ips_ip_g_frag_timeout; clock_t ips_ip_g_frag_timo_ms; @@ -280,8 +274,6 @@ struct ip_stack { clock_t ips_icmp_pkt_err_last; /* Number of packets sent in burst */ uint_t ips_icmp_pkt_err_sent; - /* Used by icmp_send_redirect_v6 for picking random src. */ - uint_t ips_icmp_redirect_v6_src_index; /* Protected by ip_mi_lock */ void *ips_ip_g_head; /* Instance Data List Head */ @@ -356,8 +348,6 @@ struct ip_stack { kstat_t *ips_loopback_ksp; - uint_t ips_ipif_src_random; - struct idl_s *ips_conn_drain_list; /* Array of conn drain lists */ uint_t ips_conn_drain_list_cnt; /* Count of conn_drain_list */ int ips_conn_drain_list_index; /* Next drain_list */ @@ -375,15 +365,6 @@ struct ip_stack { uint64_t ips_ipif_g_seqid; union phyint_list_u *ips_phyint_g_list; /* start of phyint list */ - /* - * Reflects value of FAILBACK variable in IPMP config file - * /etc/default/mpathd. Default value is B_TRUE. - * Set to B_FALSE if user disabled failback by configuring - * "FAILBACK=no" in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this - * information to kernel. - */ - boolean_t ips_ipmp_enable_failback; - /* ip_neti.c */ hook_family_t ips_ipv4root; hook_family_t ips_ipv6root; @@ -427,12 +408,25 @@ struct ip_stack { kcondvar_t ips_ipobs_cb_cv; struct __ldi_ident *ips_ldi_ident; + +/* ipmp.c */ + krwlock_t ips_ipmp_lock; + mod_hash_t *ips_ipmp_grp_hash; + +/* igmp.c */ + /* multicast restart timers thread logic */ + kmutex_t ips_mrt_lock; + uint_t ips_mrt_flags; + kcondvar_t ips_mrt_cv; + kcondvar_t ips_mrt_done_cv; + kthread_t *ips_mrt_thread; }; typedef struct ip_stack ip_stack_t; /* Finding an ip_stack_t */ #define CONNQ_TO_IPST(_q) (Q_TO_CONN(_q)->conn_netstack->netstack_ip) #define ILLQ_TO_IPST(_q) (((ill_t *)(_q)->q_ptr)->ill_ipst) +#define PHYINT_TO_IPST(phyi) ((phyi)->phyint_ipsq->ipsq_ipst) #else /* _KERNEL */ typedef int ip_stack_t; diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index 5fb86a5262..d80123a977 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -249,7 +249,6 @@ struct conn_s { squeue_t *conn_initial_sqp; /* Squeue at open time */ squeue_t *conn_final_sqp; /* Squeue after connect */ - ill_t *conn_nofailover_ill; /* Failover ill */ ill_t *conn_dhcpinit_ill; /* IP_DHCPINIT_IF */ ipsec_latch_t *conn_latch; /* latched state */ ill_t *conn_outgoing_ill; /* IP{,V6}_BOUND_IF */ @@ -295,7 +294,6 @@ struct conn_s { uint_t conn_proto; /* SO_PROTOTYPE state */ ill_t *conn_incoming_ill; /* IP{,V6}_BOUND_IF */ - ill_t *conn_outgoing_pill; /* IP{,V6}_BOUND_PIF */ ill_t *conn_oper_pending_ill; /* pending shared ioctl */ ilg_t *conn_ilg; /* Group memberships */ @@ -307,9 +305,6 @@ struct conn_s { struct ipif_s *conn_multicast_ipif; /* IP_MULTICAST_IF */ ill_t *conn_multicast_ill; /* IPV6_MULTICAST_IF */ - int conn_orig_bound_ifindex; /* BOUND_IF before MOVE */ - int conn_orig_multicast_ifindex; - /* IPv6 MC IF before MOVE */ struct conn_s *conn_drain_next; /* Next conn in drain list */ struct conn_s *conn_drain_prev; /* Prev conn in drain list */ idl_t *conn_idl; /* Ptr to the drain list head */ @@ -322,7 +317,7 @@ struct conn_s { uchar_t conn_broadcast_ttl; /* IP_BROADCAST_TTL */ #define conn_nexthop_v4 V4_PART_OF_V6(conn_nexthop_v6) cred_t *conn_peercred; /* Peer credentials, if any */ - + int conn_rtaware; /* RT_AWARE sockopt value */ kcondvar_t conn_sq_cv; /* For non-STREAMS socket IO */ kthread_t *conn_sq_caller; /* Caller of squeue sync ops */ sock_upcalls_t *conn_upcalls; /* Upcalls to sockfs */ diff --git a/usr/src/uts/common/inet/ipnet/ipnet.c b/usr/src/uts/common/inet/ipnet/ipnet.c index 577205f25a..e94af50424 100644 --- a/usr/src/uts/common/inet/ipnet/ipnet.c +++ b/usr/src/uts/common/inet/ipnet/ipnet.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -229,16 +229,19 @@ ipnet_if_init(void) int _init(void) { - int ret; + int ret; + boolean_t netstack_registered = B_FALSE; if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1) return (ENODEV); ipnet_minor_space = id_space_create("ipnet_minor_space", IPNET_MINOR_MIN, MAXMIN32); - netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini); + /* * We call ddi_taskq_create() with nthread == 1 to ensure in-order - * delivery of packets to clients. + * delivery of packets to clients. Note that we need to create the + * taskqs before calling netstack_register() since ipnet_stack_init() + * registers callbacks that use 'em. */ ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0); ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue", @@ -247,6 +250,10 @@ _init(void) ret = ENOMEM; goto done; } + + netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini); + netstack_registered = B_TRUE; + if ((ret = ipnet_if_init()) == 0) ret = mod_install(&modlinkage); done: @@ -255,7 +262,8 @@ done: ddi_taskq_destroy(ipnet_taskq); if (ipnet_nicevent_taskq != NULL) ddi_taskq_destroy(ipnet_nicevent_taskq); - netstack_unregister(NS_IPNET); + if (netstack_registered) + netstack_unregister(NS_IPNET); id_space_destroy(ipnet_minor_space); } return (ret); @@ -268,9 +276,10 @@ _fini(void) if ((err = mod_remove(&modlinkage)) != 0) return (err); + + netstack_unregister(NS_IPNET); ddi_taskq_destroy(ipnet_nicevent_taskq); ddi_taskq_destroy(ipnet_taskq); - netstack_unregister(NS_IPNET); id_space_destroy(ipnet_minor_space); return (0); } @@ -987,6 +996,7 @@ static boolean_t ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src, ipnet_addrp_t *dst) { + boolean_t obsif; uint64_t ifindex = ipnet->ipnet_if->if_index; ipnet_addrtype_t srctype, dsttype; @@ -994,6 +1004,13 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src, dsttype = ipnet_get_addrtype(ipnet, dst); /* + * If the packet's ifindex matches ours, or the packet's group ifindex + * matches ours, it's on the interface we're observing. (Thus, + * observing on the group ifindex matches all ifindexes in the group.) + */ + obsif = (ihd->ihd_ifindex == ifindex || ihd->ihd_grifindex == ifindex); + + /* * Do not allow an ipnet stream to see packets that are not from or to * its zone. The exception is when zones are using the shared stack * model. In this case, streams in the global zone have visibility @@ -1025,7 +1042,7 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src, * have our source address (this allows us to see packets we send). */ if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) { - if (ihd->ihd_ifindex == ifindex || srctype == IPNETADDR_MYADDR) + if (srctype == IPNETADDR_MYADDR || obsif) return (B_TRUE); } @@ -1033,7 +1050,7 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src, * We accept multicast and broadcast packets transmitted or received * on the interface we're observing. */ - if (dsttype == IPNETADDR_MBCAST && ihd->ihd_ifindex == ifindex) + if (dsttype == IPNETADDR_MBCAST && obsif) return (B_TRUE); return (B_FALSE); diff --git a/usr/src/uts/common/inet/ipsec_info.h b/usr/src/uts/common/inet/ipsec_info.h index b014bdade0..0348e10b91 100644 --- a/usr/src/uts/common/inet/ipsec_info.h +++ b/usr/src/uts/common/inet/ipsec_info.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_IPSEC_INFO_H #define _INET_IPSEC_INFO_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -114,12 +112,11 @@ typedef struct ipsec_in_s { ipsec_in_decaps : 1, /* Was this packet decapsulated from */ /* a matching inner packet? */ - ipsec_in_attach_if : 1, /* Don't load spread this packet */ ipsec_in_accelerated : 1, /* hardware accelerated packet */ ipsec_in_icmp_loopback : 1, /* Looped-back ICMP packet, */ /* all should trust this. */ - ipsec_in_pad_bits : 24; + ipsec_in_pad_bits : 25; int ipsec_in_ill_index; /* interface on which ipha_dst was */ /* configured when pkt was recv'd */ @@ -197,12 +194,11 @@ typedef struct ipsec_out_s { ipsec_out_reserved : 1, ipsec_out_v4 : 1, - ipsec_out_attach_if : 1, ipsec_out_unspec_src : 1, /* IPv6 ip6i_t info */ ipsec_out_reachable : 1, /* NDP reachability info */ ipsec_out_failed: 1, - ipsec_out_se_done: 1, + ipsec_out_esp_done: 1, ipsec_out_ah_done: 1, ipsec_out_need_policy: 1, @@ -225,7 +221,7 @@ typedef struct ipsec_out_s { */ ipsec_out_icmp_loopback: 1, ipsec_out_ip_nexthop : 1, /* IP_NEXTHOP option is set */ - ipsec_out_pad_bits : 12; + ipsec_out_pad_bits : 13; cred_t *ipsec_out_cred; uint32_t ipsec_out_capab_ill_index; diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h index 5abfc06581..a467abaee9 100644 --- a/usr/src/uts/common/inet/mib2.h +++ b/usr/src/uts/common/inet/mib2.h @@ -17,9 +17,8 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -27,8 +26,6 @@ #ifndef _INET_MIB2_H #define _INET_MIB2_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <netinet/in.h> /* For in6_addr_t */ #include <sys/tsol/label.h> /* For brange_t */ #include <sys/tsol/label_macro.h> /* For brange_t */ @@ -65,9 +62,14 @@ extern "C" { * #define OPTLEN(x) ((((x) + sizeof(long) - 1) / sizeof(long)) * sizeof(long)) * #define OPTVAL(opt) ((char *)(opt + 1)) * - * For get requests (T_NEGOTIATE), any MIB2_xxx value can be used (only + * For get requests (T_CURRENT), any MIB2_xxx value can be used (only * "get all" is supported, so all modules get a copy of the request to - * return everything it knows. Recommend: Use MIB2_IP + * return everything it knows. In general, we use MIB2_IP. There is + * one exception: in general, IP will not report information related to + * IRE_MARK_TESTHIDDEN routes (e.g., in the MIB2_IP_ROUTE table). + * However, using the special value EXPER_IP_AND_TESTHIDDEN will cause + * all information to be reported. This special value should only be + * used by IPMP-aware low-level utilities (e.g. in.mpathd). * * IMPORTANT: some fields are grouped in a different structure than * suggested by MIB-II, e.g., checksum error counts. The original MIB-2 @@ -79,7 +81,6 @@ extern "C" { #define IPPROTO_MAX 256 #endif - #define MIB2_SYSTEM (IPPROTO_MAX+1) #define MIB2_INTERFACES (IPPROTO_MAX+2) #define MIB2_AT (IPPROTO_MAX+3) @@ -108,12 +109,13 @@ extern "C" { #define EXPER_IGMP (EXPER+1) #define EXPER_DVMRP (EXPER+2) #define EXPER_RAWIP (EXPER+3) +#define EXPER_IP_AND_TESTHIDDEN (EXPER+4) /* * Define range of levels for experimental use */ #define EXPER_RANGE_START (EXPER+1) -#define EXPER_RANGE_END (EXPER+3) +#define EXPER_RANGE_END (EXPER+4) #define BUMP_MIB(s, x) { \ extern void __dtrace_probe___mib_##x(int, void *); \ diff --git a/usr/src/uts/common/inet/sctp/sctp_addr.c b/usr/src/uts/common/inet/sctp/sctp_addr.c index 1761396031..94cc8e8883 100644 --- a/usr/src/uts/common/inet/sctp/sctp_addr.c +++ b/usr/src/uts/common/inet/sctp/sctp_addr.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/systm.h> #include <sys/stream.h> diff --git a/usr/src/uts/common/inet/sctp_ip.h b/usr/src/uts/common/inet/sctp_ip.h index 16ab99abab..7b20d3fd2b 100644 --- a/usr/src/uts/common/inet/sctp_ip.h +++ b/usr/src/uts/common/inet/sctp_ip.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_SCTP_IP_H #define _INET_SCTP_IP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 488f8ee021..68e0883222 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -31,7 +31,6 @@ #include <sys/strsubr.h> #include <sys/stropts.h> #include <sys/strlog.h> -#include <sys/strsun.h> #define _SUN_TPI_VERSION 2 #include <sys/tihdr.h> #include <sys/timod.h> @@ -4683,18 +4682,10 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, /* ifindex must be already set */ ASSERT(ifindex != 0); - if (ltcp->tcp_bound_if != 0) { - /* - * Set newtcp's bound_if equal to - * listener's value. If ifindex is - * not the same as ltcp->tcp_bound_if, - * it must be a packet for the ipmp group - * of interfaces - */ + if (ltcp->tcp_bound_if != 0) tcp->tcp_bound_if = ltcp->tcp_bound_if; - } else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { + else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) tcp->tcp_bound_if = ifindex; - } tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary; tcp->tcp_recvifindex = 0; @@ -10716,9 +10707,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, ipp->ipp_fields |= IPPF_USE_MIN_MTU; ipp->ipp_use_min_mtu = *i1; break; - case IPV6_BOUND_PIF: - /* Handled at the IP level */ - return (-EINVAL); case IPV6_SEC_OPT: /* * We should not allow policy setting after @@ -18895,7 +18883,6 @@ tcp_zcopy_check(tcp_t *tcp) connp->conn_dontroute == 0 && !connp->conn_nexthop_set && connp->conn_outgoing_ill == NULL && - connp->conn_nofailover_ill == NULL && do_tcpzcopy == 1) { /* * the checks above closely resemble the fast path checks @@ -19139,7 +19126,6 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp) ipaddr_t dst; ire_t *ire; ill_t *ill; - conn_t *connp = tcp->tcp_connp; mblk_t *ire_fp_mp; tcp_stack_t *tcps = tcp->tcp_tcps; @@ -19164,14 +19150,6 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp) } ill = ire_to_ill(ire); - if (connp->conn_outgoing_ill != NULL) { - ill_t *conn_outgoing_ill = NULL; - /* - * Choose a good ill in the group to send the packets on. - */ - ire = conn_set_outgoing_ill(connp, ire, &conn_outgoing_ill); - ill = ire_to_ill(ire); - } ASSERT(ill != NULL); if (!tcp->tcp_ire_ill_check_done) { diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c index 15b5d04d61..8c8eee3b58 100644 --- a/usr/src/uts/common/inet/tcp/tcp_fusion.c +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +35,7 @@ #include <inet/common.h> #include <inet/optcom.h> #include <inet/ip.h> +#include <inet/ip_if.h> #include <inet/ip_impl.h> #include <inet/tcp.h> #include <inet/tcp_impl.h> diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index d977c27e53..e2314f8104 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -151,9 +151,6 @@ opdes_t tcp_opt_arr[] = { { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (in_addr_t), -1 /* not initialized */ }, -{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (int), 0 /* no ifindex */ }, - { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, sizeof (int), 0 }, diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 7c9433caa0..1178315cb5 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -80,6 +80,7 @@ #include <inet/ipp_common.h> #include <sys/squeue_impl.h> #include <inet/ipnet.h> +#include <sys/ethernet.h> /* * The ipsec_info.h header file is here since it has the definition for the @@ -2141,7 +2142,6 @@ udp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) case MCAST_UNBLOCK_SOURCE: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: - case IP_DONTFAILOVER_IF: /* cannot "get" the value for these */ return (-1); case IP_BOUND_IF: @@ -3152,9 +3152,7 @@ udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, ipp->ipp_use_min_mtu = *i1; break; - case IPV6_BOUND_PIF: case IPV6_SEC_OPT: - case IPV6_DONTFAILOVER_IF: case IPV6_SRC_PREFERENCES: case IPV6_V6ONLY: /* Handled at the IP level */ @@ -5351,7 +5349,6 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 || CONN_OUTBOUND_POLICY_PRESENT(connp, ipss) || connp->conn_dontroute || - connp->conn_nofailover_ill != NULL || connp->conn_outgoing_ill != NULL || optinfo.ip_opt_flags != 0 || optinfo.ip_opt_ill_index != 0 || ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION || @@ -5419,8 +5416,7 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr; ASSERT(ipif != NULL); - if (stq_ill != ipif->ipif_ill && (stq_ill->ill_group == NULL || - stq_ill->ill_group != ipif->ipif_ill->ill_group)) + if (!IS_ON_SAME_LAN(stq_ill, ipif->ipif_ill)) retry_caching = B_TRUE; } @@ -5444,7 +5440,7 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) ASSERT(ipif != NULL); ire = ire_ctable_lookup(dst, 0, 0, ipif, connp->conn_zoneid, MBLK_GETLABEL(mp), - MATCH_IRE_ILL_GROUP, ipst); + MATCH_IRE_ILL, ipst); } else { ASSERT(ipif == NULL); ire = ire_cache_lookup(dst, connp->conn_zoneid, @@ -5622,12 +5618,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) } if (CLASSD(dst)) { - boolean_t ilm_exists; - - ILM_WALKER_HOLD(ill); - ilm_exists = (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL); - ILM_WALKER_RELE(ill); - if (ilm_exists) { + if (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL) { ip_multicast_loopback(q, ill, mp, connp->conn_multicast_loop ? 0 : IP_FF_NO_MCAST_LOOP, zoneid); diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index 0ec5a2c45e..65729b82f1 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -132,9 +132,6 @@ opdes_t udp_opt_arr[] = { { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 /* no ifindex */ }, -{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (struct in_addr), 0 /* not initialized */ }, - { IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (int), 0 }, @@ -191,12 +188,6 @@ opdes_t udp_opt_arr[] = { { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 /* no ifindex */ }, -{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (int), 0 /* no ifindex */ }, - -{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, - sizeof (int), 0 /* no ifindex */ }, - { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, sizeof (int), 0 }, diff --git a/usr/src/uts/common/inet/vni/vni.c b/usr/src/uts/common/inet/vni/vni.c deleted file mode 100644 index a370a7b4be..0000000000 --- a/usr/src/uts/common/inet/vni/vni.c +++ /dev/null @@ -1,359 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - - -#include "vni_impl.h" -#include <sys/conf.h> -#include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/dlpi.h> -#include <sys/stat.h> -#include <sys/ethernet.h> -#include <sys/strsun.h> -#include <sys/stropts.h> - -static int vniopen(queue_t *, dev_t *, int, int, cred_t *); -static int vniclose(queue_t *, int, cred_t *); -static int vniwput(queue_t *, mblk_t *); -static int vniattach(dev_info_t *, ddi_attach_cmd_t); -static int vnidetach(dev_info_t *, ddi_detach_cmd_t); - -static struct module_info minfo = { - VNIIDNUM, /* mi_idnum */ - VNINAME, /* mi_idname */ - VNIMINPSZ, /* mi_minpsz */ - VNIMAXPSZ, /* mi_maxpsz */ - VNIHIWAT, /* mi_hiwat */ - VNILOWAT /* mi_lowat */ -}; - -static struct qinit vnirinit = { - NULL, /* qi_putp */ - NULL, /* qi_srvp */ - vniopen, /* qi_qopen */ - vniclose, /* qi_qclose */ - NULL, /* qi_qadmin */ - &minfo, /* qi_minfo */ - NULL /* qi_mstat */ -}; - -static struct qinit vniwinit = { - vniwput, /* qi_putp */ - NULL, /* qi_srvp */ - NULL, /* qi_qopen */ - NULL, /* qi_qclose */ - NULL, /* qi_qadmin */ - &minfo, /* qi_minfo */ - NULL /* qi_mstat */ -}; - -static struct streamtab vni_info = { - &vnirinit, /* st_rdinit */ - &vniwinit, /* st_wrinit */ - NULL, /* st_muxrinit */ - NULL /* st_muxwrinit */ -}; - -DDI_DEFINE_STREAM_OPS(vni_ops, nulldev, nulldev, vniattach, \ - vnidetach, nodev, nodev, VNIFLAGS, &vni_info, ddi_quiesce_not_supported); - -static struct modldrv modldrv = { - &mod_driverops, - "Virtual network interface", - &vni_ops, -}; - -static struct modlinkage modlinkage = { - MODREV_1, &modldrv, NULL -}; - -static vni_str_t *vni_strlist_head; - -/* - * DL_INFO_ACK template for VNI pseudo interface. - */ -static dl_info_ack_t dlvni_infoack = { - DL_INFO_ACK, /* dl_primitive */ - 0, /* dl_max_sdu */ - 0, /* dl_min_sdu */ - 0, /* dl_addr_length */ - SUNW_DL_VNI, /* dl_mac_type */ - 0, /* dl_reserved */ - 0, /* dl_current_state */ - 0, /* dl_sap_length */ - DL_CLDLS, /* dl_service_mode */ - 0, /* dl_qos_length */ - 0, /* dl_qos_offset */ - 0, /* dl_range_length */ - 0, /* dl_range_offset */ - DL_STYLE2, /* dl_provider_style */ - 0, /* dl_addr_offset */ - DL_VERSION_2, /* dl_version */ - 0, /* dl_brdcst_addr_length */ - 0, /* dl_brdcst_addr_offset */ - 0 /* dl_growth */ -}; - -int -_init(void) -{ - return (mod_install(&modlinkage)); -} - -int -_fini(void) -{ - return (mod_remove(&modlinkage)); -} - -int -_info(struct modinfo *modinfop) -{ - return (mod_info(&modlinkage, modinfop)); -} - -static int -vniattach(dev_info_t *devi, ddi_attach_cmd_t cmd) -{ - if (cmd != DDI_ATTACH) { - cmn_err(CE_NOTE, "vniattach failure: cmd != DDI_ATTACH\n"); - return (DDI_FAILURE); - } - - if (ddi_create_minor_node(devi, VNINAME, S_IFCHR, - ddi_get_instance(devi), DDI_PSEUDO, CLONE_DEV) == - DDI_FAILURE) { - ddi_remove_minor_node(devi, NULL); - cmn_err(CE_NOTE, "vniattach failure: ddi_create_minor_node\n"); - return (DDI_FAILURE); - } - - return (DDI_SUCCESS); -} - -static int -vnidetach(dev_info_t *devi, ddi_detach_cmd_t cmd) -{ - if (cmd != DDI_DETACH) - return (DDI_FAILURE); - - ddi_remove_minor_node(devi, NULL); - return (DDI_SUCCESS); -} - -/* ARGSUSED */ -static int -vniopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) -{ - vni_str_t *stp, *prevstp; - minor_t minordev = 0; - - if (sflag != CLONEOPEN) - return (EINVAL); - - prevstp = NULL; - - for (stp = vni_strlist_head; stp != NULL; stp = stp->st_next) { - if (minordev < stp->st_minor) - break; - minordev++; - prevstp = stp; - } - - stp = kmem_zalloc(sizeof (vni_str_t), KM_SLEEP); - - *devp = makedevice(getmajor(*devp), minordev); - - stp->st_minor = minordev; - stp->st_state = DL_UNATTACHED; - stp->st_next = NULL; - - q->q_ptr = stp; - WR(q)->q_ptr = stp; - - if (prevstp != NULL) { - stp->st_next = prevstp->st_next; - prevstp->st_next = stp; - } else { - stp->st_next = vni_strlist_head; - vni_strlist_head = stp; - } - - qprocson(q); - return (0); -} - -/* ARGSUSED */ -static int -vniclose(queue_t *q, int flag, cred_t *credp) -{ - vni_str_t *stp, **prevstpp; - - qprocsoff(q); - stp = (vni_str_t *)q->q_ptr; - stp->st_state = DL_UNATTACHED; - - /* Unlink the per-stream entry from the list and free it */ - stp = vni_strlist_head; - prevstpp = &vni_strlist_head; - - for (; stp != NULL; stp = stp->st_next) { - if (stp == (vni_str_t *)q->q_ptr) - break; - prevstpp = &stp->st_next; - } - - ASSERT(stp != NULL); - - *prevstpp = stp->st_next; - - kmem_free(stp, sizeof (vni_str_t)); - - q->q_ptr = WR(q)->q_ptr = NULL; - return (0); -} - -static int -vniwput(queue_t *q, mblk_t *mp) -{ - union DL_primitives *dlp; - vni_str_t *stp; - dl_info_ack_t *dlip; - t_scalar_t prim; - - stp = q->q_ptr; - - switch ((mp)->b_datap->db_type) { - case M_PROTO: - case M_PCPROTO: - if (MBLKL(mp) < sizeof (t_scalar_t)) { - dlerrorack(q, mp, DL_PRIM_INVAL, DL_UNSUPPORTED, 0); - return (0); - } - dlp = (void *)mp->b_rptr; - prim = dlp->dl_primitive; - switch (prim) { - case DL_ATTACH_REQ: - if (MBLKL(mp) < DL_ATTACH_REQ_SIZE) { - dlerrorack(q, mp, DL_ATTACH_REQ, DL_BADPRIM, 0); - return (0); - } - if (stp->st_state != DL_UNATTACHED) { - dlerrorack(q, mp, DL_ATTACH_REQ, DL_OUTSTATE, - 0); - return (0); - } - stp->st_ppa = dlp->attach_req.dl_ppa; - stp->st_state = DL_UNBOUND; - dlokack(q, mp, DL_ATTACH_REQ); - break; - case DL_BIND_REQ: - if (MBLKL(mp) < DL_BIND_REQ_SIZE) { - dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0); - return (0); - } - if (stp->st_state != DL_UNBOUND) { - dlerrorack(q, mp, DL_BIND_REQ, DL_OUTSTATE, 0); - return (0); - } - stp->st_state = DL_IDLE; - dlbindack(q, mp, dlp->bind_req.dl_sap, NULL, 0, 0, 0); - break; - case DL_INFO_REQ: - if (MBLKL(mp) < DL_INFO_REQ_SIZE) { - dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0); - return (0); - } - if ((mp = mexchange(q, mp, sizeof (dl_info_ack_t), - M_PCPROTO, DL_INFO_ACK)) == NULL) { - return (0); - } - dlip = (void *)mp->b_rptr; - *dlip = dlvni_infoack; - dlip->dl_current_state = stp->st_state; - qreply(q, mp); - break; - case DL_PHYS_ADDR_REQ: - if (MBLKL(mp) < DL_PHYS_ADDR_REQ_SIZE) { - dlerrorack(q, mp, DL_PHYS_ADDR_REQ, DL_BADPRIM, - 0); - return (0); - } - dlphysaddrack(q, mp, NULL, 0); - break; - case DL_UNBIND_REQ: - if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) { - dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0); - return (0); - } - if (stp->st_state != DL_IDLE) { - dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, - 0); - return (0); - } - /* Nothing to flush. But DLPI spec says to; so do it */ - flushq(q, FLUSHALL); - flushq(RD(q), FLUSHALL); - stp->st_state = DL_UNBOUND; - dlokack(q, mp, DL_UNBIND_REQ); - break; - case DL_DETACH_REQ: - if (MBLKL(mp) < DL_DETACH_REQ_SIZE) { - dlerrorack(q, mp, DL_DETACH_REQ, DL_BADPRIM, 0); - return (0); - } - if (stp->st_state != DL_UNBOUND) { - dlerrorack(q, mp, DL_DETACH_REQ, DL_OUTSTATE, - 0); - return (0); - } - stp->st_state = DL_UNATTACHED; - dlokack(q, mp, DL_DETACH_REQ); - break; - default: - dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0); - } - break; - case M_IOCTL: - /* - * No ioctl's currently supported. Need to have the NAK since - * ifconfig calls SIOCGTUNPARAM during the end of plumb - */ - miocnak(q, mp, 0, EINVAL); - break; - case M_FLUSH: - /* Really nothing to flush since no msgs enqueued */ - if (*mp->b_rptr & FLUSHR) { - qreply(q, mp); - } else { - freemsg(mp); - } - break; - default: - freemsg(mp); - break; - } - return (0); -} diff --git a/usr/src/uts/common/inet/vni/vni_impl.h b/usr/src/uts/common/inet/vni/vni_impl.h deleted file mode 100644 index ffba1b08bf..0000000000 --- a/usr/src/uts/common/inet/vni/vni_impl.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _INET_VNI_IMPL_H -#define _INET_VNI_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef __cplusplus -extern "C" { -#endif - -#include <sys/modctl.h> -#include <sys/stream.h> - -typedef struct vni_str { - struct vni_str *st_next; /* next in list */ - t_uscalar_t st_state; /* DLPI state */ - minor_t st_minor; /* corresponding minor */ - uint32_t st_ppa; /* physical point of attachment */ -} vni_str_t; - -#define DL_MAXPRIM DL_GET_STATISTICS_ACK -#define VNIIDNUM 0x2a84 -#define VNINAME "vni" -#define VNIFLAGS (D_MP|D_MTPERMOD) -#define VNIHIWAT 1024 -#define VNILOWAT 512 -#define VNIMINPSZ 0 -#define VNIMAXPSZ INFPSZ - -#ifdef __cplusplus -} -#endif - -#endif /* _INET_VNI_IMPL_H */ diff --git a/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c b/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c index 69feb36606..03d82fbcab 100644 --- a/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c +++ b/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c @@ -19,14 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - -#define AF_INET_OFFLOAD 30 - #include <sys/sockio.h> #include <sys/stream.h> #include <sys/errno.h> @@ -34,27 +30,24 @@ #include <sys/strsun.h> #include <inet/common.h> #include <net/if.h> +#include <net/if_types.h> #include <inet/mi.h> #include <sys/t_kuser.h> #include <sys/stropts.h> #include <sys/pathname.h> #include <sys/kstr.h> #include <sys/timod.h> +#include <sys/sunddi.h> #include <sys/ib/clients/rds/rds.h> #include <sys/ib/clients/rds/rds_transport.h> static sin_t sin_null; /* Zero address for quick clears */ -#define isdigit(ch) ((ch) >= '0' && (ch) <= '9') - -#define isalpha(ch) (((ch) >= 'a' && (ch) <= 'z') || \ - ((ch) >= 'A' && (ch) <= 'Z')) - /* * Just pass the ioctl to IP and the result to the caller. */ int -rds_do_ip_ioctl(int cmd, int len, caddr_t arg) +rds_do_ip_ioctl(int cmd, int len, void *arg) { vnode_t *kvp, *vp; TIUSER *tiptr; @@ -62,8 +55,7 @@ rds_do_ip_ioctl(int cmd, int len, caddr_t arg) k_sigset_t smask; int err = 0; - if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, - &kvp) == 0) { + if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE, &tiptr, CRED()) == 0) { vp = tiptr->fp->f_vnode; @@ -72,13 +64,13 @@ rds_do_ip_ioctl(int cmd, int len, caddr_t arg) return (EPROTO); } } else { - return (EPROTO); + return (EPROTO); } iocb.ic_cmd = cmd; iocb.ic_timout = 0; iocb.ic_len = len; - iocb.ic_dp = arg; + iocb.ic_dp = (caddr_t)arg; sigintr(&smask, 0); err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); sigunintr(&smask); @@ -88,197 +80,166 @@ rds_do_ip_ioctl(int cmd, int len, caddr_t arg) } /* - * Return 0 if the interface is IB. - * Return error (>0) if any error is encountered during processing. - * Return -1 if the interface is not IB and no error. + * Check if the IP interface named by `lifrp' is RDS-capable. */ -static int -rds_is_ib_interface(char *name) +static boolean_t +rds_capable_interface(struct lifreq *lifrp) { + char ifname[LIFNAMSIZ]; + char drv[MAXLINKNAMELEN]; + uint_t ppa; + char *cp; - char dev_path[MAXPATHLEN]; - char devname[MAXNAMELEN]; - ldi_handle_t lh; - dl_info_ack_t info; - int ret = 0; - int i; - k_sigset_t smask; + if (lifrp->lifr_type == IFT_IB) + return (B_TRUE); /* - * ibd devices are only style 2 devices - * so we will open only style 2 devices - * by ignoring the ppa + * Strip off the logical interface portion before getting + * intimate with the name. */ - i = strlen(name) - 1; - while ((i >= 0) && (!isalpha(name[i]))) i--; - if (i < 0) { - /* Invalid interface name, no alphabet */ - return (-1); - } - (void) strncpy(devname, name, i + 1); - devname[i + 1] = '\0'; + (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); + if ((cp = strchr(ifname, ':')) != NULL) + *cp = '\0'; - if (strcmp("lo", devname) == 0) { + if (strcmp("lo0", ifname) == 0) { /* - * loopback interface is considered RDS capable + * loopback is considered RDS-capable */ - return (0); + return (B_TRUE); } - (void) strncpy(dev_path, "/dev/", MAXPATHLEN); - if (strlcat(dev_path, devname, MAXPATHLEN) >= MAXPATHLEN) { - /* string overflow */ - return (-1); - } + return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS && + rds_transport_ops->rds_transport_if_lookup_by_name(drv)); +} - ret = ldi_open_by_name(dev_path, FREAD|FWRITE, kcred, &lh, rds_li); - if (ret != 0) { - return (ret); - } +/* + * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. + * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. + */ +static int +rds_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) +{ + int err; + int nifs; - sigintr(&smask, 0); - ret = dl_info(lh, &info, NULL, NULL, NULL); - sigunintr(&smask); - (void) ldi_close(lh, FREAD|FWRITE, kcred); - if (ret != 0) { - return (ret); - } + if ((err = rds_do_ip_ioctl(SIOCGIFNUM, sizeof (int), &nifs)) != 0) + return (err); - if (info.dl_mac_type != DL_IB && - !rds_transport_ops->rds_transport_if_lookup_by_name(devname)) { - return (-1); + /* + * Pad the interface count to account for additional interfaces that + * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. + */ + nifs += 4; + + bzero(lifcp, sizeof (struct lifconf)); + lifcp->lifc_family = AF_INET; + lifcp->lifc_len = *bufsizep = (nifs * sizeof (struct lifreq)); + lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_NOSLEEP); + if (lifcp->lifc_buf == NULL) + return (ENOMEM); + + err = rds_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); + if (err != 0) { + kmem_free(lifcp->lifc_buf, *bufsizep); + return (err); } - return (0); } void rds_ioctl_copyin_done(queue_t *q, mblk_t *mp) { - char *addr; + void *addr; mblk_t *mp1; int err = 0; - struct iocblk *iocp = (struct iocblk *)(uintptr_t)mp->b_rptr; + struct iocblk *iocp = (void *)mp->b_rptr; if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) { err = EPROTO; goto done; } - addr = (char *)mp1->b_rptr; + addr = mp1->b_rptr; switch (iocp->ioc_cmd) { - case SIOCGIFNUM: { - /* Get number of interfaces. */ - struct ifconf kifc; - struct ifreq *ifr; - int num_ifs; - int n; - - err = rds_do_ip_ioctl(iocp->ioc_cmd, sizeof (int), - (char *)&num_ifs); - if (err != 0) { - break; - } + uint_t bufsize; + struct lifconf lifc; + struct lifreq *lifrp; + int i, nifs, retval = 0; - kifc.ifc_len = num_ifs * sizeof (struct ifreq); - kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP); - err = rds_do_ip_ioctl(SIOCGIFCONF, - sizeof (struct ifconf), (caddr_t)&kifc); - if (err != 0) { - kmem_free(kifc.ifc_buf, kifc.ifc_len); + if ((err = rds_do_lifconf(&lifc, &bufsize)) != 0) break; - } - ifr = kifc.ifc_req; - n = num_ifs; - for (num_ifs = 0; n > 0; ifr++) { - err = rds_is_ib_interface(ifr->ifr_name); - if (err == 0) { - num_ifs++; - } else if (err > 0) { - num_ifs = 0; - break; - } else { - err = 0; + + nifs = lifc.lifc_len / sizeof (struct lifreq); + for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { + if (strlen(lifrp->lifr_name) <= IFNAMSIZ && + rds_capable_interface(lifrp)) { + retval++; } - n--; } - *((int *)(uintptr_t)addr) = num_ifs; - kmem_free(kifc.ifc_buf, kifc.ifc_len); - } + *((int *)addr) = retval; + kmem_free(lifc.lifc_buf, bufsize); break; + } case O_SIOCGIFCONF: case SIOCGIFCONF: { STRUCT_HANDLE(ifconf, ifc); caddr_t ubuf_addr; int ubuf_size; - struct ifconf kifc; - struct ifreq *ifr, *ptr; - int num_ifs; - - STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, - (struct ifconf *)(uintptr_t)addr); + uint_t bufsize; + int i, nifs; + struct lifconf lifc; + struct lifreq *lifrp; + struct ifreq *ifrp; + STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, (struct ifconf *)addr); ubuf_size = STRUCT_FGET(ifc, ifc_len); ubuf_addr = STRUCT_FGETP(ifc, ifc_buf); - err = rds_do_ip_ioctl(SIOCGIFNUM, sizeof (int), - (char *)&num_ifs); - if (err != 0) { + if ((err = rds_do_lifconf(&lifc, &bufsize)) != 0) break; - } - kifc.ifc_len = num_ifs * sizeof (struct ifreq); - kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP); - err = rds_do_ip_ioctl(iocp->ioc_cmd, - sizeof (struct ifconf), (caddr_t)&kifc); - if (err != 0) { - kmem_free(kifc.ifc_buf, kifc.ifc_len); - break; - } mp1 = mi_copyout_alloc(q, mp, ubuf_addr, ubuf_size, B_FALSE); if (mp1 == NULL) { err = ENOMEM; - kmem_free(kifc.ifc_buf, ubuf_size); + kmem_free(lifc.lifc_buf, bufsize); break; } - ifr = kifc.ifc_req; - ptr = (struct ifreq *)(uintptr_t)mp1->b_rptr; - for (; num_ifs > 0 && - (int)((uintptr_t)mp1->b_wptr - (uintptr_t)mp1->b_rptr) < - ubuf_size; num_ifs--, ifr++) { - err = rds_is_ib_interface(ifr->ifr_name); - if (err == 0) { - ifr->ifr_addr.sa_family = AF_INET_OFFLOAD; - bcopy((caddr_t)ifr, ptr, sizeof (struct ifreq)); - ptr++; - mp1->b_wptr = (uchar_t *)ptr; - } else if (err > 0) { - break; - } else { - err = 0; + ifrp = (void *)mp1->b_rptr; + nifs = lifc.lifc_len / sizeof (struct lifreq); + for (lifrp = lifc.lifc_req, i = 0; i < nifs && + MBLKTAIL(mp1) >= sizeof (struct ifreq); i++, lifrp++) { + /* + * Skip entries that are impossible to return with + * SIOCGIFCONF, or not RDS-capable. + */ + if (strlen(lifrp->lifr_name) > IFNAMSIZ || + !rds_capable_interface(lifrp)) { + continue; } + + ifrp->ifr_addr = *(struct sockaddr *)&lifrp->lifr_addr; + ifrp->ifr_addr.sa_family = AF_INET_OFFLOAD; + (void) strlcpy(ifrp->ifr_name, lifrp->lifr_name, + IFNAMSIZ); + ifrp++; + mp1->b_wptr += sizeof (struct ifreq); } - STRUCT_FSET(ifc, ifc_len, (int)((uintptr_t)mp1->b_wptr - - (uintptr_t)mp1->b_rptr)); - kmem_free(kifc.ifc_buf, kifc.ifc_len); - } + STRUCT_FSET(ifc, ifc_len, MBLKL(mp1)); + kmem_free(lifc.lifc_buf, bufsize); break; + } case SIOCGIFMTU: - err = rds_do_ip_ioctl(iocp->ioc_cmd, - sizeof (struct ifreq), addr); - break; - case SIOCGIFFLAGS: - err = rds_do_ip_ioctl(iocp->ioc_cmd, - sizeof (struct ifreq), addr); + err = rds_do_ip_ioctl(iocp->ioc_cmd, sizeof (struct ifreq), + addr); break; - case TI_GETMYNAME: { + case TI_GETMYNAME: { rds_t *rds; STRUCT_HANDLE(strbuf, sb); ipaddr_t v4addr; @@ -287,8 +248,7 @@ rds_ioctl_copyin_done(queue_t *q, mblk_t *mp) sin_t *sin; STRUCT_SET_HANDLE(sb, - ((struct iocblk *)(uintptr_t)mp->b_rptr)->ioc_flag, - (void *)(uintptr_t)addr); + ((struct iocblk *)(uintptr_t)mp->b_rptr)->ioc_flag, addr); rds = (rds_t *)q->q_ptr; ASSERT(rds->rds_family == AF_INET_OFFLOAD); addrlen = sizeof (sin_t); @@ -320,7 +280,6 @@ done: mi_copy_done(q, mp, err); } - void rds_ioctl_copyin_setup(queue_t *q, mblk_t *mp) { @@ -383,38 +342,26 @@ rds_ioctl(queue_t *q, mblk_t *mp) boolean_t rds_verify_bind_address(ipaddr_t addr) { - int numifs; - struct ifconf kifc; - struct ifreq *ifr; - boolean_t ret = B_FALSE; - - - if (rds_do_ip_ioctl(SIOCGIFNUM, sizeof (int), (caddr_t)&numifs)) { - return (ret); - } - - kifc.ifc_len = numifs * sizeof (struct ifreq); - kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP); - - if (rds_do_ip_ioctl(SIOCGIFCONF, sizeof (struct ifconf), - (caddr_t)&kifc)) { - goto done; - } - - ifr = kifc.ifc_req; - for (numifs = kifc.ifc_len / sizeof (struct ifreq); - numifs > 0; numifs--, ifr++) { - struct sockaddr_in *sin; - - sin = (struct sockaddr_in *)(uintptr_t)&ifr->ifr_addr; - if ((sin->sin_addr.s_addr == addr) && - (rds_is_ib_interface(ifr->ifr_name) == 0)) { - ret = B_TRUE; - break; + int i, nifs; + uint_t bufsize; + struct lifconf lifc; + struct lifreq *lifrp; + struct sockaddr_in *sinp; + boolean_t retval = B_FALSE; + + if (rds_do_lifconf(&lifc, &bufsize) != 0) + return (B_FALSE); + + nifs = lifc.lifc_len / sizeof (struct lifreq); + for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { + sinp = (struct sockaddr_in *)&lifrp->lifr_addr; + if (rds_capable_interface(lifrp) && + sinp->sin_addr.s_addr == addr) { + retval = B_TRUE; + break; } } -done: - kmem_free(kifc.ifc_buf, kifc.ifc_len); - return (ret); + kmem_free(lifc.lifc_buf, bufsize); + return (retval); } diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c index bcb3c235be..dd7c9554a5 100644 --- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c +++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +33,7 @@ #include <sys/socket.h> #include <sys/stat.h> #include <net/if_arp.h> +#include <net/if_types.h> #include <sys/file.h> #include <sys/sockio.h> #include <sys/pathname.h> @@ -528,62 +529,112 @@ ibcm_arp_get_ibd_insts(ibcm_arp_ibd_insts_t *ibds) } /* - * Return ibd interfaces and ibd instances. + * Issue an ioctl down to IP. There are several similar versions of this + * function (e.g., rpcib_do_ip_ioctl()); clearly a utility routine is needed. */ static int -ibcm_arp_get_ibd_ipaddr(ibcm_arp_ibd_insts_t *ibds) +ibcm_do_ip_ioctl(int cmd, int len, void *arg) { - TIUSER *tiptr; - vnode_t *kvp; - vnode_t *vp = NULL; - struct strioctl iocb; - struct lifreq lif_req; - int k, ip_cnt; - ibcm_arp_ip_t *ipp; + vnode_t *kvp; + TIUSER *tiptr; + struct strioctl iocb; + int err = 0; - if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { - if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE, - &tiptr, CRED()) == 0) { - vp = tiptr->fp->f_vnode; - } else { - VN_RELE(kvp); - } - } + if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) != 0) + return (EPROTO); - if (vp == NULL) - return (-1); + if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, &tiptr, CRED()) != 0) { + VN_RELE(kvp); + return (EPROTO); + } - /* Get ibd ip's */ - ip_cnt = 0; - for (k = 0, ipp = ibds->ibcm_arp_ip; k < ibds->ibcm_arp_ibd_cnt; - k++, ipp++) { + iocb.ic_cmd = cmd; + iocb.ic_timout = 0; + iocb.ic_len = len; + iocb.ic_dp = (caddr_t)arg; + err = kstr_ioctl(tiptr->fp->f_vnode, I_STR, (intptr_t)&iocb); + (void) t_kclose(tiptr, 0); + VN_RELE(kvp); + return (err); +} - (void) bzero((void *)&lif_req, sizeof (struct lifreq)); - (void) snprintf(lif_req.lifr_name, sizeof (lif_req.lifr_name), - "%s%d", IBCM_ARP_IBD_NAME, ipp->ip_inst); +/* + * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. + * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. + */ +static int +ibcm_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) +{ + int err; + struct lifnum lifn; + + bzero(&lifn, sizeof (struct lifnum)); + lifn.lifn_family = AF_UNSPEC; + + err = ibcm_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); + if (err != 0) + return (err); + + /* + * Pad the interface count to account for additional interfaces that + * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. + */ + lifn.lifn_count += 4; + + bzero(lifcp, sizeof (struct lifconf)); + lifcp->lifc_family = AF_UNSPEC; + lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); + lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); + + err = ibcm_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); + if (err != 0) { + kmem_free(lifcp->lifc_buf, *bufsizep); + return (err); + } + return (0); +} - (void) bzero((void *)&iocb, sizeof (struct strioctl)); - iocb.ic_cmd = SIOCGLIFADDR; - iocb.ic_timout = 0; - iocb.ic_len = sizeof (struct lifreq); - iocb.ic_dp = (caddr_t)&lif_req; +/* + * Fill in `ibds' with IP addresses tied to IFT_IB IP interfaces. Returns + * B_TRUE if at least one address was filled in. + */ +static boolean_t +ibcm_arp_get_ibd_ipaddr(ibcm_arp_ibd_insts_t *ibds) +{ + int i, nifs, naddr = 0; + uint_t bufsize; + struct lifconf lifc; + struct lifreq *lifrp; + ibcm_arp_ip_t *ipp; + + if (ibcm_do_lifconf(&lifc, &bufsize) != 0) + return (B_FALSE); + + nifs = lifc.lifc_len / sizeof (struct lifreq); + for (lifrp = lifc.lifc_req, i = 0; + i < nifs && naddr < ibds->ibcm_arp_ibd_cnt; i++, lifrp++) { + if (lifrp->lifr_type != IFT_IB) + continue; - if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) { + ipp = &ibds->ibcm_arp_ip[naddr]; + switch (lifrp->lifr_addr.ss_family) { + case AF_INET: ipp->ip_inet_family = AF_INET; - bcopy(&lif_req.lifr_addr, &ipp->ip_cm_sin, + bcopy(&lifrp->lifr_addr, &ipp->ip_cm_sin, sizeof (struct sockaddr_in)); - ip_cnt++; - continue; + naddr++; + break; + case AF_INET6: + ipp->ip_inet_family = AF_INET6; + bcopy(&lifrp->lifr_addr, &ipp->ip_cm_sin6, + sizeof (struct sockaddr_in6)); + naddr++; + break; } } - (void) t_kclose(tiptr, 0); - VN_RELE(kvp); - - if (ip_cnt == 0) - return (-1); - else - return (0); + kmem_free(lifc.lifc_buf, bufsize); + return (naddr > 0); } ibt_status_t @@ -600,7 +651,7 @@ ibcm_arp_get_ibds(ibcm_arp_ibd_insts_t *ibdp) return (IBT_SRC_IP_NOT_FOUND); /* Get the IP addresses of active ports. */ - if (ibcm_arp_get_ibd_ipaddr(ibdp) != 0) { + if (!ibcm_arp_get_ibd_ipaddr(ibdp)) { IBTF_DPRINTF_L2(cmlog, "ibcm_arp_get_ibds: failed to get " "ibd instance: IBT_SRC_IP_NOT_FOUND"); return (IBT_SRC_IP_NOT_FOUND); diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c index af622d5c8f..29b5116446 100644 --- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c +++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #include <sys/dlpi.h> @@ -35,24 +33,13 @@ #include <sys/ddi.h> #include <sys/cmn_err.h> #include <sys/socket.h> -#include <sys/tihdr.h> #include <net/if.h> -#include <net/if_arp.h> #include <net/if_types.h> -#include <net/if_dl.h> -#include <net/route.h> -#include <sys/sockio.h> #include <netinet/in.h> -#include <netinet/ip6.h> -#include <netinet/icmp6.h> #include <sys/ethernet.h> -#include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ -#include <inet/mi.h> #include <inet/arp.h> #include <inet/ip.h> -#include <inet/ip_multi.h> #include <inet/ip_ire.h> -#include <inet/ip_rts.h> #include <inet/ip_if.h> #include <sys/ib/mgt/ibcm/ibcm_arp.h> #include <inet/ip_ftable.h> @@ -389,21 +376,16 @@ ibcm_arp_pr_callback(ibcm_arp_prwqn_t *wqnp, int status) wqnp->func((void *)wqnp, status); } +/* + * Check if the interface is loopback or IB. + */ static int -ibcm_arp_check_interface(ibcm_arp_prwqn_t *wqnp, int length) +ibcm_arp_check_interface(ill_t *ill) { - /* - * if the i/f is not ib or lo device, fail the request - */ - if (bcmp(wqnp->ifname, "ibd", 3) == 0) { - if (length != IPOIB_ADDRL) { - return (EINVAL); - } - } else if (bcmp(wqnp->ifname, "lo", 2)) { - return (ETIMEDOUT); - } + if (IS_LOOPBACK(ill) || ill->ill_type == IFT_IB) + return (0); - return (0); + return (ETIMEDOUT); } #define IBTL_IPV4_ADDR(a) (a->un.ip4addr) @@ -414,11 +396,10 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, ibcm_arp_pr_comp_func_t func) { ibcm_arp_prwqn_t *wqnp; - ire_t *ire; - ire_t *src_ire; + ire_t *ire = NULL; + ire_t *src_ire = NULL; ipif_t *ipif; - ill_t *ill; - int length; + ill_t *ill, *hwaddr_ill = NULL; ip_stack_t *ipst; IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup(src %p dest %p)", @@ -449,13 +430,10 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, if (src_ire == NULL) { IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " "ire_ctable_lookup failed"); - netstack_rele(ipst->ips_netstack); - ibcm_arp_prwqn_delete(wqnp); ib_s->status = EFAULT; - return (1); + goto fail; } - /* * get an ire for the destination adress with the matching source * address @@ -463,16 +441,11 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, ire = ire_ftable_lookup(IBTL_IPV4_ADDR(dst_addr), 0, 0, 0, src_ire->ire_ipif, 0, src_ire->ire_zoneid, 0, NULL, MATCH_IRE_SRC, ipst); - - netstack_rele(ipst->ips_netstack); - if (ire == NULL) { IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " "ire_ftable_lookup failed"); - IRE_REFRELE(src_ire); - ibcm_arp_prwqn_delete(wqnp); ib_s->status = EFAULT; - return (1); + goto fail; } wqnp->src_addr.un.ip4addr = ire->ire_src_addr; @@ -480,35 +453,56 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, ipif = src_ire->ire_ipif; ill = ipif->ipif_ill; - length = ill->ill_name_length; - bcopy(ill->ill_name, &wqnp->ifname, ill->ill_name_length); - wqnp->ifname[length] = '\0'; - bcopy(ill->ill_phys_addr, &wqnp->src_mac, - ill->ill_phys_addr_length); + (void) strlcpy(wqnp->ifname, ill->ill_name, sizeof (wqnp->ifname)); - IRE_REFRELE(ire); - IRE_REFRELE(src_ire); + /* + * For IPMP data addresses, we need to use the hardware address of the + * interface bound to the given address. + */ + if (IS_IPMP(ill)) { + if ((hwaddr_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { + IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: no bound " + "ill for IPMP interface %s", ill->ill_name); + ib_s->status = EFAULT; + goto fail; + } + } else { + hwaddr_ill = ill; + ill_refhold(hwaddr_ill); /* for symmetry */ + } - ib_s->status = - ibcm_arp_check_interface(wqnp, ill->ill_phys_addr_length); - if (ib_s->status) { + bcopy(hwaddr_ill->ill_phys_addr, &wqnp->src_mac, + hwaddr_ill->ill_phys_addr_length); + + if ((ib_s->status = ibcm_arp_check_interface(hwaddr_ill)) != 0) { IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " "ibcm_arp_check_interface failed"); - ibcm_arp_prwqn_delete(wqnp); - return (1); + goto fail; } - ib_s->status = ibcm_arp_squery_arp(wqnp); - if (ib_s->status) { + if ((ib_s->status = ibcm_arp_squery_arp(wqnp)) != 0) { IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " "ibcm_arp_squery_arp failed"); - ibcm_arp_prwqn_delete(wqnp); - return (1); + goto fail; } - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: Return: 0x%p", wqnp); + ill_refrele(hwaddr_ill); + IRE_REFRELE(ire); + IRE_REFRELE(src_ire); + netstack_rele(ipst->ips_netstack); + IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: Return: 0x%p", wqnp); return (0); +fail: + if (hwaddr_ill != NULL) + ill_refrele(hwaddr_ill); + if (ire != NULL) + IRE_REFRELE(ire); + if (src_ire != NULL) + IRE_REFRELE(src_ire); + ibcm_arp_prwqn_delete(wqnp); + netstack_rele(ipst->ips_netstack); + return (1); } #define IBCM_H2N_GID(gid) \ diff --git a/usr/src/uts/common/ipp/ipgpc/classifier-objects.h b/usr/src/uts/common/ipp/ipgpc/classifier-objects.h index f1cb20b88d..4002a39573 100644 --- a/usr/src/uts/common/ipp/ipgpc/classifier-objects.h +++ b/usr/src/uts/common/ipp/ipgpc/classifier-objects.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPP_IPGPC_CLASSIFIER_OBJECTS_H #define _IPP_IPGPC_CLASSIFIER_OBJECTS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/time.h> #include <ipp/ipp.h> #include <ipp/ipgpc/ipgpc.h> @@ -64,14 +61,12 @@ extern "C" { #define IPGPC_TABLE_UID 8 #define IPGPC_TABLE_PROJID 9 #define IPGPC_TABLE_IF 10 -#define IPGPC_TABLE_IF_GRPNM 11 -#define IPGPC_TABLE_DIR 12 +#define IPGPC_TABLE_DIR 11 #define TABLE_ID_OFFSET IPGPC_TABLE_PROTOID #define PROTOID_IDX (IPGPC_TABLE_PROTOID - TABLE_ID_OFFSET) #define UID_IDX (IPGPC_TABLE_UID - TABLE_ID_OFFSET) #define PROJID_IDX (IPGPC_TABLE_PROJID - TABLE_ID_OFFSET) #define IF_IDX (IPGPC_TABLE_IF - TABLE_ID_OFFSET) -#define IF_GRPNM_IDX (IPGPC_TABLE_IF_GRPNM - TABLE_ID_OFFSET) #define DIR_IDX (IPGPC_TABLE_DIR - TABLE_ID_OFFSET) /* Match types for selector searching */ @@ -91,11 +86,10 @@ extern "C" { #define UID_MASK 0x40 #define PROJID_MASK 0x80 #define IF_MASK 0x100 -#define IF_GRPNM_MASK 0x200 -#define DIR_MASK 0x400 +#define DIR_MASK 0x200 #define ALL_MATCH_MASK (DS_MASK | PROTO_MASK | SADDR_MASK | DADDR_MASK | \ SPORT_MASK | DPORT_MASK | UID_MASK | PROJID_MASK | \ - IF_MASK | IF_GRPNM_MASK | DIR_MASK) + IF_MASK | DIR_MASK) #define HASH_SIZE 11 /* default hash table size */ @@ -108,7 +102,6 @@ typedef struct ipgpc_filter_s { char filter_name[MAXNAMELEN]; /* null terminated name of filter */ /* exact match selectors */ - char if_groupname[LIFNAMSIZ]; /* null terminated iface groupname */ uid_t uid; /* uid key, value = exact or IPGPC_WILDCARD */ projid_t projid; /* project id, " " */ uint_t if_index; /* interface index, " " or 0 for wildcard */ diff --git a/usr/src/uts/common/ipp/ipgpc/classifier.c b/usr/src/uts/common/ipp/ipgpc/classifier.c index bb09a3ca89..9137fcba9a 100644 --- a/usr/src/uts/common/ipp/ipgpc/classifier.c +++ b/usr/src/uts/common/ipp/ipgpc/classifier.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/kmem.h> #include <sys/systm.h> #include <sys/socket.h> @@ -78,7 +76,6 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table, uint16_t *slctrs_srchd) { int match_status; - int if_grpnm_hv; /* Find on packet direction */ match_status = @@ -96,19 +93,6 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table, return (match_status); } - /* Find on IF_GRPNM of packet */ - if (packet->if_groupname_len > 0) { - if_grpnm_hv = name_hash(packet->if_groupname, TABLE_SIZE); - } else { - if_grpnm_hv = IPGPC_WILDCARD; - } - match_status = - ipgpc_findfilters(IPGPC_TABLE_IF_GRPNM, if_grpnm_hv, fid_table); - if (CHECK_MATCH_STATUS(match_status, slctrs_srchd, - ipgpc_table_list[IF_GRPNM_IDX].info.mask) != NORMAL_MATCH) { - return (match_status); - } - /* Find on DS field */ match_status = ipgpc_findfilters(IPGPC_BA_DSID, packet->dsfield, fid_table); @@ -149,9 +133,8 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table, /* Find on IP Source Port field */ if (packet->sport > 0) { - match_status = - ipgpc_findfilters(IPGPC_TRIE_SPORTID, packet->sport, - fid_table); + match_status = ipgpc_findfilters(IPGPC_TRIE_SPORTID, + packet->sport, fid_table); if (CHECK_MATCH_STATUS(match_status, slctrs_srchd, ipgpc_trie_list[IPGPC_TRIE_SPORTID].info.mask) != NORMAL_MATCH) { @@ -164,9 +147,8 @@ common_classify(ipgpc_packet_t *packet, ht_match_t *fid_table, /* Find on IP Destination Port field */ if (packet->dport > 0) { - match_status = - ipgpc_findfilters(IPGPC_TRIE_DPORTID, packet->dport, - fid_table); + match_status = ipgpc_findfilters(IPGPC_TRIE_DPORTID, + packet->dport, fid_table); if (CHECK_MATCH_STATUS(match_status, slctrs_srchd, ipgpc_trie_list[IPGPC_TRIE_DPORTID].info.mask) != NORMAL_MATCH) { @@ -261,12 +243,11 @@ ipgpc_classify(int af, ipgpc_packet_t *packet) match_status = 0; slctrs_srchd = ALL_MATCH_MASK; - bzero(fid_table, sizeof (ht_match_t) * HASH_SIZE); /* first search all address family independent selectors */ - if ((rc = common_classify(packet, fid_table, &slctrs_srchd)) != - NORMAL_MATCH) { + rc = common_classify(packet, fid_table, &slctrs_srchd); + if (rc != NORMAL_MATCH) { /* free all dynamic allocated memory */ FREE_FID_TABLE(fid_table, p, q, i); if (rc == NO_MATCHES) { @@ -453,7 +434,7 @@ bestmatch(ht_match_t *fid_table, uint16_t bestmask) */ real_prio = ((uint64_t)ipgpc_fid_list[key].filter.priority - << 32) | + << 32) | (uint64_t)~ipgpc_fid_list[key].filter.precedence; /* check to see if this is the new bestmatch */ @@ -689,35 +670,32 @@ parse_packet6(ipgpc_packet_t *packet, mblk_t *mp) void print_packet(int af, ipgpc_packet_t *pkt) { + char saddrbuf[INET6_ADDRSTRLEN]; + char daddrbuf[INET6_ADDRSTRLEN]; + if (af == AF_INET) { - char saddrbuf[INET_ADDRSTRLEN]; - char daddrbuf[INET_ADDRSTRLEN]; + (void) inet_ntop(af, &V4_PART_OF_V6(pkt->saddr), saddrbuf, + sizeof (saddrbuf)); + (void) inet_ntop(af, &V4_PART_OF_V6(pkt->daddr), daddrbuf, + sizeof (daddrbuf)); + ipgpc4dbg(("print_packet: saddr = %s, daddr = %s, sport = %u" \ ", dport = %u, proto = %u, dsfield = %x, uid = %d," \ - " if_index = %d, if_groupname = %s, projid = %d, " \ - "direction = %d", - inet_ntop(af, &V4_PART_OF_V6(pkt->saddr), saddrbuf, - sizeof (saddrbuf)), - inet_ntop(af, &V4_PART_OF_V6(pkt->daddr), daddrbuf, - sizeof (daddrbuf)), - ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto, + " if_index = %d, projid = %d, direction = %d", saddrbuf, + daddrbuf, ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto, pkt->dsfield, pkt->uid, pkt->if_index, - (pkt->if_groupname != NULL) ? pkt->if_groupname : "NULL", pkt->projid, pkt->direction)); } else if (af == AF_INET6) { - char saddrbuf[INET6_ADDRSTRLEN]; - char daddrbuf[INET6_ADDRSTRLEN]; + (void) inet_ntop(af, pkt->saddr.s6_addr32, saddrbuf, + sizeof (saddrbuf)); + (void) inet_ntop(af, pkt->daddr.s6_addr32, daddrbuf, + sizeof (daddrbuf)); + ipgpc4dbg(("print_packet: saddr = %s, daddr = %s, sport = %u" \ ", dport = %u, proto = %u, dsfield = %x, uid = %d," \ - " if_index = %d, if_groupname = %s, projid = %d, " \ - "direction = %d", - inet_ntop(af, pkt->saddr.s6_addr32, saddrbuf, - sizeof (saddrbuf)), - inet_ntop(af, pkt->daddr.s6_addr32, daddrbuf, - sizeof (daddrbuf)), - ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto, + " if_index = %d, projid = %d, direction = %d", saddrbuf, + daddrbuf, ntohs(pkt->sport), ntohs(pkt->dport), pkt->proto, pkt->dsfield, pkt->uid, pkt->if_index, - (pkt->if_groupname != NULL) ? pkt->if_groupname : "NULL", pkt->projid, pkt->direction)); } } diff --git a/usr/src/uts/common/ipp/ipgpc/classifier.h b/usr/src/uts/common/ipp/ipgpc/classifier.h index 4ee36ae32b..629aeab2f5 100644 --- a/usr/src/uts/common/ipp/ipgpc/classifier.h +++ b/usr/src/uts/common/ipp/ipgpc/classifier.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPP_IPGPC_CLASSIFIER_H #define _IPP_IPGPC_CLASSIFIER_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/cmn_err.h> #include <ipp/ipgpc/filters.h> @@ -74,8 +71,6 @@ typedef struct ipgpc_packet_s { projid_t projid; /* project id for packet */ uint_t if_index; /* interface index */ uint32_t direction; /* packet direction */ - char *if_groupname; /* interface group name */ - uint_t if_groupname_len; /* interface group name length */ uint_t len; /* length of packet */ } ipgpc_packet_t; diff --git a/usr/src/uts/common/ipp/ipgpc/classifierddi.c b/usr/src/uts/common/ipp/ipgpc/classifierddi.c index d9955d84a6..4d31da6396 100644 --- a/usr/src/uts/common/ipp/ipgpc/classifierddi.c +++ b/usr/src/uts/common/ipp/ipgpc/classifierddi.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/systm.h> #include <sys/socket.h> #include <netinet/in.h> @@ -433,12 +431,6 @@ ipgpc_invoke_action(ipp_action_id_t aid, ipp_packet_t *packet) } } - /* The ill_index could be 0 when called from forwarding (read) path */ - if (ill_idx > 0) { - ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE, - NULL, NULL, NULL, NULL); - } - /* parse the packet from the message block */ ipha = (ipha_t *)mp->b_rptr; /* Determine IP Header Version */ @@ -452,23 +444,27 @@ ipgpc_invoke_action(ipp_action_id_t aid, ipp_packet_t *packet) pkt.direction = callout_pos; /* set packet direction */ + /* The ill_index could be 0 when called from forwarding (read) path */ + if (ill_idx > 0) { + ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE, + NULL, NULL, NULL, NULL); + } if (ill != NULL) { - pkt.if_index = ill->ill_phyint->phyint_ifindex; - pkt.if_groupname_len = - ill->ill_phyint->phyint_groupname_len; - if (pkt.if_groupname_len > 0) { - pkt.if_groupname = - ill->ill_phyint->phyint_groupname; - } else { - pkt.if_groupname = NULL; - } - /* Got the fields from the ILL, go ahead and refrele */ + /* + * Since all IPP actions in an IPMP group are performed + * relative to the IPMP group interface, if this is an + * underlying interface in an IPMP group, use the IPMP + * group interface's index. + */ + if (IS_UNDER_IPMP(ill)) + pkt.if_index = ipmp_ill_get_ipmp_ifindex(ill); + else + pkt.if_index = ill->ill_phyint->phyint_ifindex; + /* Got the field from the ILL, go ahead and refrele */ ill_refrele(ill); } else { - /* unknown if_index and if_group */ + /* unknown if_index */ pkt.if_index = IPGPC_UNSPECIFIED; - pkt.if_groupname = NULL; - pkt.if_groupname_len = 0; } if (ipgpc_debug > 5) { diff --git a/usr/src/uts/common/ipp/ipgpc/filters.c b/usr/src/uts/common/ipp/ipgpc/filters.c index 7dd4dce48e..3a2f954d0a 100644 --- a/usr/src/uts/common/ipp/ipgpc/filters.c +++ b/usr/src/uts/common/ipp/ipgpc/filters.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/atomic.h> #include <sys/types.h> #include <sys/systm.h> @@ -83,7 +81,6 @@ static ht_node_t proto_table[TABLE_SIZE]; /* protocol table */ static ht_node_t uid_table[TABLE_SIZE]; /* IPGPC_UID table */ static ht_node_t projid_table[TABLE_SIZE]; /* IPGPC_PROJID table */ static ht_node_t if_table[TABLE_SIZE]; /* Interface ID table */ -static ht_node_t if_grpnm_table[TABLE_SIZE]; /* Interface Group Name table */ static ht_node_t dir_table[TABLE_SIZE]; /* packet direction table */ static ipp_action_id_t ipgpc_aid; /* the action id for ipgpc */ @@ -262,9 +259,6 @@ initialize_tables(void) /* IF_INDEX selector structure */ insert_ipgpc_table_list_info(IF_IDX, if_table, IPGPC_UNSPECIFIED, IF_MASK); - /* IF_GRPNM_INDEX selector structure */ - insert_ipgpc_table_list_info(IF_GRPNM_IDX, if_grpnm_table, - IPGPC_WILDCARD, IF_GRPNM_MASK); /* DIR selector structure */ insert_ipgpc_table_list_info(DIR_IDX, dir_table, IPGPC_UNSPECIFIED, DIR_MASK); @@ -617,19 +611,6 @@ ipgpc_parse_filter(ipgpc_filter_t *filter, nvlist_t *nvlp) bcopy(s, filter->filter_name, (strlen(s) + 1)); - /* parse interface group name */ - if (nvlist_lookup_string(nvlp, IPGPC_IF_GROUPNAME, &s) != 0) { - filter->if_groupname[0] = '\0'; - } else { - /* check max interface group name lenght */ - if ((strlen(s) + 1) > LIFNAMSIZ) { - ipgpc0dbg(("ipgpc_parse_filter: interface group name" \ - " > LIFNAMSIZ")); - return (EINVAL); - } - bcopy(s, filter->if_groupname, (strlen(s) + 1)); - } - /* parse uid */ if (nvlist_lookup_uint32(nvlp, IPGPC_UID, &filter->uid) != 0) { filter->uid = (uid_t)IPGPC_WILDCARD; @@ -976,8 +957,6 @@ insertfid(int filter_id, ipgpc_filter_t *filter, uint_t class_id) static void common_addfilter(fid_t *fid, int filter_id) { - int if_grpnm_hv; - /* start trie inserts */ /* add source port selector */ if (t_insert(&ipgpc_trie_list[IPGPC_TRIE_SPORTID], filter_id, @@ -1025,17 +1004,6 @@ common_addfilter(fid_t *fid, int filter_id) fid->insert_map |= IF_MASK; } - /* add interface groupname selector */ - if (fid->filter.if_groupname[0] == '\0') { - if_grpnm_hv = IPGPC_WILDCARD; - } else { - if_grpnm_hv = name_hash(fid->filter.if_groupname, TABLE_SIZE); - } - if (ht_insert(&ipgpc_table_list[IF_GRPNM_IDX], filter_id, if_grpnm_hv) - == NORMAL_VALUE) { - fid->insert_map |= IF_GRPNM_MASK; - } - /* add direction selector */ if (ht_insert(&ipgpc_table_list[DIR_IDX], filter_id, fid->filter.direction) == NORMAL_VALUE) { @@ -1102,8 +1070,8 @@ ipgpc_addfilter(ipgpc_filter_t *filter, char *class_name, ipp_flags_t flags) fid_t *fid; unsigned class_id; - if ((err = class_name2id(&class_id, class_name, ipgpc_num_cls)) != - EEXIST) { + err = class_name2id(&class_id, class_name, ipgpc_num_cls); + if (err != EEXIST) { ipgpc0dbg(("ipgpc_addfilter: class lookup error %d", err)); return (err); } @@ -1376,9 +1344,8 @@ insertcid(ipgpc_class_t *in_class, int *out_class_id) /* init kstat entry */ if ((rc = class_statinit(in_class, class_id)) != 0) { ipgpc_cid_list[class_id].info = -1; - ipgpc0dbg(("insertcid: " \ - "class_statinit failed with " \ - "error %d", rc)); + ipgpc0dbg(("insertcid: " + "class_statinit failed with error %d", rc)); mutex_exit(&ipgpc_cid_list_lock); return (rc); } @@ -1409,8 +1376,6 @@ insertcid(ipgpc_class_t *in_class, int *out_class_id) static void common_removefilter(int in_filter_id, fid_t *fid) { - int if_grpnm_hv; - /* start trie removes */ t_remove(&ipgpc_trie_list[IPGPC_TRIE_SPORTID], in_filter_id, fid->filter.sport, fid->filter.sport_mask); @@ -1438,14 +1403,6 @@ common_removefilter(int in_filter_id, fid_t *fid) /* remove id from interface id table */ ht_remove(&ipgpc_table_list[IF_IDX], in_filter_id, fid->filter.if_index); - - /* remove id from interface group name table */ - if (fid->filter.if_groupname[0] == '\0') { - if_grpnm_hv = IPGPC_WILDCARD; - } else { - if_grpnm_hv = name_hash(fid->filter.if_groupname, TABLE_SIZE); - } - ht_remove(&ipgpc_table_list[IF_GRPNM_IDX], in_filter_id, if_grpnm_hv); /* remove id from direction table */ ht_remove(&ipgpc_table_list[DIR_IDX], in_filter_id, fid->filter.direction); @@ -1782,7 +1739,6 @@ int ipgpc_modifyclass(nvlist_t **nvlpp, ipp_flags_t flags) { unsigned class_id; - ipp_stat_t *cl_stats; ipgpc_class_t in_class; char *name; int rc; @@ -1837,15 +1793,14 @@ ipgpc_modifyclass(nvlist_t **nvlpp, ipp_flags_t flags) /* check to see if gather_stats booleans differ */ if ((ipgpc_cid_list[class_id].aclass.gather_stats != in_class.gather_stats)) { - if (ipgpc_cid_list[class_id].aclass.gather_stats == - B_TRUE) { - /* delete kstat entry */ - if (ipgpc_cid_list[class_id].cl_stats != NULL) { - cl_stats = - ipgpc_cid_list[class_id].cl_stats; - ipp_stat_destroy(cl_stats); - ipgpc_cid_list[class_id].cl_stats = NULL; - } + if (ipgpc_cid_list[class_id].aclass.gather_stats) { + /* delete kstat entry */ + if (ipgpc_cid_list[class_id].cl_stats != NULL) { + ipp_stat_destroy( + ipgpc_cid_list[class_id].cl_stats); + ipgpc_cid_list[class_id].cl_stats = + NULL; + } } else { /* gather_stats == B_FALSE */ if ((rc = class_statinit(&in_class, class_id)) != 0) { @@ -2326,14 +2281,6 @@ build_filter_nvlist(nvlist_t **nvlpp, ipgpc_filter_t *in_filter, return (rc); } - /* add interface groupname */ - if (in_filter->if_groupname[0] != '\0') { - if ((rc = nvlist_add_string(nvlp, IPGPC_IF_GROUPNAME, - in_filter->if_groupname)) != 0) { - return (rc); - } - } - /* add uid */ if (in_filter->uid != IPGPC_WILDCARD) { if ((rc = nvlist_add_uint32(nvlp, IPGPC_UID, in_filter->uid)) diff --git a/usr/src/uts/common/ipp/ipgpc/ipgpc.h b/usr/src/uts/common/ipp/ipgpc/ipgpc.h index f2e1354132..51edc313f8 100644 --- a/usr/src/uts/common/ipp/ipgpc/ipgpc.h +++ b/usr/src/uts/common/ipp/ipgpc/ipgpc.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IPP_IPGPC_IPGPC_H #define _IPP_IPGPC_IPGPC_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/kmem.h> #include <sys/socket.h> @@ -48,7 +45,6 @@ extern "C" { #define IPGPC_NAME "ipgpc" /* config names of name-value pairs and type */ -#define IPGPC_IF_GROUPNAME "ipgpc.if_groupname" /* string */ #define IPGPC_UID "ipgpc.user" /* int32_t */ #define IPGPC_PROJID "ipgpc.projid" /* int32_t */ #define IPGPC_IF_INDEX "ipgpc.if_index" /* uint32_t */ diff --git a/usr/src/uts/common/net/if.h b/usr/src/uts/common/net/if.h index 904fe078cb..05f013e4dc 100644 --- a/usr/src/uts/common/net/if.h +++ b/usr/src/uts/common/net/if.h @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -12,7 +12,6 @@ #ifndef _NET_IF_H #define _NET_IF_H -#pragma ident "%Z%%M% %I% %E% SMI" /* if.h 1.26 90/05/29 SMI; from UCB 7.1 6/4/86 */ #include <sys/feature_tests.h> @@ -105,7 +104,7 @@ struct ifnet { * If you define a flag here, you need to define one in ip_if.h before * using the new flag in IP. Don't use these flags directly in IP. */ -#define IFF_UP 0x0000000001 /* interface is up */ +#define IFF_UP 0x0000000001 /* address is up */ #define IFF_BROADCAST 0x0000000002 /* broadcast address valid */ #define IFF_DEBUG 0x0000000004 /* turn on debugging */ #define IFF_LOOPBACK 0x0000000008 /* is a loopback net */ @@ -138,7 +137,7 @@ struct ifnet { */ #define IFF_NOXMIT 0x0000010000 /* Do not transmit packets */ #define IFF_NOLOCAL 0x0000020000 /* No address - just on-link subnet */ -#define IFF_DEPRECATED 0x0000040000 /* interface address deprecated */ +#define IFF_DEPRECATED 0x0000040000 /* Address is deprecated */ #define IFF_ADDRCONF 0x0000080000 /* address from stateless addrconf */ #define IFF_ROUTER 0x0000100000 /* router on this interface */ @@ -149,14 +148,12 @@ struct ifnet { #define IFF_IPV4 0x0001000000 /* IPv4 interface */ #define IFF_IPV6 0x0002000000 /* IPv6 interface */ /* 0x0004000000 was IFF_MIPRUNNING */ -#define IFF_NOFAILOVER 0x0008000000 /* Don't failover on NIC failure */ +#define IFF_NOFAILOVER 0x0008000000 /* in.mpathd(1M) test address */ -#define IFF_FAILED 0x0010000000 /* NIC has failed */ -#define IFF_STANDBY 0x0020000000 /* Standby NIC to be used on failures */ -#define IFF_INACTIVE 0x0040000000 /* NIC active or not ? */ - /* Used for Standby NIC or */ - /* when FAILBACK is disabled by user */ -#define IFF_OFFLINE 0x0080000000 /* NIC has been offlined */ +#define IFF_FAILED 0x0010000000 /* Interface has failed */ +#define IFF_STANDBY 0x0020000000 /* Interface is a hot-spare */ +#define IFF_INACTIVE 0x0040000000 /* Functioning but not used for data */ +#define IFF_OFFLINE 0x0080000000 /* Interface is offline */ /* * The IFF_XRESOLV flag is an evolving interface and is subject @@ -170,14 +167,22 @@ struct ifnet { #define IFF_FIXEDMTU 0x1000000000ll /* MTU manually set with SIOCSLIFMTU */ #define IFF_VIRTUAL 0x2000000000ll /* Does not send or receive packets */ #define IFF_DUPLICATE 0x4000000000ll /* Local address already in use */ +#define IFF_IPMP 0x8000000000ll /* IPMP IP interface */ -/* flags set internally only: */ +/* flags that cannot be changed by userland on any interface */ #define IFF_CANTCHANGE \ (IFF_BROADCAST | IFF_POINTOPOINT | IFF_RUNNING | IFF_PROMISC | \ IFF_MULTICAST | IFF_MULTI_BCAST | IFF_UNNUMBERED | IFF_IPV4 | \ - IFF_IPV6 | IFF_INACTIVE | IFF_FIXEDMTU | IFF_VIRTUAL | \ + IFF_IPV6 | IFF_IPMP | IFF_FIXEDMTU | IFF_VIRTUAL | \ IFF_LOOPBACK | IFF_ALLMULTI | IFF_DUPLICATE | IFF_COS_ENABLED) +/* flags that cannot be changed by userland on an IPMP interface */ +#define IFF_IPMP_CANTCHANGE IFF_FAILED + +/* flags that can never be set on an IPMP interface */ +#define IFF_IPMP_INVALID (IFF_STANDBY | IFF_INACTIVE | IFF_OFFLINE | \ + IFF_NOFAILOVER | IFF_NOARP | IFF_NONUD | IFF_XRESOLV) + /* * Output queues (ifp->if_snd) and internetwork datagram level (pup level 1) * input routines have queues of messages stored on ifqueue structures @@ -354,7 +359,7 @@ struct lifreq { } lifr_lifru1; #define lifr_addrlen lifr_lifru1.lifru_addrlen #define lifr_ppa lifr_lifru1.lifru_ppa /* Driver's ppa */ - uint_t lifr_movetoindex; /* FAILOVER/FAILBACK ifindex */ + uint_t lifr_type; /* IFT_ETHER, ... */ union { struct sockaddr_storage lifru_addr; struct sockaddr_storage lifru_dstaddr; @@ -371,6 +376,7 @@ struct lifreq { struct lif_nd_req lifru_nd_req; struct lif_ifinfo_req lifru_ifinfo_req; char lifru_groupname[LIFGRNAMSIZ]; /* SIOC[GS]LIFGROUPNAME */ + char lifru_binding[LIFNAMSIZ]; /* SIOCGLIFBINDING */ uint_t lifru_delay; /* SIOC[GS]LIFNOTIFYDELAY */ zoneid_t lifru_zoneid; /* SIOC[GS]LIFZONE */ } lifr_lifru; @@ -392,6 +398,7 @@ struct lifreq { #define lifr_nd lifr_lifru.lifru_nd_req /* SIOCLIF*ND */ #define lifr_ifinfo lifr_lifru.lifru_ifinfo_req /* SIOC[GS]LIFLNKINFO */ #define lifr_groupname lifr_lifru.lifru_groupname +#define lifr_binding lifr_lifru.lifru_binding #define lifr_delay lifr_lifru.lifru_delay #define lifr_zoneid lifr_lifru.lifru_zoneid }; @@ -556,6 +563,7 @@ struct lifsrcof { #define LIFC_TEMPORARY 0x04 /* Include IFF_TEMPORARY interfaces */ #define LIFC_ALLZONES 0x08 /* Include all zones */ /* (must be issued from global zone) */ +#define LIFC_UNDER_IPMP 0x10 /* Include underlying IPMP interfaces */ #if defined(_SYSCALL32) @@ -582,6 +590,22 @@ struct lifsrcof32 { #endif /* _SYSCALL32 */ /* + * IPMP group information, for use with SIOCGLIFGROUPINFO. + */ +typedef struct lifgroupinfo { + char gi_grname[LIFGRNAMSIZ]; /* group name (set by caller) */ + char gi_grifname[LIFNAMSIZ]; /* IPMP meta-interface name */ + char gi_m4ifname[LIFNAMSIZ]; /* v4 mcast interface name */ + char gi_m6ifname[LIFNAMSIZ]; /* v6 mcast interface name */ + char gi_bcifname[LIFNAMSIZ]; /* v4 bcast interface name */ + boolean_t gi_v4; /* group is plumbed for v4 */ + boolean_t gi_v6; /* group is plumbed for v6 */ + uint_t gi_nv4; /* # of underlying v4 if's */ + uint_t gi_nv6; /* # of underlying v6 if's */ + uint_t gi_mactype; /* DLPI mac type of group */ +} lifgroupinfo_t; + +/* * OBSOLETE: Structure used in SIOCGIFCONF request. * Used to retrieve interface configuration * for machine (useful for programs which diff --git a/usr/src/uts/common/net/route.h b/usr/src/uts/common/net/route.h index 078971918d..3e4307f25e 100644 --- a/usr/src/uts/common/net/route.h +++ b/usr/src/uts/common/net/route.h @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -45,7 +45,6 @@ #ifndef _NET_ROUTE_H #define _NET_ROUTE_H -#pragma ident "%Z%%M% %I% %E% SMI" /* from UCB 8.5 (Berkeley) 2/8/95 */ #include <sys/tsol/label.h> @@ -254,6 +253,18 @@ typedef struct tsol_rtsecattr_s { #define RTSA_CIPSO 0x100 /* CIPSO protocol */ #define RTSA_SLRANGE (RTSA_MINSL|RTSA_MAXSL) +/* + * Routing socket options. + */ +#define RT_AWARE 0x0001 /* set awareness of hidden interfaces */ + +/* + * Supported RT_AWARE values. As a convenience, the bit-values here mirror + * the LIFC_* values. + */ +#define RTAW_DEFAULT 0x0000 /* unaware application */ +#define RTAW_UNDER_IPMP 0x0010 /* aware of underlying IPMP interfaces */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h index 782e2dc340..fc2c750ba7 100644 --- a/usr/src/uts/common/netinet/in.h +++ b/usr/src/uts/common/netinet/in.h @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -932,15 +932,7 @@ typedef struct ipsec_req { #define IP_BOUND_IF 0x41 /* bind socket to an ifindex */ #define IP_UNSPEC_SRC 0x42 /* use unspecified source address */ #define IP_BROADCAST_TTL 0x43 /* use specific TTL for broadcast */ - -/* - * IP_DONTFAILOVER_IF option is used to indicate that outbound unicast and - * multicast packets go through the specified interface, no load spreading, - * no failover. - * This is a Sun private interface. - */ -#define IP_DONTFAILOVER_IF 0x44 - +/* can be reused 0x44 */ #define IP_DHCPINIT_IF 0x45 /* accept all unicast DHCP traffic */ /* @@ -1258,15 +1250,6 @@ typedef struct { #define IPV6_BOUND_IF 0x41 /* bind to an ifindex */ #define IPV6_UNSPEC_SRC 0x42 /* source of packets set to */ /* unspecified (all zeros) */ -#define IPV6_BOUND_PIF 0x43 /* Bind to Physical interface */ - /* No load balancing or failover */ -/* - * IPV6_DONTFAILOVER_IF option is used to indicate that outbound unicast and - * multicast packets go through the specified interface, no load spreading, - * no failover. - * This is a Sun private interface. - */ -#define IPV6_DONTFAILOVER_IF 0x44 /* * Miscellaneous IPv6 constants. diff --git a/usr/src/uts/common/rpc/rpcib.c b/usr/src/uts/common/rpc/rpcib.c index d0edb2e8f0..aba7803131 100644 --- a/usr/src/uts/common/rpc/rpcib.c +++ b/usr/src/uts/common/rpc/rpcib.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -56,7 +56,6 @@ #include <sys/errno.h> #include <sys/kmem.h> #include <sys/debug.h> -#include <sys/systm.h> #include <sys/pathname.h> #include <sys/kstat.h> #include <sys/t_lock.h> @@ -67,47 +66,43 @@ #include <sys/callb.h> #include <sys/sunddi.h> #include <sys/sunndi.h> -#include <sys/sunldi.h> #include <sys/sdt.h> -#include <sys/dlpi.h> #include <sys/ib/ibtl/ibti.h> #include <rpc/rpc.h> #include <rpc/ib.h> - #include <sys/modctl.h> - -#include <sys/pathname.h> #include <sys/kstr.h> #include <sys/sockio.h> #include <sys/vnode.h> #include <sys/tiuser.h> #include <net/if.h> +#include <net/if_types.h> #include <sys/cred.h> #include <rpc/rpc_rdma.h> - #include <nfs/nfs.h> -#include <sys/kstat.h> #include <sys/atomic.h> #define NFS_RDMA_PORT 2050 -extern char *inet_ntop(int, const void *, char *, int); - +/* + * Convenience structure used by rpcib_get_ib_addresses() + */ +typedef struct rpcib_ipaddrs { + void *ri_list; /* pointer to list of addresses */ + uint_t ri_count; /* number of addresses in list */ + uint_t ri_size; /* size of ri_list in bytes */ +} rpcib_ipaddrs_t; /* * Prototype declarations for driver ops */ - static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); -static int rpcib_is_ib_interface(char *); -static int rpcib_dl_info(ldi_handle_t, dl_info_ack_t *); -static int rpcib_do_ip_ioctl(int, int, caddr_t); -static boolean_t rpcib_get_ib_addresses(struct sockaddr_in *, - struct sockaddr_in6 *, uint_t *, uint_t *); -static uint_t rpcib_get_number_interfaces(void); +static boolean_t rpcib_rdma_capable_interface(struct lifreq *); +static int rpcib_do_ip_ioctl(int, int, void *); +static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); static int rpcib_cache_kstat_update(kstat_t *, int); static void rib_force_cleanup(void *); @@ -147,9 +142,6 @@ static struct cb_ops rpcib_cbops = { nodev /* int (*cb_awrite)() */ }; - - - /* * Device options */ @@ -205,8 +197,7 @@ typedef struct cache_struct { avl_node_t avl_link; } cache_avl_struct_t; - -static uint64_t rib_total_buffers = 0; +static uint64_t rib_total_buffers = 0; uint64_t cache_limit = 100 * 1024 * 1024; static volatile uint64_t cache_allocation = 0; static uint64_t cache_watermark = 80 * 1024 * 1024; @@ -409,12 +400,10 @@ rpcib_t rpcib; */ int rib_debug = 0; - int _init(void) { - int error; - int ret; + int error; error = mod_install((struct modlinkage *)&rib_modlinkage); if (error != 0) { @@ -423,11 +412,7 @@ _init(void) */ return (error); } - ret = ldi_ident_from_mod(&rib_modlinkage, &rpcib_li); - if (ret != 0) - rpcib_li = NULL; mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); - return (0); } @@ -448,7 +433,6 @@ _fini() return (status); } mutex_destroy(&plugin_state_lock); - ldi_ident_release(rpcib_li); return (0); } @@ -458,7 +442,6 @@ _info(struct modinfo *modinfop) return (mod_info(&rib_modlinkage, modinfop)); } - /* * rpcib_getinfo() * Given the device number, return the devinfo pointer or the @@ -1822,124 +1805,100 @@ refresh: rdma_stat rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca) { - struct sockaddr_in *sin4, *sin4arr; - struct sockaddr_in6 *sin6, *sin6arr; - uint_t nif, nif4, nif6, i; + uint_t i; ibt_path_info_t path; ibt_status_t ibt_status; uint8_t num_paths_p; ibt_ip_path_attr_t ipattr; ibt_ip_addr_t dstip; ibt_path_ip_src_t srcip; - + rpcib_ipaddrs_t addrs4; + rpcib_ipaddrs_t addrs6; + struct sockaddr_in *sinp; + struct sockaddr_in6 *sin6p; + rdma_stat retval = RDMA_SUCCESS; *hca = NULL; - ASSERT(raddr->buf != NULL); bzero(&path, sizeof (ibt_path_info_t)); bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); bzero(&srcip, sizeof (ibt_path_ip_src_t)); - /* Obtain the source IP addresses for the system */ - nif = rpcib_get_number_interfaces(); - sin4arr = (struct sockaddr_in *) - kmem_zalloc(sizeof (struct sockaddr_in) * nif, KM_SLEEP); - sin6arr = (struct sockaddr_in6 *) - kmem_zalloc(sizeof (struct sockaddr_in6) * nif, KM_SLEEP); - - (void) rpcib_get_ib_addresses(sin4arr, sin6arr, &nif4, &nif6); - - /* Are there really any IB interfaces available */ - if (nif4 == 0 && nif6 == 0) { - kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif); - kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif); - return (RDMA_FAILED); + if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || + (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { + retval = RDMA_FAILED; + goto done; } /* Prep the destination address */ switch (addr_type) { case AF_INET: - sin4 = (struct sockaddr_in *)raddr->buf; + sinp = (struct sockaddr_in *)raddr->buf; dstip.family = AF_INET; - dstip.un.ip4addr = sin4->sin_addr.s_addr; + dstip.un.ip4addr = sinp->sin_addr.s_addr; + sinp = addrs4.ri_list; - for (i = 0; i < nif4; i++) { + for (i = 0; i < addrs4.ri_count; i++) { num_paths_p = 0; ipattr.ipa_dst_ip = &dstip; ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; ipattr.ipa_ndst = 1; ipattr.ipa_max_paths = 1; ipattr.ipa_src_ip.family = dstip.family; - ipattr.ipa_src_ip.un.ip4addr = - sin4arr[i].sin_addr.s_addr; + ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr; ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, - IBT_PATH_NO_FLAGS, - &ipattr, - &path, - &num_paths_p, + IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p, &srcip); if (ibt_status == IBT_SUCCESS && num_paths_p != 0 && path.pi_hca_guid == rib_stat->hca->hca_guid) { *hca = rib_stat->hca; - - kmem_free(sin4arr, - sizeof (struct sockaddr_in) * nif); - kmem_free(sin6arr, - sizeof (struct sockaddr_in6) * nif); - - return (RDMA_SUCCESS); + goto done; } } + retval = RDMA_FAILED; break; case AF_INET6: - sin6 = (struct sockaddr_in6 *)raddr->buf; + sin6p = (struct sockaddr_in6 *)raddr->buf; dstip.family = AF_INET6; - dstip.un.ip6addr = sin6->sin6_addr; + dstip.un.ip6addr = sin6p->sin6_addr; + sin6p = addrs6.ri_list; - for (i = 0; i < nif6; i++) { + for (i = 0; i < addrs6.ri_count; i++) { num_paths_p = 0; ipattr.ipa_dst_ip = &dstip; ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; ipattr.ipa_ndst = 1; ipattr.ipa_max_paths = 1; ipattr.ipa_src_ip.family = dstip.family; - ipattr.ipa_src_ip.un.ip6addr = sin6arr[i].sin6_addr; + ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr; ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, - IBT_PATH_NO_FLAGS, - &ipattr, - &path, - &num_paths_p, + IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p, &srcip); if (ibt_status == IBT_SUCCESS && num_paths_p != 0 && path.pi_hca_guid == rib_stat->hca->hca_guid) { *hca = rib_stat->hca; - - kmem_free(sin4arr, - sizeof (struct sockaddr_in) * nif); - kmem_free(sin6arr, - sizeof (struct sockaddr_in6) * nif); - - return (RDMA_SUCCESS); + goto done; } } - + retval = RDMA_FAILED; break; default: - kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif); - kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif); - return (RDMA_INVAL); + retval = RDMA_INVAL; + break; } - - kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif); - kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif); - return (RDMA_FAILED); +done: + if (addrs4.ri_size > 0) + kmem_free(addrs4.ri_list, addrs4.ri_size); + if (addrs6.ri_size > 0) + kmem_free(addrs6.ri_list, addrs6.ri_size); + return (retval); } /* @@ -4668,123 +4627,31 @@ rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) return (RDMA_SUCCESS); } - /* - * Return 0 if the interface is IB. - * Return error (>0) if any error is encountered during processing. - * Return -1 if the interface is not IB and no error. + * Check if the IP interface named by `lifrp' is RDMA-capable. */ -#define isalpha(ch) (((ch) >= 'a' && (ch) <= 'z') || \ - ((ch) >= 'A' && (ch) <= 'Z')) -static int -rpcib_is_ib_interface(char *name) +static boolean_t +rpcib_rdma_capable_interface(struct lifreq *lifrp) { + char ifname[LIFNAMSIZ]; + char *cp; - char dev_path[MAXPATHLEN]; - char devname[MAXNAMELEN]; - ldi_handle_t lh; - dl_info_ack_t info; - int ret = 0; - int i; + if (lifrp->lifr_type == IFT_IB) + return (B_TRUE); /* - * ibd devices are only style 2 devices - * so we will open only style 2 devices - * by ignoring the ppa + * Strip off the logical interface portion before getting + * intimate with the name. */ + (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); + if ((cp = strchr(ifname, ':')) != NULL) + *cp = '\0'; - i = strlen(name) - 1; - while ((i >= 0) && (!isalpha(name[i]))) i--; - - if (i < 0) { - /* Invalid interface name, no alphabet */ - return (-1); - } - - (void) strncpy(devname, name, i + 1); - devname[i + 1] = '\0'; - - if (strcmp("lo", devname) == 0) { - /* - * loopback interface not rpc/rdma capable - */ - return (-1); - } - - (void) strncpy(dev_path, "/dev/", MAXPATHLEN); - if (strlcat(dev_path, devname, MAXPATHLEN) >= MAXPATHLEN) { - /* string overflow */ - return (-1); - } - - ret = ldi_open_by_name(dev_path, FREAD|FWRITE, kcred, &lh, rpcib_li); - if (ret != 0) { - return (ret); - } - ret = rpcib_dl_info(lh, &info); - (void) ldi_close(lh, FREAD|FWRITE, kcred); - if (ret != 0) { - return (ret); - } - - if (info.dl_mac_type != DL_IB) { - return (-1); - } - - return (0); + return (strcmp("lo0", ifname) == 0); } static int -rpcib_dl_info(ldi_handle_t lh, dl_info_ack_t *info) -{ - dl_info_req_t *info_req; - union DL_primitives *dl_prim; - mblk_t *mp; - k_sigset_t smask; - int error; - - if ((mp = allocb(sizeof (dl_info_req_t), BPRI_MED)) == NULL) { - return (ENOMEM); - } - - mp->b_datap->db_type = M_PROTO; - - info_req = (dl_info_req_t *)(uintptr_t)mp->b_wptr; - mp->b_wptr += sizeof (dl_info_req_t); - info_req->dl_primitive = DL_INFO_REQ; - - sigintr(&smask, 0); - if ((error = ldi_putmsg(lh, mp)) != 0) { - sigunintr(&smask); - return (error); - } - if ((error = ldi_getmsg(lh, &mp, (timestruc_t *)NULL)) != 0) { - sigunintr(&smask); - return (error); - } - sigunintr(&smask); - - dl_prim = (union DL_primitives *)(uintptr_t)mp->b_rptr; - switch (dl_prim->dl_primitive) { - case DL_INFO_ACK: - if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) < - sizeof (dl_info_ack_t)) { - error = -1; - } else { - *info = *(dl_info_ack_t *)(uintptr_t)mp->b_rptr; - error = 0; - } - break; - default: - error = -1; - break; - } - - freemsg(mp); - return (error); -} -static int -rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg) +rpcib_do_ip_ioctl(int cmd, int len, void *arg) { vnode_t *kvp, *vp; TIUSER *tiptr; @@ -4792,23 +4659,22 @@ rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg) k_sigset_t smask; int err = 0; - if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, - &kvp) == 0) { - if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE, + if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { + if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, &tiptr, CRED()) == 0) { - vp = tiptr->fp->f_vnode; - } else { - VN_RELE(kvp); - return (EPROTO); + vp = tiptr->fp->f_vnode; + } else { + VN_RELE(kvp); + return (EPROTO); } } else { - return (EPROTO); + return (EPROTO); } iocb.ic_cmd = cmd; iocb.ic_timout = 0; iocb.ic_len = len; - iocb.ic_dp = arg; + iocb.ic_dp = (caddr_t)arg; sigintr(&smask, 0); err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); sigunintr(&smask); @@ -4817,65 +4683,89 @@ rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg) return (err); } -static uint_t rpcib_get_number_interfaces(void) { -uint_t numifs; - if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (uint_t), (caddr_t)&numifs)) { - return (0); +/* + * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. + * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. + */ +static int +rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) +{ + int err; + struct lifnum lifn; + + bzero(&lifn, sizeof (struct lifnum)); + lifn.lifn_family = AF_UNSPEC; + + err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); + if (err != 0) + return (err); + + /* + * Pad the interface count to account for additional interfaces that + * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. + */ + lifn.lifn_count += 4; + + bzero(lifcp, sizeof (struct lifconf)); + lifcp->lifc_family = AF_UNSPEC; + lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); + lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); + + err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); + if (err != 0) { + kmem_free(lifcp->lifc_buf, *bufsizep); + return (err); } - return (numifs); + return (0); } static boolean_t -rpcib_get_ib_addresses( - struct sockaddr_in *saddr4, - struct sockaddr_in6 *saddr6, - uint_t *number4, - uint_t *number6) +rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) { - int numifs; - struct ifconf kifc; - struct ifreq *ifr; - boolean_t ret = B_FALSE; + uint_t i, nifs; + uint_t bufsize; + struct lifconf lifc; + struct lifreq *lifrp; + struct sockaddr_in *sinp; + struct sockaddr_in6 *sin6p; - *number4 = 0; - *number6 = 0; + bzero(addrs4, sizeof (rpcib_ipaddrs_t)); + bzero(addrs6, sizeof (rpcib_ipaddrs_t)); - if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (int), (caddr_t)&numifs)) { - return (ret); + if (rpcib_do_lifconf(&lifc, &bufsize) != 0) + return (B_FALSE); + + if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { + kmem_free(lifc.lifc_buf, bufsize); + return (B_FALSE); } - kifc.ifc_len = numifs * sizeof (struct ifreq); - kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP); + /* + * Worst case is that all of the addresses are IB-capable and have + * the same address family, so size our buffers accordingly. + */ + addrs4->ri_size = nifs * sizeof (struct sockaddr_in); + addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); + addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); + addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); - if (rpcib_do_ip_ioctl(SIOCGIFCONF, sizeof (struct ifconf), - (caddr_t)&kifc)) { - goto done; - } + for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { + if (!rpcib_rdma_capable_interface(lifrp)) + continue; - ifr = kifc.ifc_req; - for (numifs = kifc.ifc_len / sizeof (struct ifreq); - numifs > 0; numifs--, ifr++) { - struct sockaddr_in *sin4; - struct sockaddr_in6 *sin6; - - if ((rpcib_is_ib_interface(ifr->ifr_name) == 0)) { - sin4 = (struct sockaddr_in *)(uintptr_t)&ifr->ifr_addr; - sin6 = (struct sockaddr_in6 *)(uintptr_t)&ifr->ifr_addr; - if (sin4->sin_family == AF_INET) { - saddr4[*number4] = *(struct sockaddr_in *) - (uintptr_t)&ifr->ifr_addr; - *number4 = *number4 + 1; - } else if (sin6->sin6_family == AF_INET6) { - saddr6[*number6] = *(struct sockaddr_in6 *) - (uintptr_t)&ifr->ifr_addr; - *number6 = *number6 + 1; - } + if (lifrp->lifr_addr.ss_family == AF_INET) { + sinp = addrs4->ri_list; + bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], + sizeof (struct sockaddr_in)); + } else if (lifrp->lifr_addr.ss_family == AF_INET6) { + sin6p = addrs6->ri_list; + bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], + sizeof (struct sockaddr_in6)); } } - ret = B_TRUE; -done: - kmem_free(kifc.ifc_buf, kifc.ifc_len); - return (ret); + + kmem_free(lifc.lifc_buf, bufsize); + return (B_TRUE); } /* ARGSUSED */ diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index aa01ddeed6..9f9c95c78d 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -266,13 +266,16 @@ typedef struct dl_ipnetinfo { #define DL_OTHER 0x09 /* Any other medium not listed above */ /* * Private media types. These must be above the value 0x80000000 as - * stated in the DLPI specification. + * stated in the DLPI specification. NOTE: The SUNW_ prefix is used + * to denote synthetic DLPI types that are internal to the stack. */ #define DL_IPV4 0x80000001ul /* IPv4 Tunnel Link */ #define DL_IPV6 0x80000002ul /* IPv6 Tunnel Link */ #define SUNW_DL_VNI 0x80000003ul /* Virtual network interface */ #define DL_WIFI 0x80000004ul /* IEEE 802.11 */ #define DL_IPNET 0x80000005ul /* ipnet(7D) link */ +#define SUNW_DL_IPMP 0x80000006ul /* IPMP stub interface */ + /* * DLPI provider service supported. * These must be allowed to be bitwise-OR for dl_service_mode in diff --git a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h index e421c0b9c0..7bb54ad12e 100644 --- a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h +++ b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h @@ -19,34 +19,23 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_IB_MGT_IBCM_IBCM_ARP_H #define _SYS_IB_MGT_IBCM_IBCM_ARP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif - #include <sys/ib/mgt/ibcm/ibcm_impl.h> #include <sys/modhash.h> #include <sys/ib/clients/ibd/ibd.h> #include <sys/strsun.h> -#include <sys/strsubr.h> #include <sys/socket.h> #include <sys/stat.h> /* for S_IFCHR */ -#include <inet/common.h> -#include <inet/ip.h> -#include <inet/ip_if.h> -#include <inet/ip_ire.h> -#include <inet/ip_rts.h> -#include <sys/dlpi.h> -#include <net/route.h> /* * IPoIB addr lookup completion function @@ -103,7 +92,6 @@ typedef struct ibcm_arp_streams_s { /* GID to IP-Addr and Ip-Addr to GID look-up functions. */ -#define IBCM_ARP_IBD_NAME "ibd" #define IBCM_ARP_IBD_INSTANCES 4 typedef struct ibcm_arp_ip_s { diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h index 593505a426..4e3b2b5778 100644 --- a/usr/src/uts/common/sys/socket.h +++ b/usr/src/uts/common/sys/socket.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -156,12 +156,10 @@ struct so_snd_bufinfo { /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x1010 /* access rights (array of int) */ - #define SO_SECATTR 0x1011 /* socket's security attributes */ #define SCM_UCRED 0x1012 /* sender's ucred */ #define SO_TIMESTAMP 0x1013 /* socket-level timestamp option */ #define SCM_TIMESTAMP SO_TIMESTAMP /* socket control message timestamp */ - #define SO_ALLZONES 0x1014 /* bind in all zones */ #define SO_EXCLBIND 0x1015 /* exclusive binding */ @@ -203,9 +201,12 @@ struct linger { }; /* - * Level number for (get/set)sockopt() to apply to socket itself. + * Levels for (get/set)sockopt() that don't apply to a specific protocol. */ #define SOL_SOCKET 0xffff /* options for socket level */ +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) +#define SOL_ROUTE 0xfffe /* options for routing socket level */ +#endif /* * Address families. diff --git a/usr/src/uts/common/sys/sockio.h b/usr/src/uts/common/sys/sockio.h index 9e107ff3ef..0ef5394fea 100644 --- a/usr/src/uts/common/sys/sockio.h +++ b/usr/src/uts/common/sys/sockio.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -175,7 +175,7 @@ extern "C" { #define SIOCSLIFNETMASK _IOW('i', 126, struct lifreq) /* set subnetmask */ #define SIOCGLIFMETRIC _IOWR('i', 127, struct lifreq) /* get if metric */ #define SIOCSLIFMETRIC _IOW('i', 128, struct lifreq) /* set if metric */ -#define SIOCSLIFNAME _IOWR('i', 129, struct lifreq) /* set interface name */ +#define SIOCSLIFNAME _IOWR('i', 129, struct lifreq) /* set interface name */ #define SIOCGLIFNUM _IOWR('i', 130, struct lifnum) /* get number of ifs */ #define SIOCGLIFMUXID _IOWR('i', 131, struct lifreq) /* get if muxid */ #define SIOCSLIFMUXID _IOW('i', 132, struct lifreq) /* set if muxid */ @@ -223,22 +223,21 @@ extern "C" { #define SIOCLIPSECONFIG _IOW('i', 152, 0) /* List Policy */ /* - * IOCTLS for implementing load balancing and failover within IP. + * 153 can be reused (was consolidation-private SIOCLIFFAILOVER). */ -#define SIOCLIFFAILOVER _IOW('i', 153, struct lifreq) /* Failover */ -#define SIOCLIFFAILBACK _IOW('i', 154, struct lifreq) /* Failback */ -#define SIOCSLIFGROUPNAME _IOW('i', 155, struct lifreq) /* Group interfaces */ -#define SIOCGLIFGROUPNAME _IOWR('i', 156, struct lifreq) /* Get group name */ -#define SIOCGLIFOINDEX _IOWR('i', 157, struct lifreq) /* get orig if index */ /* - * Leave 158 - 160 unused; used to be SIOC*IFARP ioctls. + * IP Multipathing ioctls. */ +#define SIOCGLIFBINDING _IOWR('i', 154, struct lifreq) +#define SIOCSLIFGROUPNAME _IOW('i', 155, struct lifreq) +#define SIOCGLIFGROUPNAME _IOWR('i', 156, struct lifreq) +#define SIOCGLIFGROUPINFO _IOWR('i', 157, struct lifgroupinfo) /* - * IOCTL for implementing load balancing and failover within IP. + * Leave 158 - 160 unused; used to be SIOC*IFARP ioctls. + * However, 161 can be reused (was consolidation-private SIOCSLIFOINDEX). */ -#define SIOCSLIFOINDEX _IOWR('i', 161, struct lifreq) /* set orig if index */ /* * IOCTLS which provide an interface to the IPv6 address selection policy. @@ -309,10 +308,8 @@ extern "C" { #define SIOCSIPMSFILTER _IOW('i', 181, 0) /* - * IOCTL for implementing "disable FAILBACK" IPMP configuration. + * 182 can be reused (was consolidation-private SIOCSIPMPFAILBACK). */ -#define SIOCSIPMPFAILBACK _IOW('i', 182, int) /* enable/disable */ - /* FAILBACK */ #define SIOCSENABLESDP _IOWR('i', 183, int) /* Enable SDP */ diff --git a/usr/src/uts/common/sys/sysevent/eventdefs.h b/usr/src/uts/common/sys/sysevent/eventdefs.h index ac21686e84..dcf36f748c 100644 --- a/usr/src/uts/common/sys/sysevent/eventdefs.h +++ b/usr/src/uts/common/sys/sysevent/eventdefs.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -179,6 +179,8 @@ extern "C" { /* Interface within an IPMP group has changed state or type */ #define ESC_IPMP_IF_CHANGE "ESC_ipmp_if_change" +/* IPMP probe has changed state */ +#define ESC_IPMP_PROBE_STATE "ESC_ipmp_probe_state" /* * EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes diff --git a/usr/src/uts/common/sys/sysevent/ipmp.h b/usr/src/uts/common/sys/sysevent/ipmp.h index 137fa918cd..ba39a5bb2b 100644 --- a/usr/src/uts/common/sys/sysevent/ipmp.h +++ b/usr/src/uts/common/sys/sysevent/ipmp.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,16 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SYSEVENT_IPMP_H #define _SYS_SYSEVENT_IPMP_H -#pragma ident "%Z%%M% %I% %E% SMI" - - /* * IPMP sysevent definitions. Note that all of these definitions are * Sun-private and are subject to change at any time. @@ -39,13 +35,18 @@ extern "C" { #endif +/* + * Event channel associated with these events + */ +#define IPMP_EVENT_CHAN "com.sun:ipmp:events" /* * Event type EC_IPMP/ESC_IPMP_GROUP_STATE event schema * * Event Class - EC_IPMP * Event Sub-Class - ESC_IPMP_GROUP_STATE - * Event Publisher - SUNW:usr:in.mpathd + * Event Vendor - com.sun + * Event Publisher - in.mpathd * * Attribute Name - IPMP_EVENT_VERSION * Attribute Type - SE_DATA_TYPE_UINT32 @@ -70,18 +71,20 @@ extern "C" { #define IPMP_GROUP_STATE "ipmp_group_state" typedef enum { - IPMP_GROUP_OK, /* at least one interface in group is ok */ - IPMP_GROUP_FAILED /* all interfaces in the group have failed */ + IPMP_GROUP_OK, /* all interfaces in the group are ok */ + IPMP_GROUP_FAILED, /* all interfaces in the group are unusable */ + IPMP_GROUP_DEGRADED /* some interfaces in the group are unusable */ } ipmp_group_state_t; -#define IPMP_EVENT_CUR_VERSION 1 +#define IPMP_EVENT_CUR_VERSION 2 /* * Event type EC_IPMP/ESC_IPMP_GROUP_CHANGE event schema * * Event Class - EC_IPMP * Event Sub-Class - ESC_IPMP_GROUP_CHANGE - * Event Publisher - SUNW:usr:in.mpathd + * Event Vendor - com.sun + * Event Publisher - in.mpathd * * Attribute Name - IPMP_GROUP_NAME * Attribute Type - SE_DATA_TYPE_STRING @@ -113,7 +116,8 @@ typedef enum { * * Event Class - EC_IPMP * Event Sub-Class - ESC_IPMP_GROUP_MEMBER_CHANGE - * Event Publisher - SUNW:usr:in.mpathd + * Event Vendor - com.sun + * Event Publisher - in.mpathd * * Attribute Name - IPMP_GROUP_NAME * Attribute Type - SE_DATA_TYPE_STRING @@ -171,7 +175,8 @@ typedef enum { * * Event Class - EC_IPMP * Event Sub-Class - ESC_IPMP_IF_CHANGE - * Event Publisher - SUNW:usr:in.mpathd + * Event Vendor - com.sun + * Event Publisher - in.mpathd * * Attribute Name - IPMP_GROUP_NAME * Attribute Type - SE_DATA_TYPE_STRING @@ -198,6 +203,75 @@ typedef enum { * Attribute Value - <if-type> */ +#define IPMP_PROBE_ID "ipmp_probe_id" +#define IPMP_PROBE_STATE "ipmp_probe_state" +#define IPMP_PROBE_START_TIME "ipmp_probe_start_time" +#define IPMP_PROBE_SENT_TIME "ipmp_probe_sent_time" +#define IPMP_PROBE_ACKRECV_TIME "ipmp_probe_ackrecv_time" +#define IPMP_PROBE_ACKPROC_TIME "ipmp_probe_ackproc_time" +#define IPMP_PROBE_TARGET "ipmp_probe_target" +#define IPMP_PROBE_TARGET_RTTAVG "ipmp_probe_target_rttavg" +#define IPMP_PROBE_TARGET_RTTDEV "ipmp_probe_target_rttdev" + +typedef enum { + IPMP_PROBE_SENT, /* the probe has been sent */ + IPMP_PROBE_ACKED, /* the probe has been acked */ + IPMP_PROBE_LOST /* the probe has been lost */ +} ipmp_probe_state_t; + +/* + * Event type EC_IPMP/ESC_IPMP_PROBE_STATE event schema + * + * Event Class - EC_IPMP + * Event Sub-Class - ESC_IPMP_PROBE_STATE + * Event Vendor - com.sun + * Event Publisher - in.mpathd + * + * Attribute Name - IPMP_PROBE_ID + * Attribute Type - SE_DATA_TYPE_UINT32 + * Attribute Value - <probe-id> + * + * Attribute Name - IPMP_EVENT_VERSION + * Attribute Type - SE_DATA_TYPE_UINT32 + * Attribute Value - <version> + * + * Attribute Name - IPMP_IF_NAME + * Attribute Type - SE_DATA_TYPE_STRING + * Attribute Value - <if-name> + * + * Attribute Name - IPMP_PROBE_STATE + * Attribute Type - SE_DATA_TYPE_UINT32 + * Attribute Value - <probe-state> + * + * Attribute Name - IPMP_PROBE_START_TIME + * Attribute Type - SE_DATA_TYPE_TIME + * Attribute Value - <probe-start-time> + * + * Attribute Name - IPMP_PROBE_SENT_TIME + * Attribute Type - SE_DATA_TYPE_TIME + * Attribute Value - <probe-sent-time> + * + * Attribute Name - IPMP_PROBE_ACKRECV_TIME + * Attribute Type - SE_DATA_TYPE_TIME + * Attribute Value - <probe-ackrecv-time> + * + * Attribute Name - IPMP_PROBE_ACKPROC_TIME + * Attribute Type - SE_DATA_TYPE_TIME + * Attribute Value - <probe-ackproc-time> + * + * Attribute Name - IPMP_PROBE_TARGET + * Attribute Type - SE_DATA_TYPE_BYTES + * Attribute Value - <probe-target-ip> + * + * Attribute Name - IPMP_PROBE_TARGET_RTTAVG + * Attribute Type - SE_DATA_TYPE_UINT32 + * Attribute Value - <probe-target-rttavg> + * + * Attribute Name - IPMP_PROBE_TARGET_RTTDEV + * Attribute Type - SE_DATA_TYPE_UINT32 + * Attribute Value - <probe-target-rttdev> + */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared index 9585034efb..b25c2fb0cc 100644 --- a/usr/src/uts/intel/Makefile.intel.shared +++ b/usr/src/uts/intel/Makefile.intel.shared @@ -216,6 +216,7 @@ DRV_KMODS += cryptoadm DRV_KMODS += dda DRV_KMODS += devinfo DRV_KMODS += dld +DRV_KMODS += dlpistub DRV_KMODS += dmd DRV_KMODS_32 += dnet DRV_KMODS += dump @@ -321,7 +322,6 @@ DRV_KMODS += udp6 DRV_KMODS += ucode DRV_KMODS += ural DRV_KMODS += vgatext -DRV_KMODS += vni DRV_KMODS += vnic DRV_KMODS += vscan DRV_KMODS += wc diff --git a/usr/src/uts/intel/vni/Makefile b/usr/src/uts/intel/dlpistub/Makefile index aa32704615..53cf2092a7 100644 --- a/usr/src/uts/intel/vni/Makefile +++ b/usr/src/uts/intel/dlpistub/Makefile @@ -18,18 +18,11 @@ # # CDDL HEADER END # -# -# uts/intel/vni/Makefile -# -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# -# This makefile drives the production of the vni streams kernel -# module. -# -# intel architecture dependent +# This makefile drives the production of the dlpistub STREAMS module. +# intel architecture dependent # # @@ -40,11 +33,11 @@ UTSBASE = ../.. # # Define the module and object file sets. # -MODULE = vni -OBJECTS = $(VNI_OBJS:%=$(OBJS_DIR)/%) -LINTS = $(VNI_OBJS:%.o=$(LINTS_DIR)/%.ln) +MODULE = dlpistub +OBJECTS = $(DLPISTUB_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(DLPISTUB_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -CONF_SRCDIR = $(UTSBASE)/common/inet/vni +CONF_SRCDIR = $(UTSBASE)/common/inet/dlpistub # # Include common rules. diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64 index 3972f1b4ec..d89224677b 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.debug64 +++ b/usr/src/uts/intel/ip/ip.global-objs.debug64 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -161,6 +161,9 @@ ipinfov4 ipinfov6 iplrinit iplwinit +ipmp_aract_template +ipmp_ardeact_template +ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64 index f6a97be29b..0e58fdc219 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.obj64 +++ b/usr/src/uts/intel/ip/ip.global-objs.obj64 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -160,6 +160,9 @@ ipinfov4 ipinfov6 iplrinit iplwinit +ipmp_aract_template +ipmp_ardeact_template +ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache diff --git a/usr/src/uts/intel/os/name_to_major b/usr/src/uts/intel/os/name_to_major index 3d58c314b7..eb70695abd 100644 --- a/usr/src/uts/intel/os/name_to_major +++ b/usr/src/uts/intel/os/name_to_major @@ -102,7 +102,7 @@ kmdb 171 sctp 172 sctp6 173 scsi_vhci 174 -vni 175 +dlpistub 175 cpuid 176 bmc 177 dld 178 diff --git a/usr/src/uts/sparc/Makefile.sparc.shared b/usr/src/uts/sparc/Makefile.sparc.shared index 3723be6f32..39fba551aa 100644 --- a/usr/src/uts/sparc/Makefile.sparc.shared +++ b/usr/src/uts/sparc/Makefile.sparc.shared @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This makefile contains the common definitions for all sparc @@ -216,7 +216,8 @@ DRV_KMODS += ippctl sctp sctp6 DRV_KMODS += dld DRV_KMODS += ipf DRV_KMODS += rpcib -DRV_KMODS += vni vnic +DRV_KMODS += dlpistub +DRV_KMODS += vnic DRV_KMODS += xge DRV_KMODS += rds DRV_KMODS += chxge diff --git a/usr/src/uts/sparc/vni/Makefile b/usr/src/uts/sparc/dlpistub/Makefile index 6a96edc17e..548361738a 100644 --- a/usr/src/uts/sparc/vni/Makefile +++ b/usr/src/uts/sparc/dlpistub/Makefile @@ -18,18 +18,11 @@ # # CDDL HEADER END # -# -# uts/sparc/vni/Makefile -# -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# -# This makefile drives the production of the vni streams kernel -# module. -# -# sparc architecture dependent +# This makefile drives the production of the dlpistub STREAMS module. +# sparc architecture dependent # # @@ -40,11 +33,11 @@ UTSBASE = ../.. # # Define the module and object file sets. # -MODULE = vni -OBJECTS = $(VNI_OBJS:%=$(OBJS_DIR)/%) -LINTS = $(VNI_OBJS:%.o=$(LINTS_DIR)/%.ln) +MODULE = dlpistub +OBJECTS = $(DLPISTUB_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(DLPISTUB_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -CONF_SRCDIR = $(UTSBASE)/common/inet/vni +CONF_SRCDIR = $(UTSBASE)/common/inet/dlpistub # # Include common rules. diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64 index 279bd92d0b..6606b472bf 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.debug64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -161,6 +161,9 @@ ipinfov4 ipinfov6 iplrinit iplwinit +ipmp_aract_template +ipmp_ardeact_template +ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64 index 4f4bc3e376..89d40afbbb 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.obj64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -160,6 +160,9 @@ ipinfov4 ipinfov6 iplrinit iplwinit +ipmp_aract_template +ipmp_ardeact_template +ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache diff --git a/usr/src/uts/sparc/os/name_to_major b/usr/src/uts/sparc/os/name_to_major index ff58cf5113..9702d00ad7 100644 --- a/usr/src/uts/sparc/os/name_to_major +++ b/usr/src/uts/sparc/os/name_to_major @@ -182,7 +182,7 @@ pic16f819 233 kmdb 234 sctp 235 sctp6 236 -vni 237 +dlpistub 237 cpuid 238 did 239 ntwdt 240 |